From 693074b3371415d3ac98cb0b52612a482e16cd8c Mon Sep 17 00:00:00 2001
From: fszontagh <szf@fsociety.hu>
Date: Sun, 25 Feb 2024 07:24:07 +0000
Subject: [PATCH] remove submodule

---
 /dev/null |  102 ---------------------------------------------------
 1 files changed, 0 insertions(+), 102 deletions(-)

diff --git a/ggml/.editorconfig b/ggml/.editorconfig
deleted file mode 100644
index 6f987c0..0000000
--- a/ggml/.editorconfig
+++ /dev/null
@@ -1,22 +0,0 @@
-# https://EditorConfig.org
-
-# Top-most EditorConfig file
-root = true
-
-# Unix-style newlines with a newline ending every file, utf-8 charset
-[*]
-end_of_line = lf
-insert_final_newline = true
-trim_trailing_whitespace = true
-charset = utf-8
-indent_style = space
-indent_size = 4
-
-[*.md]
-indent_size = 2
-
-[Makefile]
-indent_style = tab
-
-[prompts/*.txt]
-insert_final_newline = unset
diff --git a/ggml/.github/workflows/ci.yml b/ggml/.github/workflows/ci.yml
deleted file mode 100644
index 4da1cd4..0000000
--- a/ggml/.github/workflows/ci.yml
+++ /dev/null
@@ -1,139 +0,0 @@
-name: CI
-
-on:
-  push:
-    branches: [ master ]
-  pull_request:
-    branches: [ master ]
-
-jobs:
-  test-ubuntu-opencl:
-    if: false
-    runs-on: ubuntu-latest
-    env:
-      GGML_NLOOP: 3
-      GGML_NITER: 1
-      GGML_N_THREADS: 2
-
-    steps:
-    - uses: actions/checkout@v3
-
-    - name: Dependencies
-      run: |
-        wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | sudo tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null
-        echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | sudo tee /etc/apt/sources.list.d/oneAPI.list
-        sudo apt-get update
-        sudo apt-get install -y --no-install-recommends llvm intel-oneapi-runtime-opencl intel-oneapi-runtime-compilers libclblast-dev
-    - name: Create Build Environment
-      run: mkdir build
-
-    - name: Configure CMake
-      working-directory: ./build
-      run: cmake -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DGGML_TEST_COVERAGE=ON -DGGML_CLBLAST=ON ..
-
-    - name: Build
-      working-directory: ./build
-      run: make
-
-    - name: Test
-      working-directory: ./build
-      run: ctest --verbose --timeout 900
-
-    - name: Test Coverage
-      working-directory: ./build
-      run: |
-        llvm-profdata merge -sparse tests/*.profraw -o ggml.profdata
-        llvm-cov      report ./bin/test-grad0 -instr-profile=ggml.profdata
-        llvm-cov      report ./bin/test-opt   -instr-profile=ggml.profdata
-
-  test-macos-metal:
-    runs-on: macos-13
-    env:
-      GGML_NLOOP: 3
-      GGML_NITER: 1
-      GGML_N_THREADS: 2
-
-    steps:
-    - uses: actions/checkout@v3
-
-    - name: Create Build Environment
-      run: mkdir build
-
-    - name: Configure CMake
-      working-directory: ./build
-      run: cmake -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DGGML_TEST_COVERAGE=ON ..
-
-    - name: Build
-      working-directory: ./build
-      run: make
-
-    - name: Test
-      working-directory: ./build
-      run: ctest --verbose --timeout 900
-
-    - name: Test Coverage
-      working-directory: ./build
-      run: |
-        xcrun llvm-profdata merge -sparse tests/*.profraw -o ggml.profdata
-        xcrun llvm-cov      report ./bin/test-grad0 -instr-profile=ggml.profdata
-        xcrun llvm-cov      report ./bin/test-opt   -instr-profile=ggml.profdata
-
-  build:
-
-    strategy:
-      matrix:
-        os: [ubuntu-latest, macos-latest]
-
-    runs-on: ${{ matrix.os }}
-
-    env:
-      GGML_NLOOP: 3
-      GGML_NITER: 1
-
-    steps:
-    - uses: actions/checkout@v3
-
-    - name: Dependencies for Ubuntu
-      if: matrix.os == 'ubuntu-latest'
-      run: |
-        sudo apt-get update
-        sudo apt-get install llvm
-
-    - name: Set GGML_N_THREADS for Ubuntu
-      run: echo "GGML_N_THREADS=2" >> $GITHUB_ENV
-      if: matrix.os == 'ubuntu-latest'
-
-    - name: Set GGML_N_THREADS for MacOS
-      run: echo "GGML_N_THREADS=2" >> $GITHUB_ENV
-      if: matrix.os == 'macos-latest'
-
-    - name: Create Build Environment
-      run: mkdir build
-
-    - name: Configure CMake
-      working-directory: ./build
-      run: cmake -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DGGML_TEST_COVERAGE=ON ..
-
-    - name: Build
-      working-directory: ./build
-      run: make
-
-    - name: Test
-      working-directory: ./build
-      run: ctest --verbose --timeout 900
-
-    - name: Test Coverage for Ubuntu
-      if: matrix.os == 'ubuntu-latest'
-      working-directory: ./build
-      run: |
-        llvm-profdata merge -sparse tests/*.profraw -o ggml.profdata
-        llvm-cov      report ./bin/test-grad0 -instr-profile=ggml.profdata
-        llvm-cov      report ./bin/test-opt   -instr-profile=ggml.profdata
-
-    - name: Test Coverage for MacOS
-      if: matrix.os == 'macos-latest'
-      working-directory: ./build
-      run: |
-        xcrun llvm-profdata merge -sparse tests/*.profraw -o ggml.profdata
-        xcrun llvm-cov      report ./bin/test-grad0 -instr-profile=ggml.profdata
-        xcrun llvm-cov      report ./bin/test-opt   -instr-profile=ggml.profdata
diff --git a/ggml/.gitignore b/ggml/.gitignore
deleted file mode 100644
index fbf3c63..0000000
--- a/ggml/.gitignore
+++ /dev/null
@@ -1,41 +0,0 @@
-build/
-build-debug/
-build-release/
-build-sanitize-addr/
-build-sanitize-thread/
-build-cov/
-build-ci-debug/
-build-ci-release/
-build-cublas/
-out/
-tmp/
-models/
-models-mnt
-
-compile_commands.json
-CMakeSettings.json
-.vs/
-.vscode/
-.clangd
-
-.exrc
-.cache
-.DS_Store
-.stablelm
-.gpt-2
-
-src/arm_neon.h
-tests/arm_neon.h
-
-zig-out/
-zig-cache/
-
-*.dot
-
-*.sw?
-
-__pycache__/
-
-# Model files
-ggml-model-f16.bin
-*.bat
diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
deleted file mode 100644
index ba0a745..0000000
--- a/ggml/CMakeLists.txt
+++ /dev/null
@@ -1,206 +0,0 @@
-cmake_minimum_required (VERSION 3.3)
-project(ggml VERSION 0.1.0)
-
-set(CMAKE_EXPORT_COMPILE_COMMANDS "on")
-set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
-set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib")
-
-if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
-    set(GGML_STANDALONE ON)
-    include(cmake/GitVars.cmake)
-    include(cmake/BuildTypes.cmake)
-else()
-    set(GGML_STANDALONE OFF)
-endif()
-
-if (EMSCRIPTEN)
-    set(BUILD_SHARED_LIBS_DEFAULT OFF)
-else()
-    if (MINGW)
-        set(BUILD_SHARED_LIBS_DEFAULT OFF)
-    else()
-        set(BUILD_SHARED_LIBS_DEFAULT ON)
-    endif()
-endif()
-
-# options
-
-option(BUILD_SHARED_LIBS            "ggml: build shared libs" ${BUILD_SHARED_LIBS_DEFAULT})
-
-option(GGML_ALL_WARNINGS            "ggml: enable all compiler warnings"                   ON)
-option(GGML_ALL_WARNINGS_3RD_PARTY  "ggml: enable all compiler warnings in 3rd party libs" OFF)
-
-option(GGML_SANITIZE_THREAD         "ggml: enable thread sanitizer"    OFF)
-option(GGML_SANITIZE_ADDRESS        "ggml: enable address sanitizer"   OFF)
-option(GGML_SANITIZE_UNDEFINED      "ggml: enable undefined sanitizer" OFF)
-
-option(GGML_BUILD_TESTS             "ggml: build tests"    ${GGML_STANDALONE})
-option(GGML_BUILD_EXAMPLES          "ggml: build examples" ${GGML_STANDALONE})
-
-option(GGML_TEST_COVERAGE           "ggml: enable test coverage" OFF)
-
-option(GGML_PERF                    "ggml: enable perf timings"          OFF)
-option(GGML_NO_ACCELERATE           "ggml: disable Accelerate framework" OFF)
-option(GGML_OPENBLAS                "ggml: use OpenBLAS"                 OFF)
-option(GGML_CLBLAST                 "ggml: use clBLAST"                  OFF)
-option(GGML_HIPBLAS                 "ggml: use hipBLAS"                  OFF)
-option(GGML_CUBLAS                  "ggml: use cuBLAS"                   OFF)
-option(GGML_METAL                   "ggml: use Metal"                    OFF)
-
-option(GGML_CUDA_FORCE_DMMV                 "ggml: use dmmv instead of mmvq CUDA kernels"     OFF)
-option(GGML_CUDA_FORCE_MMQ                  "ggml: use mmq kernels instead of cuBLAS"         OFF)
-set(GGML_CUDA_DMMV_X      "32" CACHE STRING "ggml: x stride for dmmv CUDA kernels")
-set(GGML_CUDA_MMV_Y        "1" CACHE STRING "ggml: y block size for mmv CUDA kernels")
-option(GGML_CUDA_F16                        "ggml: use 16 bit floats for some calculations"   OFF)
-set(GGML_CUDA_KQUANTS_ITER "2" CACHE STRING "ggml: iters./thread per block for Q2_K/Q6_K")
-set(GGML_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
-                                            "ggml: max. batch size for using peer access")
-# sanitizers
-
-if (GGML_SANITIZE_THREAD)
-    set(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS}   -fsanitize=thread")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=thread")
-endif()
-
-if (GGML_SANITIZE_ADDRESS)
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}     -fsanitize=address -fno-omit-frame-pointer")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address -fno-omit-frame-pointer")
-endif()
-
-if (GGML_SANITIZE_UNDEFINED)
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}     -fsanitize=undefined")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=undefined")
-endif()
-
-# instruction set specific
-option(GGML_AVX                     "ggml: enable AVX"                                     ON)
-option(GGML_AVX2                    "ggml: enable AVX2"                                    ON)
-option(GGML_AVX512                  "ggml: enable AVX512"                                  OFF)
-option(GGML_AVX512_VBMI             "ggml: enable AVX512-VBMI"                             OFF)
-option(GGML_AVX512_VNNI             "ggml: enable AVX512-VNNI"                             OFF)
-option(GGML_FMA                     "ggml: enable FMA"                                     ON)
-# in MSVC F16C is implied with AVX2/AVX512
-if (NOT MSVC)
-    option(GGML_F16C                "ggml: enable F16C"                                    ON)
-endif()
-
-#set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -ffast-math")
-#set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=native")
-#set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mcpu=native")
-
-# warning flags
-
-if (GGML_ALL_WARNINGS)
-    if (NOT MSVC)
-        set(c_flags   -Wall -Wpedantic -Wformat=2 -Wno-unused -Wstrict-prototypes)
-        set(cxx_flags -Wall -Wpedantic -Wformat=2)
-    else()
-        # todo : windows
-    endif()
-
-    add_compile_options(
-        "$<$<COMPILE_LANGUAGE:C>:${c_flags}>"
-        "$<$<COMPILE_LANGUAGE:CXX>:${cxx_flags}>"
-    )
-endif()
-
-if (NOT MSVC)
-    add_compile_options(
-        "$<$<COMPILE_LANGUAGE:C>:-Werror=vla>"
-        "$<$<COMPILE_LANGUAGE:CXX>:-Werror=vla>"
-        "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler;-Werror=vla>"
-    )
-endif()
-
-#
-# POSIX conformance
-#
-
-# clock_gettime came in POSIX.1b (1993)
-# CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional
-# posix_memalign came in POSIX.1-2001 / SUSv3
-# M_PI is an XSI extension since POSIX.1-2001 / SUSv3, came in XPG1 (1985)
-add_compile_definitions(_XOPEN_SOURCE=600)
-
-# Somehow in OpenBSD whenever POSIX conformance is specified
-# some string functions rely on locale_t availability,
-# which was introduced in POSIX.1-2008, forcing us to go higher
-if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD")
-    remove_definitions(-D_XOPEN_SOURCE=600)
-    add_compile_definitions(_XOPEN_SOURCE=700)
-endif()
-
-# Data types, macros and functions related to controlling CPU affinity
-# are available on Linux through GNU extensions in libc
-if (CMAKE_SYSTEM_NAME MATCHES "Linux")
-    add_compile_definitions(_GNU_SOURCE)
-endif()
-
-# RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1,
-# and on macOS its availability depends on enabling Darwin extensions
-# similarly on DragonFly, enabling BSD extensions is necessary
-if (CMAKE_SYSTEM_NAME MATCHES "Darwin")
-    add_compile_definitions(_DARWIN_C_SOURCE)
-endif()
-if (CMAKE_SYSTEM_NAME MATCHES "DragonFly")
-    add_compile_definitions(_DARWIN_C_SOURCE)
-endif()
-
-# alloca is a non-standard interface that is not visible on BSDs when
-# POSIX conformance is specified, but not all of them provide a clean way
-# to enable it in such cases
-if (CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
-    add_compile_definitions(__BSD_VISIBLE)
-endif()
-if (CMAKE_SYSTEM_NAME MATCHES "NetBSD")
-    add_compile_definitions(_NETBSD_SOURCE)
-endif()
-if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD")
-    add_compile_definitions(_BSD_SOURCE)
-endif()
-
-if (WHISPER_PERF)
-    set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_PERF)
-endif()
-
-# dependencies
-
-set(CMAKE_C_STANDARD   11)
-set(CMAKE_CXX_STANDARD 11)
-
-find_package(Threads REQUIRED)
-
-# main
-
-if (NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
-    set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
-    set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "RelWithDebInfo")
-endif ()
-
-if (GGML_BUILD_TESTS)
-    if (GGML_TEST_COVERAGE)
-        if (CMAKE_C_COMPILER_ID MATCHES "Clang")
-            set(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS}   -fprofile-instr-generate -fcoverage-mapping")
-            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fprofile-instr-generate -fcoverage-mapping")
-        else()
-            message(WARNING "Test coverage is only supported for Clang")
-        endif()
-    endif()
-endif()
-
-add_subdirectory(src)
-
-if (GGML_BUILD_TESTS)
-    enable_testing()
-    add_subdirectory(tests)
-endif ()
-
-if (GGML_BUILD_EXAMPLES)
-    add_subdirectory(examples)
-endif ()
-
-configure_file(${CMAKE_CURRENT_SOURCE_DIR}/ggml.pc.in
-               ${CMAKE_CURRENT_BINARY_DIR}/ggml.pc
-               @ONLY)
-install(FILES ${CMAKE_CURRENT_BINARY_DIR}/ggml.pc
-        DESTINATION share/pkgconfig)
diff --git a/ggml/LICENSE b/ggml/LICENSE
deleted file mode 100644
index fb7ff0c..0000000
--- a/ggml/LICENSE
+++ /dev/null
@@ -1,21 +0,0 @@
-MIT License
-
-Copyright (c) 2022 Georgi Gerganov
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
diff --git a/ggml/Package.swift b/ggml/Package.swift
deleted file mode 100644
index 0c3313d..0000000
--- a/ggml/Package.swift
+++ /dev/null
@@ -1,49 +0,0 @@
-// swift-tools-version: 5.5
-
-import PackageDescription
-
-let package = Package(
-    name: "ggml",
-    platforms: [
-        .macOS(.v12),
-        .iOS(.v14),
-        .watchOS(.v4),
-        .tvOS(.v14)
-    ],
-    products: [
-        .library(name: "ggml", targets: ["ggml"]),
-    ],
-    targets: [
-        .target(
-            name: "ggml",
-            path: ".",
-            exclude: [],
-            sources: [
-                "src/ggml.c",
-                "src/ggml-alloc.c",
-                "src/ggml-backend.c",
-                "src/ggml-quants.c",
-                "src/ggml-metal.m",
-            ],
-            resources: [
-                .process("src/ggml-metal.metal")
-            ],
-            publicHeadersPath: "include/ggml",
-            cSettings: [
-                .unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
-                .define("GGML_USE_ACCELERATE"),
-                .unsafeFlags(["-fno-objc-arc"]),
-                .define("GGML_USE_METAL"),
-                // NOTE: NEW_LAPACK will required iOS version 16.4+
-                // We should consider add this in the future when we drop support for iOS 14
-                // (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc)
-                // .define("ACCELERATE_NEW_LAPACK"),
-                // .define("ACCELERATE_LAPACK_ILP64")
-            ],
-            linkerSettings: [
-                .linkedFramework("Accelerate")
-            ]
-        )
-    ],
-    cxxLanguageStandard: .cxx11
-)
diff --git a/ggml/README.md b/ggml/README.md
deleted file mode 100644
index 50d4c12..0000000
--- a/ggml/README.md
+++ /dev/null
@@ -1,179 +0,0 @@
-# ggml
-
-[Roadmap](https://github.com/users/ggerganov/projects/7) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205)
-
-Tensor library for machine learning
-
-***Note that this project is under active development. \
-Some of the development is currently happening in the [llama.cpp](https://github.com/ggerganov/llama.cpp) and [whisper.cpp](https://github.com/ggerganov/whisper.cpp) repos***
-
-## Features
-
-- Written in C
-- 16-bit float support
-- Integer quantization support (4-bit, 5-bit, 8-bit, etc.)
-- Automatic differentiation
-- ADAM and L-BFGS optimizers
-- Optimized for Apple Silicon
-- On x86 architectures utilizes AVX / AVX2 intrinsics
-- On ppc64 architectures utilizes VSX intrinsics
-- No third-party dependencies
-- Zero memory allocations during runtime
-
-## Updates
-
-- [X] Example of GPT-2 inference [examples/gpt-2](https://github.com/ggerganov/ggml/tree/master/examples/gpt-2)
-- [X] Example of GPT-J inference [examples/gpt-j](https://github.com/ggerganov/ggml/tree/master/examples/gpt-j)
-- [X] Example of Whisper inference [examples/whisper](https://github.com/ggerganov/ggml/tree/master/examples/whisper)
-- [X] Support 4-bit integer quantization https://github.com/ggerganov/ggml/pull/27
-- [X] Example of Cerebras-GPT inference [examples/gpt-2](https://github.com/ggerganov/ggml/tree/master/examples/gpt-2)
-- [ ] Example of FLAN-T5 inference https://github.com/ggerganov/ggml/pull/12
-- [X] Example of LLaMA inference [ggerganov/llama.cpp](https://github.com/ggerganov/llama.cpp)
-- [X] Example of LLaMA training [ggerganov/llama.cpp/examples/baby-llama](https://github.com/ggerganov/llama.cpp/tree/master/examples/baby-llama)
-- [X] Example of Falcon inference [cmp-nct/ggllm.cpp](https://github.com/cmp-nct/ggllm.cpp)
-- [X] Example of BLOOM inference [NouamaneTazi/bloomz.cpp](https://github.com/NouamaneTazi/bloomz.cpp)
-- [X] Example of RWKV inference [saharNooby/rwkv.cpp](https://github.com/saharNooby/rwkv.cpp)
-- [X] Example of SAM inference [examples/sam](https://github.com/ggerganov/ggml/tree/master/examples/sam)
-- [X] Idea for GPU support: https://github.com/ggerganov/llama.cpp/discussions/915
-- [X] Example of StableLM (GPT-NeoX) inference [examples/gpt-neox](https://github.com/ggerganov/ggml/tree/master/examples/gpt-neox)
-- [X] Example of BERT inference [skeskinen/bert.cpp](https://github.com/skeskinen/bert.cpp)
-- [X] Example of 💫 StarCoder inference [examples/starcoder](https://github.com/ggerganov/ggml/tree/master/examples/starcoder)
-- [X] Example of MPT inference [examples/mpt](https://github.com/ggerganov/ggml/tree/master/examples/mpt)
-- [X] Example of Replit inference [examples/replit](https://github.com/ggerganov/ggml/tree/master/examples/replit)
-- [X] Example of BioGPT inference [PABannier/biogpt.cpp](https://github.com/PABannier/biogpt.cpp)
-- [X] Example of Encodec inference [PABannier/encodec.cpp](https://github.com/PABannier/encodec.cpp)
-- [X] Example of CLIP inference [monatis/clip.cpp](https://github.com/monatis/clip.cpp)
-- [X] Example of MiniGPT4 inference [Maknee/minigpt4.cpp](https://github.com/Maknee/minigpt4.cpp)
-- [X] Example of ChatGLM inference [li-plus/chatglm.cpp](https://github.com/li-plus/chatglm.cpp)
-- [X] Example of Stable Diffusion inference [leejet/stable-diffusion.cpp](https://github.com/leejet/stable-diffusion.cpp)
-- [X] Example of Qwen inference [QwenLM/qwen.cpp](https://github.com/QwenLM/qwen.cpp)
-- [X] Example of YOLO inference [examples/yolo](https://github.com/ggerganov/ggml/tree/master/examples/yolo)
-- [X] Example of ViT inference [staghado/vit.cpp](https://github.com/staghado/vit.cpp)
-- [X] SeamlessM4T inference *(in development)* https://github.com/facebookresearch/seamless_communication/tree/main/ggml
-
-## Whisper inference (example)
-
-With ggml you can efficiently run [Whisper](examples/whisper) inference on the CPU.
-
-Memory requirements:
-
-| Model  | Disk   | Mem     |
-| ---    | ---    | ---     |
-| tiny   |  75 MB | ~280 MB |
-| base   | 142 MB | ~430 MB |
-| small  | 466 MB | ~1.0 GB |
-| medium | 1.5 GB | ~2.6 GB |
-| large  | 2.9 GB | ~4.7 GB |
-
-## GPT inference (example)
-
-With ggml you can efficiently run [GPT-2](examples/gpt-2) and [GPT-J](examples/gpt-j) inference on the CPU.
-
-Here is how to run the example programs:
-
-```bash
-# Build ggml + examples
-git clone https://github.com/ggerganov/ggml
-cd ggml
-mkdir build && cd build
-cmake ..
-make -j4 gpt-2-backend gpt-j
-
-# Run the GPT-2 small 117M model
-../examples/gpt-2/download-ggml-model.sh 117M
-./bin/gpt-2-backend -m models/gpt-2-117M/ggml-model.bin -p "This is an example"
-
-# Run the GPT-J 6B model (requires 12GB disk space and 16GB CPU RAM)
-../examples/gpt-j/download-ggml-model.sh 6B
-./bin/gpt-j -m models/gpt-j-6B/ggml-model.bin -p "This is an example"
-
-# Install Python dependencies
-python3 -m pip install -r ../requirements.txt
-
-# Run the Cerebras-GPT 111M model
-# Download from: https://huggingface.co/cerebras
-python3 ../examples/gpt-2/convert-cerebras-to-ggml.py /path/to/Cerebras-GPT-111M/
-./bin/gpt-2 -m /path/to/Cerebras-GPT-111M/ggml-model-f16.bin -p "This is an example"
-```
-
-The inference speeds that I get for the different models on my 32GB MacBook M1 Pro are as follows:
-
-| Model | Size  | Time / Token |
-| ---   | ---   | ---    |
-| GPT-2 |  117M |   5 ms |
-| GPT-2 |  345M |  12 ms |
-| GPT-2 |  774M |  23 ms |
-| GPT-2 | 1558M |  42 ms |
-| ---   | ---   | ---    |
-| GPT-J |    6B | 125 ms |
-
-For more information, checkout the corresponding programs in the [examples](examples) folder.
-
-## Using Metal (only with GPT-2)
-
-For GPT-2 models, offloading to GPU is possible. Note that it will not improve inference performances but will reduce power consumption and free up the CPU for other tasks.
-
-To enable GPU offloading on MacOS:
-
-```bash
-cmake -DGGML_METAL=ON -DBUILD_SHARED_LIBS=Off ..
-
-# add -ngl 1
-./bin/gpt-2 -t 4 -ngl 100 -m models/gpt-2-117M/ggml-model.bin -p "This is an example"
-```
-
-## Using cuBLAS
-
-```bash
-# fix the path to point to your CUDA compiler
-cmake -DGGML_CUBLAS=ON -DCMAKE_CUDA_COMPILER=/usr/local/cuda-12.1/bin/nvcc ..
-```
-
-## Using clBLAST
-
-```bash
-cmake -DGGML_CLBLAST=ON ..
-```
-## Compiling for Android
-
-Download and unzip the NDK from this download [page](https://developer.android.com/ndk/downloads). Set the NDK_ROOT_PATH environment variable or provide the absolute path to the CMAKE_ANDROID_NDK in the command below.
-
-```bash
-cmake .. \
-   -DCMAKE_SYSTEM_NAME=Android \
-   -DCMAKE_SYSTEM_VERSION=33 \
-   -DCMAKE_ANDROID_ARCH_ABI=arm64-v8a \
-   -DCMAKE_ANDROID_NDK=$NDK_ROOT_PATH
-   -DCMAKE_ANDROID_STL_TYPE=c++_shared 
-```
-
-```bash
-# Create directories
-adb shell 'mkdir /data/local/tmp/bin'
-adb shell 'mkdir /data/local/tmp/models'
-
-# Push the compiled binaries to the folder
-adb push bin/* /data/local/tmp/bin/
-
-# Push the ggml library
-adb push src/libggml.so /data/local/tmp/
-
-# Push model files
-adb push models/gpt-2-117M/ggml-model.bin /data/local/tmp/models/
-
-
-# Now lets do some inference ...
-adb shell
-
-# Now we are in shell
-cd /data/local/tmp
-export LD_LIBRARY_PATH=/data/local/tmp
-./bin/gpt-2-backend -m models/ggml-model.bin -p "this is an example"
-```
-
-## Resources
-
-- [GGML - Large Language Models for Everyone](https://github.com/rustformers/llm/blob/main/crates/ggml/README.md): a description of the GGML format provided by the maintainers of the `llm` Rust crate, which provides Rust bindings for GGML
-- [marella/ctransformers](https://github.com/marella/ctransformers): Python bindings for GGML models.
-- [go-skynet/go-ggml-transformers.cpp](https://github.com/go-skynet/go-ggml-transformers.cpp): Golang bindings for GGML models
-- [smspillaz/ggml-gobject](https://github.com/smspillaz/ggml-gobject): GObject-introspectable wrapper for use of GGML on the GNOME platform.
diff --git a/ggml/build.zig b/ggml/build.zig
deleted file mode 100644
index 5aa379d..0000000
--- a/ggml/build.zig
+++ /dev/null
@@ -1,158 +0,0 @@
-const std = @import("std");
-const builtin = @import("builtin");
-
-// Zig Version: 0.11.0
-// Zig Build Command: zig build
-// Zig Run Command: zig build -h
-//     zig build run_dolly-v2
-//     zig build run_gpt-2
-//     zig build run_gpt-j
-//     zig build run_gpt-neox
-//     zig build run_mnist
-//     zig build run_mpt
-//     zig build run_replit
-//     zig build run_starcoder
-//     zig build run_test-grad0
-//     zig build run_test-mul-mat0
-//     zig build run_test-mul-mat2
-//     zig build run_test-opt
-//     zig build run_test-vec1
-//     zig build run_test0
-//     zig build run_test1
-//     zig build run_test2
-//     zig build run_test3
-//     zig build run_zig_test0
-//     zig build run_zig_test1
-//     zig build run_zig_test2
-//     zig build run_zig_test3
-pub fn build(b: *std.build.Builder) void {
-    const target = b.standardTargetOptions(.{});
-    const optimize = b.standardOptimizeOption(.{});
-    const lib = b.addStaticLibrary(.{
-        .name = "ggml",
-        .target = target,
-        .optimize = optimize,
-    });
-    lib.addIncludePath(.{ .path = "./include" });
-    lib.addIncludePath(.{ .path = "./include/ggml" });
-    lib.addCSourceFiles(&.{
-        "src/ggml.c",
-    }, &.{"-std=c11"});
-    lib.linkLibC();
-    lib.linkLibCpp();
-    b.installArtifact(lib);
-
-    // examples
-    const examples = .{
-        "dolly-v2",
-        "gpt-2",
-        "gpt-j",
-        "gpt-neox",
-        "mnist",
-        "mpt",
-        "replit",
-        "starcoder",
-        // "whisper",
-    };
-    inline for (examples) |name| {
-        const exe = b.addExecutable(.{
-            .name = name,
-            .target = target,
-            .optimize = optimize,
-        });
-        exe.addIncludePath(.{ .path = "./include" });
-        exe.addIncludePath(.{ .path = "./include/ggml" });
-        exe.addIncludePath(.{ .path = "./examples" });
-        // exe.addIncludePath("./examples/whisper");
-        exe.addCSourceFiles(&.{
-            std.fmt.comptimePrint("examples/{s}/main.cpp", .{name}),
-            "examples/common.cpp",
-            "examples/common-ggml.cpp",
-            // "examples/whisper/whisper.cpp",
-        }, &.{"-std=c++11"});
-        exe.linkLibrary(lib);
-        b.installArtifact(exe);
-        const run_cmd = b.addRunArtifact(exe);
-        run_cmd.step.dependOn(b.getInstallStep());
-        if (b.args) |args| run_cmd.addArgs(args);
-        const run_step = b.step("run_" ++ name, "Run examples");
-        run_step.dependOn(&run_cmd.step);
-    }
-
-    // tests
-    const tests = if (builtin.target.cpu.arch == .x86_64) .{
-        // "test-blas0",
-        // "test-grad0",
-        "test-mul-mat0",
-        // "test-mul-mat1",
-        "test-mul-mat2",
-        // "test-opt",
-        // "test-svd0",
-        // "test-vec0",
-        "test-vec1",
-        // "test-vec2",
-        "test0",
-        "test1",
-        "test2",
-        "test3",
-    } else .{
-        // "test-blas0",
-        // "test-grad0",
-        "test-mul-mat0",
-        // "test-mul-mat1",
-        "test-mul-mat2",
-        // "test-opt",
-        // "test-svd0",
-        // "test-vec0",
-        // "test-vec1",
-        // "test-vec2",
-        "test0",
-        "test1",
-        "test2",
-        "test3",
-    };
-    inline for (tests) |name| {
-        const exe = b.addExecutable(.{
-            .name = name,
-            .target = target,
-            .optimize = optimize,
-        });
-        exe.addIncludePath(.{ .path = "./include" });
-        exe.addIncludePath(.{ .path = "./include/ggml" });
-        exe.addCSourceFiles(&.{
-            std.fmt.comptimePrint("tests/{s}.c", .{name}),
-        }, &.{"-std=c11"});
-        exe.linkLibrary(lib);
-        b.installArtifact(exe);
-        const run_cmd = b.addRunArtifact(exe);
-        run_cmd.step.dependOn(b.getInstallStep());
-        if (b.args) |args| run_cmd.addArgs(args);
-        const run_step = b.step("run_" ++ name, "Run tests");
-        run_step.dependOn(&run_cmd.step);
-    }
-
-    // zig_tests
-    const zig_tests = .{
-        "test0",
-        "test1",
-        "test2",
-        "test3",
-    };
-    inline for (zig_tests) |name| {
-        const exe = b.addExecutable(.{
-            .name = name,
-            .root_source_file = .{ .path = std.fmt.comptimePrint("tests/{s}.zig", .{name}) },
-            .target = target,
-            .optimize = optimize,
-        });
-        exe.addIncludePath(.{ .path = "./include" });
-        exe.addIncludePath(.{ .path = "./include/ggml" });
-        exe.linkLibrary(lib);
-        b.installArtifact(exe);
-        const run_cmd = b.addRunArtifact(exe);
-        run_cmd.step.dependOn(b.getInstallStep());
-        if (b.args) |args| run_cmd.addArgs(args);
-        const run_step = b.step("run_zig_" ++ name, "Run zig_tests");
-        run_step.dependOn(&run_cmd.step);
-    }
-}
diff --git a/ggml/ci/run.sh b/ggml/ci/run.sh
deleted file mode 100644
index 299da67..0000000
--- a/ggml/ci/run.sh
+++ /dev/null
@@ -1,395 +0,0 @@
-#/bin/bash
-#
-# sample usage:
-#
-# mkdir tmp
-#
-# # CPU-only build
-# bash ./ci/run.sh ./tmp/results ./tmp/mnt
-#
-# # with CUDA support
-# GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
-#
-
-if [ -z "$2" ]; then
-    echo "usage: $0 <output-dir> <mnt-dir>"
-    exit 1
-fi
-
-mkdir -p "$1"
-mkdir -p "$2"
-
-OUT=$(realpath "$1")
-MNT=$(realpath "$2")
-
-rm -v $OUT/*.log
-rm -v $OUT/*.exit
-rm -v $OUT/*.md
-
-sd=`dirname $0`
-cd $sd/../
-SRC=`pwd`
-
-CMAKE_EXTRA=""
-
-if [ ! -z ${GG_BUILD_CUDA} ]; then
-    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_CUBLAS=ON"
-fi
-
-if [ ! -z ${GG_BUILD_METAL} ]; then
-    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON"
-fi
-
-## helpers
-
-# download a file if it does not exist or if it is outdated
-function gg_wget {
-    local out=$1
-    local url=$2
-
-    local cwd=`pwd`
-
-    mkdir -p $out
-    cd $out
-
-    # should not re-download if file is the same
-    wget -nv -N $url
-
-    cd $cwd
-}
-
-function gg_printf {
-    printf -- "$@" >> $OUT/README.md
-}
-
-function gg_run {
-    ci=$1
-
-    set -o pipefail
-    set -x
-
-    gg_run_$ci | tee $OUT/$ci.log
-    cur=$?
-    echo "$cur" > $OUT/$ci.exit
-
-    set +x
-    set +o pipefail
-
-    gg_sum_$ci
-
-    ret=$((ret | cur))
-}
-
-## ci
-
-# ctest_debug
-
-function gg_run_ctest_debug {
-    cd ${SRC}
-
-    rm -rf build-ci-debug && mkdir build-ci-debug && cd build-ci-debug
-
-    set -e
-
-    (time cmake -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} ..     ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time make -j                                              ) 2>&1 | tee -a $OUT/${ci}-make.log
-
-    if [ ! -z ${GG_BUILD_METAL} ]; then
-        export GGML_METAL_PATH_RESOURCES="$(pwd)/bin"
-    fi
-
-    (time ctest --output-on-failure -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
-
-    set +e
-}
-
-function gg_sum_ctest_debug {
-    gg_printf '### %s\n\n' "${ci}"
-
-    gg_printf 'Runs ctest in debug mode\n'
-    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
-    gg_printf '```\n'
-    gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)"
-    gg_printf '```\n'
-    gg_printf '\n'
-}
-
-# ctest_release
-
-function gg_run_ctest_release {
-    cd ${SRC}
-
-    rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
-
-    set -e
-
-    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} ..   ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time make -j                                              ) 2>&1 | tee -a $OUT/${ci}-make.log
-
-    if [ ! -z ${GG_BUILD_METAL} ]; then
-        export GGML_METAL_PATH_RESOURCES="$(pwd)/bin"
-    fi
-
-    if [ -z $GG_BUILD_LOW_PERF ]; then
-        (time ctest --output-on-failure ) 2>&1 | tee -a $OUT/${ci}-ctest.log
-    else
-        (time ctest --output-on-failure -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
-    fi
-
-    set +e
-}
-
-function gg_sum_ctest_release {
-    gg_printf '### %s\n\n' "${ci}"
-
-    gg_printf 'Runs ctest in release mode\n'
-    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
-    gg_printf '```\n'
-    gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)"
-    gg_printf '```\n'
-}
-
-# gpt_2
-
-function gg_run_gpt_2 {
-    cd ${SRC}
-
-    gg_wget models-mnt/gpt-2 https://huggingface.co/ggerganov/ggml/resolve/main/ggml-model-gpt-2-117M.bin
-
-    cd build-ci-release
-
-    set -e
-
-    model="../models-mnt/gpt-2/ggml-model-gpt-2-117M.bin"
-    prompts="../examples/prompts/gpt-2.txt"
-
-    (time ./bin/gpt-2-backend2 --model ${model} -s 1234 -n 64 -tt ${prompts}                       ) 2>&1 | tee -a $OUT/${ci}-tg.log
-    (time ./bin/gpt-2-backend2 --model ${model} -s 1234 -n 64 -p "I believe the meaning of life is") 2>&1 | tee -a $OUT/${ci}-tg.log
-
-    (time ./bin/gpt-2-batched --model ${model} -s 1234 -n 64 -np 8 -p "I believe the meaning of life is") 2>&1 | tee -a $OUT/${ci}-tg.log
-
-    set +e
-}
-
-function gg_sum_gpt_2 {
-    gg_printf '### %s\n\n' "${ci}"
-
-    gg_printf 'Runs short GPT-2 text generation\n'
-    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
-    gg_printf '```\n'
-    gg_printf '%s\n' "$(cat $OUT/${ci}-tg.log)"
-    gg_printf '```\n'
-}
-
-# mnist
-
-function gg_run_mnist {
-    cd ${SRC}
-
-    cd build-ci-release
-
-    set -e
-
-    mkdir -p models/mnist
-    python3 ../examples/mnist/convert-h5-to-ggml.py ../examples/mnist/models/mnist/mnist_model.state_dict
-
-    model_f32="./models/mnist/ggml-model-f32.bin"
-    samples="../examples/mnist/models/mnist/t10k-images.idx3-ubyte"
-
-    # first command runs and exports "mnist.ggml", the second command runs the exported model
-
-    (time ./bin/mnist     ${model_f32} ${samples} ) 2>&1 | tee -a $OUT/${ci}-mnist.log
-    (time ./bin/mnist-cpu ./mnist.ggml ${samples} ) 2>&1 | tee -a $OUT/${ci}-mnist.log
-
-    set +e
-}
-
-function gg_sum_mnist {
-    gg_printf '### %s\n\n' "${ci}"
-
-    gg_printf 'MNIST\n'
-    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
-    gg_printf '```\n'
-    gg_printf '%s\n' "$(cat $OUT/${ci}-mnist.log)"
-    gg_printf '```\n'
-}
-
-# whisper
-
-function gg_run_whisper {
-    cd ${SRC}
-
-    gg_wget models-mnt/whisper/ https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.en.bin
-    gg_wget models-mnt/whisper/ https://github.com/ggerganov/whisper.cpp/raw/master/samples/jfk.wav
-
-    cd build-ci-release
-
-    set -e
-
-    path_models="../models-mnt/whisper/"
-    model_f16="${path_models}/ggml-base.en.bin"
-    audio_0="${path_models}/jfk.wav"
-
-    (time ./bin/whisper -m ${model_f16} -f ${audio_0} ) 2>&1 | tee -a $OUT/${ci}-main.log
-
-    grep -q "And so my fellow Americans" $OUT/${ci}-main.log
-
-    set +e
-}
-
-function gg_sum_whisper {
-    gg_printf '### %s\n\n' "${ci}"
-
-    gg_printf 'Runs short Whisper transcription\n'
-    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
-    gg_printf '```\n'
-    gg_printf '%s\n' "$(cat $OUT/${ci}-main.log)"
-    gg_printf '```\n'
-}
-
-# sam
-
-function gg_run_sam {
-    cd ${SRC}
-
-    gg_wget models-mnt/sam/ https://dl.fbaipublicfiles.com/segment_anything/sam_vit_b_01ec64.pth
-    gg_wget models-mnt/sam/ https://raw.githubusercontent.com/YavorGIvanov/sam.cpp/ceafb7467bff7ec98e0c4f952e58a9eb8fd0238b/img.jpg
-
-    cd build-ci-release
-
-    set -e
-
-    path_models="../models-mnt/sam/"
-    model_f16="${path_models}/ggml-model-f16.bin"
-    img_0="${path_models}/img.jpg"
-
-    python3 ../examples/sam/convert-pth-to-ggml.py ${path_models}/sam_vit_b_01ec64.pth ${path_models}/ 1
-
-    (time ./bin/sam -m ${model_f16} -i ${img_0} ) 2>&1 | tee -a $OUT/${ci}-main.log
-
-    grep -q "bbox (371, 436), (144, 168)" $OUT/${ci}-main.log
-
-    set +e
-}
-
-function gg_sum_sam {
-    gg_printf '### %s\n\n' "${ci}"
-
-    gg_printf 'Run SAM\n'
-    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
-    gg_printf '```\n'
-    gg_printf '%s\n' "$(cat $OUT/${ci}-main.log)"
-    gg_printf '```\n'
-}
-
-# yolo
-
-function gg_run_yolo {
-    cd ${SRC}
-
-    gg_wget models-mnt/yolo/ https://pjreddie.com/media/files/yolov3-tiny.weights
-    gg_wget models-mnt/yolo/ https://raw.githubusercontent.com/pjreddie/darknet/master/data/dog.jpg
-
-    cd build-ci-release
-    cp -r ../examples/yolo/data .
-
-    set -e
-
-    path_models="../models-mnt/yolo/"
-
-    python3 ../examples/yolo/convert-yolov3-tiny.py ${path_models}/yolov3-tiny.weights
-
-    (time ./bin/yolov3-tiny -m yolov3-tiny.gguf -i ${path_models}/dog.jpg ) 2>&1 | tee -a $OUT/${ci}-main.log
-
-    grep -q "dog: 57%" $OUT/${ci}-main.log
-    grep -q "car: 52%" $OUT/${ci}-main.log
-    grep -q "truck: 56%" $OUT/${ci}-main.log
-    grep -q "bicycle: 59%" $OUT/${ci}-main.log
-
-    set +e
-}
-
-function gg_sum_yolo {
-    gg_printf '### %s\n\n' "${ci}"
-
-    gg_printf 'Run YOLO\n'
-    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
-    gg_printf '```\n'
-    gg_printf '%s\n' "$(cat $OUT/${ci}-main.log)"
-    gg_printf '```\n'
-}
-
-# mpt
-
-function gg_run_mpt {
-    cd ${SRC}
-
-    gg_wget models-mnt/mpt/7B/ https://huggingface.co/mosaicml/mpt-7b/raw/main/config.json
-    gg_wget models-mnt/mpt/7B/ https://huggingface.co/mosaicml/mpt-7b/raw/main/tokenizer.json
-    gg_wget models-mnt/mpt/7B/ https://huggingface.co/mosaicml/mpt-7b/raw/main/tokenizer_config.json
-    gg_wget models-mnt/mpt/7B/ https://huggingface.co/mosaicml/mpt-7b/raw/main/pytorch_model.bin.index.json
-    gg_wget models-mnt/mpt/7B/ https://huggingface.co/mosaicml/mpt-7b/raw/main/configuration_mpt.py
-    gg_wget models-mnt/mpt/7B/ https://huggingface.co/mosaicml/mpt-7b/resolve/main/pytorch_model-00001-of-00002.bin
-    gg_wget models-mnt/mpt/7B/ https://huggingface.co/mosaicml/mpt-7b/resolve/main/pytorch_model-00002-of-00002.bin
-
-    cd build-ci-release
-
-    set -e
-
-    path_models="../models-mnt/mpt/7B"
-    model_f16="${path_models}/ggml-model-f16.bin"
-    model_q4_0="${path_models}/ggml-model-q4_0.bin"
-
-    python3 ../examples/mpt/convert-h5-to-ggml.py ${path_models} 1
-    ./bin/mpt-quantize ${model_f16} ${model_q4_0} q4_0
-
-    (time ./bin/mpt --model ${model_f16}  -s 1234 -n 64 -p "I believe the meaning of life is") 2>&1 | tee -a $OUT/${ci}-tg.log
-    (time ./bin/mpt --model ${model_q4_0} -s 1234 -n 64 -p "I believe the meaning of life is") 2>&1 | tee -a $OUT/${ci}-tg.log
-
-    set +e
-}
-
-function gg_sum_mpt {
-    gg_printf '### %s\n\n' "${ci}"
-
-    gg_printf 'Runs short MPT text generation\n'
-    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
-    gg_printf '```\n'
-    gg_printf '%s\n' "$(cat $OUT/${ci}-tg.log)"
-    gg_printf '```\n'
-}
-
-## main
-
-if [ -z $GG_BUILD_LOW_PERF ]; then
-    rm -rf ${SRC}/models-mnt
-
-    mnt_models=${MNT}/models
-    mkdir -p ${mnt_models}
-    ln -sfn ${mnt_models} ${SRC}/models-mnt
-fi
-
-python3 -m pip install -r ${SRC}/requirements.txt
-
-ret=0
-
-test $ret -eq 0 && gg_run ctest_debug
-test $ret -eq 0 && gg_run ctest_release
-
-if [ ! -z ${GG_BUILD_METAL} ]; then
-    export GGML_METAL_PATH_RESOURCES="${SRC}/build-ci-release/bin"
-fi
-
-test $ret -eq 0 && gg_run gpt_2
-test $ret -eq 0 && gg_run mnist
-test $ret -eq 0 && gg_run whisper
-test $ret -eq 0 && gg_run sam
-test $ret -eq 0 && gg_run yolo
-
-if [ -z $GG_BUILD_LOW_PERF ]; then
-    if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 16 ]; then
-        test $ret -eq 0 && gg_run mpt
-    fi
-fi
-
-exit $ret
diff --git a/ggml/cmake/BuildTypes.cmake b/ggml/cmake/BuildTypes.cmake
deleted file mode 100644
index a9c7b6c..0000000
--- a/ggml/cmake/BuildTypes.cmake
+++ /dev/null
@@ -1,54 +0,0 @@
-# Add new build types
-
-# ReleaseGG - Release with enabled asserts
-
-SET(CMAKE_CXX_FLAGS_RELEASEGG
-    "-O3"
-    CACHE STRING "Flags used by the c++ compiler during release builds with enabled asserts."
-    FORCE )
-SET(CMAKE_C_FLAGS_RELEASEGG
-    "-O3"
-    CACHE STRING "Flags used by the compiler during release builds with enabled asserts."
-    FORCE )
-SET(CMAKE_EXE_LINKER_FLAGS_RELEASEGG
-    ""
-    CACHE STRING "Flags used for linking binaries during release builds with enabled asserts."
-    FORCE )
-SET(CMAKE_SHARED_LINKER_FLAGS_RELEASEGG
-    ""
-    CACHE STRING "Flags used by the shared libraries linker during release builds with enabled asserts."
-    FORCE )
-MARK_AS_ADVANCED(
-    CMAKE_CXX_FLAGS_RELEASEGG
-    CMAKE_C_FLAGS_RELEASEGG
-    CMAKE_EXE_LINKER_FLAGS_RELEASEGG
-    CMAKE_SHARED_LINKER_FLAGS_RELEASEGG )
-
-# RelWithDebInfoGG - RelWithDebInfo with enabled asserts
-
-SET(CMAKE_CXX_FLAGS_RELWITHDEBINFOGG
-    "-O2 -g"
-    CACHE STRING "Flags used by the c++ compiler during release builds with debug symbols and enabled asserts."
-    FORCE )
-SET(CMAKE_C_FLAGS_RELWITHDEBINFOGG
-    "-O2 -g"
-    CACHE STRING "Flags used by the compiler during release builds with debug symbols and enabled asserts."
-    FORCE )
-SET(CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFOGG
-    ""
-    CACHE STRING "Flags used for linking binaries during release builds with debug symbols and enabled asserts."
-    FORCE )
-SET(CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFOGG
-    ""
-    CACHE STRING "Flags used by the shared libraries linker during release builds with debug symbols and enabled asserts."
-    FORCE )
-MARK_AS_ADVANCED(
-    CMAKE_CXX_FLAGS_RELWITHDEBINFOGG
-    CMAKE_C_FLAGS_RELWITHDEBINFOGG
-    CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFOGG
-    CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFOGG )
-
-if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
-    set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
-    set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo" "ReleaseGG" "RelWithDebInfoGG")
-endif()
diff --git a/ggml/cmake/GitVars.cmake b/ggml/cmake/GitVars.cmake
deleted file mode 100644
index 1a4c24e..0000000
--- a/ggml/cmake/GitVars.cmake
+++ /dev/null
@@ -1,22 +0,0 @@
-find_package(Git)
-
-# the commit's SHA1
-execute_process(COMMAND
-    "${GIT_EXECUTABLE}" describe --match=NeVeRmAtCh --always --abbrev=8
-    WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
-    OUTPUT_VARIABLE GIT_SHA1
-    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
-
-# the date of the commit
-execute_process(COMMAND
-    "${GIT_EXECUTABLE}" log -1 --format=%ad --date=local
-    WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
-    OUTPUT_VARIABLE GIT_DATE
-    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
-
-# the subject of the commit
-execute_process(COMMAND
-    "${GIT_EXECUTABLE}" log -1 --format=%s
-    WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
-    OUTPUT_VARIABLE GIT_COMMIT_SUBJECT
-    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
diff --git a/ggml/docs/gguf.md b/ggml/docs/gguf.md
deleted file mode 100644
index bb63f4f..0000000
--- a/ggml/docs/gguf.md
+++ /dev/null
@@ -1,631 +0,0 @@
-# GGUF
-
-GGUF is a file format for storing models for inference with GGML and executors based on GGML. GGUF is a binary format that is designed for fast loading and saving of models, and for ease of reading. Models are traditionally developed using PyTorch or another framework, and then converted to GGUF for use in GGML.
-
-It is a successor file format to GGML, GGMF and GGJT, and is designed to be unambiguous by containing all the information needed to load a model. It is also designed to be extensible, so that new information can be added to models without breaking compatibility.
-
-For more information about the motivation behind GGUF, see [Historical State of Affairs](#historical-state-of-affairs).
-
-## Specification
-
-GGUF is a format based on the existing GGJT, but makes a few changes to the format to make it more extensible and easier to use. The following features are desired:
-
-- Single-file deployment: they can be easily distributed and loaded, and do not require any external files for additional information.
-- Extensible: new features can be added to GGML-based executors/new information can be added to GGUF models without breaking compatibility with existing models.
-- `mmap` compatibility: models can be loaded using `mmap` for fast loading and saving.
-- Easy to use: models can be easily loaded and saved using a small amount of code, with no need for external libraries, regardless of the language used.
-- Full information: all information needed to load a model is contained in the model file, and no additional information needs to be provided by the user.
-
-The key difference between GGJT and GGUF is the use of a key-value structure for the hyperparameters (now referred to as metadata), rather than a list of untyped values. This allows for new metadata to be added without breaking compatibility with existing models, and to annotate the model with additional information that may be useful for inference or for identifying the model.
-
-### File Structure
-
-GGUF files are structured as follows. They use a global alignment specified in the `general.alignment` metadata field, referred to as `ALIGNMENT` below. Where required, the file is padded with `0x00` bytes to the next multiple of `general.alignment`.
-
-Fields, including arrays, are written sequentially without alignment unless otherwise specified.
-
-Models are little-endian by default. They can also come in big-endian for use with big-endian computers; in this case, all values (including metadata values and tensors) will also be big-endian. At the time of writing, there is no way to determine if a model is big-endian; this may be rectified in future versions. If no additional information is provided, assume the model is little-endian.
-
-```c
-enum ggml_type: uint32_t {
-    GGML_TYPE_F32  = 0,
-    GGML_TYPE_F16  = 1,
-    GGML_TYPE_Q4_0 = 2,
-    GGML_TYPE_Q4_1 = 3,
-    // GGML_TYPE_Q4_2 = 4, support has been removed
-    // GGML_TYPE_Q4_3 (5) support has been removed
-    GGML_TYPE_Q5_0 = 6,
-    GGML_TYPE_Q5_1 = 7,
-    GGML_TYPE_Q8_0 = 8,
-    GGML_TYPE_Q8_1 = 9,
-    // k-quantizations
-    GGML_TYPE_Q2_K = 10,
-    GGML_TYPE_Q3_K = 11,
-    GGML_TYPE_Q4_K = 12,
-    GGML_TYPE_Q5_K = 13,
-    GGML_TYPE_Q6_K = 14,
-    GGML_TYPE_Q8_K = 15,
-    GGML_TYPE_I8,
-    GGML_TYPE_I16,
-    GGML_TYPE_I32,
-    GGML_TYPE_COUNT,
-};
-
-enum gguf_metadata_value_type: uint32_t {
-    // The value is a 8-bit unsigned integer.
-    GGUF_METADATA_VALUE_TYPE_UINT8 = 0,
-    // The value is a 8-bit signed integer.
-    GGUF_METADATA_VALUE_TYPE_INT8 = 1,
-    // The value is a 16-bit unsigned little-endian integer.
-    GGUF_METADATA_VALUE_TYPE_UINT16 = 2,
-    // The value is a 16-bit signed little-endian integer.
-    GGUF_METADATA_VALUE_TYPE_INT16 = 3,
-    // The value is a 32-bit unsigned little-endian integer.
-    GGUF_METADATA_VALUE_TYPE_UINT32 = 4,
-    // The value is a 32-bit signed little-endian integer.
-    GGUF_METADATA_VALUE_TYPE_INT32 = 5,
-    // The value is a 32-bit IEEE754 floating point number.
-    GGUF_METADATA_VALUE_TYPE_FLOAT32 = 6,
-    // The value is a boolean.
-    // 1-byte value where 0 is false and 1 is true.
-    // Anything else is invalid, and should be treated as either the model being invalid or the reader being buggy.
-    GGUF_METADATA_VALUE_TYPE_BOOL = 7,
-    // The value is a UTF-8 non-null-terminated string, with length prepended.
-    GGUF_METADATA_VALUE_TYPE_STRING = 8,
-    // The value is an array of other values, with the length and type prepended.
-    ///
-    // Arrays can be nested, and the length of the array is the number of elements in the array, not the number of bytes.
-    GGUF_METADATA_VALUE_TYPE_ARRAY = 9,
-    // The value is a 64-bit unsigned little-endian integer.
-    GGUF_METADATA_VALUE_TYPE_UINT64 = 10,
-    // The value is a 64-bit signed little-endian integer.
-    GGUF_METADATA_VALUE_TYPE_INT64 = 11,
-    // The value is a 64-bit IEEE754 floating point number.
-    GGUF_METADATA_VALUE_TYPE_FLOAT64 = 12,
-}
-
-// A string in GGUF.
-struct gguf_string_t {
-    // The length of the string, in bytes.
-    uint64_t len;
-    // The string as a UTF-8 non-null-terminated string.
-    char string[len];
-}
-
-union gguf_metadata_value_t {
-    uint8_t uint8;
-    int8_t int8;
-    uint16_t uint16;
-    int16_t int16;
-    uint32_t uint32;
-    int32_t int32;
-    float float32;
-    uint64_t uint64;
-    int64_t int64;
-    double float64;
-    bool bool_;
-    gguf_string_t string;
-    struct {
-        // Any value type is valid, including arrays.
-        gguf_metadata_value_type type;
-        // Number of elements, not bytes
-        uint64_t len;
-        // The array of values.
-        gguf_metadata_value_t array[len];
-    } array;
-};
-
-struct gguf_metadata_kv_t {
-    // The key of the metadata. It is a standard GGUF string, with the following caveats:
-    // - It must be a valid ASCII string.
-    // - It must be a hierarchical key, where each segment is `lower_snake_case` and separated by a `.`.
-    // - It must be at most 2^16-1/65535 bytes long.
-    // Any keys that do not follow these rules are invalid.
-    gguf_string_t key;
-
-    // The type of the value.
-    // Must be one of the `gguf_metadata_value_type` values.
-    gguf_metadata_value_type value_type;
-    // The value.
-    gguf_metadata_value_t value;
-};
-
-struct gguf_header_t {
-    // Magic number to announce that this is a GGUF file.
-    // Must be `GGUF` at the byte level: `0x47` `0x47` `0x55` `0x46`.
-    // Your executor might do little-endian byte order, so it might be
-    // check for 0x46554747 and letting the endianness cancel out.
-    // Consider being *very* explicit about the byte order here.
-    uint32_t magic;
-    // The version of the format implemented.
-    // Must be `3` for version described in this spec, which introduces big-endian support.
-    //
-    // This version should only be increased for structural changes to the format.
-    // Changes that do not affect the structure of the file should instead update the metadata
-    // to signify the change.
-    uint32_t version;
-    // The number of tensors in the file.
-    // This is explicit, instead of being included in the metadata, to ensure it is always present
-    // for loading the tensors.
-    uint64_t tensor_count;
-    // The number of metadata key-value pairs.
-    uint64_t metadata_kv_count;
-    // The metadata key-value pairs.
-    gguf_metadata_kv_t metadata_kv[metadata_kv_count];
-};
-
-uint64_t align_offset(uint64_t offset) {
-    return offset + (ALIGNMENT - (offset % ALIGNMENT)) % ALIGNMENT;
-}
-
-struct gguf_tensor_info_t {
-    // The name of the tensor. It is a standard GGUF string, with the caveat that
-    // it must be at most 64 bytes long.
-    gguf_string_t name;
-    // The number of dimensions in the tensor.
-    // Currently at most 4, but this may change in the future.
-    uint32_t n_dimensions;
-    // The dimensions of the tensor.
-    uint64_t dimensions[n_dimensions];
-    // The type of the tensor.
-    ggml_type type;
-    // The offset of the tensor's data in this file in bytes.
-    //
-    // This offset is relative to `tensor_data`, not to the start
-    // of the file, to make it easier for writers to write the file.
-    // Readers should consider exposing this offset relative to the
-    // file to make it easier to read the data.
-    //
-    // Must be a multiple of `ALIGNMENT`. That is, `align_offset(offset) == offset`.
-    uint64_t offset;
-};
-
-struct gguf_file_t {
-    // The header of the file.
-    gguf_header_t header;
-
-    // Tensor infos, which can be used to locate the tensor data.
-    gguf_tensor_info_t tensor_infos[header.tensor_count];
-
-    // Padding to the nearest multiple of `ALIGNMENT`.
-    //
-    // That is, if `sizeof(header) + sizeof(tensor_infos)` is not a multiple of `ALIGNMENT`,
-    // this padding is added to make it so.
-    //
-    // This can be calculated as `align_offset(position) - position`, where `position` is
-    // the position of the end of `tensor_infos` (i.e. `sizeof(header) + sizeof(tensor_infos)`).
-    uint8_t _padding[];
-
-    // Tensor data.
-    //
-    // This is arbitrary binary data corresponding to the weights of the model. This data should be close
-    // or identical to the data in the original model file, but may be different due to quantization or
-    // other optimizations for inference. Any such deviations should be recorded in the metadata or as
-    // part of the architecture definition.
-    //
-    // Each tensor's data must be stored within this array, and located through its `tensor_infos` entry.
-    // The offset of each tensor's data must be a multiple of `ALIGNMENT`, and the space between tensors
-    // should be padded to `ALIGNMENT` bytes.
-    uint8_t tensor_data[];
-};
-```
-
-## Standardized key-value pairs
-
-The following key-value pairs are standardized. This list may grow in the future as more use cases are discovered. Where possible, names are shared with the original model definitions to make it easier to map between the two.
-
-Not all of these are required, but they are all recommended. Keys that are required are bolded. For omitted pairs, the reader should assume that the value is unknown and either default or error as appropriate.
-
-The community can develop their own key-value pairs to carry additional data. However, these should be namespaced with the relevant community name to avoid collisions. For example, the `rustformers` community might use `rustformers.` as a prefix for all of their keys.
-
-If a particular community key is widely used, it may be promoted to a standardized key.
-
-By convention, most counts/lengths/etc are `uint64` unless otherwise specified. This is to allow for larger models to be supported in the future. Some models may use `uint32` for their values; it is recommended that readers support both.
-
-### General
-
-#### Required
-
-- **`general.architecture: string`**: describes what architecture this model implements. All lowercase ASCII, with only `[a-z0-9]+` characters allowed. Known values include:
-  - `llama`
-  - `mpt`
-  - `gptneox`
-  - `gptj`
-  - `gpt2`
-  - `bloom`
-  - `falcon`
-  - `rwkv`
-- **`general.quantization_version: uint32`**: The version of the quantization format. Not required if the model is not quantized (i.e. no tensors are quantized). If any tensors are quantized, this _must_ be present. This is separate to the quantization scheme of the tensors itself; the quantization version may change without changing the scheme's name (e.g. the quantization scheme is Q5_K, and the quantization version is 4).
-- **`general.alignment: uint32`**: the global alignment to use, as described above. This can vary to allow for different alignment schemes, but it must be a multiple of 8. Some writers may not write the alignment. If the alignment is **not** specified, assume it is `32`.
-
-#### General metadata
-
-- `general.name`: The name of the model. This should be a human-readable name that can be used to identify the model. It should be unique within the community that the model is defined in.
-- `general.author`: The author of the model.
-- `general.url`: URL to the model's homepage. This can be a GitHub repo, a paper, etc.
-- `general.description: string`: free-form description of the model including anything that isn't covered by the other fields
-- `general.license: string`: License of the model, expressed as a [SPDX license expression](https://spdx.github.io/spdx-spec/v2-draft/SPDX-license-expressions/) (e.g. `"MIT OR Apache-2.0`). Do not include any other information, such as the license text or the URL to the license.
-- `general.file_type: uint32`: An enumerated value describing the type of the majority of the tensors in the file. Optional; can be inferred from the tensor types.
-  - `ALL_F32 = 0`
-  - `MOSTLY_F16 = 1`
-  - `MOSTLY_Q4_0 = 2`
-  - `MOSTLY_Q4_1 = 3`
-  - `MOSTLY_Q4_1_SOME_F16 = 4`
-  - `MOSTLY_Q4_2 = 5` (support removed)
-  - `MOSTLY_Q4_3 = 6` (support removed)
-  - `MOSTLY_Q8_0 = 7`
-  - `MOSTLY_Q5_0 = 8`
-  - `MOSTLY_Q5_1 = 9`
-  - `MOSTLY_Q2_K = 10`
-  - `MOSTLY_Q3_K_S = 11`
-  - `MOSTLY_Q3_K_M = 12`
-  - `MOSTLY_Q3_K_L = 13`
-  - `MOSTLY_Q4_K_S = 14`
-  - `MOSTLY_Q4_K_M = 15`
-  - `MOSTLY_Q5_K_S = 16`
-  - `MOSTLY_Q5_K_M = 17`
-  - `MOSTLY_Q6_K = 18`
-
-#### Source metadata
-
-Information about where this model came from. This is useful for tracking the provenance of the model, and for finding the original source if the model is modified. For a model that was converted from GGML, for example, these keys would point to the model that was converted from.
-
-- `general.source.url: string`: URL to the source of the model. Can be a GitHub repo, a paper, etc.
-- `general.source.huggingface.repository: string`: Hugging Face model repository that this model is either hosted on or based on
-
-### LLM
-
-In the following, `[llm]` is used to fill in for the name of a specific LLM architecture. For example, `llama` for LLaMA, `mpt` for MPT, etc. If mentioned in an architecture's section, it is required for that architecture, but not all keys are required for all architectures. Consult the relevant section for more information.
-
-- `[llm].context_length: uint64`: Also known as `n_ctx`. length of the context (in tokens) that the model was trained on. For most architectures, this is the hard limit on the length of the input. Architectures, like RWKV, that are not reliant on transformer-style attention may be able to handle larger inputs, but this is not guaranteed.
-- `[llm].embedding_length: uint64`: Also known as `n_embd`. Embedding layer size.
-- `[llm].block_count: uint64`: The number of blocks of attention+feed-forward layers (i.e. the bulk of the LLM). Does not include the input or embedding layers.
-- `[llm].feed_forward_length: uint64`: Also known as `n_ff`. The length of the feed-forward layer.
-- `[llm].use_parallel_residual: bool`: Whether or not the parallel residual logic should be used.
-- `[llm].tensor_data_layout: string`: When a model is converted to GGUF, tensors may be rearranged to improve performance. This key describes the layout of the tensor data. This is not required; if not present, it is assumed to be `reference`.
-  - `reference`: tensors are laid out in the same order as the original model
-  - further options can be found for each architecture in their respective sections
-- `[llm].expert_count: uint32`: Number of experts in MoE models (optional for non-MoE arches).
-- `[llm].expert_used_count: uint32`: Number of experts used during each token token evaluation (optional for non-MoE arches).
-
-#### Attention
-
-- `[llm].attention.head_count: uint64`: Also known as `n_head`. Number of attention heads.
-- `[llm].attention.head_count_kv: uint64`: The number of heads per group used in Grouped-Query-Attention. If not present or if present and equal to `[llm].attention.head_count`, the model does not use GQA.
-- `[llm].attention.max_alibi_bias: float32`: The maximum bias to use for ALiBI.
-- `[llm].attention.clamp_kqv: float32`: Value (`C`) to clamp the values of the `Q`, `K`, and `V` tensors between (`[-C, C]`).
-- `[llm].attention.layer_norm_epsilon: float32`: Layer normalization epsilon.
-- `[llm].attention.layer_norm_rms_epsilon: float32`: Layer RMS normalization epsilon.
-- `[llm].attention.key_length: uint32`: The optional size of a key head, $d_k$. If not specified, it will be `n_embd / n_head`.
-- `[llm].attention.value_length: uint32`: The optional size of a value head, $d_v$. If not specified, it will be `n_embd / n_head`.
-
-#### RoPE
-
-- `[llm].rope.dimension_count: uint64`: The number of rotary dimensions for RoPE.
-- `[llm].rope.freq_base: float32`: The base frequency for RoPE.
-
-##### Scaling
-
-The following keys describe RoPE scaling parameters:
-
-- `[llm].rope.scaling.type: string`: Can be `none`, `linear`, or `yarn`.
-- `[llm].rope.scaling.factor: float32`: A scale factor for RoPE to adjust the context length.
-- `[llm].rope.scaling.original_context_length: uint32_t`: The original context length of the base model.
-- `[llm].rope.scaling.finetuned: bool`: True if model has been finetuned with RoPE scaling.
-
-Note that older models may not have these keys, and may instead use the following key:
-
-- `[llm].rope.scale_linear: float32`: A linear scale factor for RoPE to adjust the context length.
-
-It is recommended that models use the newer keys if possible, as they are more flexible and allow for more complex scaling schemes. Executors will need to support both indefinitely.
-
-#### Models
-
-The following sections describe the metadata for each model architecture. Each key specified _must_ be present.
-
-##### LLaMA
-
-- `llama.context_length`
-- `llama.embedding_length`
-- `llama.block_count`
-- `llama.feed_forward_length`
-- `llama.rope.dimension_count`
-- `llama.attention.head_count`
-- `llama.attention.layer_norm_rms_epsilon`
-
-###### Optional
-
-- `llama.rope.scale`
-- `llama.attention.head_count_kv`
-- `llama.tensor_data_layout`:
-  - `Meta AI original pth`:
-    ```python
-    def permute(weights: NDArray, n_head: int) -> NDArray:
-        return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
-                    .swapaxes(1, 2)
-                    .reshape(weights.shape))
-    ```
-- `llama.expert_count`
-- `llama.expert_used_count`
-
-##### MPT
-
-- `mpt.context_length`
-- `mpt.embedding_length`
-- `mpt.block_count`
-- `mpt.attention.head_count`
-- `mpt.attention.alibi_bias_max`
-- `mpt.attention.clip_kqv`
-- `mpt.attention.layer_norm_epsilon`
-
-##### GPT-NeoX
-
-- `gptneox.context_length`
-- `gptneox.embedding_length`
-- `gptneox.block_count`
-- `gptneox.use_parallel_residual`
-- `gptneox.rope.dimension_count`
-- `gptneox.attention.head_count`
-- `gptneox.attention.layer_norm_epsilon`
-
-###### Optional
-
-- `gptneox.rope.scale`
-
-##### GPT-J
-
-- `gptj.context_length`
-- `gptj.embedding_length`
-- `gptj.block_count`
-- `gptj.rope.dimension_count`
-- `gptj.attention.head_count`
-- `gptj.attention.layer_norm_epsilon`
-
-###### Optional
-
-- `gptj.rope.scale`
-
-##### GPT-2
-
-- `gpt2.context_length`
-- `gpt2.embedding_length`
-- `gpt2.block_count`
-- `gpt2.attention.head_count`
-- `gpt2.attention.layer_norm_epsilon`
-
-##### BLOOM
-
-- `bloom.context_length`
-- `bloom.embedding_length`
-- `bloom.block_count`
-- `bloom.feed_forward_length`
-- `bloom.attention.head_count`
-- `bloom.attention.layer_norm_epsilon`
-
-##### Falcon
-
-- `falcon.context_length`
-- `falcon.embedding_length`
-- `falcon.block_count`
-- `falcon.attention.head_count`
-- `falcon.attention.head_count_kv`
-- `falcon.attention.use_norm`
-- `falcon.attention.layer_norm_epsilon`
-
-###### Optional
-
-- `falcon.tensor_data_layout`:
-
-  - `jploski` (author of the original GGML implementation of Falcon):
-
-    ```python
-    # The original query_key_value tensor contains n_head_kv "kv groups",
-    # each consisting of n_head/n_head_kv query weights followed by one key
-    # and one value weight (shared by all query heads in the kv group).
-    # This layout makes it a big pain to work with in GGML.
-    # So we rearrange them here,, so that we have n_head query weights
-    # followed by n_head_kv key weights followed by n_head_kv value weights,
-    # in contiguous fashion.
-
-    if "query_key_value" in src:
-        qkv = model[src].view(
-            n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head)
-
-        q = qkv[:, :-2 ].reshape(n_head * head_dim, head_dim * n_head)
-        k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head)
-        v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head)
-
-        model[src] = torch.cat((q,k,v)).reshape_as(model[src])
-    ```
-
-##### RWKV
-
-The vocabulary size is the same as the number of rows in the `head` matrix.
-
-- `rwkv.architecture_version: uint32`: The only allowed value currently is 4. Version 5 is expected to appear some time in the future.
-- `rwkv.context_length: uint64`: Length of the context used during training or fine-tuning. RWKV is able to handle larger context than this limit, but the output quality may suffer.
-- `rwkv.block_count: uint64`
-- `rwkv.embedding_length: uint64`
-- `rwkv.feed_forward_length: uint64`
-
-##### Whisper
-
-Keys that do not have types defined should be assumed to share definitions with `llm.` keys.
-(For example, `whisper.context_length` is equivalent to `llm.context_length`.)
-This is because they are both transformer models.
-
-- `whisper.encoder.context_length`
-- `whisper.encoder.embedding_length`
-- `whisper.encoder.block_count`
-- `whisper.encoder.mels_count: uint64`
-- `whisper.encoder.attention.head_count`
-
-- `whisper.decoder.context_length`
-- `whisper.decoder.embedding_length`
-- `whisper.decoder.block_count`
-- `whisper.decoder.attention.head_count`
-
-#### Prompting
-
-**TODO**: Include prompt format, and/or metadata about how it should be used (instruction, conversation, autocomplete, etc).
-
-### LoRA
-
-**TODO**: Figure out what metadata is needed for LoRA. Probably desired features:
-
-- match an existing model exactly, so that it can't be misapplied
-- be marked as a LoRA so executors won't try to run it by itself
-
-Should this be an architecture, or should it share the details of the original model with additional fields to mark it as a LoRA?
-
-### Tokenizer
-
-The following keys are used to describe the tokenizer of the model. It is recommended that model authors support as many of these as possible, as it will allow for better tokenization quality with supported executors.
-
-#### GGML
-
-GGML supports an embedded vocabulary that enables inference of the model, but implementations of tokenization using this vocabulary (i.e. `llama.cpp`'s tokenizer) may have lower accuracy than the original tokenizer used for the model. When a more accurate tokenizer is available and supported, it should be used instead.
-
-It is not guaranteed to be standardized across models, and may change in the future. It is recommended that model authors use a more standardized tokenizer if possible.
-
-- `tokenizer.ggml.model: string`: The name of the tokenizer model.
-  - `llama`: Llama style SentencePiece (tokens and scores extracted from HF `tokenizer.model`)
-  - `replit`: Replit style SentencePiece (tokens and scores extracted from HF `spiece.model`)
-  - `gpt2`: GPT-2 / GPT-NeoX style BPE (tokens extracted from HF `tokenizer.json`)
-  - `rwkv`: RWKV tokenizer
-- `tokenizer.ggml.tokens: array[string]`: A list of tokens indexed by the token ID used by the model.
-- `tokenizer.ggml.scores: array[float32]`: If present, the score/probability of each token. If not present, all tokens are assumed to have equal probability. If present, it must have the same length and index as `tokens`.
-- `tokenizer.ggml.token_type: array[int32]`: The token type (1=normal, 2=unknown, 3=control, 4=user defined, 5=unused, 6=byte). If present, it must have the same length and index as `tokens`.
-- `tokenizer.ggml.merges: array[string]`: If present, the merges of the tokenizer. If not present, the tokens are assumed to be atomic.
-- `tokenizer.ggml.added_tokens: array[string]`: If present, tokens that were added after training.
-
-##### Special tokens
-
-- `tokenizer.ggml.bos_token_id: uint32`: Beginning of sequence marker
-- `tokenizer.ggml.eos_token_id: uint32`: End of sequence marker
-- `tokenizer.ggml.unknown_token_id: uint32`: Unknown token
-- `tokenizer.ggml.separator_token_id: uint32`: Separator token
-- `tokenizer.ggml.padding_token_id: uint32`: Padding token
-
-#### Hugging Face
-
-Hugging Face maintains their own `tokenizers` library that supports a wide variety of tokenizers. If your executor uses this library, it may be able to use the model's tokenizer directly.
-
-- `tokenizer.huggingface.json: string`: the entirety of the HF `tokenizer.json` for a given model (e.g. <https://huggingface.co/mosaicml/mpt-7b-instruct/blob/main/tokenizer.json>). Included for compatibility with executors that support HF tokenizers directly.
-
-#### Other
-
-Other tokenizers may be used, but are not necessarily standardized. They may be executor-specific. They will be documented here as they are discovered/further developed.
-
-- `tokenizer.rwkv.world: string`: a RWKV World tokenizer, like [this](https://github.com/BlinkDL/ChatRWKV/blob/main/tokenizer/rwkv_vocab_v20230424.txt). This text file should be included verbatim.
-- `tokenizer.chat_template : string`: a Jinja template that specifies the input format expected by the model. For more details see: <https://huggingface.co/docs/transformers/main/en/chat_templating>
-
-### Computation graph
-
-This is a future extension and still needs to be discussed, and may necessitate a new GGUF version. At the time of writing, the primary blocker is the stabilization of the computation graph format.
-
-A sample computation graph of GGML nodes could be included in the model itself, allowing an executor to run the model without providing its own implementation of the architecture. This would allow for a more consistent experience across executors, and would allow for more complex architectures to be supported without requiring the executor to implement them.
-
-## Standardized tensor names
-
-To minimize complexity and maximize compatibility, it is recommended that models using the transformer architecture use the following naming convention for their tensors:
-
-### Base layers
-
-`AA.weight` `AA.bias`
-
-where `AA` can be:
-
-- `token_embd`: Token embedding layer
-- `pos_embd`: Position embedding layer
-- `output_norm`: Output normalization layer
-- `output`: Output layer
-
-### Attention and feed-forward layer blocks
-
-`blk.N.BB.weight` `blk.N.BB.bias`
-
-where N signifies the block number a layer belongs to, and where `BB` could be:
-
-- `attn_norm`: Attention normalization layer
-- `attn_norm_2`: Attention normalization layer
-- `attn_qkv`: Attention query-key-value layer
-- `attn_q`: Attention query layer
-- `attn_k`: Attention key layer
-- `attn_v`: Attention value layer
-- `attn_output`: Attention output layer
-
-- `ffn_norm`: Feed-forward network normalization layer
-- `ffn_up`: Feed-forward network "up" layer
-- `ffn_gate`: Feed-forward network "gate" layer
-- `ffn_down`: Feed-forward network "down" layer
-- `ffn_gate_inp`: Expert-routing layer for the Fee-forward network in MoE models
-- `ffn_gate_exp`: Feed-forward network "gate" layer per expert in MoE models
-- `ffn_down_exp`: Feed-forward network "down" layer per expert in MoE models
-- `ffn_up_exp`: Feed-forward network "up" layer per expert in MoE models
-
-## Version History
-
-This document is actively updated to describe the current state of the metadata, and these changes are not tracked outside of the commits.
-
-However, the format _itself_ has changed. The following sections describe the changes to the format itself.
-
-### v3
-
-Adds big-endian support.
-
-### v2
-
-Most countable values (lengths, etc) were changed from `uint32` to `uint64` to allow for larger models to be supported in the future.
-
-### v1
-
-Initial version.
-
-## Historical State of Affairs
-
-The following information is provided for context, but is not necessary to understand the rest of this document.
-
-### Overview
-
-At present, there are three GGML file formats floating around for LLMs:
-
-- **GGML** (unversioned): baseline format, with no versioning or alignment.
-- **GGMF** (versioned): the same as GGML, but with versioning. Only one version exists.
-- **GGJT**: Aligns the tensors to allow for use with `mmap`, which requires alignment. v1, v2 and v3 are identical, but the latter versions use a different quantization scheme that is incompatible with previous versions.
-
-GGML is primarily used by the examples in `ggml`, while GGJT is used by `llama.cpp` models. Other executors may use any of the three formats, but this is not 'officially' supported.
-
-These formats share the same fundamental structure:
-
-- a magic number with an optional version number
-- model-specific hyperparameters, including
-  - metadata about the model, such as the number of layers, the number of heads, etc.
-  - a `ftype` that describes the type of the majority of the tensors,
-    - for GGML files, the quantization version is encoded in the `ftype` divided by 1000
-- an embedded vocabulary, which is a list of strings with length prepended. The GGMF/GGJT formats embed a float32 score next to the strings.
-- finally, a list of tensors with their length-prepended name, type, and (aligned, in the case of GGJT) tensor data
-
-Notably, this structure does not identify what model architecture the model belongs to, nor does it offer any flexibility for changing the structure of the hyperparameters. This means that the only way to add new hyperparameters is to add them to the end of the list, which is a breaking change for existing models.
-
-### Drawbacks
-
-Unfortunately, over the last few months, there are a few issues that have become apparent with the existing models:
-
-- There's no way to identify which model architecture a given model is for, because that information isn't present
-  - Similarly, existing programs cannot intelligently fail upon encountering new architectures
-- Adding or removing any new hyperparameters is a breaking change, which is impossible for a reader to detect without using heuristics
-- Each model architecture requires its own conversion script to their architecture's variant of GGML
-- Maintaining backwards compatibility without breaking the structure of the format requires clever tricks, like packing the quantization version into the ftype, which are not guaranteed to be picked up by readers/writers, and are not consistent between the two formats
-
-### Why not other formats?
-
-There are a few other formats that could be used, but issues include:
-
-- requiring additional dependencies to load or save the model, which is complicated in a C environment
-- limited or no support for 4-bit quantization
-- existing cultural expectations (e.g. whether or not the model is a directory or a file)
-- lack of support for embedded vocabularies
-- lack of control over direction of future development
-
-Ultimately, it is likely that GGUF will remain necessary for the foreseeable future, and it is better to have a single format that is well-documented and supported by all executors than to contort an existing format to fit the needs of GGML.
diff --git a/ggml/examples/CMakeLists.txt b/ggml/examples/CMakeLists.txt
deleted file mode 100644
index 340f647..0000000
--- a/ggml/examples/CMakeLists.txt
+++ /dev/null
@@ -1,31 +0,0 @@
-if (GGML_ALL_WARNINGS)
-  if (NOT MSVC)
-      set(cxx_flags
-          # TODO(marella): Add other warnings.
-          -Wpedantic
-          -Wunused-variable
-          -Wno-unused-function
-          -Wno-multichar
-      )
-      add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:${cxx_flags}>")
-  endif()
-endif()
-
-add_library(common STATIC common.cpp)
-target_include_directories(common PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
-
-add_library(common-ggml STATIC common-ggml.cpp)
-target_link_libraries(common-ggml PRIVATE ggml)
-target_include_directories(common-ggml PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
-
-add_subdirectory(gpt-2)
-add_subdirectory(gpt-j)
-add_subdirectory(whisper)
-add_subdirectory(mnist)
-add_subdirectory(gpt-neox)
-add_subdirectory(dolly-v2)
-add_subdirectory(replit)
-add_subdirectory(mpt)
-add_subdirectory(starcoder)
-add_subdirectory(sam)
-add_subdirectory(yolo)
diff --git a/ggml/examples/common-ggml.cpp b/ggml/examples/common-ggml.cpp
deleted file mode 100644
index 06a0f37..0000000
--- a/ggml/examples/common-ggml.cpp
+++ /dev/null
@@ -1,243 +0,0 @@
-#include "common-ggml.h"
-
-#include <regex>
-#include <map>
-
-static const std::map<std::string, enum ggml_ftype> GGML_FTYPE_MAP = {
-    {"q4_0", GGML_FTYPE_MOSTLY_Q4_0},
-    {"q4_1", GGML_FTYPE_MOSTLY_Q4_1},
-    {"q5_0", GGML_FTYPE_MOSTLY_Q5_0},
-    {"q5_1", GGML_FTYPE_MOSTLY_Q5_1},
-    {"q8_0", GGML_FTYPE_MOSTLY_Q8_0},
-    {"q2_k", GGML_FTYPE_MOSTLY_Q2_K},
-    {"q3_k", GGML_FTYPE_MOSTLY_Q3_K},
-    {"q4_k", GGML_FTYPE_MOSTLY_Q4_K},
-    {"q5_k", GGML_FTYPE_MOSTLY_Q5_K},
-    {"q6_k", GGML_FTYPE_MOSTLY_Q6_K},
-};
-
-void ggml_print_ftypes(FILE * fp) {
-    for (auto it = GGML_FTYPE_MAP.begin(); it != GGML_FTYPE_MAP.end(); it++) {
-        fprintf(fp, "  type = \"%s\" or %d\n", it->first.c_str(), it->second);
-    }
-}
-
-enum ggml_ftype ggml_parse_ftype(const char * str) {
-    enum ggml_ftype ftype;
-    if (str[0] == 'q') {
-        const auto it = GGML_FTYPE_MAP.find(str);
-        if (it == GGML_FTYPE_MAP.end()) {
-            fprintf(stderr, "%s: unknown ftype '%s'\n", __func__, str);
-            return GGML_FTYPE_UNKNOWN;
-        }
-        ftype = it->second;
-    } else {
-        ftype = (enum ggml_ftype) atoi(str);
-    }
-
-    return ftype;
-}
-
-bool ggml_common_quantize_0(
-        std::ifstream & finp,
-        std::ofstream & fout,
-        const ggml_ftype ftype,
-        const std::vector<std::string> & to_quant,
-        const std::vector<std::string> & to_skip) {
-
-    ggml_type qtype = GGML_TYPE_F32;
-
-    switch (ftype) {
-        case GGML_FTYPE_MOSTLY_Q4_0: qtype = GGML_TYPE_Q4_0; break;
-        case GGML_FTYPE_MOSTLY_Q4_1: qtype = GGML_TYPE_Q4_1; break;
-        case GGML_FTYPE_MOSTLY_Q5_0: qtype = GGML_TYPE_Q5_0; break;
-        case GGML_FTYPE_MOSTLY_Q5_1: qtype = GGML_TYPE_Q5_1; break;
-        case GGML_FTYPE_MOSTLY_Q8_0: qtype = GGML_TYPE_Q8_0; break;
-        case GGML_FTYPE_MOSTLY_Q2_K: qtype = GGML_TYPE_Q2_K; break;
-        case GGML_FTYPE_MOSTLY_Q3_K: qtype = GGML_TYPE_Q3_K; break;
-        case GGML_FTYPE_MOSTLY_Q4_K: qtype = GGML_TYPE_Q4_K; break;
-        case GGML_FTYPE_MOSTLY_Q5_K: qtype = GGML_TYPE_Q5_K; break;
-        case GGML_FTYPE_MOSTLY_Q6_K: qtype = GGML_TYPE_Q6_K; break;
-        case GGML_FTYPE_UNKNOWN:
-        case GGML_FTYPE_ALL_F32:
-        case GGML_FTYPE_MOSTLY_F16:
-        case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16:
-        case GGML_FTYPE_MOSTLY_IQ2_XXS:
-        case GGML_FTYPE_MOSTLY_IQ2_XS:
-                {
-                    fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype);
-                    return false;
-                }
-    };
-
-    if (!ggml_is_quantized(qtype)) {
-        fprintf(stderr, "%s: invalid quantization type %d (%s)\n", __func__, qtype, ggml_type_name(qtype));
-        return false;
-    }
-
-    size_t total_size_org = 0;
-    size_t total_size_new = 0;
-
-    std::vector<float> work;
-
-    std::vector<uint8_t>     data_u8;
-    std::vector<ggml_fp16_t> data_f16;
-    std::vector<float>       data_f32;
-
-    std::vector<int64_t> hist_all(1 << 4, 0);
-
-    while (true) {
-        int32_t n_dims;
-        int32_t length;
-        int32_t ttype;
-
-        finp.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
-        finp.read(reinterpret_cast<char *>(&length), sizeof(length));
-        finp.read(reinterpret_cast<char *>(&ttype),  sizeof(ttype));
-
-        if (finp.eof()) {
-            break;
-        }
-
-        int32_t nelements = 1;
-        int32_t ne[4] = { 1, 1, 1, 1 };
-        for (int i = 0; i < n_dims; ++i) {
-            finp.read (reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
-            nelements *= ne[i];
-        }
-
-        std::string name(length, 0);
-        finp.read (&name[0], length);
-
-        printf("%64s - [%5d, %5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ne[2], ggml_type_name((ggml_type) ttype));
-
-        bool quantize = false;
-
-        // check if we should quantize this tensor
-        for (const auto & s : to_quant) {
-            if (std::regex_match(name, std::regex(s))) {
-                quantize = true;
-                break;
-            }
-        }
-
-        // check if we should skip this tensor
-        for (const auto & s : to_skip) {
-            if (std::regex_match(name, std::regex(s))) {
-                quantize = false;
-                break;
-            }
-        }
-
-        // quantize only 2D tensors
-        quantize &= (n_dims == 2);
-
-        if (quantize) {
-            if (ttype != GGML_TYPE_F32 && ttype != GGML_TYPE_F16) {
-                fprintf(stderr, "%s: unsupported ttype %d (%s) for integer quantization\n", __func__, ttype, ggml_type_name((ggml_type) ttype));
-                return false;
-            }
-
-            if (ttype == GGML_TYPE_F16) {
-                data_f16.resize(nelements);
-                finp.read(reinterpret_cast<char *>(data_f16.data()), nelements * sizeof(ggml_fp16_t));
-                data_f32.resize(nelements);
-                for (int i = 0; i < nelements; ++i) {
-                    data_f32[i] = ggml_fp16_to_fp32(data_f16[i]);
-                }
-            } else {
-                data_f32.resize(nelements);
-                finp.read(reinterpret_cast<char *>(data_f32.data()), nelements * sizeof(float));
-            }
-
-            ttype = qtype;
-        } else {
-            const int bpe = (ttype == 0) ? sizeof(float) : sizeof(uint16_t);
-
-            data_u8.resize(nelements*bpe);
-            finp.read(reinterpret_cast<char *>(data_u8.data()), nelements * bpe);
-        }
-
-        fout.write(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
-        fout.write(reinterpret_cast<char *>(&length), sizeof(length));
-        fout.write(reinterpret_cast<char *>(&ttype),  sizeof(ttype));
-        for (int i = 0; i < n_dims; ++i) {
-            fout.write(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
-        }
-        fout.write(&name[0], length);
-
-        if (quantize) {
-            work.resize(nelements); // for quantization
-
-            size_t cur_size = 0;
-            std::vector<int64_t> hist_cur(1 << 4, 0);
-
-            switch ((ggml_type) ttype) {
-                case GGML_TYPE_Q4_0:
-                case GGML_TYPE_Q4_1:
-                case GGML_TYPE_Q5_0:
-                case GGML_TYPE_Q5_1:
-                case GGML_TYPE_Q8_0:
-                case GGML_TYPE_Q2_K:
-                case GGML_TYPE_Q3_K:
-                case GGML_TYPE_Q4_K:
-                case GGML_TYPE_Q5_K:
-                case GGML_TYPE_Q6_K:
-                    {
-                        cur_size = ggml_quantize_chunk((ggml_type) ttype, data_f32.data(), work.data(), 0, nelements, hist_cur.data());
-                    } break;
-                case GGML_TYPE_F32:
-                case GGML_TYPE_F16:
-                case GGML_TYPE_I8:
-                case GGML_TYPE_I16:
-                case GGML_TYPE_I32:
-                case GGML_TYPE_Q8_1:
-                case GGML_TYPE_Q8_K:
-                case GGML_TYPE_IQ2_XXS:
-                case GGML_TYPE_IQ2_XS:
-                case GGML_TYPE_COUNT:
-                    {
-                        fprintf(stderr, "%s: unsupported quantization type %d (%s)\n", __func__, ttype, ggml_type_name((ggml_type) ttype));
-                        return false;
-                    }
-            }
-
-            fout.write(reinterpret_cast<char *>(work.data()), cur_size);
-            total_size_new += cur_size;
-
-            printf("size = %8.2f MB -> %8.2f MB | hist: ", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0);
-            for (int i = 0; i < (int) hist_cur.size(); ++i) {
-                hist_all[i] += hist_cur[i];
-            }
-
-            for (int i = 0; i < (int) hist_cur.size(); ++i) {
-                printf("%5.3f ", hist_cur[i] / (float)nelements);
-            }
-            printf("\n");
-        } else {
-            printf("size = %8.3f MB\n", data_u8.size()/1024.0/1024.0);
-            fout.write(reinterpret_cast<char *>(data_u8.data()), data_u8.size());
-            total_size_new += data_u8.size();
-        }
-
-        total_size_org += nelements * sizeof(float);
-    }
-
-    printf("%s: model size  = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
-    printf("%s: quant size  = %8.2f MB | ftype = %d (%s)\n", __func__, total_size_new/1024.0/1024.0, ftype, ggml_type_name(qtype));
-
-    {
-        int64_t sum_all = 0;
-        for (int i = 0; i < (int) hist_all.size(); ++i) {
-            sum_all += hist_all[i];
-        }
-
-        printf("%s: hist: ", __func__);
-        for (int i = 0; i < (int) hist_all.size(); ++i) {
-            printf("%5.3f ", hist_all[i] / (float)sum_all);
-        }
-        printf("\n");
-    }
-
-    return true;
-}
diff --git a/ggml/examples/common-ggml.h b/ggml/examples/common-ggml.h
deleted file mode 100644
index 477de34..0000000
--- a/ggml/examples/common-ggml.h
+++ /dev/null
@@ -1,18 +0,0 @@
-#pragma once
-
-#include "ggml.h"
-
-#include <fstream>
-#include <vector>
-#include <string>
-
-enum ggml_ftype ggml_parse_ftype(const char * str);
-
-void ggml_print_ftypes(FILE * fp = stderr);
-
-bool ggml_common_quantize_0(
-        std::ifstream & finp,
-        std::ofstream & fout,
-        const ggml_ftype ftype,
-        const std::vector<std::string> & to_quant,
-        const std::vector<std::string> & to_skip);
diff --git a/ggml/examples/common.cpp b/ggml/examples/common.cpp
deleted file mode 100644
index 603c655..0000000
--- a/ggml/examples/common.cpp
+++ /dev/null
@@ -1,817 +0,0 @@
-#define _USE_MATH_DEFINES // for M_PI
-
-#include "common.h"
-
-// third-party utilities
-// use your favorite implementations
-#define DR_WAV_IMPLEMENTATION
-#include "dr_wav.h"
-
-#include <cmath>
-#include <cstring>
-#include <fstream>
-#include <regex>
-#include <locale>
-#include <codecvt>
-#include <sstream>
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-
-// Function to check if the next argument exists
-std::string get_next_arg(int& i, int argc, char** argv, const std::string& flag, gpt_params& params) {
-    if (i + 1 < argc && argv[i + 1][0] != '-') {
-        return argv[++i];
-    } else {
-        fprintf(stderr, "error: %s requires one argument.\n", flag.c_str());
-        gpt_print_usage(argc, argv, params);
-        exit(0);
-    }
-}
-
-bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
-    for (int i = 1; i < argc; i++) {
-        std::string arg = argv[i];
-
-        if (arg == "-s" || arg == "--seed") {
-            params.seed = std::stoi(get_next_arg(i, argc, argv, arg, params));
-        } else if (arg == "-t" || arg == "--threads") {
-            params.n_threads = std::stoi(get_next_arg(i, argc, argv, arg, params));
-        } else if (arg == "-p" || arg == "--prompt") {
-            params.prompt = get_next_arg(i, argc, argv, arg, params);
-        } else if (arg == "-n" || arg == "--n_predict") {
-            params.n_predict = std::stoi(get_next_arg(i, argc, argv, arg, params));
-        } else if (arg == "-np" || arg == "--n_parallel") {
-            params.n_parallel = std::stoi(get_next_arg(i, argc, argv, arg, params));
-        } else if (arg == "--top_k") {
-            params.top_k = std::stoi(get_next_arg(i, argc, argv, arg, params));
-        } else if (arg == "--top_p") {
-            params.top_p = std::stof(get_next_arg(i, argc, argv, arg, params));
-        } else if (arg == "--temp") {
-            params.temp = std::stof(get_next_arg(i, argc, argv, arg, params));
-        } else if (arg == "--repeat-last-n") {
-            params.repeat_last_n = std::stoi(get_next_arg(i, argc, argv, arg, params));
-        } else if (arg == "--repeat-penalty") {
-            params.repeat_penalty = std::stof(get_next_arg(i, argc, argv, arg, params));
-        } else if (arg == "-b" || arg == "--batch_size") {
-            params.n_batch= std::stoi(get_next_arg(i, argc, argv, arg, params));
-        } else if (arg == "-c" || arg == "--context") {
-            params.n_ctx= std::stoi(get_next_arg(i, argc, argv, arg, params));
-        } else if (arg == "-ngl" || arg == "--gpu-layers" || arg == "--n-gpu-layers") {
-            params.n_gpu_layers = std::stoi(get_next_arg(i, argc, argv, arg, params));
-        } else if (arg == "--ignore-eos") {
-            params.ignore_eos = true;
-        } else if (arg == "-m" || arg == "--model") {
-            params.model = get_next_arg(i, argc, argv, arg, params);
-        } else if (arg == "-i" || arg == "--interactive") {
-            params.interactive = true;
-        } else if (arg == "-ip" || arg == "--interactive-port") {
-            params.interactive = true;
-            params.interactive_port = std::stoi(get_next_arg(i, argc, argv, arg, params));
-        } else if (arg == "-h" || arg == "--help") {
-            gpt_print_usage(argc, argv, params);
-            exit(0);
-        } else if (arg == "-f" || arg == "--file") {
-            get_next_arg(i, argc, argv, arg, params);
-            std::ifstream file(argv[i]);
-            if (!file) {
-                fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
-                break;
-            }
-            std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
-            if (params.prompt.back() == '\n') {
-                params.prompt.pop_back();
-            }
-        } else if (arg == "-tt" || arg == "--token_test") {
-            params.token_test = get_next_arg(i, argc, argv, arg, params);
-        }
-        else {
-            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
-            gpt_print_usage(argc, argv, params);
-            exit(0);
-        }
-    }
-
-    return true;
-}
-
-void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
-    fprintf(stderr, "usage: %s [options]\n", argv[0]);
-    fprintf(stderr, "\n");
-    fprintf(stderr, "options:\n");
-    fprintf(stderr, "  -h, --help            show this help message and exit\n");
-    fprintf(stderr, "  -s SEED, --seed SEED  RNG seed (default: -1)\n");
-    fprintf(stderr, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
-    fprintf(stderr, "  -p PROMPT, --prompt PROMPT\n");
-    fprintf(stderr, "                        prompt to start generation with (default: random)\n");
-    fprintf(stderr, "  -f FNAME, --file FNAME\n");
-    fprintf(stderr, "                        load prompt from a file\n");
-    fprintf(stderr, "  -tt TOKEN_TEST, --token_test TOKEN_TEST\n");
-    fprintf(stderr, "                        test tokenization\n");
-    fprintf(stderr, "  -n N, --n_predict N   number of tokens to predict (default: %d)\n", params.n_predict);
-    fprintf(stderr, "  --top_k N             top-k sampling (default: %d)\n", params.top_k);
-    fprintf(stderr, "  --top_p N             top-p sampling (default: %.1f)\n", params.top_p);
-    fprintf(stderr, "  --temp N              temperature (default: %.1f)\n", params.temp);
-    fprintf(stderr, "  --repeat-last-n N     last n tokens to consider for penalize (default: %d, 0 = disabled)\n", params.repeat_last_n);
-    fprintf(stderr, "  --repeat-penalty N    penalize repeat sequence of tokens (default: %.2f, 1.0 = disabled)\n", (double)params.repeat_penalty);
-    fprintf(stderr, "  -b N, --batch_size N  batch size for prompt processing (default: %d)\n", params.n_batch);
-    fprintf(stderr, "  -c N, --context N     context / KV cache size (default: %d)\n", params.n_ctx);
-    fprintf(stderr, "  --ignore-eos          ignore EOS token during generation\n");
-    fprintf(stderr, "  -ngl N, --gpu-layers N  number of layers to offload to GPU on supported models (default: %d)\n", params.n_gpu_layers);
-    fprintf(stderr, "  -m FNAME, --model FNAME\n");
-    fprintf(stderr, "                        model path (default: %s)\n", params.model.c_str());
-    fprintf(stderr, "\n");
-}
-
-std::string gpt_random_prompt(std::mt19937 & rng) {
-    const int r = rng() % 10;
-    switch (r) {
-        case 0: return "So";
-        case 1: return "Once upon a time";
-        case 2: return "When";
-        case 3: return "The";
-        case 4: return "After";
-        case 5: return "If";
-        case 6: return "import";
-        case 7: return "He";
-        case 8: return "She";
-        case 9: return "They";
-        default: return "To";
-    }
-
-    return "The";
-}
-
-std::string trim(const std::string & s) {
-    std::regex e("^\\s+|\\s+$");
-    return std::regex_replace(s, e, "");
-}
-
-std::string replace(const std::string & s, const std::string & from, const std::string & to) {
-    std::string result = s;
-    size_t pos = 0;
-    while ((pos = result.find(from, pos)) != std::string::npos) {
-        result.replace(pos, from.length(), to);
-        pos += to.length();
-    }
-    return result;
-}
-
-void gpt_vocab::add_special_token(const std::string & token) {
-    special_tokens.push_back(token);
-}
-
-std::map<std::string, int32_t> json_parse(const std::string & fname) {
-    std::map<std::string, int32_t> result;
-
-    // read file into string
-    std::string json;
-    {
-        std::ifstream ifs(fname);
-        if (!ifs) {
-            fprintf(stderr, "Failed to open %s\n", fname.c_str());
-            exit(1);
-        }
-
-        json = std::string((std::istreambuf_iterator<char>(ifs)),
-                (std::istreambuf_iterator<char>()));
-    }
-
-    if (json[0] != '{') {
-        return result;
-    }
-
-    // parse json
-    {
-        bool has_key  = false;
-        bool in_token = false;
-
-        std::string str_key = "";
-        std::string str_val = "";
-
-        int n = json.size();
-        for (int i = 1; i < n; ++i) {
-            if (!in_token) {
-                if (json[i] == ' ') continue;
-                if (json[i] == '"') {
-                    in_token = true;
-                    continue;
-                }
-            } else {
-                if (json[i] == '\\' && i+1 < n) {
-                    if (has_key == false) {
-                        str_key += json[i];
-                    } else {
-                        str_val += json[i];
-                    }
-                    ++i;
-                } else if (json[i] == '"') {
-                    if (has_key == false) {
-                        has_key = true;
-                        ++i;
-                        while (json[i] == ' ') ++i;
-                        ++i; // :
-                        while (json[i] == ' ') ++i;
-                        if (json[i] != '\"') {
-                            while (json[i] != ',' && json[i] != '}') {
-                                str_val += json[i++];
-                            }
-                            has_key = false;
-                        } else {
-                            in_token = true;
-                            continue;
-                        }
-                    } else {
-                        has_key = false;
-                    }
-
-                    str_key = ::replace(str_key, "\\u0120", " " ); // \u0120 -> space
-                    str_key = ::replace(str_key, "\\u010a", "\n"); // \u010a -> new line
-                    str_key = ::replace(str_key, "\\\"",    "\""); // \\\"   -> "
-
-                    try {
-                        result[str_key] = std::stoi(str_val);
-                    } catch (...) {
-                        //fprintf(stderr, "%s: ignoring key '%s' with value '%s'\n", fname.c_str(), str_key.c_str(), str_val.c_str());
-
-                    }
-                    str_key = "";
-                    str_val = "";
-                    in_token = false;
-                    continue;
-                }
-                if (has_key == false) {
-                    str_key += json[i];
-                } else {
-                    str_val += json[i];
-                }
-            }
-        }
-    }
-
-    return result;
-}
-
-std::string convert_to_utf8(const std::wstring & input) {
-    std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
-    return converter.to_bytes(input);
-}
-
-
-std::wstring convert_to_wstring(const std::string & input) {
-    std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
-    return converter.from_bytes(input);
-}
-
-void gpt_split_words(std::string str, std::vector<std::string>& words) {
-    const std::string pattern = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
-    const std::regex re(pattern);
-    std::smatch m;
-
-    while (std::regex_search(str, m, re)) {
-        for (auto x : m) {
-            words.push_back(x);
-        }
-        str = m.suffix();
-    }
-}
-
-std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text) {
-    std::vector<std::string> words;
-
-    // first split the text into words
-    {
-        std::string str = text;
-
-        // Generate the subpattern from the special_tokens vector if it's not empty
-        if (!vocab.special_tokens.empty()) {
-            const std::regex escape(R"([\[\\\^\$\.\|\?\*\+\(\)\{\}])");
-            std::string special_tokens_subpattern;
-            for (const auto & token : vocab.special_tokens) {
-                if (!special_tokens_subpattern.empty()) {
-                    special_tokens_subpattern += "|";
-                }
-                special_tokens_subpattern += std::regex_replace(token, escape, R"(\$&)");
-            }
-
-            std::regex re(special_tokens_subpattern);
-            std::smatch m;
-            // Split the text by special tokens.
-            while (std::regex_search(str, m, re)) {
-                // Split the substrings in-between special tokens into words.
-                gpt_split_words(m.prefix(), words);
-                // Add matched special tokens as words.
-                for (auto x : m) {
-                    words.push_back(x);
-                }
-                str = m.suffix();
-            }
-            // Remaining text without special tokens will be handled below.
-        }
-
-        gpt_split_words(str, words);
-    }
-
-    // find the longest token that forms each word in words:
-    std::vector<gpt_vocab::id> tokens;
-    for (const auto & word : words) {
-        for (int i = 0; i < (int) word.size(); ){
-            for (int j = word.size() - 1; j >= i; j--){
-                auto cand = word.substr(i, j-i+1);
-                auto it = vocab.token_to_id.find(cand);
-                if (it != vocab.token_to_id.end()){ // word.substr(i, j-i+1) in vocab
-                    tokens.push_back(it->second);
-                    i = j + 1;
-                    break;
-                }
-                else if (j == i){ // word.substr(i, 1) has no matching
-                    fprintf(stderr, "%s: unknown token '%s'\n", __func__, word.substr(i, 1).data());
-                    i++;
-                }
-            }
-        }
-    }
-
-    return tokens;
-}
-
-std::vector<gpt_vocab::id> parse_tokens_from_string(const std::string& input, char delimiter) {
-    std::vector<gpt_vocab::id> output;
-    std::stringstream ss(input);
-    std::string token;
-
-    while (std::getline(ss, token, delimiter)) {
-        output.push_back(std::stoi(token));
-    }
-
-    return output;
-}
-
-std::map<std::string, std::vector<gpt_vocab::id>> extract_tests_from_file(const std::string & fpath_test){
-    if (fpath_test.empty()){
-        fprintf(stderr, "%s : No test file found.\n", __func__);
-        return std::map<std::string, std::vector<gpt_vocab::id>>();
-    }
-
-    std::map<std::string, std::vector<gpt_vocab::id>> tests;
-
-    auto fin = std::ifstream(fpath_test, std::ios_base::in);
-    const char * delimeter = " => ";
-    const char del_tok = ',';
-    std::string line;
-    while (std::getline(fin, line)) {
-        size_t delimiterPos = line.find(delimeter);
-        if (delimiterPos != std::string::npos) {
-            std::string text = line.substr(0, delimiterPos);
-            std::string s_tokens = line.substr(delimiterPos + std::strlen(delimeter));
-            tests[text] = parse_tokens_from_string(s_tokens, del_tok);
-        }
-    }
-    return tests;
-}
-
-void test_gpt_tokenizer(gpt_vocab & vocab, const std::string & fpath_test){
-    std::map<std::string, std::vector<gpt_vocab::id>> tests = extract_tests_from_file(fpath_test);
-
-    size_t n_fails = 0;
-
-    for (const auto & test : tests) {
-        std::vector<gpt_vocab::id> tokens = gpt_tokenize(vocab, test.first);
-
-        if (tokens != test.second){
-            n_fails++;
-
-            // print out failure cases
-            fprintf(stderr, "%s : failed test: '%s'\n", __func__, test.first.c_str());
-            fprintf(stderr, "%s : tokens in hf:   ", __func__);
-            for (const auto & t : test.second) {
-                fprintf(stderr, "%s(%d), ", vocab.id_to_token[t].c_str(), t);
-            }
-            fprintf(stderr, "\n");
-            fprintf(stderr, "%s : tokens in ggml: ", __func__);
-            for (const auto & t : tokens) {
-                fprintf(stderr, "%s(%d), ", vocab.id_to_token[t].c_str(), t);
-            }
-            fprintf(stderr, "\n");
-        }
-    }
-
-    fprintf(stderr, "%s : %zu tests failed out of %zu tests.\n", __func__, n_fails, tests.size());
-}
-
-bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab) {
-    printf("%s: loading vocab from '%s'\n", __func__, fname.c_str());
-
-    vocab.token_to_id = ::json_parse(fname);
-
-    for (const auto & kv : vocab.token_to_id) {
-        vocab.id_to_token[kv.second] = kv.first;
-    }
-
-    printf("%s: vocab size = %d\n", __func__, (int) vocab.token_to_id.size());
-
-    // print the vocabulary
-    //for (auto kv : vocab.token_to_id) {
-    //    printf("'%s' -> %d\n", kv.first.data(), kv.second);
-    //}
-
-    return true;
-}
-
-gpt_vocab::id gpt_sample_top_k_top_p(
-        const gpt_vocab & vocab,
-        const float * logits,
-        int    top_k,
-        double top_p,
-        double temp,
-        std::mt19937 & rng) {
-    int n_logits = vocab.id_to_token.size();
-
-    std::vector<std::pair<double, gpt_vocab::id>> logits_id;
-    logits_id.reserve(n_logits);
-
-    {
-        const double scale = 1.0/temp;
-        for (int i = 0; i < n_logits; ++i) {
-            logits_id.push_back(std::make_pair(logits[i]*scale, i));
-        }
-    }
-
-    // find the top K tokens
-    std::partial_sort(
-            logits_id.begin(),
-            logits_id.begin() + top_k, logits_id.end(),
-            [](const std::pair<double, gpt_vocab::id> & a, const std::pair<double, gpt_vocab::id> & b) {
-        return a.first > b.first;
-    });
-
-    logits_id.resize(top_k);
-
-    double maxl = -INFINITY;
-    for (const auto & kv : logits_id) {
-        maxl = std::max(maxl, kv.first);
-    }
-
-    // compute probs for the top K tokens
-    std::vector<double> probs;
-    probs.reserve(logits_id.size());
-
-    double sum = 0.0;
-    for (const auto & kv : logits_id) {
-        double p = exp(kv.first - maxl);
-        probs.push_back(p);
-        sum += p;
-    }
-
-    // normalize the probs
-    for (auto & p : probs) {
-        p /= sum;
-    }
-
-    if (top_p < 1.0f) {
-        double cumsum = 0.0f;
-        for (int i = 0; i < top_k; i++) {
-            cumsum += probs[i];
-            if (cumsum >= top_p) {
-                top_k = i + 1;
-                probs.resize(top_k);
-                logits_id.resize(top_k);
-                break;
-            }
-        }
-
-        cumsum = 1.0/cumsum;
-        for (int i = 0; i < (int) probs.size(); i++) {
-            probs[i] *= cumsum;
-        }
-    }
-
-    //printf("\n");
-    //for (int i = 0; i < (int) probs.size(); i++) {
-    //    printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), probs[i]);
-    //}
-    //exit(0);
-
-    std::discrete_distribution<> dist(probs.begin(), probs.end());
-    int idx = dist(rng);
-
-    return logits_id[idx].second;
-}
-
-gpt_vocab::id gpt_sample_top_k_top_p_repeat(
-        const gpt_vocab & vocab,
-        const float * logits,
-        const int32_t * last_n_tokens_data,
-        size_t last_n_tokens_data_size,
-        int    top_k,
-        double top_p,
-        double temp,
-        int repeat_last_n,
-        float repeat_penalty,
-        std::mt19937 & rng) {
-
-    int n_logits = vocab.id_to_token.size();
-
-    const auto * plogits = logits;
-
-    const auto last_n_tokens = std::vector<int32_t>(last_n_tokens_data, last_n_tokens_data + last_n_tokens_data_size);
-
-    if (temp <= 0) {
-        // select the token with the highest logit directly
-        float max_logit = plogits[0];
-        gpt_vocab::id max_id = 0;
-
-        for (int i = 1; i < n_logits; ++i) {
-            if (plogits[i] > max_logit) {
-                max_logit = plogits[i];
-                max_id = i;
-            }
-        }
-        return max_id;
-    }
-
-
-    std::vector<std::pair<double, gpt_vocab::id>> logits_id;
-    logits_id.reserve(n_logits);
-
-    {
-        const float scale = 1.0f/temp;
-        for (int i = 0; i < n_logits; ++i) {
-            // repetition penalty from ctrl paper (https://arxiv.org/abs/1909.05858)
-            // credit https://github.com/facebookresearch/llama/compare/main...shawwn:llama:main
-            if (repeat_last_n > 0 && std::find(last_n_tokens.end()-repeat_last_n, last_n_tokens.end(), i) != last_n_tokens.end()) {
-                // if score < 0 then repetition penalty has to multiplied to reduce the previous token probability
-                if (plogits[i] < 0.0f) {
-                    logits_id.push_back(std::make_pair(plogits[i]*scale*repeat_penalty, i));
-                } else {
-                    logits_id.push_back(std::make_pair(plogits[i]*scale/repeat_penalty, i));
-                }
-            } else {
-                logits_id.push_back(std::make_pair(plogits[i]*scale, i));
-            }
-        }
-    }
-
-    // find the top K tokens
-    std::partial_sort(
-            logits_id.begin(),
-            logits_id.begin() + top_k, logits_id.end(),
-            [](const std::pair<double, gpt_vocab::id> & a, const std::pair<double, gpt_vocab::id> & b) {
-        return a.first > b.first;
-    });
-
-    logits_id.resize(top_k);
-
-    double maxl = -INFINITY;
-    for (const auto & kv : logits_id) {
-        maxl = std::max(maxl, kv.first);
-    }
-
-    // compute probs for the top K tokens
-    std::vector<double> probs;
-    probs.reserve(logits_id.size());
-
-    double sum = 0.0;
-    for (const auto & kv : logits_id) {
-        double p = exp(kv.first - maxl);
-        probs.push_back(p);
-        sum += p;
-    }
-
-    // normalize the probs
-    for (auto & p : probs) {
-        p /= sum;
-    }
-
-    if (top_p < 1.0f) {
-        double cumsum = 0.0f;
-        for (int i = 0; i < top_k; i++) {
-            cumsum += probs[i];
-            if (cumsum >= top_p) {
-                top_k = i + 1;
-                probs.resize(top_k);
-                logits_id.resize(top_k);
-                break;
-            }
-        }
-
-        cumsum = 1.0/cumsum;
-        for (int i = 0; i < (int) probs.size(); i++) {
-            probs[i] *= cumsum;
-        }
-    }
-
-//    printf("\n");
-//    for (int i = 0; i < (int) probs.size(); i++) {
-//    for (int i = 0; i < 10; i++) {
-//        printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), probs[i]);
-//    }
-
-    std::discrete_distribution<> dist(probs.begin(), probs.end());
-    int idx = dist(rng);
-
-    return logits_id[idx].second;
-
-}
-
-bool read_wav(const std::string & fname, std::vector<float>& pcmf32, std::vector<std::vector<float>>& pcmf32s, bool stereo) {
-    drwav wav;
-    std::vector<uint8_t> wav_data; // used for pipe input from stdin
-
-    if (fname == "-") {
-        {
-            uint8_t buf[1024];
-            while (true)
-            {
-                const size_t n = fread(buf, 1, sizeof(buf), stdin);
-                if (n == 0) {
-                    break;
-                }
-                wav_data.insert(wav_data.end(), buf, buf + n);
-            }
-        }
-
-        if (drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr) == false) {
-            fprintf(stderr, "error: failed to open WAV file from stdin\n");
-            return false;
-        }
-
-        fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, wav_data.size());
-    }
-    else if (drwav_init_file(&wav, fname.c_str(), nullptr) == false) {
-        fprintf(stderr, "error: failed to open '%s' as WAV file\n", fname.c_str());
-        return false;
-    }
-
-    if (wav.channels != 1 && wav.channels != 2) {
-        fprintf(stderr, "%s: WAV file '%s' must be mono or stereo\n", __func__, fname.c_str());
-        return false;
-    }
-
-    if (stereo && wav.channels != 2) {
-        fprintf(stderr, "%s: WAV file '%s' must be stereo for diarization\n", __func__, fname.c_str());
-        return false;
-    }
-
-    if (wav.sampleRate != COMMON_SAMPLE_RATE) {
-        fprintf(stderr, "%s: WAV file '%s' must be %i kHz\n", __func__, fname.c_str(), COMMON_SAMPLE_RATE/1000);
-        return false;
-    }
-
-    if (wav.bitsPerSample != 16) {
-        fprintf(stderr, "%s: WAV file '%s' must be 16-bit\n", __func__, fname.c_str());
-        return false;
-    }
-
-    const uint64_t n = wav_data.empty() ? wav.totalPCMFrameCount : wav_data.size()/(wav.channels*wav.bitsPerSample/8);
-
-    std::vector<int16_t> pcm16;
-    pcm16.resize(n*wav.channels);
-    drwav_read_pcm_frames_s16(&wav, n, pcm16.data());
-    drwav_uninit(&wav);
-
-    // convert to mono, float
-    pcmf32.resize(n);
-    if (wav.channels == 1) {
-        for (uint64_t i = 0; i < n; i++) {
-            pcmf32[i] = float(pcm16[i])/32768.0f;
-        }
-    } else {
-        for (uint64_t i = 0; i < n; i++) {
-            pcmf32[i] = float(pcm16[2*i] + pcm16[2*i + 1])/65536.0f;
-        }
-    }
-
-    if (stereo) {
-        // convert to stereo, float
-        pcmf32s.resize(2);
-
-        pcmf32s[0].resize(n);
-        pcmf32s[1].resize(n);
-        for (uint64_t i = 0; i < n; i++) {
-            pcmf32s[0][i] = float(pcm16[2*i])/32768.0f;
-            pcmf32s[1][i] = float(pcm16[2*i + 1])/32768.0f;
-        }
-    }
-
-    return true;
-}
-
-void high_pass_filter(std::vector<float> & data, float cutoff, float sample_rate) {
-    const float rc = 1.0f / (2.0f * M_PI * cutoff);
-    const float dt = 1.0f / sample_rate;
-    const float alpha = dt / (rc + dt);
-
-    float y = data[0];
-
-    for (size_t i = 1; i < data.size(); i++) {
-        y = alpha * (y + data[i] - data[i - 1]);
-        data[i] = y;
-    }
-}
-
-bool vad_simple(std::vector<float> & pcmf32, int sample_rate, int last_ms, float vad_thold, float freq_thold, bool verbose) {
-    const int n_samples      = pcmf32.size();
-    const int n_samples_last = (sample_rate * last_ms) / 1000;
-
-    if (n_samples_last >= n_samples) {
-        // not enough samples - assume no speech
-        return false;
-    }
-
-    if (freq_thold > 0.0f) {
-        high_pass_filter(pcmf32, freq_thold, sample_rate);
-    }
-
-    float energy_all  = 0.0f;
-    float energy_last = 0.0f;
-
-    for (int i = 0; i < n_samples; i++) {
-        energy_all += fabsf(pcmf32[i]);
-
-        if (i >= n_samples - n_samples_last) {
-            energy_last += fabsf(pcmf32[i]);
-        }
-    }
-
-    energy_all  /= n_samples;
-    energy_last /= n_samples_last;
-
-    if (verbose) {
-        fprintf(stderr, "%s: energy_all: %f, energy_last: %f, vad_thold: %f, freq_thold: %f\n", __func__, energy_all, energy_last, vad_thold, freq_thold);
-    }
-
-    if (energy_last > vad_thold*energy_all) {
-        return false;
-    }
-
-    return true;
-}
-
-float similarity(const std::string & s0, const std::string & s1) {
-    const size_t len0 = s0.size() + 1;
-    const size_t len1 = s1.size() + 1;
-
-    std::vector<int> col(len1, 0);
-    std::vector<int> prevCol(len1, 0);
-
-    for (size_t i = 0; i < len1; i++) {
-        prevCol[i] = i;
-    }
-
-    for (size_t i = 0; i < len0; i++) {
-        col[0] = i;
-        for (size_t j = 1; j < len1; j++) {
-            col[j] = std::min(std::min(1 + col[j - 1], 1 + prevCol[j]), prevCol[j - 1] + (i > 0 && s0[i - 1] == s1[j - 1] ? 0 : 1));
-        }
-        col.swap(prevCol);
-    }
-
-    const float dist = prevCol[len1 - 1];
-
-    return 1.0f - (dist / std::max(s0.size(), s1.size()));
-}
-
-bool sam_params_parse(int argc, char ** argv, sam_params & params) {
-    for (int i = 1; i < argc; i++) {
-        std::string arg = argv[i];
-
-        if (arg == "-s" || arg == "--seed") {
-            params.seed = std::stoi(argv[++i]);
-        } else if (arg == "-t" || arg == "--threads") {
-            params.n_threads = std::stoi(argv[++i]);
-        } else if (arg == "-m" || arg == "--model") {
-            params.model = argv[++i];
-        } else if (arg == "-i" || arg == "--inp") {
-            params.fname_inp = argv[++i];
-        } else if (arg == "-o" || arg == "--out") {
-            params.fname_out = argv[++i];
-        } else if (arg == "-h" || arg == "--help") {
-            sam_print_usage(argc, argv, params);
-            exit(0);
-        } else {
-            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
-            sam_print_usage(argc, argv, params);
-            exit(0);
-        }
-    }
-
-    return true;
-}
-
-void sam_print_usage(int /*argc*/, char ** argv, const sam_params & params) {
-    fprintf(stderr, "usage: %s [options]\n", argv[0]);
-    fprintf(stderr, "\n");
-    fprintf(stderr, "options:\n");
-    fprintf(stderr, "  -h, --help            show this help message and exit\n");
-    fprintf(stderr, "  -s SEED, --seed SEED  RNG seed (default: -1)\n");
-    fprintf(stderr, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
-    fprintf(stderr, "  -m FNAME, --model FNAME\n");
-    fprintf(stderr, "                        model path (default: %s)\n", params.model.c_str());
-    fprintf(stderr, "  -i FNAME, --inp FNAME\n");
-    fprintf(stderr, "                        input file (default: %s)\n", params.fname_inp.c_str());
-    fprintf(stderr, "  -o FNAME, --out FNAME\n");
-    fprintf(stderr, "                        output file (default: %s)\n", params.fname_out.c_str());
-    fprintf(stderr, "\n");
-}
diff --git a/ggml/examples/common.h b/ggml/examples/common.h
deleted file mode 100644
index 54f0b00..0000000
--- a/ggml/examples/common.h
+++ /dev/null
@@ -1,279 +0,0 @@
-// Various helper functions and utilities
-
-#pragma once
-
-#include <string>
-#include <map>
-#include <vector>
-#include <random>
-#include <thread>
-#include <ctime>
-#include <fstream>
-
-#define COMMON_SAMPLE_RATE 16000
-
-//
-// GPT CLI argument parsing
-//
-
-struct gpt_params {
-    int32_t seed         = -1;   // RNG seed
-    int32_t n_threads    = std::min(4, (int32_t) std::thread::hardware_concurrency());
-    int32_t n_predict    = 200;  // new tokens to predict
-    int32_t n_parallel   = 1;    // number of parallel streams
-    int32_t n_batch      = 8;    // batch size for prompt processing
-    int32_t n_ctx        = 2048; // context size (this is the KV cache max size)
-    int32_t n_gpu_layers = 0;    // number of layers to offlload to the GPU
-
-    bool ignore_eos = false; // ignore EOS token when generating text
-
-    // sampling parameters
-    int32_t top_k          = 40;
-    float   top_p          = 0.9f;
-    float   temp           = 0.9f;
-    int32_t repeat_last_n  = 64;
-    float   repeat_penalty = 1.00f;
-
-    std::string model      = "models/gpt-2-117M/ggml-model.bin"; // model path
-    std::string prompt     = "";
-    std::string token_test = "";
-
-    bool    interactive      = false;
-    int32_t interactive_port = -1;
-};
-
-bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
-
-void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
-
-std::string gpt_random_prompt(std::mt19937 & rng);
-
-//
-// Vocab utils
-//
-
-std::string trim(const std::string & s);
-
-std::string replace(
-        const std::string & s,
-        const std::string & from,
-        const std::string & to);
-
-struct gpt_vocab {
-    using id    = int32_t;
-    using token = std::string;
-
-    std::map<token, id> token_to_id;
-    std::map<id, token> id_to_token;
-    std::vector<std::string> special_tokens;
-
-    void add_special_token(const std::string & token);
-};
-
-// poor-man's JSON parsing
-std::map<std::string, int32_t> json_parse(const std::string & fname);
-
-std::string convert_to_utf8(const std::wstring & input);
-
-std::wstring convert_to_wstring(const std::string & input);
-
-void gpt_split_words(std::string str, std::vector<std::string>& words);
-
-// split text into tokens
-//
-// ref: https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/encoder.py#L53
-//
-// Regex (Python):
-// r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
-//
-// Regex (C++):
-// R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)"
-//
-std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text);
-
-// test outputs of gpt_tokenize
-//
-//   - compare with tokens generated by the huggingface tokenizer
-//   - test cases are chosen based on the model's main language (under 'prompt' directory)
-//   - if all sentences are tokenized identically, print 'All tests passed.'
-//   - otherwise, print sentence, huggingface tokens, ggml tokens
-//
-void test_gpt_tokenizer(gpt_vocab & vocab, const std::string & fpath_test);
-
-// load the tokens from encoder.json
-bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab);
-
-// sample next token given probabilities for each embedding
-//
-//   - consider only the top K tokens
-//   - from them, consider only the top tokens with cumulative probability > P
-//
-// TODO: not sure if this implementation is correct
-// TODO: temperature is not implemented
-//
-gpt_vocab::id gpt_sample_top_k_top_p(
-        const gpt_vocab & vocab,
-        const float * logits,
-        int    top_k,
-        double top_p,
-        double temp,
-        std::mt19937 & rng);
-
-gpt_vocab::id gpt_sample_top_k_top_p_repeat(
-        const gpt_vocab & vocab,
-        const float * logits,
-        const int32_t * last_n_tokens_data,
-        size_t last_n_tokens_data_size,
-        int    top_k,
-        double top_p,
-        double temp,
-        int repeat_last_n,
-        float repeat_penalty,
-        std::mt19937 & rng);
-
-//
-// Audio utils
-//
-
-// Read WAV audio file and store the PCM data into pcmf32
-// The sample rate of the audio must be equal to COMMON_SAMPLE_RATE
-// If stereo flag is set and the audio has 2 channels, the pcmf32s will contain 2 channel PCM
-bool read_wav(
-        const std::string & fname,
-        std::vector<float> & pcmf32,
-        std::vector<std::vector<float>> & pcmf32s,
-        bool stereo);
-
-// Write PCM data into WAV audio file
-class wav_writer {
-private:
-    std::ofstream file;
-    uint32_t dataSize = 0;
-    std::string wav_filename;
-
-    bool write_header(const uint32_t sample_rate,
-                      const uint16_t bits_per_sample,
-                      const uint16_t channels) {
-
-        file.write("RIFF", 4);
-        file.write("\0\0\0\0", 4);    // Placeholder for file size
-        file.write("WAVE", 4);
-        file.write("fmt ", 4);
-
-        const uint32_t sub_chunk_size = 16;
-        const uint16_t audio_format = 1;      // PCM format
-        const uint32_t byte_rate = sample_rate * channels * bits_per_sample / 8;
-        const uint16_t block_align = channels * bits_per_sample / 8;
-
-        file.write(reinterpret_cast<const char *>(&sub_chunk_size), 4);
-        file.write(reinterpret_cast<const char *>(&audio_format), 2);
-        file.write(reinterpret_cast<const char *>(&channels), 2);
-        file.write(reinterpret_cast<const char *>(&sample_rate), 4);
-        file.write(reinterpret_cast<const char *>(&byte_rate), 4);
-        file.write(reinterpret_cast<const char *>(&block_align), 2);
-        file.write(reinterpret_cast<const char *>(&bits_per_sample), 2);
-        file.write("data", 4);
-        file.write("\0\0\0\0", 4);    // Placeholder for data size
-
-        return true;
-    }
-
-    // It is assumed that PCM data is normalized to a range from -1 to 1
-    bool write_audio(const float * data, size_t length) {
-        for (size_t i = 0; i < length; ++i) {
-            const int16_t intSample = data[i] * 32767;
-            file.write(reinterpret_cast<const char *>(&intSample), sizeof(int16_t));
-            dataSize += sizeof(int16_t);
-        }
-        if (file.is_open()) {
-            file.seekp(4, std::ios::beg);
-            uint32_t fileSize = 36 + dataSize;
-            file.write(reinterpret_cast<char *>(&fileSize), 4);
-            file.seekp(40, std::ios::beg);
-            file.write(reinterpret_cast<char *>(&dataSize), 4);
-            file.seekp(0, std::ios::end);
-        }
-        return true;
-    }
-
-    bool open_wav(const std::string & filename) {
-        if (filename != wav_filename) {
-            if (file.is_open()) {
-                file.close();
-            }
-        }
-        if (!file.is_open()) {
-            file.open(filename, std::ios::binary);
-            wav_filename = filename;
-            dataSize = 0;
-        }
-        return file.is_open();
-    }
-
-public:
-    bool open(const std::string & filename,
-              const    uint32_t   sample_rate,
-              const    uint16_t   bits_per_sample,
-              const    uint16_t   channels) {
-
-        if (open_wav(filename)) {
-            write_header(sample_rate, bits_per_sample, channels);
-        } else {
-            return false;
-        }
-
-        return true;
-    }
-
-    bool close() {
-        file.close();
-        return true;
-    }
-
-    bool write(const float * data, size_t length) {
-        return write_audio(data, length);
-    }
-
-    ~wav_writer() {
-        if (file.is_open()) {
-            file.close();
-        }
-    }
-};
-
-
-// Apply a high-pass frequency filter to PCM audio
-// Suppresses frequencies below cutoff Hz
-void high_pass_filter(
-        std::vector<float> & data,
-        float cutoff,
-        float sample_rate);
-
-// Basic voice activity detection (VAD) using audio energy adaptive threshold
-bool vad_simple(
-        std::vector<float> & pcmf32,
-        int   sample_rate,
-        int   last_ms,
-        float vad_thold,
-        float freq_thold,
-        bool  verbose);
-
-// compute similarity between two strings using Levenshtein distance
-float similarity(const std::string & s0, const std::string & s1);
-
-//
-// SAM argument parsing
-//
-
-struct sam_params {
-    int32_t seed      = -1; // RNG seed
-    int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
-
-    std::string model     = "models/sam-vit-b/ggml-model-f16.bin"; // model path
-    std::string fname_inp = "img.jpg";
-    std::string fname_out = "img.out";
-};
-
-bool sam_params_parse(int argc, char ** argv, sam_params & params);
-
-void sam_print_usage(int argc, char ** argv, const sam_params & params);
diff --git a/ggml/examples/dolly-v2/CMakeLists.txt b/ggml/examples/dolly-v2/CMakeLists.txt
deleted file mode 100644
index b2d5556..0000000
--- a/ggml/examples/dolly-v2/CMakeLists.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-#
-# dollyv2
-
-set(TEST_TARGET dollyv2)
-add_executable(${TEST_TARGET} main.cpp)
-target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)
-
-#
-# dollyv2-quantize
-
-set(TEST_TARGET dollyv2-quantize)
-add_executable(${TEST_TARGET} quantize.cpp)
-target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)
diff --git a/ggml/examples/dolly-v2/README.md b/ggml/examples/dolly-v2/README.md
deleted file mode 100644
index add9738..0000000
--- a/ggml/examples/dolly-v2/README.md
+++ /dev/null
@@ -1,187 +0,0 @@
-# Dolly-V2
-
-Transformer architecture: GPT-NeoX
-
-Modeled from examples/stablelm
-
-Ref: https://github.com/databrickslabs/dolly
-
-Ref: https://github.com/stability-AI/stableLM/#stablelm-alpha
-
-## Usage
-
-```bash
-# get the repo and build it
-git clone https://github.com/ggerganov/ggml
-cd ggml
-mkdir build && cd build
-cmake ..
-make -j
-
-# get the Dolly-V2 3B model
-git clone https://huggingface.co/databricks/dolly-v2-3b
-
-# install Python dependencies
-python3 -m pip install -r ../requirements.txt
-
-# convert model to FP16
-python3 ../examples/dolly-v2/convert-h5-to-ggml.py ./dolly-v2-3b/ 1
-
-# run inference using FP16 precision
-./bin/dollyv2 -m ./dolly-v2-3b/ggml-model-f16.bin -p "State the meaning of life." -t 6 -n 64
-
-main: seed = 1683218142
-dollyv2_model_load: loading model from './dolly-v2-3b/ggml-model-f16.bin' - please wait ...
-dollyv2_model_load: n_vocab = 50280
-dollyv2_model_load: n_ctx   = 2048
-dollyv2_model_load: n_embd  = 2560
-dollyv2_model_load: n_head  = 32
-dollyv2_model_load: n_layer = 32
-dollyv2_model_load: n_rot   = 20
-dollyv2_model_load: ftype   = 1
-dollyv2_model_load: ggml ctx size = 7374.91 MB
-dollyv2_model_load: memory_size =   640.00 MB, n_mem = 65536
-dollyv2_model_load: ................................................ done
-dollyv2_model_load: model size =  5295.10 MB / num tensors = 388
-main: number of tokens in prompt = 32
-main: token[0] =  30003, Below
-main: token[1] =    310,  is
-main: token[2] =    271,  an
-main: token[3] =   9775,  instruction
-main: token[4] =    326,  that
-main: token[5] =   8631,  describes
-main: token[6] =    247,  a
-main: token[7] =   4836,  task
-main: token[8] =    964, .
-main: token[9] =  19566,  Write
-main: token[10] =    247,  a
-main: token[11] =   2380,  response
-main: token[12] =    326,  that
-main: token[13] =  20420,  appropriately
-main: token[14] =  29141,  completes
-main: token[15] =    253,  the
-main: token[16] =   2748,  request
-main: token[17] =    964, .
-main: token[18] =    187, 
-
-main: token[19] =    187, 
-
-main: token[20] =  50278, ### Instruction:
-main: token[21] =    187, 
-
-main: token[22] =   5443, State
-main: token[23] =    253,  the
-main: token[24] =   4495,  meaning
-main: token[25] =    273,  of
-main: token[26] =   1495,  life
-main: token[27] =    964, .
-main: token[28] =    187, 
-
-main: token[29] =    187, 
-
-main: token[30] =  50279, ### Response:
-main: token[31] =    187, 
-
-
-Below is an instruction that describes a task. Write a response that appropriately completes the request.
-
-### Instruction:
-State the meaning of life.
-
-### Response:
-The meaning of life is to love and be loved.
-
-### End
-
-main: mem per token = 16136720 bytes
-main:     load time =  2202.58 ms
-main:   sample time =     2.57 ms
-main:  predict time =  1497.14 ms / 33.27 ms per token
-main:    total time =  6187.27 ms
-```
-
-## 5-bit integer quantization mode
-
-```bash
-# quantize the model to 5-bits using Q5_0 quantization
-./bin/dollyv2-quantize ./dolly-v2-3b/ggml-model-f16.bin ./dolly-v2-3b/ggml-model-q5_0.bin q5_0
-
-# run the quantized model
-./bin/dollyv2 -m ./dolly-v2-3b/ggml-model-q5_0.bin -p "State the meaning of life." -t 6 -n 64
-
-main: seed = 1683218518
-dollyv2_model_load: loading model from './dolly-v2-3b/ggml-model-q5_0.bin' - please wait ...
-dollyv2_model_load: n_vocab = 50280
-dollyv2_model_load: n_ctx   = 2048
-dollyv2_model_load: n_embd  = 2560
-dollyv2_model_load: n_head  = 32
-dollyv2_model_load: n_layer = 32
-dollyv2_model_load: n_rot   = 20
-dollyv2_model_load: ftype   = 8
-dollyv2_model_load: ggml ctx size = 3902.68 MB
-dollyv2_model_load: memory_size =   640.00 MB, n_mem = 65536
-dollyv2_model_load: ................................................ done
-dollyv2_model_load: model size =  1822.87 MB / num tensors = 388
-main: number of tokens in prompt = 32
-main: token[0] =  30003, Below
-main: token[1] =    310,  is
-main: token[2] =    271,  an
-main: token[3] =   9775,  instruction
-main: token[4] =    326,  that
-main: token[5] =   8631,  describes
-main: token[6] =    247,  a
-main: token[7] =   4836,  task
-main: token[8] =    964, .
-main: token[9] =  19566,  Write
-main: token[10] =    247,  a
-main: token[11] =   2380,  response
-main: token[12] =    326,  that
-main: token[13] =  20420,  appropriately
-main: token[14] =  29141,  completes
-main: token[15] =    253,  the
-main: token[16] =   2748,  request
-main: token[17] =    964, .
-main: token[18] =    187, 
-
-main: token[19] =    187, 
-
-main: token[20] =  50278, ### Instruction:
-main: token[21] =    187, 
-
-main: token[22] =   5443, State
-main: token[23] =    253,  the
-main: token[24] =   4495,  meaning
-main: token[25] =    273,  of
-main: token[26] =   1495,  life
-main: token[27] =    964, .
-main: token[28] =    187, 
-
-main: token[29] =    187, 
-
-main: token[30] =  50279, ### Response:
-main: token[31] =    187, 
-
-
-Below is an instruction that describes a task. Write a response that appropriately completes the request.
-
-### Instruction:
-State the meaning of life.
-
-### Response:
-The meaning of life is the discovery of the true self.
-
-### End
-
-main: mem per token = 16127760 bytes
-main:     load time =  1011.09 ms
-main:   sample time =     2.79 ms
-main:  predict time =  1271.62 ms / 27.64 ms per token
-main:    total time =  2802.51 ms
-```
-
-## Notes
-
-- No guarantees for correctness
-- The tokenizer is currently hacked - probably works only for English
-- Non-parallel residual is not supported
-- Contributions and improvements are welcome
diff --git a/ggml/examples/dolly-v2/convert-h5-to-ggml.py b/ggml/examples/dolly-v2/convert-h5-to-ggml.py
deleted file mode 100644
index 0019810..0000000
--- a/ggml/examples/dolly-v2/convert-h5-to-ggml.py
+++ /dev/null
@@ -1,116 +0,0 @@
-import sys
-import struct
-import json
-import numpy as np
-
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-if len(sys.argv) < 3:
-    print("Usage: convert-h5-to-ggml.py dir-model [use-f32]\n")
-    print("  ftype == 0 -> float32")
-    print("  ftype == 1 -> float16")
-    sys.exit(1)
-
-# output in the same directory as the model
-dir_model = sys.argv[1]
-fname_out = sys.argv[1] + "/ggml-model.bin"
-
-with open(dir_model + "/tokenizer.json", "r", encoding="utf-8") as f:
-    encoder = json.load(f)
-
-with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
-    hparams = json.load(f)
-
-# possible data types
-#   ftype == 0 -> float32
-#   ftype == 1 -> float16
-#
-# map from ftype to string
-ftype_str = ["f32", "f16"]
-
-ftype = 1
-if len(sys.argv) > 2:
-    ftype = int(sys.argv[2])
-    if ftype < 0 or ftype > 1:
-        print("Invalid ftype: " + str(ftype))
-        sys.exit(1)
-    fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin"
-
-
-tokenizer = AutoTokenizer.from_pretrained(dir_model)
-model = AutoModelForCausalLM.from_pretrained(dir_model, low_cpu_mem_usage=True)
-#print (model)
-
-#print(tokenizer.encode('I believe the meaning of life is'))
-
-list_vars = model.state_dict()
-for name in list_vars.keys():
-    print(name, list_vars[name].shape, list_vars[name].dtype)
-
-fout = open(fname_out, "wb")
-
-print(hparams)
-
-fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
-fout.write(struct.pack("i", hparams["vocab_size"]))
-fout.write(struct.pack("i", hparams["max_position_embeddings"]))
-fout.write(struct.pack("i", hparams["hidden_size"]))
-fout.write(struct.pack("i", hparams["num_attention_heads"]))
-fout.write(struct.pack("i", hparams["num_hidden_layers"]))
-fout.write(struct.pack("i", int(hparams["rotary_pct"]*(hparams["hidden_size"]//hparams["num_attention_heads"]))))
-fout.write(struct.pack("i", hparams["use_parallel_residual"]))
-fout.write(struct.pack("i", ftype))
-
-# TODO: temporary hack to not deal with implementing the tokenizer
-dot_token = tokenizer.encode('.')[0]
-for i in range(hparams["vocab_size"]):
-    text = tokenizer.decode([dot_token, i]).encode('utf-8')
-    # remove the first byte (it's always '.')
-    text = text[1:]
-    fout.write(struct.pack("i", len(text)))
-    fout.write(text)
-
-for name in list_vars.keys():
-    data = list_vars[name].squeeze().numpy()
-    print("Processing variable: " + name + " with shape: ", data.shape)
-
-    # we don't need these
-    if name.endswith(".attention.masked_bias") or     \
-       name.endswith(".attention.bias") or \
-       name.endswith(".attention.rotary_emb.inv_freq"):
-        print("  Skipping variable: " + name)
-        continue
-
-    n_dims = len(data.shape);
-
-    # ftype == 0 -> float32, ftype == 1 -> float16
-    ftype_cur = 0;
-    if ftype != 0:
-        if name[-7:] == ".weight" and n_dims == 2:
-            print("  Converting to float16")
-            data = data.astype(np.float16)
-            ftype_cur = 1
-        else:
-            print("  Converting to float32")
-            data = data.astype(np.float32)
-            ftype_cur = 0
-    else:
-        if data.dtype != np.float32:
-            print("  Converting to float32")
-            data = data.astype(np.float32)
-            ftype_cur = 0
-
-    # header
-    str = name.encode('utf-8')
-    fout.write(struct.pack("iii", n_dims, len(str), ftype_cur))
-    for i in range(n_dims):
-        fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
-    fout.write(str);
-
-    # data
-    data.tofile(fout)
-
-fout.close()
-
-print("Done. Output file: " + fname_out)
-print("")
diff --git a/ggml/examples/dolly-v2/main.cpp b/ggml/examples/dolly-v2/main.cpp
deleted file mode 100644
index 9e35996..0000000
--- a/ggml/examples/dolly-v2/main.cpp
+++ /dev/null
@@ -1,968 +0,0 @@
-#include "ggml/ggml.h"
-
-#include "common.h"
-#include "common-ggml.h"
-
-#include <cassert>
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <cinttypes>
-#include <fstream>
-#include <iostream>
-#include <map>
-#include <string>
-#include <vector>
-
-#if !defined(_WIN32)
-#define DOLLY_INTERACTIVE_PORT
-#endif
-
-#if defined(DOLLY_INTERACTIVE_PORT)
-#include <arpa/inet.h>
-#include <netinet/in.h>
-#include <sys/socket.h>
-#include <unistd.h>
-#endif
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-
-// default hparams (Dolly-V2 3B)
-struct dollyv2_hparams {
-    int32_t n_vocab = 50254; // tokenizer.vocab_size
-    int32_t n_ctx   = 2048;  // model.config.max_position_embeddings
-    int32_t n_embd  = 2560;  // model.config.hidden_size
-    int32_t n_head  = 32;    // model.config.num_attention_heads
-    int32_t n_layer = 32;    // model.config.num_hidden_layers
-    int32_t n_rot   = 20;    // rotary_pct[25%] * (n_embd / n_head)
-    int32_t par_res = 1; // 1 = true, 0 = false
-    int32_t ftype   = GGML_FTYPE_MOSTLY_F16;
-    float   eps     = 1e-5f;
-};
-
-const std::string INSTRUCTION_KEY = "### Instruction:";
-const std::string RESPONSE_KEY    = "### Response:";
-const std::string END_KEY         = "### End";
-const std::string INTRO_BLURB     = "Below is an instruction that describes a task. Write a response that appropriately completes the request.";
-
-// dollyv2 prompt format
-std::string prompt_for_generation(const std::string& instruction) {
-    return INTRO_BLURB + "\n\n" + INSTRUCTION_KEY + "\n" + instruction + "\n\n" + RESPONSE_KEY + "\n";
-}
-
-struct dollyv2_layer {
-    // pre normalization
-    struct ggml_tensor * ln_1_g;
-    struct ggml_tensor * ln_1_b;
-
-    // attention
-    struct ggml_tensor * c_attn_attn_w;
-    struct ggml_tensor * c_attn_attn_b;
-
-    struct ggml_tensor * c_attn_proj_w;
-    struct ggml_tensor * c_attn_proj_b;
-
-    // post normalization
-    struct ggml_tensor * ln_2_g;
-    struct ggml_tensor * ln_2_b;
-
-    // ff
-    struct ggml_tensor * c_mlp_fc_w;
-    struct ggml_tensor * c_mlp_fc_b;
-
-    struct ggml_tensor * c_mlp_proj_w;
-    struct ggml_tensor * c_mlp_proj_b;
-};
-
-struct dollyv2_model {
-    dollyv2_hparams hparams;
-
-    // normalization
-    struct ggml_tensor * ln_f_g;
-    struct ggml_tensor * ln_f_b;
-
-    struct ggml_tensor * wte; // position embedding
-
-    struct ggml_tensor * lmh_g; // language model head
-    //struct ggml_tensor * lmh_b; // language model bias
-
-    std::vector<dollyv2_layer> layers;
-
-    // key + value memory
-    struct ggml_tensor * memory_k;
-    struct ggml_tensor * memory_v;
-
-    //
-    struct ggml_context * ctx;
-    std::map<std::string, struct ggml_tensor *> tensors;
-};
-
-// load the model's weights from a file
-bool dollyv2_model_load(const std::string & fname, dollyv2_model & model, gpt_vocab & vocab) {
-    printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
-
-    auto fin = std::ifstream(fname, std::ios::binary);
-    if (!fin) {
-        fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
-        return false;
-    }
-
-    // verify magic
-    {
-        uint32_t magic;
-        fin.read((char *) &magic, sizeof(magic));
-        if (magic != GGML_FILE_MAGIC) {
-            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
-            return false;
-        }
-    }
-
-    // load hparams
-    {
-        auto & hparams = model.hparams;
-
-        fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
-        fin.read((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
-        fin.read((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
-        fin.read((char *) &hparams.n_head,  sizeof(hparams.n_head));
-        fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
-        fin.read((char *) &hparams.n_rot,   sizeof(hparams.n_rot));
-        fin.read((char *) &hparams.par_res, sizeof(hparams.par_res));
-        fin.read((char *) &hparams.ftype,   sizeof(hparams.ftype));
-
-        const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
-
-        printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
-        printf("%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
-        printf("%s: n_embd  = %d\n", __func__, hparams.n_embd);
-        printf("%s: n_head  = %d\n", __func__, hparams.n_head);
-        printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
-        printf("%s: n_rot   = %d\n", __func__, hparams.n_rot);
-        printf("%s: par_res = %d\n", __func__, hparams.par_res);
-        printf("%s: ftype   = %d\n", __func__, hparams.ftype);
-        printf("%s: qntvr   = %d\n", __func__, qntvr);
-
-        hparams.ftype %= GGML_QNT_VERSION_FACTOR;
-    }
-
-    // load vocab
-    {
-        const int32_t n_vocab = model.hparams.n_vocab;
-
-        std::string word;
-        std::vector<char> buf(128);
-
-        for (int i = 0; i < n_vocab; i++) {
-            uint32_t len;
-            fin.read((char *) &len, sizeof(len));
-
-            buf.resize(len);
-            fin.read((char *) buf.data(), len);
-            word.assign(buf.data(), len);
-
-            vocab.token_to_id[word] = i;
-            vocab.id_to_token[i] = word;
-        }
-
-        vocab.add_special_token("### End");
-        vocab.add_special_token("### Instruction:");
-        vocab.add_special_token("### Response:");
-    }
-
-    // for the big tensors, we have the option to store the data in 16-bit floats or quantized
-    // in order to save memory and also to speed up the computation
-    ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
-    if (wtype == GGML_TYPE_COUNT) {
-        fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
-                __func__, fname.c_str(), model.hparams.ftype);
-        return false;
-    }
-
-    auto & ctx = model.ctx;
-
-    size_t ctx_size = 0;
-
-    {
-        const auto & hparams = model.hparams;
-
-        const int n_embd  = hparams.n_embd;
-        const int n_layer = hparams.n_layer;
-        const int n_ctx   = hparams.n_ctx;
-        const int n_vocab = hparams.n_vocab;
-
-        ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_g
-        ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_b
-
-        ctx_size += ggml_row_size(wtype, n_embd*n_vocab); // wte
-
-        ctx_size += ggml_row_size(wtype, n_embd*n_vocab); // lmh_g
-        //ctx_size += ggml_row_size(GGML_TYPE_F32, n_vocab); // lmh_b
-
-        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_g
-        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_b
-
-        ctx_size += n_layer*(ggml_row_size(wtype,         3*n_embd*n_embd)); // c_attn_attn_w
-        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 3*n_embd));        // c_attn_attn_b
-
-        ctx_size += n_layer*(ggml_row_size(wtype,         n_embd*n_embd)); // c_attn_proj_w
-        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd*n_embd)); // c_attn_proj_b
-
-        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_g
-        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_b
-
-        ctx_size += n_layer*(ggml_row_size(wtype,         4*n_embd*n_embd)); // c_mlp_fc_w
-        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 4*n_embd));        // c_mlp_fc_b
-
-        ctx_size += n_layer*(ggml_row_size(wtype,         4*n_embd*n_embd)); // c_mlp_proj_w
-        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32,   n_embd));        // c_mlp_proj_b
-
-        ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F32, n_embd); // memory_k
-        ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F32, n_embd); // memory_v
-
-        ctx_size += (6 + 16*n_layer)*512; // object overhead
-
-        printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
-    }
-
-    // create the ggml context
-    {
-        struct ggml_init_params params = {
-            /*.mem_size   =*/ ctx_size,
-            /*.mem_buffer =*/ NULL,
-            /*.no_alloc   =*/ false,
-        };
-
-        model.ctx = ggml_init(params);
-        if (!model.ctx) {
-            fprintf(stderr, "%s: ggml_init() failed\n", __func__);
-            return false;
-        }
-    }
-
-    // prepare memory for the weights
-    {
-        const auto & hparams = model.hparams;
-
-        const int n_embd  = hparams.n_embd;
-        const int n_layer = hparams.n_layer;
-        const int n_vocab = hparams.n_vocab;
-
-        model.layers.resize(n_layer);
-
-        model.wte    = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
-
-        model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-        model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-
-        model.lmh_g  = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
-        //model.lmh_b  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_vocab);
-
-        // map by name
-        model.tensors["gpt_neox.embed_in.weight"] = model.wte;
-
-        model.tensors["gpt_neox.final_layer_norm.weight"] = model.ln_f_g;
-        model.tensors["gpt_neox.final_layer_norm.bias"]   = model.ln_f_b;
-
-        model.tensors["embed_out.weight"] = model.lmh_g;
-        //model.tensors["lm_head.bias"]   = model.lmh_b;
-
-        for (int i = 0; i < n_layer; ++i) {
-            auto & layer = model.layers[i];
-
-            layer.ln_1_g          = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-            layer.ln_1_b          = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-
-            layer.c_attn_attn_w   = ggml_new_tensor_2d(ctx, wtype,           n_embd, 3*n_embd);
-            layer.c_attn_attn_b   = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd);
-
-            layer.c_attn_proj_w   = ggml_new_tensor_2d(ctx, wtype,           n_embd,   n_embd);
-            layer.c_attn_proj_b   = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-
-            layer.ln_2_g          = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-            layer.ln_2_b          = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-
-            layer.c_mlp_fc_w      = ggml_new_tensor_2d(ctx, wtype,           n_embd, 4*n_embd);
-            layer.c_mlp_fc_b      = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
-
-            layer.c_mlp_proj_w    = ggml_new_tensor_2d(ctx, wtype,         4*n_embd,   n_embd);
-            layer.c_mlp_proj_b    = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-
-            // map by name
-
-            // unmapped: attention.rotary_emb, mlp.act
-
-            model.tensors["gpt_neox.layers." + std::to_string(i) + ".input_layernorm.weight"] = layer.ln_1_g;
-            model.tensors["gpt_neox.layers." + std::to_string(i) + ".input_layernorm.bias"]   = layer.ln_1_b;
-
-            model.tensors["gpt_neox.layers." + std::to_string(i) + ".attention.query_key_value.weight"] = layer.c_attn_attn_w;
-            model.tensors["gpt_neox.layers." + std::to_string(i) + ".attention.query_key_value.bias"]   = layer.c_attn_attn_b;
-
-            model.tensors["gpt_neox.layers." + std::to_string(i) + ".attention.dense.weight"] = layer.c_attn_proj_w;
-            model.tensors["gpt_neox.layers." + std::to_string(i) + ".attention.dense.bias"]   = layer.c_attn_proj_b;
-
-            model.tensors["gpt_neox.layers." + std::to_string(i) + ".post_attention_layernorm.weight"] = layer.ln_2_g;
-            model.tensors["gpt_neox.layers." + std::to_string(i) + ".post_attention_layernorm.bias"]   = layer.ln_2_b;
-
-            model.tensors["gpt_neox.layers." + std::to_string(i) + ".mlp.dense_h_to_4h.weight"] = layer.c_mlp_fc_w;
-            model.tensors["gpt_neox.layers." + std::to_string(i) + ".mlp.dense_h_to_4h.bias"]   = layer.c_mlp_fc_b;
-
-            model.tensors["gpt_neox.layers." + std::to_string(i) + ".mlp.dense_4h_to_h.weight"] = layer.c_mlp_proj_w;
-            model.tensors["gpt_neox.layers." + std::to_string(i) + ".mlp.dense_4h_to_h.bias"]   = layer.c_mlp_proj_b;
-        }
-    }
-
-    // key + value memory
-    {
-        const auto & hparams = model.hparams;
-
-        const int n_embd  = hparams.n_embd;
-        const int n_layer = hparams.n_layer;
-        const int n_ctx   = hparams.n_ctx;
-
-        const int64_t n_mem      = n_layer*n_ctx;
-        const int64_t n_elements = n_embd*n_mem;
-
-        model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
-        model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
-
-        const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
-
-        printf("%s: memory_size = %8.2f MB, n_mem = %" PRId64 "\n", __func__, memory_size/1024.0/1024.0, n_mem);
-    }
-
-    // load weights
-    {
-        int n_tensors = 0;
-        size_t total_size = 0;
-
-        printf("%s: ", __func__);
-
-        while (true) {
-            int32_t n_dims;
-            int32_t length;
-            int32_t ttype;
-
-            fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
-            fin.read(reinterpret_cast<char *>(&length), sizeof(length));
-            fin.read(reinterpret_cast<char *>(&ttype),  sizeof(ttype));
-
-            if (fin.eof()) {
-                break;
-            }
-
-            int32_t nelements = 1;
-            int32_t ne[2] = { 1, 1 };
-            for (int i = 0; i < n_dims; ++i) {
-                fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
-                nelements *= ne[i];
-            }
-
-            std::string name(length, 0);
-            fin.read(&name[0], length);
-
-            if (model.tensors.find(name) == model.tensors.end()) {
-                fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.c_str());
-                return false;
-            }
-
-            auto tensor = model.tensors[name];
-            if (ggml_nelements(tensor) != nelements) {
-                fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.c_str());
-                return false;
-            }
-
-            if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
-                fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%5d, %5d], expected [%5d, %5d]\n",
-                        __func__, name.c_str(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]);
-                return false;
-            }
-
-            // for debugging
-            if (0) {
-                printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.c_str(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
-            }
-
-            const size_t bpe = ggml_type_size(ggml_type(ttype));
-
-            if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
-                fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
-                        __func__, name.c_str(), ggml_nbytes(tensor), nelements*bpe);
-                return false;
-            }
-
-            fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
-
-            total_size += ggml_nbytes(tensor);
-            if (++n_tensors % 8 == 0) {
-                printf(".");
-                fflush(stdout);
-            }
-        }
-
-        printf(" done\n");
-
-        printf("%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, n_tensors);
-    }
-
-    fin.close();
-
-    return true;
-}
-
-// feed-forward network
-ggml_tensor * gpt_neox_ff(
-        const dollyv2_layer & layer,
-        ggml_context        * ctx0,
-        ggml_tensor         * inp,
-        float                 eps) {
-    ggml_tensor * cur = ggml_norm(ctx0, inp, eps);
-
-    cur = ggml_add(ctx0,
-        ggml_mul(ctx0,
-            ggml_repeat(ctx0, layer.ln_2_g, cur),
-            cur),
-        ggml_repeat(ctx0, layer.ln_2_b, cur));
-
-    cur = ggml_mul_mat(ctx0,
-            layer.c_mlp_fc_w,
-            cur);
-
-    cur = ggml_add(ctx0,
-            ggml_repeat(ctx0, layer.c_mlp_fc_b, cur),
-            cur);
-
-    // GELU activation
-    cur = ggml_gelu(ctx0, cur);
-
-    // projection
-    // cur = proj_w*cur + proj_b
-    cur = ggml_mul_mat(ctx0,
-            layer.c_mlp_proj_w,
-            cur);
-
-    cur = ggml_add(ctx0,
-            ggml_repeat(ctx0, layer.c_mlp_proj_b, cur),
-            cur);
-    return cur;
-}
-
-// evaluate the transformer
-//
-//   - model:     the model
-//   - n_threads: number of threads to use
-//   - n_past:    the context size so far
-//   - embd_inp:  the embeddings of the tokens in the context
-//   - embd_w:    the predicted logits for the next token
-//
-bool dollyv2_eval(
-        const dollyv2_model & model,
-        const int n_threads,
-        const int n_past,
-        const std::vector<gpt_vocab::id> & embd_inp,
-              std::vector<float>         & embd_w,
-              size_t                     & mem_per_token) {
-    const int N = embd_inp.size();
-
-    const auto & hparams = model.hparams;
-
-    const int n_embd  = hparams.n_embd;
-    const int n_layer = hparams.n_layer;
-    const int n_ctx   = hparams.n_ctx;
-    const int n_head  = hparams.n_head;
-    const int n_vocab = hparams.n_vocab;
-    const int n_rot   = hparams.n_rot;
-
-    static size_t buf_size = 256u*1024*1024;
-    static void * buf = malloc(buf_size);
-
-    if (mem_per_token > 0 && mem_per_token*N > buf_size) {
-        const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
-        //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
-
-        // reallocate
-        buf_size = buf_size_new;
-        buf = realloc(buf, buf_size);
-        if (buf == nullptr) {
-            fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size);
-            return false;
-        }
-    }
-
-    struct ggml_init_params params = {
-        /*.mem_size   =*/ buf_size,
-        /*.mem_buffer =*/ buf,
-        /*.no_alloc   =*/ false,
-    };
-
-    struct ggml_context * ctx0 = ggml_init(params);
-    struct ggml_cgraph * gf = ggml_new_graph(ctx0);
-
-    // KQ_pos - contains the positions
-    struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
-    int * data = (int *) KQ_pos->data;
-    for (int i = 0; i < N; ++i) {
-        data[i] = n_past + i;
-    }
-
-    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
-    memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
-
-    // wte
-    struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.wte, embd);
-
-    for (int il = 0; il < n_layer; ++il) {
-        struct ggml_tensor * cur;
-
-        // self-attention
-        {
-            {
-                cur = ggml_norm(ctx0, inpL, hparams.eps);
-
-                cur = ggml_add(ctx0,
-                        ggml_mul(ctx0,
-                            ggml_repeat(ctx0, model.layers[il].ln_1_g, cur),
-                            cur),
-                        ggml_repeat(ctx0, model.layers[il].ln_1_b, cur));
-            }
-
-            // compute QKV
-            {
-                cur = ggml_mul_mat(ctx0,
-                        model.layers[il].c_attn_attn_w,
-                        cur);
-
-                cur = ggml_add(ctx0,
-                        ggml_repeat(ctx0, model.layers[il].c_attn_attn_b, cur),
-                        cur);
-            }
-
-            struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd/n_head, n_head, N, cur->nb[1]/n_head, cur->nb[1], 0*sizeof(float)*n_embd/n_head));
-            struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd/n_head, n_head, N, cur->nb[1]/n_head, cur->nb[1], 1*sizeof(float)*n_embd/n_head));
-            struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd/n_head, n_head, N, cur->nb[1]/n_head, cur->nb[1], 2*sizeof(float)*n_embd/n_head));
-
-            // using mode = 2 for GPT-NeoX mode
-            Qcur = ggml_rope_inplace(ctx0, Qcur, KQ_pos, n_rot, 2, 0);
-            Kcur = ggml_rope_inplace(ctx0, Kcur, KQ_pos, n_rot, 2, 0);
-
-            // store key and value to memory
-            {
-                Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd, N));
-
-                struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past));
-                struct ggml_tensor * v = ggml_view_2d(ctx0, model.memory_v, N, n_embd,
-                        (   n_ctx)*ggml_element_size(model.memory_v),
-                        (il*n_ctx)*ggml_element_size(model.memory_v)*n_embd + n_past*ggml_element_size(model.memory_v));
-
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
-            }
-
-            // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
-            struct ggml_tensor * Q =
-                ggml_permute(ctx0,
-                        Qcur,
-                        0, 2, 1, 3);
-
-            // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
-            struct ggml_tensor * K =
-                ggml_permute(ctx0,
-                        ggml_reshape_3d(ctx0,
-                            ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd),
-                            n_embd/n_head, n_head, n_past + N),
-                        0, 2, 1, 3);
-
-            // K * Q
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
-
-            // KQ_scaled = KQ / sqrt(n_embd/n_head)
-            struct ggml_tensor * KQ_scaled =
-                ggml_scale_inplace(ctx0,
-                        KQ,
-                        1.0f/sqrt(float(n_embd)/n_head));
-
-            // KQ_masked = mask_past(KQ_scaled)
-            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
-
-            // KQ = soft_max(KQ_masked)
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
-
-            // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
-            struct ggml_tensor * V =
-                ggml_view_3d(ctx0, model.memory_v,
-                        n_past + N, n_embd/n_head, n_head,
-                        n_ctx*ggml_element_size(model.memory_v),
-                        n_ctx*ggml_element_size(model.memory_v)*n_embd/n_head,
-                        il*n_ctx*ggml_element_size(model.memory_v)*n_embd);
-
-            // KQV = transpose(V) * KQ_soft_max
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
-
-            // KQV_merged = KQV.permute(0, 2, 1, 3)
-            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
-
-            // cur = KQV_merged.contiguous().view(n_embd, N)
-            cur = ggml_cpy(ctx0,
-                    KQV_merged,
-                    ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
-
-            // projection
-            {
-                cur = ggml_mul_mat(ctx0,
-                        model.layers[il].c_attn_proj_w,
-                        cur);
-
-                cur = ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].c_attn_proj_b, cur), cur);
-            }
-        }
-
-        if (hparams.par_res == 0) {
-            struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpL);
-
-            cur = gpt_neox_ff(model.layers[il], ctx0, inpFF, hparams.eps);
-
-            // input for next layer
-            inpL = ggml_add(ctx0, cur, inpFF);
-        } else {
-            struct ggml_tensor * inpFF = cur;
-
-            // this is independent of the self-attention result, so it could be done in parallel to the self-attention
-            // note here we pass inpL instead of cur
-            cur = gpt_neox_ff(model.layers[il], ctx0, inpL, hparams.eps);
-
-            // layer input + FF
-            cur  = ggml_add(ctx0, cur, inpFF);
-
-            // input for next layer
-            inpL = ggml_add(ctx0, cur, inpL);
-        }
-
-    }
-
-    // norm
-    {
-        inpL = ggml_norm(ctx0, inpL, hparams.eps);
-
-        // inpL = ln_f_g*inpL + ln_f_b
-        inpL = ggml_add(ctx0,
-                ggml_mul(ctx0,
-                    ggml_repeat(ctx0, model.ln_f_g, inpL),
-                    inpL),
-                ggml_repeat(ctx0, model.ln_f_b, inpL));
-    }
-
-    // lm_head
-    {
-        inpL = ggml_mul_mat(ctx0, model.lmh_g, inpL);
-
-        //inpL = ggml_add(ctx0,
-        //        ggml_repeat(ctx0, model.lmh_b, inpL),
-        //        inpL);
-    }
-
-    // logits -> probs
-    //inpL = ggml_soft_max_inplace(ctx0, inpL);
-
-    // run the computation
-    ggml_build_forward_expand(gf, inpL);
-    ggml_graph_compute_with_ctx(ctx0, gf, n_threads);
-
-    //if (n_past%100 == 0) {
-    //    ggml_graph_print   (&gf);
-    //    ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
-    //}
-
-    //embd_w.resize(n_vocab*N);
-    //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
-
-    // return result for just the last token
-    embd_w.resize(n_vocab);
-    memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
-
-    if (mem_per_token == 0) {
-        mem_per_token = ggml_used_mem(ctx0)/N;
-    }
-    //printf("used_mem = %zu\n", ggml_used_mem(ctx0));
-
-    ggml_free(ctx0);
-
-    return true;
-}
-
-std::string execute_prompt(
-        const dollyv2_model &model,
-        gpt_vocab &vocab,
-        const std::string &prompt,
-        gpt_params &params,
-        std::mt19937 &rng,
-        int64_t t_load_us,
-        int64_t t_sample_us,
-        int64_t t_predict_us,
-        size_t mem_per_token,
-        int n_past,
-        bool stream_response_to_cout = false) {
-    std::string output = "";
-    std::vector<float> logits;
-
-    // tokenize the prompt
-    std::vector<gpt_vocab::id> embd_inp = ::gpt_tokenize(vocab, prompt);
-
-    params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int)embd_inp.size());
-
-    printf("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
-    for (size_t i = 0; i < embd_inp.size(); i++) {
-        printf("%s: token[%zu] = %6d, %s\n", __func__, i, embd_inp[i], vocab.id_to_token.at(embd_inp[i]).c_str());
-    }
-    printf("\n");
-
-    std::vector<gpt_vocab::id> embd;
-
-    dollyv2_eval(model, params.n_threads, 0, {0, 1, 2, 3}, logits, mem_per_token);
-
-    const int32_t end_token = vocab.token_to_id["### End"];
-
-    for (size_t i = embd.size(); i < embd_inp.size() + params.n_predict; i++) {
-        // predict
-        if (embd.size() > 0) {
-            const int64_t t_start_us = ggml_time_us();
-
-            if (!dollyv2_eval(model, params.n_threads, n_past, embd, logits, mem_per_token)) {
-                printf("Failed to predict\n");
-                return output;
-            }
-
-            t_predict_us += ggml_time_us() - t_start_us;
-        }
-
-        n_past += embd.size();
-        embd.clear();
-
-        if (i >= embd_inp.size()) {
-            // sample next token
-            const int top_k = params.top_k;
-            const float top_p = params.top_p;
-            const float temp = params.temp;
-
-            const int n_vocab = model.hparams.n_vocab;
-
-            gpt_vocab::id id = 0;
-
-            {
-                const int64_t t_start_sample_us = ggml_time_us();
-
-                id = gpt_sample_top_k_top_p(vocab, logits.data() + (logits.size() - n_vocab), top_k, top_p, temp, rng);
-
-                t_sample_us += ggml_time_us() - t_start_sample_us;
-            }
-
-            // add it to the context
-            embd.push_back(id);
-        } else {
-            // if here, it means we are still processing the input prompt
-            for (size_t k = i; k < embd_inp.size(); k++) {
-                embd.push_back(embd_inp[k]);
-                if (int32_t(embd.size()) > params.n_batch) {
-                    break;
-                }
-            }
-            i += embd.size() - 1;
-        }
-
-        // display text
-        for (auto id : embd) {
-            output += vocab.id_to_token[id];
-            if (stream_response_to_cout) {
-                printf("%s", vocab.id_to_token[id].c_str());
-            }
-        }
-        if (stream_response_to_cout) {
-            fflush(stdout);
-        }
-
-        // end of text token
-        if (embd.back() == 0 || (end_token > 0 && embd.back() == end_token)) {
-            return output;
-        }
-    }
-    return output;
-}
-
-#if defined(DOLLY_INTERACTIVE_PORT)
-int setup_port(const int port) {
-    int sockfd = socket(AF_INET, SOCK_STREAM, 0);
-    if (sockfd < 0) {
-        fprintf(stderr, "%s: Failed to create new socket\n", __func__);
-        return -1;
-    }
-
-    sockaddr_in servaddr;
-    std::memset(&servaddr, 0, sizeof(servaddr));
-
-    servaddr.sin_family = AF_INET;
-    servaddr.sin_addr.s_addr = htonl(INADDR_ANY);
-    servaddr.sin_port = htons(port);
-
-    if (bind(sockfd, (struct sockaddr *)&servaddr, sizeof(servaddr)) < 0) {
-        fprintf(stderr, "%s: Failed to bind to port %i\n", __func__, port);
-        return -1;
-    }
-
-    if (listen(sockfd, 10) < 0) {
-        fprintf(stderr, "%s: Failed to listen to socket on port %i\n", __func__, port);
-        return -1;
-    }
-    return sockfd;
-}
-
-std::string read_from_port(int sockfd, int clientfd) {
-    if (clientfd < 0) {
-        fprintf(stderr, "%s: Failed to accept new connection\n", __func__);
-        return "";
-    }
-
-    char buffer[4096];
-    std::memset(buffer, 0, sizeof(buffer));
-
-    if (read(clientfd, buffer, sizeof(buffer)) < 0) {
-        fprintf(stderr, "%s: Failed to read from client\n", __func__);
-    } else {
-        std::cout << "Received: " << buffer;
-        return std::string(buffer);
-    }
-    return std::string("");
-}
-#endif
-
-int main(int argc, char ** argv) {
-    ggml_time_init();
-
-    const int64_t t_main_start_us = ggml_time_us();
-
-    gpt_params params;
-    params.model = "models/dolly-v2-3b/ggml-model-f16.bin";
-
-    if (gpt_params_parse(argc, argv, params) == false) {
-        return 1;
-    }
-
-    if (params.seed < 0) {
-        params.seed = time(NULL);
-    }
-
-    printf("%s: seed = %d\n", __func__, params.seed);
-
-    std::mt19937 rng(params.seed);
-
-    int64_t t_load_us = 0;
-    int64_t t_sample_us = 0;
-    int64_t t_predict_us = 0;
-
-    // determine the required inference memory per token:
-    size_t mem_per_token = 0;
-
-    int n_past = 0;
-
-    gpt_vocab vocab;
-    dollyv2_model model;
-
-    // load the model
-    {
-        const int64_t t_start_us = ggml_time_us();
-
-        if (!dollyv2_model_load(params.model, model, vocab)) {
-            fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
-            return 1;
-        }
-
-        t_load_us = ggml_time_us() - t_start_us;
-
-        test_gpt_tokenizer(vocab, params.token_test);
-    }
-
-#if defined(DOLLY_INTERACTIVE_PORT)
-    int sockfd = -1;
-    if (params.interactive_port != -1) {
-        sockfd = setup_port(params.interactive_port);
-        if (sockfd == -1) {
-            return 1;
-        }
-        fprintf(stdout, "Model is ready on port %i\n", params.interactive_port);
-        fflush(stdout);
-    }
-#endif
-
-    if (params.interactive || params.interactive_port != -1) {
-        while (true) {
-            std::string prompt_input;
-#if defined(DOLLY_INTERACTIVE_PORT)
-            int clientfd = -1;
-            if (params.interactive_port != -1) {
-                sockaddr_in clientaddr;
-                socklen_t clientaddrlen = sizeof(clientaddr);
-                clientfd = accept(sockfd, (struct sockaddr *)&clientaddr, &clientaddrlen);
-                prompt_input = read_from_port(sockfd, clientfd);
-            } else
-#endif
-            {
-                printf("Please enter your quesiton:\n>");
-                fflush(stdout);
-
-                std::getline(std::cin, prompt_input);
-            }
-
-            if (strcmp(prompt_input.c_str(), "exit") == 0) {
-                break;
-            }
-
-            const std::string prompt = prompt_for_generation(prompt_input);
-            // call the model
-            const std::string response = execute_prompt(model, vocab, prompt, params, rng, t_load_us, t_sample_us, t_predict_us, mem_per_token, n_past, true);
-
-#if defined(DOLLY_INTERACTIVE_PORT)
-            if (params.interactive_port != -1) {
-                if (write(clientfd, response.c_str(), response.size()) < 0) {
-                    fprintf(stderr, "%s: Failed to write answer '%s' to client\n", __func__, response.c_str());
-                }
-
-                if (close(clientfd) < 0) {
-                    fprintf(stderr, "%s: Failed to close client socket\n", __func__);
-                }
-            } else
-#endif
-            {
-                printf("%s\n\n", response.c_str());
-            }
-            fflush(stdout);
-        }
-    } else {
-        if (params.prompt.empty()) {
-            params.prompt = gpt_random_prompt(rng);
-        }
-
-        const std::string prompt = prompt_for_generation(params.prompt);
-        execute_prompt(model, vocab, prompt, params, rng, t_load_us, t_sample_us, t_predict_us, mem_per_token, n_past, true);
-    }
-
-    // report timing
-    {
-        const int64_t t_main_end_us = ggml_time_us();
-
-        printf("\n\n");
-        printf("%s: mem per token = %8zu bytes\n", __func__, mem_per_token);
-        printf("%s:     load time = %8.2f ms\n", __func__, t_load_us / 1000.0f);
-        printf("%s:   sample time = %8.2f ms\n", __func__, t_sample_us / 1000.0f);
-        printf("%s:  predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us / 1000.0f, t_predict_us / 1000.0f / n_past);
-        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us) / 1000.0f);
-    }
-
-    ggml_free(model.ctx);
-
-#if defined(DOLLY_INTERACTIVE_PORT)
-    if (params.interactive_port != -1 && close(sockfd) < 0) {
-        fprintf(stderr, "%s: Failed to close server socket\n", __func__);
-    }
-#endif
-
-    return 0;
-}
diff --git a/ggml/examples/dolly-v2/quantize.cpp b/ggml/examples/dolly-v2/quantize.cpp
deleted file mode 100644
index 0c0d24c..0000000
--- a/ggml/examples/dolly-v2/quantize.cpp
+++ /dev/null
@@ -1,178 +0,0 @@
-#include "ggml/ggml.h"
-
-#include "common.h"
-#include "common-ggml.h"
-
-#include <cassert>
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <fstream>
-#include <map>
-#include <string>
-#include <vector>
-#include <regex>
-
-// default hparams (dollyv2 3B)
-struct dollyv2_hparams {
-    int32_t n_vocab = 50254; // tokenizer.vocab_size
-    int32_t n_ctx   = 2048;  // model.config.max_position_embeddings
-    int32_t n_embd  = 2560;  // model.config.hidden_size
-    int32_t n_head  = 32;    // model.config.num_attention_heads
-    int32_t n_layer = 32;    // model.config.num_hidden_layers
-    int32_t n_rot   = 20;    // rotary_pct[25%] * (n_embd / n_head)
-    int32_t par_res = 1; // 1 = true, 0 = false
-    int32_t ftype   = GGML_FTYPE_MOSTLY_F16;
-};
-
-// quantize a model
-bool dollyv2_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_ftype ftype) {
-    gpt_vocab vocab;
-
-    printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
-
-    auto finp = std::ifstream(fname_inp, std::ios::binary);
-    if (!finp) {
-        fprintf(stderr, "%s: failed to open '%s' for reading\n", __func__, fname_inp.c_str());
-        return false;
-    }
-
-    auto fout = std::ofstream(fname_out, std::ios::binary);
-    if (!fout) {
-        fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname_out.c_str());
-        return false;
-    }
-
-    // verify magic
-    {
-        uint32_t magic;
-        finp.read((char *) &magic, sizeof(magic));
-        if (magic != GGML_FILE_MAGIC) {
-            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname_inp.c_str());
-            return false;
-        }
-
-        fout.write((char *) &magic, sizeof(magic));
-    }
-
-    dollyv2_hparams hparams;
-
-    // load hparams
-    {
-        finp.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
-        finp.read((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
-        finp.read((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
-        finp.read((char *) &hparams.n_head,  sizeof(hparams.n_head));
-        finp.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
-        finp.read((char *) &hparams.n_rot,   sizeof(hparams.n_rot));
-        finp.read((char *) &hparams.par_res, sizeof(hparams.par_res));
-        finp.read((char *) &hparams.ftype,   sizeof(hparams.ftype));
-
-        const int32_t qntvr_src =    hparams.ftype / GGML_QNT_VERSION_FACTOR;
-        const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype;
-
-        printf("%s: n_vocab     = %d\n", __func__, hparams.n_vocab);
-        printf("%s: n_ctx       = %d\n", __func__, hparams.n_ctx);
-        printf("%s: n_embd      = %d\n", __func__, hparams.n_embd);
-        printf("%s: n_head      = %d\n", __func__, hparams.n_head);
-        printf("%s: n_layer     = %d\n", __func__, hparams.n_layer);
-        printf("%s: par_res     = %d\n", __func__, hparams.par_res);
-        printf("%s: ftype (src) = %d\n", __func__, hparams.ftype);
-        printf("%s: qntvr (src) = %d\n", __func__, qntvr_src);
-        printf("%s: ftype (dst) = %d\n", __func__, ftype_dst);
-        printf("%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION);
-
-        fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
-        fout.write((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
-        fout.write((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
-        fout.write((char *) &hparams.n_head,  sizeof(hparams.n_head));
-        fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer));
-        fout.write((char *) &hparams.n_rot,   sizeof(hparams.n_rot));
-        fout.write((char *) &hparams.par_res, sizeof(hparams.par_res));
-        fout.write((char *) &ftype_dst,       sizeof(ftype_dst));
-    }
-
-    // load vocab
-    {
-        const int32_t n_vocab = hparams.n_vocab;
-
-        std::string word;
-        for (int i = 0; i < n_vocab; i++) {
-            uint32_t len;
-            finp.read ((char *) &len, sizeof(len));
-            fout.write((char *) &len, sizeof(len));
-
-            word.resize(len);
-            finp.read ((char *) word.data(), len);
-            fout.write((char *) word.data(), len);
-
-            vocab.token_to_id[word] = i;
-            vocab.id_to_token[i] = word;
-        }
-    }
-
-    // regexes of tensor names to be quantized
-    const std::vector<std::string> to_quant = {
-        ".*weight",
-    };
-
-    if (!ggml_common_quantize_0(finp, fout, ftype, to_quant, {})) {
-        fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__, fname_inp.c_str());
-        return false;
-    }
-
-    finp.close();
-    fout.close();
-
-    return true;
-}
-
-// usage:
-//  ./dollyv2-quantize models/dolly-v2-3B/ggml-model.bin models/dolly-v2-3B/ggml-model-quant.bin type
-//
-int main(int argc, char ** argv) {
-    if (argc != 4) {
-        fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
-        ggml_print_ftypes(stderr);
-        return 1;
-    }
-
-    // needed to initialize f16 tables
-    {
-        struct ggml_init_params params = { 0, NULL, false };
-        struct ggml_context * ctx = ggml_init(params);
-        ggml_free(ctx);
-    }
-
-    const std::string fname_inp = argv[1];
-    const std::string fname_out = argv[2];
-
-    const ggml_ftype ftype = ggml_parse_ftype(argv[3]);
-
-    const int64_t t_main_start_us = ggml_time_us();
-
-    int64_t t_quantize_us = 0;
-
-    // load the model
-    {
-        const int64_t t_start_us = ggml_time_us();
-
-        if (!dollyv2_model_quantize(fname_inp, fname_out, ggml_ftype(ftype))) {
-            fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
-            return 1;
-        }
-
-        t_quantize_us = ggml_time_us() - t_start_us;
-    }
-
-    // report timing
-    {
-        const int64_t t_main_end_us = ggml_time_us();
-
-        printf("\n");
-        printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0f);
-        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
-    }
-
-    return 0;
-}
diff --git a/ggml/examples/dr_wav.h b/ggml/examples/dr_wav.h
deleted file mode 100644
index fd3e95b..0000000
--- a/ggml/examples/dr_wav.h
+++ /dev/null
@@ -1,6434 +0,0 @@
-/*
-WAV audio loader and writer. Choice of public domain or MIT-0. See license statements at the end of this file.
-dr_wav - v0.12.16 - 2020-12-02
-
-David Reid - mackron@gmail.com
-
-GitHub: https://github.com/mackron/dr_libs
-*/
-
-/*
-RELEASE NOTES - VERSION 0.12
-============================
-Version 0.12 includes breaking changes to custom chunk handling.
-
-
-Changes to Chunk Callback
--------------------------
-dr_wav supports the ability to fire a callback when a chunk is encounted (except for WAVE and FMT chunks). The callback has been updated to include both the
-container (RIFF or Wave64) and the FMT chunk which contains information about the format of the data in the wave file.
-
-Previously, there was no direct way to determine the container, and therefore no way to discriminate against the different IDs in the chunk header (RIFF and
-Wave64 containers encode chunk ID's differently). The `container` parameter can be used to know which ID to use.
-
-Sometimes it can be useful to know the data format at the time the chunk callback is fired. A pointer to a `drwav_fmt` object is now passed into the chunk
-callback which will give you information about the data format. To determine the sample format, use `drwav_fmt_get_format()`. This will return one of the
-`DR_WAVE_FORMAT_*` tokens.
-*/
-
-/*
-Introduction
-============
-This is a single file library. To use it, do something like the following in one .c file.
-    
-    ```c
-    #define DR_WAV_IMPLEMENTATION
-    #include "dr_wav.h"
-    ```
-
-You can then #include this file in other parts of the program as you would with any other header file. Do something like the following to read audio data:
-
-    ```c
-    drwav wav;
-    if (!drwav_init_file(&wav, "my_song.wav", NULL)) {
-        // Error opening WAV file.
-    }
-
-    drwav_int32* pDecodedInterleavedPCMFrames = malloc(wav.totalPCMFrameCount * wav.channels * sizeof(drwav_int32));
-    size_t numberOfSamplesActuallyDecoded = drwav_read_pcm_frames_s32(&wav, wav.totalPCMFrameCount, pDecodedInterleavedPCMFrames);
-
-    ...
-
-    drwav_uninit(&wav);
-    ```
-
-If you just want to quickly open and read the audio data in a single operation you can do something like this:
-
-    ```c
-    unsigned int channels;
-    unsigned int sampleRate;
-    drwav_uint64 totalPCMFrameCount;
-    float* pSampleData = drwav_open_file_and_read_pcm_frames_f32("my_song.wav", &channels, &sampleRate, &totalPCMFrameCount, NULL);
-    if (pSampleData == NULL) {
-        // Error opening and reading WAV file.
-    }
-
-    ...
-
-    drwav_free(pSampleData);
-    ```
-
-The examples above use versions of the API that convert the audio data to a consistent format (32-bit signed PCM, in this case), but you can still output the
-audio data in its internal format (see notes below for supported formats):
-
-    ```c
-    size_t framesRead = drwav_read_pcm_frames(&wav, wav.totalPCMFrameCount, pDecodedInterleavedPCMFrames);
-    ```
-
-You can also read the raw bytes of audio data, which could be useful if dr_wav does not have native support for a particular data format:
-
-    ```c
-    size_t bytesRead = drwav_read_raw(&wav, bytesToRead, pRawDataBuffer);
-    ```
-
-dr_wav can also be used to output WAV files. This does not currently support compressed formats. To use this, look at `drwav_init_write()`,
-`drwav_init_file_write()`, etc. Use `drwav_write_pcm_frames()` to write samples, or `drwav_write_raw()` to write raw data in the "data" chunk.
-
-    ```c
-    drwav_data_format format;
-    format.container = drwav_container_riff;     // <-- drwav_container_riff = normal WAV files, drwav_container_w64 = Sony Wave64.
-    format.format = DR_WAVE_FORMAT_PCM;          // <-- Any of the DR_WAVE_FORMAT_* codes.
-    format.channels = 2;
-    format.sampleRate = 44100;
-    format.bitsPerSample = 16;
-    drwav_init_file_write(&wav, "data/recording.wav", &format, NULL);
-
-    ...
-
-    drwav_uint64 framesWritten = drwav_write_pcm_frames(pWav, frameCount, pSamples);
-    ```
-
-dr_wav has seamless support the Sony Wave64 format. The decoder will automatically detect it and it should Just Work without any manual intervention.
-
-
-Build Options
-=============
-#define these options before including this file.
-
-#define DR_WAV_NO_CONVERSION_API
-  Disables conversion APIs such as `drwav_read_pcm_frames_f32()` and `drwav_s16_to_f32()`.
-
-#define DR_WAV_NO_STDIO
-  Disables APIs that initialize a decoder from a file such as `drwav_init_file()`, `drwav_init_file_write()`, etc.
-
-
-
-Notes
-=====
-- Samples are always interleaved.
-- The default read function does not do any data conversion. Use `drwav_read_pcm_frames_f32()`, `drwav_read_pcm_frames_s32()` and `drwav_read_pcm_frames_s16()`
-  to read and convert audio data to 32-bit floating point, signed 32-bit integer and signed 16-bit integer samples respectively. Tested and supported internal
-  formats include the following:
-  - Unsigned 8-bit PCM
-  - Signed 12-bit PCM
-  - Signed 16-bit PCM
-  - Signed 24-bit PCM
-  - Signed 32-bit PCM
-  - IEEE 32-bit floating point
-  - IEEE 64-bit floating point
-  - A-law and u-law
-  - Microsoft ADPCM
-  - IMA ADPCM (DVI, format code 0x11)
-- dr_wav will try to read the WAV file as best it can, even if it's not strictly conformant to the WAV format.
-*/
-
-#ifndef dr_wav_h
-#define dr_wav_h
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#define DRWAV_STRINGIFY(x)      #x
-#define DRWAV_XSTRINGIFY(x)     DRWAV_STRINGIFY(x)
-
-#define DRWAV_VERSION_MAJOR     0
-#define DRWAV_VERSION_MINOR     12
-#define DRWAV_VERSION_REVISION  16
-#define DRWAV_VERSION_STRING    DRWAV_XSTRINGIFY(DRWAV_VERSION_MAJOR) "." DRWAV_XSTRINGIFY(DRWAV_VERSION_MINOR) "." DRWAV_XSTRINGIFY(DRWAV_VERSION_REVISION)
-
-#include <stddef.h> /* For size_t. */
-
-/* Sized types. */
-typedef   signed char           drwav_int8;
-typedef unsigned char           drwav_uint8;
-typedef   signed short          drwav_int16;
-typedef unsigned short          drwav_uint16;
-typedef   signed int            drwav_int32;
-typedef unsigned int            drwav_uint32;
-#if defined(_MSC_VER)
-    typedef   signed __int64    drwav_int64;
-    typedef unsigned __int64    drwav_uint64;
-#else
-    #if defined(__clang__) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)))
-        #pragma GCC diagnostic push
-        #pragma GCC diagnostic ignored "-Wlong-long"
-        #if defined(__clang__)
-            #pragma GCC diagnostic ignored "-Wc++11-long-long"
-        #endif
-    #endif
-    typedef   signed long long  drwav_int64;
-    typedef unsigned long long  drwav_uint64;
-    #if defined(__clang__) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)))
-        #pragma GCC diagnostic pop
-    #endif
-#endif
-#if defined(__LP64__) || defined(_WIN64) || (defined(__x86_64__) && !defined(__ILP32__)) || defined(_M_X64) || defined(__ia64) || defined (_M_IA64) || defined(__aarch64__) || defined(__powerpc64__)
-    typedef drwav_uint64        drwav_uintptr;
-#else
-    typedef drwav_uint32        drwav_uintptr;
-#endif
-typedef drwav_uint8             drwav_bool8;
-typedef drwav_uint32            drwav_bool32;
-#define DRWAV_TRUE              1
-#define DRWAV_FALSE             0
-
-#if !defined(DRWAV_API)
-    #if defined(DRWAV_DLL)
-        #if defined(_WIN32)
-            #define DRWAV_DLL_IMPORT  __declspec(dllimport)
-            #define DRWAV_DLL_EXPORT  __declspec(dllexport)
-            #define DRWAV_DLL_PRIVATE static
-        #else
-            #if defined(__GNUC__) && __GNUC__ >= 4
-                #define DRWAV_DLL_IMPORT  __attribute__((visibility("default")))
-                #define DRWAV_DLL_EXPORT  __attribute__((visibility("default")))
-                #define DRWAV_DLL_PRIVATE __attribute__((visibility("hidden")))
-            #else
-                #define DRWAV_DLL_IMPORT
-                #define DRWAV_DLL_EXPORT
-                #define DRWAV_DLL_PRIVATE static
-            #endif
-        #endif
-
-        #if defined(DR_WAV_IMPLEMENTATION) || defined(DRWAV_IMPLEMENTATION)
-            #define DRWAV_API  DRWAV_DLL_EXPORT
-        #else
-            #define DRWAV_API  DRWAV_DLL_IMPORT
-        #endif
-        #define DRWAV_PRIVATE DRWAV_DLL_PRIVATE
-    #else
-        #define DRWAV_API extern
-        #define DRWAV_PRIVATE static
-    #endif
-#endif
-
-typedef drwav_int32 drwav_result;
-#define DRWAV_SUCCESS                        0
-#define DRWAV_ERROR                         -1   /* A generic error. */
-#define DRWAV_INVALID_ARGS                  -2
-#define DRWAV_INVALID_OPERATION             -3
-#define DRWAV_OUT_OF_MEMORY                 -4
-#define DRWAV_OUT_OF_RANGE                  -5
-#define DRWAV_ACCESS_DENIED                 -6
-#define DRWAV_DOES_NOT_EXIST                -7
-#define DRWAV_ALREADY_EXISTS                -8
-#define DRWAV_TOO_MANY_OPEN_FILES           -9
-#define DRWAV_INVALID_FILE                  -10
-#define DRWAV_TOO_BIG                       -11
-#define DRWAV_PATH_TOO_LONG                 -12
-#define DRWAV_NAME_TOO_LONG                 -13
-#define DRWAV_NOT_DIRECTORY                 -14
-#define DRWAV_IS_DIRECTORY                  -15
-#define DRWAV_DIRECTORY_NOT_EMPTY           -16
-#define DRWAV_END_OF_FILE                   -17
-#define DRWAV_NO_SPACE                      -18
-#define DRWAV_BUSY                          -19
-#define DRWAV_IO_ERROR                      -20
-#define DRWAV_INTERRUPT                     -21
-#define DRWAV_UNAVAILABLE                   -22
-#define DRWAV_ALREADY_IN_USE                -23
-#define DRWAV_BAD_ADDRESS                   -24
-#define DRWAV_BAD_SEEK                      -25
-#define DRWAV_BAD_PIPE                      -26
-#define DRWAV_DEADLOCK                      -27
-#define DRWAV_TOO_MANY_LINKS                -28
-#define DRWAV_NOT_IMPLEMENTED               -29
-#define DRWAV_NO_MESSAGE                    -30
-#define DRWAV_BAD_MESSAGE                   -31
-#define DRWAV_NO_DATA_AVAILABLE             -32
-#define DRWAV_INVALID_DATA                  -33
-#define DRWAV_TIMEOUT                       -34
-#define DRWAV_NO_NETWORK                    -35
-#define DRWAV_NOT_UNIQUE                    -36
-#define DRWAV_NOT_SOCKET                    -37
-#define DRWAV_NO_ADDRESS                    -38
-#define DRWAV_BAD_PROTOCOL                  -39
-#define DRWAV_PROTOCOL_UNAVAILABLE          -40
-#define DRWAV_PROTOCOL_NOT_SUPPORTED        -41
-#define DRWAV_PROTOCOL_FAMILY_NOT_SUPPORTED -42
-#define DRWAV_ADDRESS_FAMILY_NOT_SUPPORTED  -43
-#define DRWAV_SOCKET_NOT_SUPPORTED          -44
-#define DRWAV_CONNECTION_RESET              -45
-#define DRWAV_ALREADY_CONNECTED             -46
-#define DRWAV_NOT_CONNECTED                 -47
-#define DRWAV_CONNECTION_REFUSED            -48
-#define DRWAV_NO_HOST                       -49
-#define DRWAV_IN_PROGRESS                   -50
-#define DRWAV_CANCELLED                     -51
-#define DRWAV_MEMORY_ALREADY_MAPPED         -52
-#define DRWAV_AT_END                        -53
-
-/* Common data formats. */
-#define DR_WAVE_FORMAT_PCM          0x1
-#define DR_WAVE_FORMAT_ADPCM        0x2
-#define DR_WAVE_FORMAT_IEEE_FLOAT   0x3
-#define DR_WAVE_FORMAT_ALAW         0x6
-#define DR_WAVE_FORMAT_MULAW        0x7
-#define DR_WAVE_FORMAT_DVI_ADPCM    0x11
-#define DR_WAVE_FORMAT_EXTENSIBLE   0xFFFE
-
-/* Constants. */
-#ifndef DRWAV_MAX_SMPL_LOOPS
-#define DRWAV_MAX_SMPL_LOOPS        1
-#endif
-
-/* Flags to pass into drwav_init_ex(), etc. */
-#define DRWAV_SEQUENTIAL            0x00000001
-
-DRWAV_API void drwav_version(drwav_uint32* pMajor, drwav_uint32* pMinor, drwav_uint32* pRevision);
-DRWAV_API const char* drwav_version_string(void);
-
-typedef enum
-{
-    drwav_seek_origin_start,
-    drwav_seek_origin_current
-} drwav_seek_origin;
-
-typedef enum
-{
-    drwav_container_riff,
-    drwav_container_w64,
-    drwav_container_rf64
-} drwav_container;
-
-typedef struct
-{
-    union
-    {
-        drwav_uint8 fourcc[4];
-        drwav_uint8 guid[16];
-    } id;
-
-    /* The size in bytes of the chunk. */
-    drwav_uint64 sizeInBytes;
-
-    /*
-    RIFF = 2 byte alignment.
-    W64  = 8 byte alignment.
-    */
-    unsigned int paddingSize;
-} drwav_chunk_header;
-
-typedef struct
-{
-    /*
-    The format tag exactly as specified in the wave file's "fmt" chunk. This can be used by applications
-    that require support for data formats not natively supported by dr_wav.
-    */
-    drwav_uint16 formatTag;
-
-    /* The number of channels making up the audio data. When this is set to 1 it is mono, 2 is stereo, etc. */
-    drwav_uint16 channels;
-
-    /* The sample rate. Usually set to something like 44100. */
-    drwav_uint32 sampleRate;
-
-    /* Average bytes per second. You probably don't need this, but it's left here for informational purposes. */
-    drwav_uint32 avgBytesPerSec;
-
-    /* Block align. This is equal to the number of channels * bytes per sample. */
-    drwav_uint16 blockAlign;
-
-    /* Bits per sample. */
-    drwav_uint16 bitsPerSample;
-
-    /* The size of the extended data. Only used internally for validation, but left here for informational purposes. */
-    drwav_uint16 extendedSize;
-
-    /*
-    The number of valid bits per sample. When <formatTag> is equal to WAVE_FORMAT_EXTENSIBLE, <bitsPerSample>
-    is always rounded up to the nearest multiple of 8. This variable contains information about exactly how
-    many bits are valid per sample. Mainly used for informational purposes.
-    */
-    drwav_uint16 validBitsPerSample;
-
-    /* The channel mask. Not used at the moment. */
-    drwav_uint32 channelMask;
-
-    /* The sub-format, exactly as specified by the wave file. */
-    drwav_uint8 subFormat[16];
-} drwav_fmt;
-
-DRWAV_API drwav_uint16 drwav_fmt_get_format(const drwav_fmt* pFMT);
-
-
-/*
-Callback for when data is read. Return value is the number of bytes actually read.
-
-pUserData   [in]  The user data that was passed to drwav_init() and family.
-pBufferOut  [out] The output buffer.
-bytesToRead [in]  The number of bytes to read.
-
-Returns the number of bytes actually read.
-
-A return value of less than bytesToRead indicates the end of the stream. Do _not_ return from this callback until
-either the entire bytesToRead is filled or you have reached the end of the stream.
-*/
-typedef size_t (* drwav_read_proc)(void* pUserData, void* pBufferOut, size_t bytesToRead);
-
-/*
-Callback for when data is written. Returns value is the number of bytes actually written.
-
-pUserData    [in]  The user data that was passed to drwav_init_write() and family.
-pData        [out] A pointer to the data to write.
-bytesToWrite [in]  The number of bytes to write.
-
-Returns the number of bytes actually written.
-
-If the return value differs from bytesToWrite, it indicates an error.
-*/
-typedef size_t (* drwav_write_proc)(void* pUserData, const void* pData, size_t bytesToWrite);
-
-/*
-Callback for when data needs to be seeked.
-
-pUserData [in] The user data that was passed to drwav_init() and family.
-offset    [in] The number of bytes to move, relative to the origin. Will never be negative.
-origin    [in] The origin of the seek - the current position or the start of the stream.
-
-Returns whether or not the seek was successful.
-
-Whether or not it is relative to the beginning or current position is determined by the "origin" parameter which will be either drwav_seek_origin_start or
-drwav_seek_origin_current.
-*/
-typedef drwav_bool32 (* drwav_seek_proc)(void* pUserData, int offset, drwav_seek_origin origin);
-
-/*
-Callback for when drwav_init_ex() finds a chunk.
-
-pChunkUserData    [in] The user data that was passed to the pChunkUserData parameter of drwav_init_ex() and family.
-onRead            [in] A pointer to the function to call when reading.
-onSeek            [in] A pointer to the function to call when seeking.
-pReadSeekUserData [in] The user data that was passed to the pReadSeekUserData parameter of drwav_init_ex() and family.
-pChunkHeader      [in] A pointer to an object containing basic header information about the chunk. Use this to identify the chunk.
-container         [in] Whether or not the WAV file is a RIFF or Wave64 container. If you're unsure of the difference, assume RIFF.
-pFMT              [in] A pointer to the object containing the contents of the "fmt" chunk.
-
-Returns the number of bytes read + seeked.
-
-To read data from the chunk, call onRead(), passing in pReadSeekUserData as the first parameter. Do the same for seeking with onSeek(). The return value must
-be the total number of bytes you have read _plus_ seeked.
-
-Use the `container` argument to discriminate the fields in `pChunkHeader->id`. If the container is `drwav_container_riff` or `drwav_container_rf64` you should
-use `id.fourcc`, otherwise you should use `id.guid`.
-
-The `pFMT` parameter can be used to determine the data format of the wave file. Use `drwav_fmt_get_format()` to get the sample format, which will be one of the
-`DR_WAVE_FORMAT_*` identifiers. 
-
-The read pointer will be sitting on the first byte after the chunk's header. You must not attempt to read beyond the boundary of the chunk.
-*/
-typedef drwav_uint64 (* drwav_chunk_proc)(void* pChunkUserData, drwav_read_proc onRead, drwav_seek_proc onSeek, void* pReadSeekUserData, const drwav_chunk_header* pChunkHeader, drwav_container container, const drwav_fmt* pFMT);
-
-typedef struct
-{
-    void* pUserData;
-    void* (* onMalloc)(size_t sz, void* pUserData);
-    void* (* onRealloc)(void* p, size_t sz, void* pUserData);
-    void  (* onFree)(void* p, void* pUserData);
-} drwav_allocation_callbacks;
-
-/* Structure for internal use. Only used for loaders opened with drwav_init_memory(). */
-typedef struct
-{
-    const drwav_uint8* data;
-    size_t dataSize;
-    size_t currentReadPos;
-} drwav__memory_stream;
-
-/* Structure for internal use. Only used for writers opened with drwav_init_memory_write(). */
-typedef struct
-{
-    void** ppData;
-    size_t* pDataSize;
-    size_t dataSize;
-    size_t dataCapacity;
-    size_t currentWritePos;
-} drwav__memory_stream_write;
-
-typedef struct
-{
-    drwav_container container;  /* RIFF, W64. */
-    drwav_uint32 format;        /* DR_WAVE_FORMAT_* */
-    drwav_uint32 channels;
-    drwav_uint32 sampleRate;
-    drwav_uint32 bitsPerSample;
-} drwav_data_format;
-
-
-/* See the following for details on the 'smpl' chunk: https://sites.google.com/site/musicgapi/technical-documents/wav-file-format#smpl */
-typedef struct
-{
-    drwav_uint32 cuePointId;
-    drwav_uint32 type;
-    drwav_uint32 start;
-    drwav_uint32 end;
-    drwav_uint32 fraction;
-    drwav_uint32 playCount;
-} drwav_smpl_loop;
-
- typedef struct
-{
-    drwav_uint32 manufacturer;
-    drwav_uint32 product;
-    drwav_uint32 samplePeriod;
-    drwav_uint32 midiUnityNotes;
-    drwav_uint32 midiPitchFraction;
-    drwav_uint32 smpteFormat;
-    drwav_uint32 smpteOffset;
-    drwav_uint32 numSampleLoops;
-    drwav_uint32 samplerData;
-    drwav_smpl_loop loops[DRWAV_MAX_SMPL_LOOPS];
-} drwav_smpl;
-
-typedef struct
-{
-    /* A pointer to the function to call when more data is needed. */
-    drwav_read_proc onRead;
-
-    /* A pointer to the function to call when data needs to be written. Only used when the drwav object is opened in write mode. */
-    drwav_write_proc onWrite;
-
-    /* A pointer to the function to call when the wav file needs to be seeked. */
-    drwav_seek_proc onSeek;
-
-    /* The user data to pass to callbacks. */
-    void* pUserData;
-
-    /* Allocation callbacks. */
-    drwav_allocation_callbacks allocationCallbacks;
-
-
-    /* Whether or not the WAV file is formatted as a standard RIFF file or W64. */
-    drwav_container container;
-
-
-    /* Structure containing format information exactly as specified by the wav file. */
-    drwav_fmt fmt;
-
-    /* The sample rate. Will be set to something like 44100. */
-    drwav_uint32 sampleRate;
-
-    /* The number of channels. This will be set to 1 for monaural streams, 2 for stereo, etc. */
-    drwav_uint16 channels;
-
-    /* The bits per sample. Will be set to something like 16, 24, etc. */
-    drwav_uint16 bitsPerSample;
-
-    /* Equal to fmt.formatTag, or the value specified by fmt.subFormat if fmt.formatTag is equal to 65534 (WAVE_FORMAT_EXTENSIBLE). */
-    drwav_uint16 translatedFormatTag;
-
-    /* The total number of PCM frames making up the audio data. */
-    drwav_uint64 totalPCMFrameCount;
-
-
-    /* The size in bytes of the data chunk. */
-    drwav_uint64 dataChunkDataSize;
-    
-    /* The position in the stream of the first byte of the data chunk. This is used for seeking. */
-    drwav_uint64 dataChunkDataPos;
-
-    /* The number of bytes remaining in the data chunk. */
-    drwav_uint64 bytesRemaining;
-
-
-    /*
-    Only used in sequential write mode. Keeps track of the desired size of the "data" chunk at the point of initialization time. Always
-    set to 0 for non-sequential writes and when the drwav object is opened in read mode. Used for validation.
-    */
-    drwav_uint64 dataChunkDataSizeTargetWrite;
-
-    /* Keeps track of whether or not the wav writer was initialized in sequential mode. */
-    drwav_bool32 isSequentialWrite;
-
-
-    /* smpl chunk. */
-    drwav_smpl smpl;
-
-
-    /* A hack to avoid a DRWAV_MALLOC() when opening a decoder with drwav_init_memory(). */
-    drwav__memory_stream memoryStream;
-    drwav__memory_stream_write memoryStreamWrite;
-
-    /* Generic data for compressed formats. This data is shared across all block-compressed formats. */
-    struct
-    {
-        drwav_uint64 iCurrentPCMFrame;  /* The index of the next PCM frame that will be read by drwav_read_*(). This is used with "totalPCMFrameCount" to ensure we don't read excess samples at the end of the last block. */
-    } compressed;
-    
-    /* Microsoft ADPCM specific data. */
-    struct
-    {
-        drwav_uint32 bytesRemainingInBlock;
-        drwav_uint16 predictor[2];
-        drwav_int32  delta[2];
-        drwav_int32  cachedFrames[4];  /* Samples are stored in this cache during decoding. */
-        drwav_uint32 cachedFrameCount;
-        drwav_int32  prevFrames[2][2]; /* The previous 2 samples for each channel (2 channels at most). */
-    } msadpcm;
-
-    /* IMA ADPCM specific data. */
-    struct
-    {
-        drwav_uint32 bytesRemainingInBlock;
-        drwav_int32  predictor[2];
-        drwav_int32  stepIndex[2];
-        drwav_int32  cachedFrames[16]; /* Samples are stored in this cache during decoding. */
-        drwav_uint32 cachedFrameCount;
-    } ima;
-} drwav;
-
-
-/*
-Initializes a pre-allocated drwav object for reading.
-
-pWav                         [out]          A pointer to the drwav object being initialized.
-onRead                       [in]           The function to call when data needs to be read from the client.
-onSeek                       [in]           The function to call when the read position of the client data needs to move.
-onChunk                      [in, optional] The function to call when a chunk is enumerated at initialized time.
-pUserData, pReadSeekUserData [in, optional] A pointer to application defined data that will be passed to onRead and onSeek.
-pChunkUserData               [in, optional] A pointer to application defined data that will be passed to onChunk.
-flags                        [in, optional] A set of flags for controlling how things are loaded.
-
-Returns true if successful; false otherwise.
-
-Close the loader with drwav_uninit().
-
-This is the lowest level function for initializing a WAV file. You can also use drwav_init_file() and drwav_init_memory()
-to open the stream from a file or from a block of memory respectively.
-
-Possible values for flags:
-  DRWAV_SEQUENTIAL: Never perform a backwards seek while loading. This disables the chunk callback and will cause this function
-                    to return as soon as the data chunk is found. Any chunks after the data chunk will be ignored.
-
-drwav_init() is equivalent to "drwav_init_ex(pWav, onRead, onSeek, NULL, pUserData, NULL, 0);".
-
-The onChunk callback is not called for the WAVE or FMT chunks. The contents of the FMT chunk can be read from pWav->fmt
-after the function returns.
-
-See also: drwav_init_file(), drwav_init_memory(), drwav_uninit()
-*/
-DRWAV_API drwav_bool32 drwav_init(drwav* pWav, drwav_read_proc onRead, drwav_seek_proc onSeek, void* pUserData, const drwav_allocation_callbacks* pAllocationCallbacks);
-DRWAV_API drwav_bool32 drwav_init_ex(drwav* pWav, drwav_read_proc onRead, drwav_seek_proc onSeek, drwav_chunk_proc onChunk, void* pReadSeekUserData, void* pChunkUserData, drwav_uint32 flags, const drwav_allocation_callbacks* pAllocationCallbacks);
-
-/*
-Initializes a pre-allocated drwav object for writing.
-
-onWrite   [in]           The function to call when data needs to be written.
-onSeek    [in]           The function to call when the write position needs to move.
-pUserData [in, optional] A pointer to application defined data that will be passed to onWrite and onSeek.
-
-Returns true if successful; false otherwise.
-
-Close the writer with drwav_uninit().
-
-This is the lowest level function for initializing a WAV file. You can also use drwav_init_file_write() and drwav_init_memory_write()
-to open the stream from a file or from a block of memory respectively.
-
-If the total sample count is known, you can use drwav_init_write_sequential(). This avoids the need for dr_wav to perform
-a post-processing step for storing the total sample count and the size of the data chunk which requires a backwards seek.
-
-See also: drwav_init_file_write(), drwav_init_memory_write(), drwav_uninit()
-*/
-DRWAV_API drwav_bool32 drwav_init_write(drwav* pWav, const drwav_data_format* pFormat, drwav_write_proc onWrite, drwav_seek_proc onSeek, void* pUserData, const drwav_allocation_callbacks* pAllocationCallbacks);
-DRWAV_API drwav_bool32 drwav_init_write_sequential(drwav* pWav, const drwav_data_format* pFormat, drwav_uint64 totalSampleCount, drwav_write_proc onWrite, void* pUserData, const drwav_allocation_callbacks* pAllocationCallbacks);
-DRWAV_API drwav_bool32 drwav_init_write_sequential_pcm_frames(drwav* pWav, const drwav_data_format* pFormat, drwav_uint64 totalPCMFrameCount, drwav_write_proc onWrite, void* pUserData, const drwav_allocation_callbacks* pAllocationCallbacks);
-
-/*
-Utility function to determine the target size of the entire data to be written (including all headers and chunks).
-
-Returns the target size in bytes.
-
-Useful if the application needs to know the size to allocate.
-
-Only writing to the RIFF chunk and one data chunk is currently supported.
-
-See also: drwav_init_write(), drwav_init_file_write(), drwav_init_memory_write()
-*/
-DRWAV_API drwav_uint64 drwav_target_write_size_bytes(const drwav_data_format* pFormat, drwav_uint64 totalSampleCount);
-
-/*
-Uninitializes the given drwav object.
-
-Use this only for objects initialized with drwav_init*() functions (drwav_init(), drwav_init_ex(), drwav_init_write(), drwav_init_write_sequential()).
-*/
-DRWAV_API drwav_result drwav_uninit(drwav* pWav);
-
-
-/*
-Reads raw audio data.
-
-This is the lowest level function for reading audio data. It simply reads the given number of
-bytes of the raw internal sample data.
-
-Consider using drwav_read_pcm_frames_s16(), drwav_read_pcm_frames_s32() or drwav_read_pcm_frames_f32() for
-reading sample data in a consistent format.
-
-pBufferOut can be NULL in which case a seek will be performed.
-
-Returns the number of bytes actually read.
-*/
-DRWAV_API size_t drwav_read_raw(drwav* pWav, size_t bytesToRead, void* pBufferOut);
-
-/*
-Reads up to the specified number of PCM frames from the WAV file.
-
-The output data will be in the file's internal format, converted to native-endian byte order. Use
-drwav_read_pcm_frames_s16/f32/s32() to read data in a specific format.
-
-If the return value is less than <framesToRead> it means the end of the file has been reached or
-you have requested more PCM frames than can possibly fit in the output buffer.
-
-This function will only work when sample data is of a fixed size and uncompressed. If you are
-using a compressed format consider using drwav_read_raw() or drwav_read_pcm_frames_s16/s32/f32().
-
-pBufferOut can be NULL in which case a seek will be performed.
-*/
-DRWAV_API drwav_uint64 drwav_read_pcm_frames(drwav* pWav, drwav_uint64 framesToRead, void* pBufferOut);
-DRWAV_API drwav_uint64 drwav_read_pcm_frames_le(drwav* pWav, drwav_uint64 framesToRead, void* pBufferOut);
-DRWAV_API drwav_uint64 drwav_read_pcm_frames_be(drwav* pWav, drwav_uint64 framesToRead, void* pBufferOut);
-
-/*
-Seeks to the given PCM frame.
-
-Returns true if successful; false otherwise.
-*/
-DRWAV_API drwav_bool32 drwav_seek_to_pcm_frame(drwav* pWav, drwav_uint64 targetFrameIndex);
-
-
-/*
-Writes raw audio data.
-
-Returns the number of bytes actually written. If this differs from bytesToWrite, it indicates an error.
-*/
-DRWAV_API size_t drwav_write_raw(drwav* pWav, size_t bytesToWrite, const void* pData);
-
-/*
-Writes PCM frames.
-
-Returns the number of PCM frames written.
-
-Input samples need to be in native-endian byte order. On big-endian architectures the input data will be converted to
-little-endian. Use drwav_write_raw() to write raw audio data without performing any conversion.
-*/
-DRWAV_API drwav_uint64 drwav_write_pcm_frames(drwav* pWav, drwav_uint64 framesToWrite, const void* pData);
-DRWAV_API drwav_uint64 drwav_write_pcm_frames_le(drwav* pWav, drwav_uint64 framesToWrite, const void* pData);
-DRWAV_API drwav_uint64 drwav_write_pcm_frames_be(drwav* pWav, drwav_uint64 framesToWrite, const void* pData);
-
-
-/* Conversion Utilities */
-#ifndef DR_WAV_NO_CONVERSION_API
-
-/*
-Reads a chunk of audio data and converts it to signed 16-bit PCM samples.
-
-pBufferOut can be NULL in which case a seek will be performed.
-
-Returns the number of PCM frames actually read.
-
-If the return value is less than <framesToRead> it means the end of the file has been reached.
-*/
-DRWAV_API drwav_uint64 drwav_read_pcm_frames_s16(drwav* pWav, drwav_uint64 framesToRead, drwav_int16* pBufferOut);
-DRWAV_API drwav_uint64 drwav_read_pcm_frames_s16le(drwav* pWav, drwav_uint64 framesToRead, drwav_int16* pBufferOut);
-DRWAV_API drwav_uint64 drwav_read_pcm_frames_s16be(drwav* pWav, drwav_uint64 framesToRead, drwav_int16* pBufferOut);
-
-/* Low-level function for converting unsigned 8-bit PCM samples to signed 16-bit PCM samples. */
-DRWAV_API void drwav_u8_to_s16(drwav_int16* pOut, const drwav_uint8* pIn, size_t sampleCount);
-
-/* Low-level function for converting signed 24-bit PCM samples to signed 16-bit PCM samples. */
-DRWAV_API void drwav_s24_to_s16(drwav_int16* pOut, const drwav_uint8* pIn, size_t sampleCount);
-
-/* Low-level function for converting signed 32-bit PCM samples to signed 16-bit PCM samples. */
-DRWAV_API void drwav_s32_to_s16(drwav_int16* pOut, const drwav_int32* pIn, size_t sampleCount);
-
-/* Low-level function for converting IEEE 32-bit floating point samples to signed 16-bit PCM samples. */
-DRWAV_API void drwav_f32_to_s16(drwav_int16* pOut, const float* pIn, size_t sampleCount);
-
-/* Low-level function for converting IEEE 64-bit floating point samples to signed 16-bit PCM samples. */
-DRWAV_API void drwav_f64_to_s16(drwav_int16* pOut, const double* pIn, size_t sampleCount);
-
-/* Low-level function for converting A-law samples to signed 16-bit PCM samples. */
-DRWAV_API void drwav_alaw_to_s16(drwav_int16* pOut, const drwav_uint8* pIn, size_t sampleCount);
-
-/* Low-level function for converting u-law samples to signed 16-bit PCM samples. */
-DRWAV_API void drwav_mulaw_to_s16(drwav_int16* pOut, const drwav_uint8* pIn, size_t sampleCount);
-
-
-/*
-Reads a chunk of audio data and converts it to IEEE 32-bit floating point samples.
-
-pBufferOut can be NULL in which case a seek will be performed.
-
-Returns the number of PCM frames actually read.
-
-If the return value is less than <framesToRead> it means the end of the file has been reached.
-*/
-DRWAV_API drwav_uint64 drwav_read_pcm_frames_f32(drwav* pWav, drwav_uint64 framesToRead, float* pBufferOut);
-DRWAV_API drwav_uint64 drwav_read_pcm_frames_f32le(drwav* pWav, drwav_uint64 framesToRead, float* pBufferOut);
-DRWAV_API drwav_uint64 drwav_read_pcm_frames_f32be(drwav* pWav, drwav_uint64 framesToRead, float* pBufferOut);
-
-/* Low-level function for converting unsigned 8-bit PCM samples to IEEE 32-bit floating point samples. */
-DRWAV_API void drwav_u8_to_f32(float* pOut, const drwav_uint8* pIn, size_t sampleCount);
-
-/* Low-level function for converting signed 16-bit PCM samples to IEEE 32-bit floating point samples. */
-DRWAV_API void drwav_s16_to_f32(float* pOut, const drwav_int16* pIn, size_t sampleCount);
-
-/* Low-level function for converting signed 24-bit PCM samples to IEEE 32-bit floating point samples. */
-DRWAV_API void drwav_s24_to_f32(float* pOut, const drwav_uint8* pIn, size_t sampleCount);
-
-/* Low-level function for converting signed 32-bit PCM samples to IEEE 32-bit floating point samples. */
-DRWAV_API void drwav_s32_to_f32(float* pOut, const drwav_int32* pIn, size_t sampleCount);
-
-/* Low-level function for converting IEEE 64-bit floating point samples to IEEE 32-bit floating point samples. */
-DRWAV_API void drwav_f64_to_f32(float* pOut, const double* pIn, size_t sampleCount);
-
-/* Low-level function for converting A-law samples to IEEE 32-bit floating point samples. */
-DRWAV_API void drwav_alaw_to_f32(float* pOut, const drwav_uint8* pIn, size_t sampleCount);
-
-/* Low-level function for converting u-law samples to IEEE 32-bit floating point samples. */
-DRWAV_API void drwav_mulaw_to_f32(float* pOut, const drwav_uint8* pIn, size_t sampleCount);
-
-
-/*
-Reads a chunk of audio data and converts it to signed 32-bit PCM samples.
-
-pBufferOut can be NULL in which case a seek will be performed.
-
-Returns the number of PCM frames actually read.
-
-If the return value is less than <framesToRead> it means the end of the file has been reached.
-*/
-DRWAV_API drwav_uint64 drwav_read_pcm_frames_s32(drwav* pWav, drwav_uint64 framesToRead, drwav_int32* pBufferOut);
-DRWAV_API drwav_uint64 drwav_read_pcm_frames_s32le(drwav* pWav, drwav_uint64 framesToRead, drwav_int32* pBufferOut);
-DRWAV_API drwav_uint64 drwav_read_pcm_frames_s32be(drwav* pWav, drwav_uint64 framesToRead, drwav_int32* pBufferOut);
-
-/* Low-level function for converting unsigned 8-bit PCM samples to signed 32-bit PCM samples. */
-DRWAV_API void drwav_u8_to_s32(drwav_int32* pOut, const drwav_uint8* pIn, size_t sampleCount);
-
-/* Low-level function for converting signed 16-bit PCM samples to signed 32-bit PCM samples. */
-DRWAV_API void drwav_s16_to_s32(drwav_int32* pOut, const drwav_int16* pIn, size_t sampleCount);
-
-/* Low-level function for converting signed 24-bit PCM samples to signed 32-bit PCM samples. */
-DRWAV_API void drwav_s24_to_s32(drwav_int32* pOut, const drwav_uint8* pIn, size_t sampleCount);
-
-/* Low-level function for converting IEEE 32-bit floating point samples to signed 32-bit PCM samples. */
-DRWAV_API void drwav_f32_to_s32(drwav_int32* pOut, const float* pIn, size_t sampleCount);
-
-/* Low-level function for converting IEEE 64-bit floating point samples to signed 32-bit PCM samples. */
-DRWAV_API void drwav_f64_to_s32(drwav_int32* pOut, const double* pIn, size_t sampleCount);
-
-/* Low-level function for converting A-law samples to signed 32-bit PCM samples. */
-DRWAV_API void drwav_alaw_to_s32(drwav_int32* pOut, const drwav_uint8* pIn, size_t sampleCount);
-
-/* Low-level function for converting u-law samples to signed 32-bit PCM samples. */
-DRWAV_API void drwav_mulaw_to_s32(drwav_int32* pOut, const drwav_uint8* pIn, size_t sampleCount);
-
-#endif  /* DR_WAV_NO_CONVERSION_API */
-
-
-/* High-Level Convenience Helpers */
-
-#ifndef DR_WAV_NO_STDIO
-/*
-Helper for initializing a wave file for reading using stdio.
-
-This holds the internal FILE object until drwav_uninit() is called. Keep this in mind if you're caching drwav
-objects because the operating system may restrict the number of file handles an application can have open at
-any given time.
-*/
-DRWAV_API drwav_bool32 drwav_init_file(drwav* pWav, const char* filename, const drwav_allocation_callbacks* pAllocationCallbacks);
-DRWAV_API drwav_bool32 drwav_init_file_ex(drwav* pWav, const char* filename, drwav_chunk_proc onChunk, void* pChunkUserData, drwav_uint32 flags, const drwav_allocation_callbacks* pAllocationCallbacks);
-DRWAV_API drwav_bool32 drwav_init_file_w(drwav* pWav, const wchar_t* filename, const drwav_allocation_callbacks* pAllocationCallbacks);
-DRWAV_API drwav_bool32 drwav_init_file_ex_w(drwav* pWav, const wchar_t* filename, drwav_chunk_proc onChunk, void* pChunkUserData, drwav_uint32 flags, const drwav_allocation_callbacks* pAllocationCallbacks);
-
-/*
-Helper for initializing a wave file for writing using stdio.
-
-This holds the internal FILE object until drwav_uninit() is called. Keep this in mind if you're caching drwav
-objects because the operating system may restrict the number of file handles an application can have open at
-any given time.
-*/
-DRWAV_API drwav_bool32 drwav_init_file_write(drwav* pWav, const char* filename, const drwav_data_format* pFormat, const drwav_allocation_callbacks* pAllocationCallbacks);
-DRWAV_API drwav_bool32 drwav_init_file_write_sequential(drwav* pWav, const char* filename, const drwav_data_format* pFormat, drwav_uint64 totalSampleCount, const drwav_allocation_callbacks* pAllocationCallbacks);
-DRWAV_API drwav_bool32 drwav_init_file_write_sequential_pcm_frames(drwav* pWav, const char* filename, const drwav_data_format* pFormat, drwav_uint64 totalPCMFrameCount, const drwav_allocation_callbacks* pAllocationCallbacks);
-DRWAV_API drwav_bool32 drwav_init_file_write_w(drwav* pWav, const wchar_t* filename, const drwav_data_format* pFormat, const drwav_allocation_callbacks* pAllocationCallbacks);
-DRWAV_API drwav_bool32 drwav_init_file_write_sequential_w(drwav* pWav, const wchar_t* filename, const drwav_data_format* pFormat, drwav_uint64 totalSampleCount, const drwav_allocation_callbacks* pAllocationCallbacks);
-DRWAV_API drwav_bool32 drwav_init_file_write_sequential_pcm_frames_w(drwav* pWav, const wchar_t* filename, const drwav_data_format* pFormat, drwav_uint64 totalPCMFrameCount, const drwav_allocation_callbacks* pAllocationCallbacks);
-#endif  /* DR_WAV_NO_STDIO */
-
-/*
-Helper for initializing a loader from a pre-allocated memory buffer.
-
-This does not create a copy of the data. It is up to the application to ensure the buffer remains valid for
-the lifetime of the drwav object.
-
-The buffer should contain the contents of the entire wave file, not just the sample data.
-*/
-DRWAV_API drwav_bool32 drwav_init_memory(drwav* pWav, const void* data, size_t dataSize, const drwav_allocation_callbacks* pAllocationCallbacks);
-DRWAV_API drwav_bool32 drwav_init_memory_ex(drwav* pWav, const void* data, size_t dataSize, drwav_chunk_proc onChunk, void* pChunkUserData, drwav_uint32 flags, const drwav_allocation_callbacks* pAllocationCallbacks);
-
-/*
-Helper for initializing a writer which outputs data to a memory buffer.
-
-dr_wav will manage the memory allocations, however it is up to the caller to free the data with drwav_free().
-
-The buffer will remain allocated even after drwav_uninit() is called. The buffer should not be considered valid
-until after drwav_uninit() has been called.
-*/
-DRWAV_API drwav_bool32 drwav_init_memory_write(drwav* pWav, void** ppData, size_t* pDataSize, const drwav_data_format* pFormat, const drwav_allocation_callbacks* pAllocationCallbacks);
-DRWAV_API drwav_bool32 drwav_init_memory_write_sequential(drwav* pWav, void** ppData, size_t* pDataSize, const drwav_data_format* pFormat, drwav_uint64 totalSampleCount, const drwav_allocation_callbacks* pAllocationCallbacks);
-DRWAV_API drwav_bool32 drwav_init_memory_write_sequential_pcm_frames(drwav* pWav, void** ppData, size_t* pDataSize, const drwav_data_format* pFormat, drwav_uint64 totalPCMFrameCount, const drwav_allocation_callbacks* pAllocationCallbacks);
-
-
-#ifndef DR_WAV_NO_CONVERSION_API
-/*
-Opens and reads an entire wav file in a single operation.
-
-The return value is a heap-allocated buffer containing the audio data. Use drwav_free() to free the buffer.
-*/
-DRWAV_API drwav_int16* drwav_open_and_read_pcm_frames_s16(drwav_read_proc onRead, drwav_seek_proc onSeek, void* pUserData, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks);
-DRWAV_API float* drwav_open_and_read_pcm_frames_f32(drwav_read_proc onRead, drwav_seek_proc onSeek, void* pUserData, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks);
-DRWAV_API drwav_int32* drwav_open_and_read_pcm_frames_s32(drwav_read_proc onRead, drwav_seek_proc onSeek, void* pUserData, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks);
-#ifndef DR_WAV_NO_STDIO
-/*
-Opens and decodes an entire wav file in a single operation.
-
-The return value is a heap-allocated buffer containing the audio data. Use drwav_free() to free the buffer.
-*/
-DRWAV_API drwav_int16* drwav_open_file_and_read_pcm_frames_s16(const char* filename, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks);
-DRWAV_API float* drwav_open_file_and_read_pcm_frames_f32(const char* filename, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks);
-DRWAV_API drwav_int32* drwav_open_file_and_read_pcm_frames_s32(const char* filename, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks);
-DRWAV_API drwav_int16* drwav_open_file_and_read_pcm_frames_s16_w(const wchar_t* filename, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks);
-DRWAV_API float* drwav_open_file_and_read_pcm_frames_f32_w(const wchar_t* filename, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks);
-DRWAV_API drwav_int32* drwav_open_file_and_read_pcm_frames_s32_w(const wchar_t* filename, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks);
-#endif
-/*
-Opens and decodes an entire wav file from a block of memory in a single operation.
-
-The return value is a heap-allocated buffer containing the audio data. Use drwav_free() to free the buffer.
-*/
-DRWAV_API drwav_int16* drwav_open_memory_and_read_pcm_frames_s16(const void* data, size_t dataSize, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks);
-DRWAV_API float* drwav_open_memory_and_read_pcm_frames_f32(const void* data, size_t dataSize, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks);
-DRWAV_API drwav_int32* drwav_open_memory_and_read_pcm_frames_s32(const void* data, size_t dataSize, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks);
-#endif
-
-/* Frees data that was allocated internally by dr_wav. */
-DRWAV_API void drwav_free(void* p, const drwav_allocation_callbacks* pAllocationCallbacks);
-
-/* Converts bytes from a wav stream to a sized type of native endian. */
-DRWAV_API drwav_uint16 drwav_bytes_to_u16(const drwav_uint8* data);
-DRWAV_API drwav_int16 drwav_bytes_to_s16(const drwav_uint8* data);
-DRWAV_API drwav_uint32 drwav_bytes_to_u32(const drwav_uint8* data);
-DRWAV_API drwav_int32 drwav_bytes_to_s32(const drwav_uint8* data);
-DRWAV_API drwav_uint64 drwav_bytes_to_u64(const drwav_uint8* data);
-DRWAV_API drwav_int64 drwav_bytes_to_s64(const drwav_uint8* data);
-
-/* Compares a GUID for the purpose of checking the type of a Wave64 chunk. */
-DRWAV_API drwav_bool32 drwav_guid_equal(const drwav_uint8 a[16], const drwav_uint8 b[16]);
-
-/* Compares a four-character-code for the purpose of checking the type of a RIFF chunk. */
-DRWAV_API drwav_bool32 drwav_fourcc_equal(const drwav_uint8* a, const char* b);
-
-#ifdef __cplusplus
-}
-#endif
-#endif  /* dr_wav_h */
-
-
-/************************************************************************************************************************************************************
- ************************************************************************************************************************************************************
-
- IMPLEMENTATION
-
- ************************************************************************************************************************************************************
- ************************************************************************************************************************************************************/
-#if defined(DR_WAV_IMPLEMENTATION) || defined(DRWAV_IMPLEMENTATION)
-#ifndef dr_wav_c
-#define dr_wav_c
-
-#include <stdlib.h>
-#include <string.h> /* For memcpy(), memset() */
-#include <limits.h> /* For INT_MAX */
-
-#ifndef DR_WAV_NO_STDIO
-#include <stdio.h>
-#include <wchar.h>
-#endif
-
-/* Standard library stuff. */
-#ifndef DRWAV_ASSERT
-#include <assert.h>
-#define DRWAV_ASSERT(expression)           assert(expression)
-#endif
-#ifndef DRWAV_MALLOC
-#define DRWAV_MALLOC(sz)                   malloc((sz))
-#endif
-#ifndef DRWAV_REALLOC
-#define DRWAV_REALLOC(p, sz)               realloc((p), (sz))
-#endif
-#ifndef DRWAV_FREE
-#define DRWAV_FREE(p)                      free((p))
-#endif
-#ifndef DRWAV_COPY_MEMORY
-#define DRWAV_COPY_MEMORY(dst, src, sz)    memcpy((dst), (src), (sz))
-#endif
-#ifndef DRWAV_ZERO_MEMORY
-#define DRWAV_ZERO_MEMORY(p, sz)           memset((p), 0, (sz))
-#endif
-#ifndef DRWAV_ZERO_OBJECT
-#define DRWAV_ZERO_OBJECT(p)               DRWAV_ZERO_MEMORY((p), sizeof(*p))
-#endif
-
-#define drwav_countof(x)                   (sizeof(x) / sizeof(x[0]))
-#define drwav_align(x, a)                  ((((x) + (a) - 1) / (a)) * (a))
-#define drwav_min(a, b)                    (((a) < (b)) ? (a) : (b))
-#define drwav_max(a, b)                    (((a) > (b)) ? (a) : (b))
-#define drwav_clamp(x, lo, hi)             (drwav_max((lo), drwav_min((hi), (x))))
-
-#define DRWAV_MAX_SIMD_VECTOR_SIZE         64  /* 64 for AVX-512 in the future. */
-
-/* CPU architecture. */
-#if defined(__x86_64__) || defined(_M_X64)
-    #define DRWAV_X64
-#elif defined(__i386) || defined(_M_IX86)
-    #define DRWAV_X86
-#elif defined(__arm__) || defined(_M_ARM)
-    #define DRWAV_ARM
-#endif
-
-#ifdef _MSC_VER
-    #define DRWAV_INLINE __forceinline
-#elif defined(__GNUC__)
-    /*
-    I've had a bug report where GCC is emitting warnings about functions possibly not being inlineable. This warning happens when
-    the __attribute__((always_inline)) attribute is defined without an "inline" statement. I think therefore there must be some
-    case where "__inline__" is not always defined, thus the compiler emitting these warnings. When using -std=c89 or -ansi on the
-    command line, we cannot use the "inline" keyword and instead need to use "__inline__". In an attempt to work around this issue
-    I am using "__inline__" only when we're compiling in strict ANSI mode.
-    */
-    #if defined(__STRICT_ANSI__)
-        #define DRWAV_INLINE __inline__ __attribute__((always_inline))
-    #else
-        #define DRWAV_INLINE inline __attribute__((always_inline))
-    #endif
-#elif defined(__WATCOMC__)
-    #define DRWAV_INLINE __inline
-#else
-    #define DRWAV_INLINE
-#endif
-
-#if defined(SIZE_MAX)
-    #define DRWAV_SIZE_MAX  SIZE_MAX
-#else
-    #if defined(_WIN64) || defined(_LP64) || defined(__LP64__)
-        #define DRWAV_SIZE_MAX  ((drwav_uint64)0xFFFFFFFFFFFFFFFF)
-    #else
-        #define DRWAV_SIZE_MAX  0xFFFFFFFF
-    #endif
-#endif
-
-#if defined(_MSC_VER) && _MSC_VER >= 1400
-    #define DRWAV_HAS_BYTESWAP16_INTRINSIC
-    #define DRWAV_HAS_BYTESWAP32_INTRINSIC
-    #define DRWAV_HAS_BYTESWAP64_INTRINSIC
-#elif defined(__clang__)
-    #if defined(__has_builtin)
-        #if __has_builtin(__builtin_bswap16)
-            #define DRWAV_HAS_BYTESWAP16_INTRINSIC
-        #endif
-        #if __has_builtin(__builtin_bswap32)
-            #define DRWAV_HAS_BYTESWAP32_INTRINSIC
-        #endif
-        #if __has_builtin(__builtin_bswap64)
-            #define DRWAV_HAS_BYTESWAP64_INTRINSIC
-        #endif
-    #endif
-#elif defined(__GNUC__)
-    #if ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3))
-        #define DRWAV_HAS_BYTESWAP32_INTRINSIC
-        #define DRWAV_HAS_BYTESWAP64_INTRINSIC
-    #endif
-    #if ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8))
-        #define DRWAV_HAS_BYTESWAP16_INTRINSIC
-    #endif
-#endif
-
-DRWAV_API void drwav_version(drwav_uint32* pMajor, drwav_uint32* pMinor, drwav_uint32* pRevision)
-{
-    if (pMajor) {
-        *pMajor = DRWAV_VERSION_MAJOR;
-    }
-
-    if (pMinor) {
-        *pMinor = DRWAV_VERSION_MINOR;
-    }
-
-    if (pRevision) {
-        *pRevision = DRWAV_VERSION_REVISION;
-    }
-}
-
-DRWAV_API const char* drwav_version_string(void)
-{
-    return DRWAV_VERSION_STRING;
-}
-
-/*
-These limits are used for basic validation when initializing the decoder. If you exceed these limits, first of all: what on Earth are
-you doing?! (Let me know, I'd be curious!) Second, you can adjust these by #define-ing them before the dr_wav implementation.
-*/
-#ifndef DRWAV_MAX_SAMPLE_RATE
-#define DRWAV_MAX_SAMPLE_RATE       384000
-#endif
-#ifndef DRWAV_MAX_CHANNELS
-#define DRWAV_MAX_CHANNELS          256
-#endif
-#ifndef DRWAV_MAX_BITS_PER_SAMPLE
-#define DRWAV_MAX_BITS_PER_SAMPLE   64
-#endif
-
-static const drwav_uint8 drwavGUID_W64_RIFF[16] = {0x72,0x69,0x66,0x66, 0x2E,0x91, 0xCF,0x11, 0xA5,0xD6, 0x28,0xDB,0x04,0xC1,0x00,0x00};    /* 66666972-912E-11CF-A5D6-28DB04C10000 */
-static const drwav_uint8 drwavGUID_W64_WAVE[16] = {0x77,0x61,0x76,0x65, 0xF3,0xAC, 0xD3,0x11, 0x8C,0xD1, 0x00,0xC0,0x4F,0x8E,0xDB,0x8A};    /* 65766177-ACF3-11D3-8CD1-00C04F8EDB8A */
-/*static const drwav_uint8 drwavGUID_W64_JUNK[16] = {0x6A,0x75,0x6E,0x6B, 0xF3,0xAC, 0xD3,0x11, 0x8C,0xD1, 0x00,0xC0,0x4F,0x8E,0xDB,0x8A};*/    /* 6B6E756A-ACF3-11D3-8CD1-00C04F8EDB8A */
-static const drwav_uint8 drwavGUID_W64_FMT [16] = {0x66,0x6D,0x74,0x20, 0xF3,0xAC, 0xD3,0x11, 0x8C,0xD1, 0x00,0xC0,0x4F,0x8E,0xDB,0x8A};    /* 20746D66-ACF3-11D3-8CD1-00C04F8EDB8A */
-static const drwav_uint8 drwavGUID_W64_FACT[16] = {0x66,0x61,0x63,0x74, 0xF3,0xAC, 0xD3,0x11, 0x8C,0xD1, 0x00,0xC0,0x4F,0x8E,0xDB,0x8A};    /* 74636166-ACF3-11D3-8CD1-00C04F8EDB8A */
-static const drwav_uint8 drwavGUID_W64_DATA[16] = {0x64,0x61,0x74,0x61, 0xF3,0xAC, 0xD3,0x11, 0x8C,0xD1, 0x00,0xC0,0x4F,0x8E,0xDB,0x8A};    /* 61746164-ACF3-11D3-8CD1-00C04F8EDB8A */
-static const drwav_uint8 drwavGUID_W64_SMPL[16] = {0x73,0x6D,0x70,0x6C, 0xF3,0xAC, 0xD3,0x11, 0x8C,0xD1, 0x00,0xC0,0x4F,0x8E,0xDB,0x8A};    /* 6C706D73-ACF3-11D3-8CD1-00C04F8EDB8A */
-
-static DRWAV_INLINE drwav_bool32 drwav__guid_equal(const drwav_uint8 a[16], const drwav_uint8 b[16])
-{
-    int i;
-    for (i = 0; i < 16; i += 1) {
-        if (a[i] != b[i]) {
-            return DRWAV_FALSE;
-        }
-    }
-
-    return DRWAV_TRUE;
-}
-
-static DRWAV_INLINE drwav_bool32 drwav__fourcc_equal(const drwav_uint8* a, const char* b)
-{
-    return
-        a[0] == b[0] &&
-        a[1] == b[1] &&
-        a[2] == b[2] &&
-        a[3] == b[3];
-}
-
-
-
-static DRWAV_INLINE int drwav__is_little_endian(void)
-{
-#if defined(DRWAV_X86) || defined(DRWAV_X64)
-    return DRWAV_TRUE;
-#elif defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && __BYTE_ORDER == __LITTLE_ENDIAN
-    return DRWAV_TRUE;
-#else
-    int n = 1;
-    return (*(char*)&n) == 1;
-#endif
-}
-
-static DRWAV_INLINE drwav_uint16 drwav__bytes_to_u16(const drwav_uint8* data)
-{
-    return (data[0] << 0) | (data[1] << 8);
-}
-
-static DRWAV_INLINE drwav_int16 drwav__bytes_to_s16(const drwav_uint8* data)
-{
-    return (short)drwav__bytes_to_u16(data);
-}
-
-static DRWAV_INLINE drwav_uint32 drwav__bytes_to_u32(const drwav_uint8* data)
-{
-    return (data[0] << 0) | (data[1] << 8) | (data[2] << 16) | (data[3] << 24);
-}
-
-static DRWAV_INLINE drwav_int32 drwav__bytes_to_s32(const drwav_uint8* data)
-{
-    return (drwav_int32)drwav__bytes_to_u32(data);
-}
-
-static DRWAV_INLINE drwav_uint64 drwav__bytes_to_u64(const drwav_uint8* data)
-{
-    return
-        ((drwav_uint64)data[0] <<  0) | ((drwav_uint64)data[1] <<  8) | ((drwav_uint64)data[2] << 16) | ((drwav_uint64)data[3] << 24) |
-        ((drwav_uint64)data[4] << 32) | ((drwav_uint64)data[5] << 40) | ((drwav_uint64)data[6] << 48) | ((drwav_uint64)data[7] << 56);
-}
-
-static DRWAV_INLINE drwav_int64 drwav__bytes_to_s64(const drwav_uint8* data)
-{
-    return (drwav_int64)drwav__bytes_to_u64(data);
-}
-
-static DRWAV_INLINE void drwav__bytes_to_guid(const drwav_uint8* data, drwav_uint8* guid)
-{
-    int i;
-    for (i = 0; i < 16; ++i) {
-        guid[i] = data[i];
-    }
-}
-
-
-static DRWAV_INLINE drwav_uint16 drwav__bswap16(drwav_uint16 n)
-{
-#ifdef DRWAV_HAS_BYTESWAP16_INTRINSIC
-    #if defined(_MSC_VER)
-        return _byteswap_ushort(n);
-    #elif defined(__GNUC__) || defined(__clang__)
-        return __builtin_bswap16(n);
-    #else
-        #error "This compiler does not support the byte swap intrinsic."
-    #endif
-#else
-    return ((n & 0xFF00) >> 8) |
-           ((n & 0x00FF) << 8);
-#endif
-}
-
-static DRWAV_INLINE drwav_uint32 drwav__bswap32(drwav_uint32 n)
-{
-#ifdef DRWAV_HAS_BYTESWAP32_INTRINSIC
-    #if defined(_MSC_VER)
-        return _byteswap_ulong(n);
-    #elif defined(__GNUC__) || defined(__clang__)
-        #if defined(DRWAV_ARM) && (defined(__ARM_ARCH) && __ARM_ARCH >= 6) && !defined(DRWAV_64BIT)   /* <-- 64-bit inline assembly has not been tested, so disabling for now. */
-            /* Inline assembly optimized implementation for ARM. In my testing, GCC does not generate optimized code with __builtin_bswap32(). */
-            drwav_uint32 r;
-            __asm__ __volatile__ (
-            #if defined(DRWAV_64BIT)
-                "rev %w[out], %w[in]" : [out]"=r"(r) : [in]"r"(n)   /* <-- This is untested. If someone in the community could test this, that would be appreciated! */
-            #else
-                "rev %[out], %[in]" : [out]"=r"(r) : [in]"r"(n)
-            #endif
-            );
-            return r;
-        #else
-            return __builtin_bswap32(n);
-        #endif
-    #else
-        #error "This compiler does not support the byte swap intrinsic."
-    #endif
-#else
-    return ((n & 0xFF000000) >> 24) |
-           ((n & 0x00FF0000) >>  8) |
-           ((n & 0x0000FF00) <<  8) |
-           ((n & 0x000000FF) << 24);
-#endif
-}
-
-static DRWAV_INLINE drwav_uint64 drwav__bswap64(drwav_uint64 n)
-{
-#ifdef DRWAV_HAS_BYTESWAP64_INTRINSIC
-    #if defined(_MSC_VER)
-        return _byteswap_uint64(n);
-    #elif defined(__GNUC__) || defined(__clang__)
-        return __builtin_bswap64(n);
-    #else
-        #error "This compiler does not support the byte swap intrinsic."
-    #endif
-#else
-    /* Weird "<< 32" bitshift is required for C89 because it doesn't support 64-bit constants. Should be optimized out by a good compiler. */
-    return ((n & ((drwav_uint64)0xFF000000 << 32)) >> 56) |
-           ((n & ((drwav_uint64)0x00FF0000 << 32)) >> 40) |
-           ((n & ((drwav_uint64)0x0000FF00 << 32)) >> 24) |
-           ((n & ((drwav_uint64)0x000000FF << 32)) >>  8) |
-           ((n & ((drwav_uint64)0xFF000000      )) <<  8) |
-           ((n & ((drwav_uint64)0x00FF0000      )) << 24) |
-           ((n & ((drwav_uint64)0x0000FF00      )) << 40) |
-           ((n & ((drwav_uint64)0x000000FF      )) << 56);
-#endif
-}
-
-
-static DRWAV_INLINE drwav_int16 drwav__bswap_s16(drwav_int16 n)
-{
-    return (drwav_int16)drwav__bswap16((drwav_uint16)n);
-}
-
-static DRWAV_INLINE void drwav__bswap_samples_s16(drwav_int16* pSamples, drwav_uint64 sampleCount)
-{
-    drwav_uint64 iSample;
-    for (iSample = 0; iSample < sampleCount; iSample += 1) {
-        pSamples[iSample] = drwav__bswap_s16(pSamples[iSample]);
-    }
-}
-
-
-static DRWAV_INLINE void drwav__bswap_s24(drwav_uint8* p)
-{
-    drwav_uint8 t;
-    t = p[0];
-    p[0] = p[2];
-    p[2] = t;
-}
-
-static DRWAV_INLINE void drwav__bswap_samples_s24(drwav_uint8* pSamples, drwav_uint64 sampleCount)
-{
-    drwav_uint64 iSample;
-    for (iSample = 0; iSample < sampleCount; iSample += 1) {
-        drwav_uint8* pSample = pSamples + (iSample*3);
-        drwav__bswap_s24(pSample);
-    }
-}
-
-
-static DRWAV_INLINE drwav_int32 drwav__bswap_s32(drwav_int32 n)
-{
-    return (drwav_int32)drwav__bswap32((drwav_uint32)n);
-}
-
-static DRWAV_INLINE void drwav__bswap_samples_s32(drwav_int32* pSamples, drwav_uint64 sampleCount)
-{
-    drwav_uint64 iSample;
-    for (iSample = 0; iSample < sampleCount; iSample += 1) {
-        pSamples[iSample] = drwav__bswap_s32(pSamples[iSample]);
-    }
-}
-
-
-static DRWAV_INLINE float drwav__bswap_f32(float n)
-{
-    union {
-        drwav_uint32 i;
-        float f;
-    } x;
-    x.f = n;
-    x.i = drwav__bswap32(x.i);
-
-    return x.f;
-}
-
-static DRWAV_INLINE void drwav__bswap_samples_f32(float* pSamples, drwav_uint64 sampleCount)
-{
-    drwav_uint64 iSample;
-    for (iSample = 0; iSample < sampleCount; iSample += 1) {
-        pSamples[iSample] = drwav__bswap_f32(pSamples[iSample]);
-    }
-}
-
-
-static DRWAV_INLINE double drwav__bswap_f64(double n)
-{
-    union {
-        drwav_uint64 i;
-        double f;
-    } x;
-    x.f = n;
-    x.i = drwav__bswap64(x.i);
-
-    return x.f;
-}
-
-static DRWAV_INLINE void drwav__bswap_samples_f64(double* pSamples, drwav_uint64 sampleCount)
-{
-    drwav_uint64 iSample;
-    for (iSample = 0; iSample < sampleCount; iSample += 1) {
-        pSamples[iSample] = drwav__bswap_f64(pSamples[iSample]);
-    }
-}
-
-
-static DRWAV_INLINE void drwav__bswap_samples_pcm(void* pSamples, drwav_uint64 sampleCount, drwav_uint32 bytesPerSample)
-{
-    /* Assumes integer PCM. Floating point PCM is done in drwav__bswap_samples_ieee(). */
-    switch (bytesPerSample)
-    {
-        case 2: /* s16, s12 (loosely packed) */
-        {
-            drwav__bswap_samples_s16((drwav_int16*)pSamples, sampleCount);
-        } break;
-        case 3: /* s24 */
-        {
-            drwav__bswap_samples_s24((drwav_uint8*)pSamples, sampleCount);
-        } break;
-        case 4: /* s32 */
-        {
-            drwav__bswap_samples_s32((drwav_int32*)pSamples, sampleCount);
-        } break;
-        default:
-        {
-            /* Unsupported format. */
-            DRWAV_ASSERT(DRWAV_FALSE);
-        } break;
-    }
-}
-
-static DRWAV_INLINE void drwav__bswap_samples_ieee(void* pSamples, drwav_uint64 sampleCount, drwav_uint32 bytesPerSample)
-{
-    switch (bytesPerSample)
-    {
-    #if 0   /* Contributions welcome for f16 support. */
-        case 2: /* f16 */
-        {
-            drwav__bswap_samples_f16((drwav_float16*)pSamples, sampleCount);
-        } break;
-    #endif
-        case 4: /* f32 */
-        {
-            drwav__bswap_samples_f32((float*)pSamples, sampleCount);
-        } break;
-        case 8: /* f64 */
-        {
-            drwav__bswap_samples_f64((double*)pSamples, sampleCount);
-        } break;
-        default:
-        {
-            /* Unsupported format. */
-            DRWAV_ASSERT(DRWAV_FALSE);
-        } break;
-    }
-}
-
-static DRWAV_INLINE void drwav__bswap_samples(void* pSamples, drwav_uint64 sampleCount, drwav_uint32 bytesPerSample, drwav_uint16 format)
-{
-    switch (format)
-    {
-        case DR_WAVE_FORMAT_PCM:
-        {
-            drwav__bswap_samples_pcm(pSamples, sampleCount, bytesPerSample);
-        } break;
-
-        case DR_WAVE_FORMAT_IEEE_FLOAT:
-        {
-            drwav__bswap_samples_ieee(pSamples, sampleCount, bytesPerSample);
-        } break;
-
-        case DR_WAVE_FORMAT_ALAW:
-        case DR_WAVE_FORMAT_MULAW:
-        {
-            drwav__bswap_samples_s16((drwav_int16*)pSamples, sampleCount);
-        } break;
-
-        case DR_WAVE_FORMAT_ADPCM:
-        case DR_WAVE_FORMAT_DVI_ADPCM:
-        default:
-        {
-            /* Unsupported format. */
-            DRWAV_ASSERT(DRWAV_FALSE);
-        } break;
-    }
-}
-
-
-static void* drwav__malloc_default(size_t sz, void* pUserData)
-{
-    (void)pUserData;
-    return DRWAV_MALLOC(sz);
-}
-
-static void* drwav__realloc_default(void* p, size_t sz, void* pUserData)
-{
-    (void)pUserData;
-    return DRWAV_REALLOC(p, sz);
-}
-
-static void drwav__free_default(void* p, void* pUserData)
-{
-    (void)pUserData;
-    DRWAV_FREE(p);
-}
-
-
-static void* drwav__malloc_from_callbacks(size_t sz, const drwav_allocation_callbacks* pAllocationCallbacks)
-{
-    if (pAllocationCallbacks == NULL) {
-        return NULL;
-    }
-
-    if (pAllocationCallbacks->onMalloc != NULL) {
-        return pAllocationCallbacks->onMalloc(sz, pAllocationCallbacks->pUserData);
-    }
-
-    /* Try using realloc(). */
-    if (pAllocationCallbacks->onRealloc != NULL) {
-        return pAllocationCallbacks->onRealloc(NULL, sz, pAllocationCallbacks->pUserData);
-    }
-
-    return NULL;
-}
-
-static void* drwav__realloc_from_callbacks(void* p, size_t szNew, size_t szOld, const drwav_allocation_callbacks* pAllocationCallbacks)
-{
-    if (pAllocationCallbacks == NULL) {
-        return NULL;
-    }
-
-    if (pAllocationCallbacks->onRealloc != NULL) {
-        return pAllocationCallbacks->onRealloc(p, szNew, pAllocationCallbacks->pUserData);
-    }
-
-    /* Try emulating realloc() in terms of malloc()/free(). */
-    if (pAllocationCallbacks->onMalloc != NULL && pAllocationCallbacks->onFree != NULL) {
-        void* p2;
-
-        p2 = pAllocationCallbacks->onMalloc(szNew, pAllocationCallbacks->pUserData);
-        if (p2 == NULL) {
-            return NULL;
-        }
-
-        if (p != NULL) {
-            DRWAV_COPY_MEMORY(p2, p, szOld);
-            pAllocationCallbacks->onFree(p, pAllocationCallbacks->pUserData);
-        }
-
-        return p2;
-    }
-
-    return NULL;
-}
-
-static void drwav__free_from_callbacks(void* p, const drwav_allocation_callbacks* pAllocationCallbacks)
-{
-    if (p == NULL || pAllocationCallbacks == NULL) {
-        return;
-    }
-
-    if (pAllocationCallbacks->onFree != NULL) {
-        pAllocationCallbacks->onFree(p, pAllocationCallbacks->pUserData);
-    }
-}
-
-
-static drwav_allocation_callbacks drwav_copy_allocation_callbacks_or_defaults(const drwav_allocation_callbacks* pAllocationCallbacks)
-{
-    if (pAllocationCallbacks != NULL) {
-        /* Copy. */
-        return *pAllocationCallbacks;
-    } else {
-        /* Defaults. */
-        drwav_allocation_callbacks allocationCallbacks;
-        allocationCallbacks.pUserData = NULL;
-        allocationCallbacks.onMalloc  = drwav__malloc_default;
-        allocationCallbacks.onRealloc = drwav__realloc_default;
-        allocationCallbacks.onFree    = drwav__free_default;
-        return allocationCallbacks;
-    }
-}
-
-
-static DRWAV_INLINE drwav_bool32 drwav__is_compressed_format_tag(drwav_uint16 formatTag)
-{
-    return
-        formatTag == DR_WAVE_FORMAT_ADPCM ||
-        formatTag == DR_WAVE_FORMAT_DVI_ADPCM;
-}
-
-static unsigned int drwav__chunk_padding_size_riff(drwav_uint64 chunkSize)
-{
-    return (unsigned int)(chunkSize % 2);
-}
-
-static unsigned int drwav__chunk_padding_size_w64(drwav_uint64 chunkSize)
-{
-    return (unsigned int)(chunkSize % 8);
-}
-
-static drwav_uint64 drwav_read_pcm_frames_s16__msadpcm(drwav* pWav, drwav_uint64 samplesToRead, drwav_int16* pBufferOut);
-static drwav_uint64 drwav_read_pcm_frames_s16__ima(drwav* pWav, drwav_uint64 samplesToRead, drwav_int16* pBufferOut);
-static drwav_bool32 drwav_init_write__internal(drwav* pWav, const drwav_data_format* pFormat, drwav_uint64 totalSampleCount);
-
-static drwav_result drwav__read_chunk_header(drwav_read_proc onRead, void* pUserData, drwav_container container, drwav_uint64* pRunningBytesReadOut, drwav_chunk_header* pHeaderOut)
-{
-    if (container == drwav_container_riff || container == drwav_container_rf64) {
-        drwav_uint8 sizeInBytes[4];
-
-        if (onRead(pUserData, pHeaderOut->id.fourcc, 4) != 4) {
-            return DRWAV_AT_END;
-        }
-
-        if (onRead(pUserData, sizeInBytes, 4) != 4) {
-            return DRWAV_INVALID_FILE;
-        }
-
-        pHeaderOut->sizeInBytes = drwav__bytes_to_u32(sizeInBytes);
-        pHeaderOut->paddingSize = drwav__chunk_padding_size_riff(pHeaderOut->sizeInBytes);
-        *pRunningBytesReadOut += 8;
-    } else {
-        drwav_uint8 sizeInBytes[8];
-
-        if (onRead(pUserData, pHeaderOut->id.guid, 16) != 16) {
-            return DRWAV_AT_END;
-        }
-
-        if (onRead(pUserData, sizeInBytes, 8) != 8) {
-            return DRWAV_INVALID_FILE;
-        }
-
-        pHeaderOut->sizeInBytes = drwav__bytes_to_u64(sizeInBytes) - 24;    /* <-- Subtract 24 because w64 includes the size of the header. */
-        pHeaderOut->paddingSize = drwav__chunk_padding_size_w64(pHeaderOut->sizeInBytes);
-        *pRunningBytesReadOut += 24;
-    }
-
-    return DRWAV_SUCCESS;
-}
-
-static drwav_bool32 drwav__seek_forward(drwav_seek_proc onSeek, drwav_uint64 offset, void* pUserData)
-{
-    drwav_uint64 bytesRemainingToSeek = offset;
-    while (bytesRemainingToSeek > 0) {
-        if (bytesRemainingToSeek > 0x7FFFFFFF) {
-            if (!onSeek(pUserData, 0x7FFFFFFF, drwav_seek_origin_current)) {
-                return DRWAV_FALSE;
-            }
-            bytesRemainingToSeek -= 0x7FFFFFFF;
-        } else {
-            if (!onSeek(pUserData, (int)bytesRemainingToSeek, drwav_seek_origin_current)) {
-                return DRWAV_FALSE;
-            }
-            bytesRemainingToSeek = 0;
-        }
-    }
-
-    return DRWAV_TRUE;
-}
-
-static drwav_bool32 drwav__seek_from_start(drwav_seek_proc onSeek, drwav_uint64 offset, void* pUserData)
-{
-    if (offset <= 0x7FFFFFFF) {
-        return onSeek(pUserData, (int)offset, drwav_seek_origin_start);
-    }
-
-    /* Larger than 32-bit seek. */
-    if (!onSeek(pUserData, 0x7FFFFFFF, drwav_seek_origin_start)) {
-        return DRWAV_FALSE;
-    }
-    offset -= 0x7FFFFFFF;
-
-    for (;;) {
-        if (offset <= 0x7FFFFFFF) {
-            return onSeek(pUserData, (int)offset, drwav_seek_origin_current);
-        }
-
-        if (!onSeek(pUserData, 0x7FFFFFFF, drwav_seek_origin_current)) {
-            return DRWAV_FALSE;
-        }
-        offset -= 0x7FFFFFFF;
-    }
-
-    /* Should never get here. */
-    /*return DRWAV_TRUE; */
-}
-
-
-static drwav_bool32 drwav__read_fmt(drwav_read_proc onRead, drwav_seek_proc onSeek, void* pUserData, drwav_container container, drwav_uint64* pRunningBytesReadOut, drwav_fmt* fmtOut)
-{
-    drwav_chunk_header header;
-    drwav_uint8 fmt[16];
-
-    if (drwav__read_chunk_header(onRead, pUserData, container, pRunningBytesReadOut, &header) != DRWAV_SUCCESS) {
-        return DRWAV_FALSE;
-    }
-
-
-    /* Skip non-fmt chunks. */
-    while (((container == drwav_container_riff || container == drwav_container_rf64) && !drwav__fourcc_equal(header.id.fourcc, "fmt ")) || (container == drwav_container_w64 && !drwav__guid_equal(header.id.guid, drwavGUID_W64_FMT))) {
-        if (!drwav__seek_forward(onSeek, header.sizeInBytes + header.paddingSize, pUserData)) {
-            return DRWAV_FALSE;
-        }
-        *pRunningBytesReadOut += header.sizeInBytes + header.paddingSize;
-
-        /* Try the next header. */
-        if (drwav__read_chunk_header(onRead, pUserData, container, pRunningBytesReadOut, &header) != DRWAV_SUCCESS) {
-            return DRWAV_FALSE;
-        }
-    }
-
-
-    /* Validation. */
-    if (container == drwav_container_riff || container == drwav_container_rf64) {
-        if (!drwav__fourcc_equal(header.id.fourcc, "fmt ")) {
-            return DRWAV_FALSE;
-        }
-    } else {
-        if (!drwav__guid_equal(header.id.guid, drwavGUID_W64_FMT)) {
-            return DRWAV_FALSE;
-        }
-    }
-
-
-    if (onRead(pUserData, fmt, sizeof(fmt)) != sizeof(fmt)) {
-        return DRWAV_FALSE;
-    }
-    *pRunningBytesReadOut += sizeof(fmt);
-
-    fmtOut->formatTag      = drwav__bytes_to_u16(fmt + 0);
-    fmtOut->channels       = drwav__bytes_to_u16(fmt + 2);
-    fmtOut->sampleRate     = drwav__bytes_to_u32(fmt + 4);
-    fmtOut->avgBytesPerSec = drwav__bytes_to_u32(fmt + 8);
-    fmtOut->blockAlign     = drwav__bytes_to_u16(fmt + 12);
-    fmtOut->bitsPerSample  = drwav__bytes_to_u16(fmt + 14);
-
-    fmtOut->extendedSize       = 0;
-    fmtOut->validBitsPerSample = 0;
-    fmtOut->channelMask        = 0;
-    memset(fmtOut->subFormat, 0, sizeof(fmtOut->subFormat));
-
-    if (header.sizeInBytes > 16) {
-        drwav_uint8 fmt_cbSize[2];
-        int bytesReadSoFar = 0;
-
-        if (onRead(pUserData, fmt_cbSize, sizeof(fmt_cbSize)) != sizeof(fmt_cbSize)) {
-            return DRWAV_FALSE;    /* Expecting more data. */
-        }
-        *pRunningBytesReadOut += sizeof(fmt_cbSize);
-
-        bytesReadSoFar = 18;
-
-        fmtOut->extendedSize = drwav__bytes_to_u16(fmt_cbSize);
-        if (fmtOut->extendedSize > 0) {
-            /* Simple validation. */
-            if (fmtOut->formatTag == DR_WAVE_FORMAT_EXTENSIBLE) {
-                if (fmtOut->extendedSize != 22) {
-                    return DRWAV_FALSE;
-                }
-            }
-
-            if (fmtOut->formatTag == DR_WAVE_FORMAT_EXTENSIBLE) {
-                drwav_uint8 fmtext[22];
-                if (onRead(pUserData, fmtext, fmtOut->extendedSize) != fmtOut->extendedSize) {
-                    return DRWAV_FALSE;    /* Expecting more data. */
-                }
-
-                fmtOut->validBitsPerSample = drwav__bytes_to_u16(fmtext + 0);
-                fmtOut->channelMask        = drwav__bytes_to_u32(fmtext + 2);
-                drwav__bytes_to_guid(fmtext + 6, fmtOut->subFormat);
-            } else {
-                if (!onSeek(pUserData, fmtOut->extendedSize, drwav_seek_origin_current)) {
-                    return DRWAV_FALSE;
-                }
-            }
-            *pRunningBytesReadOut += fmtOut->extendedSize;
-
-            bytesReadSoFar += fmtOut->extendedSize;
-        }
-
-        /* Seek past any leftover bytes. For w64 the leftover will be defined based on the chunk size. */
-        if (!onSeek(pUserData, (int)(header.sizeInBytes - bytesReadSoFar), drwav_seek_origin_current)) {
-            return DRWAV_FALSE;
-        }
-        *pRunningBytesReadOut += (header.sizeInBytes - bytesReadSoFar);
-    }
-
-    if (header.paddingSize > 0) {
-        if (!onSeek(pUserData, header.paddingSize, drwav_seek_origin_current)) {
-            return DRWAV_FALSE;
-        }
-        *pRunningBytesReadOut += header.paddingSize;
-    }
-
-    return DRWAV_TRUE;
-}
-
-
-static size_t drwav__on_read(drwav_read_proc onRead, void* pUserData, void* pBufferOut, size_t bytesToRead, drwav_uint64* pCursor)
-{
-    size_t bytesRead;
-
-    DRWAV_ASSERT(onRead != NULL);
-    DRWAV_ASSERT(pCursor != NULL);
-
-    bytesRead = onRead(pUserData, pBufferOut, bytesToRead);
-    *pCursor += bytesRead;
-    return bytesRead;
-}
-
-#if 0
-static drwav_bool32 drwav__on_seek(drwav_seek_proc onSeek, void* pUserData, int offset, drwav_seek_origin origin, drwav_uint64* pCursor)
-{
-    DRWAV_ASSERT(onSeek != NULL);
-    DRWAV_ASSERT(pCursor != NULL);
-
-    if (!onSeek(pUserData, offset, origin)) {
-        return DRWAV_FALSE;
-    }
-
-    if (origin == drwav_seek_origin_start) {
-        *pCursor = offset;
-    } else {
-        *pCursor += offset;
-    }
-
-    return DRWAV_TRUE;
-}
-#endif
-
-
-
-static drwav_uint32 drwav_get_bytes_per_pcm_frame(drwav* pWav)
-{
-    /*
-    The bytes per frame is a bit ambiguous. It can be either be based on the bits per sample, or the block align. The way I'm doing it here
-    is that if the bits per sample is a multiple of 8, use floor(bitsPerSample*channels/8), otherwise fall back to the block align.
-    */
-    if ((pWav->bitsPerSample & 0x7) == 0) {
-        /* Bits per sample is a multiple of 8. */
-        return (pWav->bitsPerSample * pWav->fmt.channels) >> 3;
-    } else {
-        return pWav->fmt.blockAlign;
-    }
-}
-
-DRWAV_API drwav_uint16 drwav_fmt_get_format(const drwav_fmt* pFMT)
-{
-    if (pFMT == NULL) {
-        return 0;
-    }
-
-    if (pFMT->formatTag != DR_WAVE_FORMAT_EXTENSIBLE) {
-        return pFMT->formatTag;
-    } else {
-        return drwav__bytes_to_u16(pFMT->subFormat);    /* Only the first two bytes are required. */
-    }
-}
-
-static drwav_bool32 drwav_preinit(drwav* pWav, drwav_read_proc onRead, drwav_seek_proc onSeek, void* pReadSeekUserData, const drwav_allocation_callbacks* pAllocationCallbacks)
-{
-    if (pWav == NULL || onRead == NULL || onSeek == NULL) {
-        return DRWAV_FALSE;
-    }
-
-    DRWAV_ZERO_MEMORY(pWav, sizeof(*pWav));
-    pWav->onRead    = onRead;
-    pWav->onSeek    = onSeek;
-    pWav->pUserData = pReadSeekUserData;
-    pWav->allocationCallbacks = drwav_copy_allocation_callbacks_or_defaults(pAllocationCallbacks);
-
-    if (pWav->allocationCallbacks.onFree == NULL || (pWav->allocationCallbacks.onMalloc == NULL && pWav->allocationCallbacks.onRealloc == NULL)) {
-        return DRWAV_FALSE;    /* Invalid allocation callbacks. */
-    }
-
-    return DRWAV_TRUE;
-}
-
-static drwav_bool32 drwav_init__internal(drwav* pWav, drwav_chunk_proc onChunk, void* pChunkUserData, drwav_uint32 flags)
-{
-    /* This function assumes drwav_preinit() has been called beforehand. */
-
-    drwav_uint64 cursor;    /* <-- Keeps track of the byte position so we can seek to specific locations. */
-    drwav_bool32 sequential;
-    drwav_uint8 riff[4];
-    drwav_fmt fmt;
-    unsigned short translatedFormatTag;
-    drwav_bool32 foundDataChunk;
-    drwav_uint64 dataChunkSize = 0; /* <-- Important! Don't explicitly set this to 0 anywhere else. Calculation of the size of the data chunk is performed in different paths depending on the container. */
-    drwav_uint64 sampleCountFromFactChunk = 0;  /* Same as dataChunkSize - make sure this is the only place this is initialized to 0. */
-    drwav_uint64 chunkSize;
-
-    cursor = 0;
-    sequential = (flags & DRWAV_SEQUENTIAL) != 0;
-
-    /* The first 4 bytes should be the RIFF identifier. */
-    if (drwav__on_read(pWav->onRead, pWav->pUserData, riff, sizeof(riff), &cursor) != sizeof(riff)) {
-        return DRWAV_FALSE;
-    }
-
-    /*
-    The first 4 bytes can be used to identify the container. For RIFF files it will start with "RIFF" and for
-    w64 it will start with "riff".
-    */
-    if (drwav__fourcc_equal(riff, "RIFF")) {
-        pWav->container = drwav_container_riff;
-    } else if (drwav__fourcc_equal(riff, "riff")) {
-        int i;
-        drwav_uint8 riff2[12];
-
-        pWav->container = drwav_container_w64;
-
-        /* Check the rest of the GUID for validity. */
-        if (drwav__on_read(pWav->onRead, pWav->pUserData, riff2, sizeof(riff2), &cursor) != sizeof(riff2)) {
-            return DRWAV_FALSE;
-        }
-
-        for (i = 0; i < 12; ++i) {
-            if (riff2[i] != drwavGUID_W64_RIFF[i+4]) {
-                return DRWAV_FALSE;
-            }
-        }
-    } else if (drwav__fourcc_equal(riff, "RF64")) {
-        pWav->container = drwav_container_rf64;
-    } else {
-        return DRWAV_FALSE;   /* Unknown or unsupported container. */
-    }
-
-
-    if (pWav->container == drwav_container_riff || pWav->container == drwav_container_rf64) {
-        drwav_uint8 chunkSizeBytes[4];
-        drwav_uint8 wave[4];
-
-        /* RIFF/WAVE */
-        if (drwav__on_read(pWav->onRead, pWav->pUserData, chunkSizeBytes, sizeof(chunkSizeBytes), &cursor) != sizeof(chunkSizeBytes)) {
-            return DRWAV_FALSE;
-        }
-
-        if (pWav->container == drwav_container_riff) {
-            if (drwav__bytes_to_u32(chunkSizeBytes) < 36) {
-                return DRWAV_FALSE;    /* Chunk size should always be at least 36 bytes. */
-            }
-        } else {
-            if (drwav__bytes_to_u32(chunkSizeBytes) != 0xFFFFFFFF) {
-                return DRWAV_FALSE;    /* Chunk size should always be set to -1/0xFFFFFFFF for RF64. The actual size is retrieved later. */
-            }
-        }
-
-        if (drwav__on_read(pWav->onRead, pWav->pUserData, wave, sizeof(wave), &cursor) != sizeof(wave)) {
-            return DRWAV_FALSE;
-        }
-
-        if (!drwav__fourcc_equal(wave, "WAVE")) {
-            return DRWAV_FALSE;    /* Expecting "WAVE". */
-        }
-    } else {
-        drwav_uint8 chunkSizeBytes[8];
-        drwav_uint8 wave[16];
-
-        /* W64 */
-        if (drwav__on_read(pWav->onRead, pWav->pUserData, chunkSizeBytes, sizeof(chunkSizeBytes), &cursor) != sizeof(chunkSizeBytes)) {
-            return DRWAV_FALSE;
-        }
-
-        if (drwav__bytes_to_u64(chunkSizeBytes) < 80) {
-            return DRWAV_FALSE;
-        }
-
-        if (drwav__on_read(pWav->onRead, pWav->pUserData, wave, sizeof(wave), &cursor) != sizeof(wave)) {
-            return DRWAV_FALSE;
-        }
-
-        if (!drwav__guid_equal(wave, drwavGUID_W64_WAVE)) {
-            return DRWAV_FALSE;
-        }
-    }
-
-
-    /* For RF64, the "ds64" chunk must come next, before the "fmt " chunk. */
-    if (pWav->container == drwav_container_rf64) {
-        drwav_uint8 sizeBytes[8];
-        drwav_uint64 bytesRemainingInChunk;
-        drwav_chunk_header header;
-        drwav_result result = drwav__read_chunk_header(pWav->onRead, pWav->pUserData, pWav->container, &cursor, &header);
-        if (result != DRWAV_SUCCESS) {
-            return DRWAV_FALSE;
-        }
-
-        if (!drwav__fourcc_equal(header.id.fourcc, "ds64")) {
-            return DRWAV_FALSE; /* Expecting "ds64". */
-        }
-
-        bytesRemainingInChunk = header.sizeInBytes + header.paddingSize;
-
-        /* We don't care about the size of the RIFF chunk - skip it. */
-        if (!drwav__seek_forward(pWav->onSeek, 8, pWav->pUserData)) {
-            return DRWAV_FALSE;
-        }
-        bytesRemainingInChunk -= 8;
-        cursor += 8;
-
-
-        /* Next 8 bytes is the size of the "data" chunk. */
-        if (drwav__on_read(pWav->onRead, pWav->pUserData, sizeBytes, sizeof(sizeBytes), &cursor) != sizeof(sizeBytes)) {
-            return DRWAV_FALSE;
-        }
-        bytesRemainingInChunk -= 8;
-        dataChunkSize = drwav__bytes_to_u64(sizeBytes);
-
-
-        /* Next 8 bytes is the same count which we would usually derived from the FACT chunk if it was available. */
-        if (drwav__on_read(pWav->onRead, pWav->pUserData, sizeBytes, sizeof(sizeBytes), &cursor) != sizeof(sizeBytes)) {
-            return DRWAV_FALSE;
-        }
-        bytesRemainingInChunk -= 8;
-        sampleCountFromFactChunk = drwav__bytes_to_u64(sizeBytes);
-
-
-        /* Skip over everything else. */
-        if (!drwav__seek_forward(pWav->onSeek, bytesRemainingInChunk, pWav->pUserData)) {
-            return DRWAV_FALSE;
-        }
-        cursor += bytesRemainingInChunk;
-    }
-
-
-    /* The next bytes should be the "fmt " chunk. */
-    if (!drwav__read_fmt(pWav->onRead, pWav->onSeek, pWav->pUserData, pWav->container, &cursor, &fmt)) {
-        return DRWAV_FALSE;    /* Failed to read the "fmt " chunk. */
-    }
-
-    /* Basic validation. */
-    if ((fmt.sampleRate    == 0 || fmt.sampleRate    > DRWAV_MAX_SAMPLE_RATE)     ||
-        (fmt.channels      == 0 || fmt.channels      > DRWAV_MAX_CHANNELS)        ||
-        (fmt.bitsPerSample == 0 || fmt.bitsPerSample > DRWAV_MAX_BITS_PER_SAMPLE) ||
-        fmt.blockAlign == 0) {
-        return DRWAV_FALSE; /* Probably an invalid WAV file. */
-    }
-
-
-    /* Translate the internal format. */
-    translatedFormatTag = fmt.formatTag;
-    if (translatedFormatTag == DR_WAVE_FORMAT_EXTENSIBLE) {
-        translatedFormatTag = drwav__bytes_to_u16(fmt.subFormat + 0);
-    }
-
-
-    /*
-    We need to enumerate over each chunk for two reasons:
-      1) The "data" chunk may not be the next one
-      2) We may want to report each chunk back to the client
-    
-    In order to correctly report each chunk back to the client we will need to keep looping until the end of the file.
-    */
-    foundDataChunk = DRWAV_FALSE;
-
-    /* The next chunk we care about is the "data" chunk. This is not necessarily the next chunk so we'll need to loop. */
-    for (;;)
-    {
-        drwav_chunk_header header;
-        drwav_result result = drwav__read_chunk_header(pWav->onRead, pWav->pUserData, pWav->container, &cursor, &header);
-        if (result != DRWAV_SUCCESS) {
-            if (!foundDataChunk) {
-                return DRWAV_FALSE;
-            } else {
-                break;  /* Probably at the end of the file. Get out of the loop. */
-            }
-        }
-
-        /* Tell the client about this chunk. */
-        if (!sequential && onChunk != NULL) {
-            drwav_uint64 callbackBytesRead = onChunk(pChunkUserData, pWav->onRead, pWav->onSeek, pWav->pUserData, &header, pWav->container, &fmt);
-
-            /*
-            dr_wav may need to read the contents of the chunk, so we now need to seek back to the position before
-            we called the callback.
-            */
-            if (callbackBytesRead > 0) {
-                if (!drwav__seek_from_start(pWav->onSeek, cursor, pWav->pUserData)) {
-                    return DRWAV_FALSE;
-                }
-            }
-        }
-        
-
-        if (!foundDataChunk) {
-            pWav->dataChunkDataPos = cursor;
-        }
-
-        chunkSize = header.sizeInBytes;
-        if (pWav->container == drwav_container_riff || pWav->container == drwav_container_rf64) {
-            if (drwav__fourcc_equal(header.id.fourcc, "data")) {
-                foundDataChunk = DRWAV_TRUE;
-                if (pWav->container != drwav_container_rf64) {  /* The data chunk size for RF64 will always be set to 0xFFFFFFFF here. It was set to it's true value earlier. */
-                    dataChunkSize = chunkSize;
-                }
-            }
-        } else {
-            if (drwav__guid_equal(header.id.guid, drwavGUID_W64_DATA)) {
-                foundDataChunk = DRWAV_TRUE;
-                dataChunkSize = chunkSize;
-            }
-        }
-
-        /*
-        If at this point we have found the data chunk and we're running in sequential mode, we need to break out of this loop. The reason for
-        this is that we would otherwise require a backwards seek which sequential mode forbids.
-        */
-        if (foundDataChunk && sequential) {
-            break;
-        }
-
-        /* Optional. Get the total sample count from the FACT chunk. This is useful for compressed formats. */
-        if (pWav->container == drwav_container_riff) {
-            if (drwav__fourcc_equal(header.id.fourcc, "fact")) {
-                drwav_uint32 sampleCount;
-                if (drwav__on_read(pWav->onRead, pWav->pUserData, &sampleCount, 4, &cursor) != 4) {
-                    return DRWAV_FALSE;
-                }
-                chunkSize -= 4;
-
-                if (!foundDataChunk) {
-                    pWav->dataChunkDataPos = cursor;
-                }
-
-                /*
-                The sample count in the "fact" chunk is either unreliable, or I'm not understanding it properly. For now I am only enabling this
-                for Microsoft ADPCM formats.
-                */
-                if (pWav->translatedFormatTag == DR_WAVE_FORMAT_ADPCM) {
-                    sampleCountFromFactChunk = sampleCount;
-                } else {
-                    sampleCountFromFactChunk = 0;
-                }
-            }
-        } else if (pWav->container == drwav_container_w64) {
-            if (drwav__guid_equal(header.id.guid, drwavGUID_W64_FACT)) {
-                if (drwav__on_read(pWav->onRead, pWav->pUserData, &sampleCountFromFactChunk, 8, &cursor) != 8) {
-                    return DRWAV_FALSE;
-                }
-                chunkSize -= 8;
-
-                if (!foundDataChunk) {
-                    pWav->dataChunkDataPos = cursor;
-                }
-            }
-        } else if (pWav->container == drwav_container_rf64) {
-            /* We retrieved the sample count from the ds64 chunk earlier so no need to do that here. */
-        }
-
-        /* "smpl" chunk. */
-        if (pWav->container == drwav_container_riff || pWav->container == drwav_container_rf64) {
-            if (drwav__fourcc_equal(header.id.fourcc, "smpl")) {
-                drwav_uint8 smplHeaderData[36];    /* 36 = size of the smpl header section, not including the loop data. */
-                if (chunkSize >= sizeof(smplHeaderData)) {
-                    drwav_uint64 bytesJustRead = drwav__on_read(pWav->onRead, pWav->pUserData, smplHeaderData, sizeof(smplHeaderData), &cursor);
-                    chunkSize -= bytesJustRead;
-
-                    if (bytesJustRead == sizeof(smplHeaderData)) {
-                        drwav_uint32 iLoop;
-
-                        pWav->smpl.manufacturer      = drwav__bytes_to_u32(smplHeaderData+0);
-                        pWav->smpl.product           = drwav__bytes_to_u32(smplHeaderData+4);
-                        pWav->smpl.samplePeriod      = drwav__bytes_to_u32(smplHeaderData+8);
-                        pWav->smpl.midiUnityNotes    = drwav__bytes_to_u32(smplHeaderData+12);
-                        pWav->smpl.midiPitchFraction = drwav__bytes_to_u32(smplHeaderData+16);
-                        pWav->smpl.smpteFormat       = drwav__bytes_to_u32(smplHeaderData+20);
-                        pWav->smpl.smpteOffset       = drwav__bytes_to_u32(smplHeaderData+24);
-                        pWav->smpl.numSampleLoops    = drwav__bytes_to_u32(smplHeaderData+28);
-                        pWav->smpl.samplerData       = drwav__bytes_to_u32(smplHeaderData+32);
-
-                        for (iLoop = 0; iLoop < pWav->smpl.numSampleLoops && iLoop < drwav_countof(pWav->smpl.loops); ++iLoop) {
-                            drwav_uint8 smplLoopData[24];  /* 24 = size of a loop section in the smpl chunk. */
-                            bytesJustRead = drwav__on_read(pWav->onRead, pWav->pUserData, smplLoopData, sizeof(smplLoopData), &cursor);
-                            chunkSize -= bytesJustRead;
-
-                            if (bytesJustRead == sizeof(smplLoopData)) {
-                                pWav->smpl.loops[iLoop].cuePointId = drwav__bytes_to_u32(smplLoopData+0);
-                                pWav->smpl.loops[iLoop].type       = drwav__bytes_to_u32(smplLoopData+4);
-                                pWav->smpl.loops[iLoop].start      = drwav__bytes_to_u32(smplLoopData+8);
-                                pWav->smpl.loops[iLoop].end        = drwav__bytes_to_u32(smplLoopData+12);
-                                pWav->smpl.loops[iLoop].fraction   = drwav__bytes_to_u32(smplLoopData+16);
-                                pWav->smpl.loops[iLoop].playCount  = drwav__bytes_to_u32(smplLoopData+20);
-                            } else {
-                                break;  /* Break from the smpl loop for loop. */
-                            }
-                        }
-                    }
-                } else {
-                    /* Looks like invalid data. Ignore the chunk. */
-                }
-            }
-        } else {
-            if (drwav__guid_equal(header.id.guid, drwavGUID_W64_SMPL)) {
-                /*
-                This path will be hit when a W64 WAV file contains a smpl chunk. I don't have a sample file to test this path, so a contribution
-                is welcome to add support for this.
-                */
-            }
-        }
-
-        /* Make sure we seek past the padding. */
-        chunkSize += header.paddingSize;
-        if (!drwav__seek_forward(pWav->onSeek, chunkSize, pWav->pUserData)) {
-            break;
-        }
-        cursor += chunkSize;
-
-        if (!foundDataChunk) {
-            pWav->dataChunkDataPos = cursor;
-        }
-    }
-
-    /* If we haven't found a data chunk, return an error. */
-    if (!foundDataChunk) {
-        return DRWAV_FALSE;
-    }
-
-    /* We may have moved passed the data chunk. If so we need to move back. If running in sequential mode we can assume we are already sitting on the data chunk. */
-    if (!sequential) {
-        if (!drwav__seek_from_start(pWav->onSeek, pWav->dataChunkDataPos, pWav->pUserData)) {
-            return DRWAV_FALSE;
-        }
-        cursor = pWav->dataChunkDataPos;
-    }
-    
-
-    /* At this point we should be sitting on the first byte of the raw audio data. */
-
-    pWav->fmt                 = fmt;
-    pWav->sampleRate          = fmt.sampleRate;
-    pWav->channels            = fmt.channels;
-    pWav->bitsPerSample       = fmt.bitsPerSample;
-    pWav->bytesRemaining      = dataChunkSize;
-    pWav->translatedFormatTag = translatedFormatTag;
-    pWav->dataChunkDataSize   = dataChunkSize;
-
-    if (sampleCountFromFactChunk != 0) {
-        pWav->totalPCMFrameCount = sampleCountFromFactChunk;
-    } else {
-        pWav->totalPCMFrameCount = dataChunkSize / drwav_get_bytes_per_pcm_frame(pWav);
-
-        if (pWav->translatedFormatTag == DR_WAVE_FORMAT_ADPCM) {
-            drwav_uint64 totalBlockHeaderSizeInBytes;
-            drwav_uint64 blockCount = dataChunkSize / fmt.blockAlign;
-
-            /* Make sure any trailing partial block is accounted for. */
-            if ((blockCount * fmt.blockAlign) < dataChunkSize) {
-                blockCount += 1;
-            }
-
-            /* We decode two samples per byte. There will be blockCount headers in the data chunk. This is enough to know how to calculate the total PCM frame count. */
-            totalBlockHeaderSizeInBytes = blockCount * (6*fmt.channels);
-            pWav->totalPCMFrameCount = ((dataChunkSize - totalBlockHeaderSizeInBytes) * 2) / fmt.channels;
-        }
-        if (pWav->translatedFormatTag == DR_WAVE_FORMAT_DVI_ADPCM) {
-            drwav_uint64 totalBlockHeaderSizeInBytes;
-            drwav_uint64 blockCount = dataChunkSize / fmt.blockAlign;
-
-            /* Make sure any trailing partial block is accounted for. */
-            if ((blockCount * fmt.blockAlign) < dataChunkSize) {
-                blockCount += 1;
-            }
-
-            /* We decode two samples per byte. There will be blockCount headers in the data chunk. This is enough to know how to calculate the total PCM frame count. */
-            totalBlockHeaderSizeInBytes = blockCount * (4*fmt.channels);
-            pWav->totalPCMFrameCount = ((dataChunkSize - totalBlockHeaderSizeInBytes) * 2) / fmt.channels;
-
-            /* The header includes a decoded sample for each channel which acts as the initial predictor sample. */
-            pWav->totalPCMFrameCount += blockCount;
-        }
-    }
-
-    /* Some formats only support a certain number of channels. */
-    if (pWav->translatedFormatTag == DR_WAVE_FORMAT_ADPCM || pWav->translatedFormatTag == DR_WAVE_FORMAT_DVI_ADPCM) {
-        if (pWav->channels > 2) {
-            return DRWAV_FALSE;
-        }
-    }
-
-#ifdef DR_WAV_LIBSNDFILE_COMPAT
-    /*
-    I use libsndfile as a benchmark for testing, however in the version I'm using (from the Windows installer on the libsndfile website),
-    it appears the total sample count libsndfile uses for MS-ADPCM is incorrect. It would seem they are computing the total sample count
-    from the number of blocks, however this results in the inclusion of extra silent samples at the end of the last block. The correct
-    way to know the total sample count is to inspect the "fact" chunk, which should always be present for compressed formats, and should
-    always include the sample count. This little block of code below is only used to emulate the libsndfile logic so I can properly run my
-    correctness tests against libsndfile, and is disabled by default.
-    */
-    if (pWav->translatedFormatTag == DR_WAVE_FORMAT_ADPCM) {
-        drwav_uint64 blockCount = dataChunkSize / fmt.blockAlign;
-        pWav->totalPCMFrameCount = (((blockCount * (fmt.blockAlign - (6*pWav->channels))) * 2)) / fmt.channels;  /* x2 because two samples per byte. */
-    }
-    if (pWav->translatedFormatTag == DR_WAVE_FORMAT_DVI_ADPCM) {
-        drwav_uint64 blockCount = dataChunkSize / fmt.blockAlign;
-        pWav->totalPCMFrameCount = (((blockCount * (fmt.blockAlign - (4*pWav->channels))) * 2) + (blockCount * pWav->channels)) / fmt.channels;
-    }
-#endif
-
-    return DRWAV_TRUE;
-}
-
-DRWAV_API drwav_bool32 drwav_init(drwav* pWav, drwav_read_proc onRead, drwav_seek_proc onSeek, void* pUserData, const drwav_allocation_callbacks* pAllocationCallbacks)
-{
-    return drwav_init_ex(pWav, onRead, onSeek, NULL, pUserData, NULL, 0, pAllocationCallbacks);
-}
-
-DRWAV_API drwav_bool32 drwav_init_ex(drwav* pWav, drwav_read_proc onRead, drwav_seek_proc onSeek, drwav_chunk_proc onChunk, void* pReadSeekUserData, void* pChunkUserData, drwav_uint32 flags, const drwav_allocation_callbacks* pAllocationCallbacks)
-{
-    if (!drwav_preinit(pWav, onRead, onSeek, pReadSeekUserData, pAllocationCallbacks)) {
-        return DRWAV_FALSE;
-    }
-
-    return drwav_init__internal(pWav, onChunk, pChunkUserData, flags);
-}
-
-
-static drwav_uint32 drwav__riff_chunk_size_riff(drwav_uint64 dataChunkSize)
-{
-    drwav_uint64 chunkSize = 4 + 24 + dataChunkSize + drwav__chunk_padding_size_riff(dataChunkSize); /* 4 = "WAVE". 24 = "fmt " chunk. */
-    if (chunkSize > 0xFFFFFFFFUL) {
-        chunkSize = 0xFFFFFFFFUL;
-    }
-
-    return (drwav_uint32)chunkSize; /* Safe cast due to the clamp above. */
-}
-
-static drwav_uint32 drwav__data_chunk_size_riff(drwav_uint64 dataChunkSize)
-{
-    if (dataChunkSize <= 0xFFFFFFFFUL) {
-        return (drwav_uint32)dataChunkSize;
-    } else {
-        return 0xFFFFFFFFUL;
-    }
-}
-
-static drwav_uint64 drwav__riff_chunk_size_w64(drwav_uint64 dataChunkSize)
-{
-    drwav_uint64 dataSubchunkPaddingSize = drwav__chunk_padding_size_w64(dataChunkSize);
-
-    return 80 + 24 + dataChunkSize + dataSubchunkPaddingSize;   /* +24 because W64 includes the size of the GUID and size fields. */
-}
-
-static drwav_uint64 drwav__data_chunk_size_w64(drwav_uint64 dataChunkSize)
-{
-    return 24 + dataChunkSize;        /* +24 because W64 includes the size of the GUID and size fields. */
-}
-
-static drwav_uint64 drwav__riff_chunk_size_rf64(drwav_uint64 dataChunkSize)
-{
-    drwav_uint64 chunkSize = 4 + 36 + 24 + dataChunkSize + drwav__chunk_padding_size_riff(dataChunkSize); /* 4 = "WAVE". 36 = "ds64" chunk. 24 = "fmt " chunk. */
-    if (chunkSize > 0xFFFFFFFFUL) {
-        chunkSize = 0xFFFFFFFFUL;
-    }
-
-    return chunkSize;
-}
-
-static drwav_uint64 drwav__data_chunk_size_rf64(drwav_uint64 dataChunkSize)
-{
-    return dataChunkSize;
-}
-
-
-static size_t drwav__write(drwav* pWav, const void* pData, size_t dataSize)
-{
-    DRWAV_ASSERT(pWav          != NULL);
-    DRWAV_ASSERT(pWav->onWrite != NULL);
-
-    /* Generic write. Assumes no byte reordering required. */
-    return pWav->onWrite(pWav->pUserData, pData, dataSize);
-}
-
-static size_t drwav__write_u16ne_to_le(drwav* pWav, drwav_uint16 value)
-{
-    DRWAV_ASSERT(pWav          != NULL);
-    DRWAV_ASSERT(pWav->onWrite != NULL);
-
-    if (!drwav__is_little_endian()) {
-        value = drwav__bswap16(value);
-    }
-
-    return drwav__write(pWav, &value, 2);
-}
-
-static size_t drwav__write_u32ne_to_le(drwav* pWav, drwav_uint32 value)
-{
-    DRWAV_ASSERT(pWav          != NULL);
-    DRWAV_ASSERT(pWav->onWrite != NULL);
-
-    if (!drwav__is_little_endian()) {
-        value = drwav__bswap32(value);
-    }
-
-    return drwav__write(pWav, &value, 4);
-}
-
-static size_t drwav__write_u64ne_to_le(drwav* pWav, drwav_uint64 value)
-{
-    DRWAV_ASSERT(pWav          != NULL);
-    DRWAV_ASSERT(pWav->onWrite != NULL);
-
-    if (!drwav__is_little_endian()) {
-        value = drwav__bswap64(value);
-    }
-
-    return drwav__write(pWav, &value, 8);
-}
-
-
-static drwav_bool32 drwav_preinit_write(drwav* pWav, const drwav_data_format* pFormat, drwav_bool32 isSequential, drwav_write_proc onWrite, drwav_seek_proc onSeek, void* pUserData, const drwav_allocation_callbacks* pAllocationCallbacks)
-{
-    if (pWav == NULL || onWrite == NULL) {
-        return DRWAV_FALSE;
-    }
-
-    if (!isSequential && onSeek == NULL) {
-        return DRWAV_FALSE; /* <-- onSeek is required when in non-sequential mode. */
-    }
-
-    /* Not currently supporting compressed formats. Will need to add support for the "fact" chunk before we enable this. */
-    if (pFormat->format == DR_WAVE_FORMAT_EXTENSIBLE) {
-        return DRWAV_FALSE;
-    }
-    if (pFormat->format == DR_WAVE_FORMAT_ADPCM || pFormat->format == DR_WAVE_FORMAT_DVI_ADPCM) {
-        return DRWAV_FALSE;
-    }
-
-    DRWAV_ZERO_MEMORY(pWav, sizeof(*pWav));
-    pWav->onWrite   = onWrite;
-    pWav->onSeek    = onSeek;
-    pWav->pUserData = pUserData;
-    pWav->allocationCallbacks = drwav_copy_allocation_callbacks_or_defaults(pAllocationCallbacks);
-
-    if (pWav->allocationCallbacks.onFree == NULL || (pWav->allocationCallbacks.onMalloc == NULL && pWav->allocationCallbacks.onRealloc == NULL)) {
-        return DRWAV_FALSE;    /* Invalid allocation callbacks. */
-    }
-
-    pWav->fmt.formatTag = (drwav_uint16)pFormat->format;
-    pWav->fmt.channels = (drwav_uint16)pFormat->channels;
-    pWav->fmt.sampleRate = pFormat->sampleRate;
-    pWav->fmt.avgBytesPerSec = (drwav_uint32)((pFormat->bitsPerSample * pFormat->sampleRate * pFormat->channels) / 8);
-    pWav->fmt.blockAlign = (drwav_uint16)((pFormat->channels * pFormat->bitsPerSample) / 8);
-    pWav->fmt.bitsPerSample = (drwav_uint16)pFormat->bitsPerSample;
-    pWav->fmt.extendedSize = 0;
-    pWav->isSequentialWrite = isSequential;
-
-    return DRWAV_TRUE;
-}
-
-static drwav_bool32 drwav_init_write__internal(drwav* pWav, const drwav_data_format* pFormat, drwav_uint64 totalSampleCount)
-{
-    /* The function assumes drwav_preinit_write() was called beforehand. */
-
-    size_t runningPos = 0;
-    drwav_uint64 initialDataChunkSize = 0;
-    drwav_uint64 chunkSizeFMT;
-
-    /*
-    The initial values for the "RIFF" and "data" chunks depends on whether or not we are initializing in sequential mode or not. In
-    sequential mode we set this to its final values straight away since they can be calculated from the total sample count. In non-
-    sequential mode we initialize it all to zero and fill it out in drwav_uninit() using a backwards seek.
-    */
-    if (pWav->isSequentialWrite) {
-        initialDataChunkSize = (totalSampleCount * pWav->fmt.bitsPerSample) / 8;
-
-        /*
-        The RIFF container has a limit on the number of samples. drwav is not allowing this. There's no practical limits for Wave64
-        so for the sake of simplicity I'm not doing any validation for that.
-        */
-        if (pFormat->container == drwav_container_riff) {
-            if (initialDataChunkSize > (0xFFFFFFFFUL - 36)) {
-                return DRWAV_FALSE; /* Not enough room to store every sample. */
-            }
-        }
-    }
-
-    pWav->dataChunkDataSizeTargetWrite = initialDataChunkSize;
-
-
-    /* "RIFF" chunk. */
-    if (pFormat->container == drwav_container_riff) {
-        drwav_uint32 chunkSizeRIFF = 28 + (drwav_uint32)initialDataChunkSize;   /* +28 = "WAVE" + [sizeof "fmt " chunk] */
-        runningPos += drwav__write(pWav, "RIFF", 4);
-        runningPos += drwav__write_u32ne_to_le(pWav, chunkSizeRIFF);
-        runningPos += drwav__write(pWav, "WAVE", 4);
-    } else if (pFormat->container == drwav_container_w64) {
-        drwav_uint64 chunkSizeRIFF = 80 + 24 + initialDataChunkSize;            /* +24 because W64 includes the size of the GUID and size fields. */
-        runningPos += drwav__write(pWav, drwavGUID_W64_RIFF, 16);
-        runningPos += drwav__write_u64ne_to_le(pWav, chunkSizeRIFF);
-        runningPos += drwav__write(pWav, drwavGUID_W64_WAVE, 16);
-    } else if (pFormat->container == drwav_container_rf64) {
-        runningPos += drwav__write(pWav, "RF64", 4);
-        runningPos += drwav__write_u32ne_to_le(pWav, 0xFFFFFFFF);               /* Always 0xFFFFFFFF for RF64. Set to a proper value in the "ds64" chunk. */
-        runningPos += drwav__write(pWav, "WAVE", 4);
-    }
-
-    
-    /* "ds64" chunk (RF64 only). */
-    if (pFormat->container == drwav_container_rf64) {
-        drwav_uint32 initialds64ChunkSize = 28;                                 /* 28 = [Size of RIFF (8 bytes)] + [Size of DATA (8 bytes)] + [Sample Count (8 bytes)] + [Table Length (4 bytes)]. Table length always set to 0. */
-        drwav_uint64 initialRiffChunkSize = 8 + initialds64ChunkSize + initialDataChunkSize;    /* +8 for the ds64 header. */
-
-        runningPos += drwav__write(pWav, "ds64", 4);
-        runningPos += drwav__write_u32ne_to_le(pWav, initialds64ChunkSize);     /* Size of ds64. */
-        runningPos += drwav__write_u64ne_to_le(pWav, initialRiffChunkSize);     /* Size of RIFF. Set to true value at the end. */
-        runningPos += drwav__write_u64ne_to_le(pWav, initialDataChunkSize);     /* Size of DATA. Set to true value at the end. */
-        runningPos += drwav__write_u64ne_to_le(pWav, totalSampleCount);         /* Sample count. */
-        runningPos += drwav__write_u32ne_to_le(pWav, 0);                        /* Table length. Always set to zero in our case since we're not doing any other chunks than "DATA". */
-    }
-
-
-    /* "fmt " chunk. */
-    if (pFormat->container == drwav_container_riff || pFormat->container == drwav_container_rf64) {
-        chunkSizeFMT = 16;
-        runningPos += drwav__write(pWav, "fmt ", 4);
-        runningPos += drwav__write_u32ne_to_le(pWav, (drwav_uint32)chunkSizeFMT);
-    } else if (pFormat->container == drwav_container_w64) {
-        chunkSizeFMT = 40;
-        runningPos += drwav__write(pWav, drwavGUID_W64_FMT, 16);
-        runningPos += drwav__write_u64ne_to_le(pWav, chunkSizeFMT);
-    }
-
-    runningPos += drwav__write_u16ne_to_le(pWav, pWav->fmt.formatTag);
-    runningPos += drwav__write_u16ne_to_le(pWav, pWav->fmt.channels);
-    runningPos += drwav__write_u32ne_to_le(pWav, pWav->fmt.sampleRate);
-    runningPos += drwav__write_u32ne_to_le(pWav, pWav->fmt.avgBytesPerSec);
-    runningPos += drwav__write_u16ne_to_le(pWav, pWav->fmt.blockAlign);
-    runningPos += drwav__write_u16ne_to_le(pWav, pWav->fmt.bitsPerSample);
-
-    pWav->dataChunkDataPos = runningPos;
-
-    /* "data" chunk. */
-    if (pFormat->container == drwav_container_riff) {
-        drwav_uint32 chunkSizeDATA = (drwav_uint32)initialDataChunkSize;
-        runningPos += drwav__write(pWav, "data", 4);
-        runningPos += drwav__write_u32ne_to_le(pWav, chunkSizeDATA);
-    } else if (pFormat->container == drwav_container_w64) {
-        drwav_uint64 chunkSizeDATA = 24 + initialDataChunkSize;     /* +24 because W64 includes the size of the GUID and size fields. */
-        runningPos += drwav__write(pWav, drwavGUID_W64_DATA, 16);
-        runningPos += drwav__write_u64ne_to_le(pWav, chunkSizeDATA);
-    } else if (pFormat->container == drwav_container_rf64) {
-        runningPos += drwav__write(pWav, "data", 4);
-        runningPos += drwav__write_u32ne_to_le(pWav, 0xFFFFFFFF);   /* Always set to 0xFFFFFFFF for RF64. The true size of the data chunk is specified in the ds64 chunk. */
-    }
-
-    /*
-    The runningPos variable is incremented in the section above but is left unused which is causing some static analysis tools to detect it
-    as a dead store. I'm leaving this as-is for safety just in case I want to expand this function later to include other tags and want to
-    keep track of the running position for whatever reason. The line below should silence the static analysis tools.
-    */
-    (void)runningPos;
-
-    /* Set some properties for the client's convenience. */
-    pWav->container = pFormat->container;
-    pWav->channels = (drwav_uint16)pFormat->channels;
-    pWav->sampleRate = pFormat->sampleRate;
-    pWav->bitsPerSample = (drwav_uint16)pFormat->bitsPerSample;
-    pWav->translatedFormatTag = (drwav_uint16)pFormat->format;
-
-    return DRWAV_TRUE;
-}
-
-
-DRWAV_API drwav_bool32 drwav_init_write(drwav* pWav, const drwav_data_format* pFormat, drwav_write_proc onWrite, drwav_seek_proc onSeek, void* pUserData, const drwav_allocation_callbacks* pAllocationCallbacks)
-{
-    if (!drwav_preinit_write(pWav, pFormat, DRWAV_FALSE, onWrite, onSeek, pUserData, pAllocationCallbacks)) {
-        return DRWAV_FALSE;
-    }
-
-    return drwav_init_write__internal(pWav, pFormat, 0);               /* DRWAV_FALSE = Not Sequential */
-}
-
-DRWAV_API drwav_bool32 drwav_init_write_sequential(drwav* pWav, const drwav_data_format* pFormat, drwav_uint64 totalSampleCount, drwav_write_proc onWrite, void* pUserData, const drwav_allocation_callbacks* pAllocationCallbacks)
-{
-    if (!drwav_preinit_write(pWav, pFormat, DRWAV_TRUE, onWrite, NULL, pUserData, pAllocationCallbacks)) {
-        return DRWAV_FALSE;
-    }
-
-    return drwav_init_write__internal(pWav, pFormat, totalSampleCount); /* DRWAV_TRUE = Sequential */
-}
-
-DRWAV_API drwav_bool32 drwav_init_write_sequential_pcm_frames(drwav* pWav, const drwav_data_format* pFormat, drwav_uint64 totalPCMFrameCount, drwav_write_proc onWrite, void* pUserData, const drwav_allocation_callbacks* pAllocationCallbacks)
-{
-    if (pFormat == NULL) {
-        return DRWAV_FALSE;
-    }
-
-    return drwav_init_write_sequential(pWav, pFormat, totalPCMFrameCount*pFormat->channels, onWrite, pUserData, pAllocationCallbacks);
-}
-
-DRWAV_API drwav_uint64 drwav_target_write_size_bytes(const drwav_data_format* pFormat, drwav_uint64 totalSampleCount)
-{
-    /* Casting totalSampleCount to drwav_int64 for VC6 compatibility. No issues in practice because nobody is going to exhaust the whole 63 bits. */
-    drwav_uint64 targetDataSizeBytes = (drwav_uint64)((drwav_int64)totalSampleCount * pFormat->channels * pFormat->bitsPerSample/8.0);
-    drwav_uint64 riffChunkSizeBytes;
-    drwav_uint64 fileSizeBytes = 0;
-
-    if (pFormat->container == drwav_container_riff) {
-        riffChunkSizeBytes = drwav__riff_chunk_size_riff(targetDataSizeBytes);
-        fileSizeBytes = (8 + riffChunkSizeBytes);   /* +8 because WAV doesn't include the size of the ChunkID and ChunkSize fields. */
-    } else if (pFormat->container == drwav_container_w64) {
-        riffChunkSizeBytes = drwav__riff_chunk_size_w64(targetDataSizeBytes);
-        fileSizeBytes = riffChunkSizeBytes;
-    } else if (pFormat->container == drwav_container_rf64) {
-        riffChunkSizeBytes = drwav__riff_chunk_size_rf64(targetDataSizeBytes);
-        fileSizeBytes = (8 + riffChunkSizeBytes);   /* +8 because WAV doesn't include the size of the ChunkID and ChunkSize fields. */
-    }
-
-    return fileSizeBytes;
-}
-
-
-#ifndef DR_WAV_NO_STDIO
-
-/* drwav_result_from_errno() is only used for fopen() and wfopen() so putting it inside DR_WAV_NO_STDIO for now. If something else needs this later we can move it out. */
-#include <errno.h>
-static drwav_result drwav_result_from_errno(int e)
-{
-    switch (e)
-    {
-        case 0: return DRWAV_SUCCESS;
-    #ifdef EPERM
-        case EPERM: return DRWAV_INVALID_OPERATION;
-    #endif
-    #ifdef ENOENT
-        case ENOENT: return DRWAV_DOES_NOT_EXIST;
-    #endif
-    #ifdef ESRCH
-        case ESRCH: return DRWAV_DOES_NOT_EXIST;
-    #endif
-    #ifdef EINTR
-        case EINTR: return DRWAV_INTERRUPT;
-    #endif
-    #ifdef EIO
-        case EIO: return DRWAV_IO_ERROR;
-    #endif
-    #ifdef ENXIO
-        case ENXIO: return DRWAV_DOES_NOT_EXIST;
-    #endif
-    #ifdef E2BIG
-        case E2BIG: return DRWAV_INVALID_ARGS;
-    #endif
-    #ifdef ENOEXEC
-        case ENOEXEC: return DRWAV_INVALID_FILE;
-    #endif
-    #ifdef EBADF
-        case EBADF: return DRWAV_INVALID_FILE;
-    #endif
-    #ifdef ECHILD
-        case ECHILD: return DRWAV_ERROR;
-    #endif
-    #ifdef EAGAIN
-        case EAGAIN: return DRWAV_UNAVAILABLE;
-    #endif
-    #ifdef ENOMEM
-        case ENOMEM: return DRWAV_OUT_OF_MEMORY;
-    #endif
-    #ifdef EACCES
-        case EACCES: return DRWAV_ACCESS_DENIED;
-    #endif
-    #ifdef EFAULT
-        case EFAULT: return DRWAV_BAD_ADDRESS;
-    #endif
-    #ifdef ENOTBLK
-        case ENOTBLK: return DRWAV_ERROR;
-    #endif
-    #ifdef EBUSY
-        case EBUSY: return DRWAV_BUSY;
-    #endif
-    #ifdef EEXIST
-        case EEXIST: return DRWAV_ALREADY_EXISTS;
-    #endif
-    #ifdef EXDEV
-        case EXDEV: return DRWAV_ERROR;
-    #endif
-    #ifdef ENODEV
-        case ENODEV: return DRWAV_DOES_NOT_EXIST;
-    #endif
-    #ifdef ENOTDIR
-        case ENOTDIR: return DRWAV_NOT_DIRECTORY;
-    #endif
-    #ifdef EISDIR
-        case EISDIR: return DRWAV_IS_DIRECTORY;
-    #endif
-    #ifdef EINVAL
-        case EINVAL: return DRWAV_INVALID_ARGS;
-    #endif
-    #ifdef ENFILE
-        case ENFILE: return DRWAV_TOO_MANY_OPEN_FILES;
-    #endif
-    #ifdef EMFILE
-        case EMFILE: return DRWAV_TOO_MANY_OPEN_FILES;
-    #endif
-    #ifdef ENOTTY
-        case ENOTTY: return DRWAV_INVALID_OPERATION;
-    #endif
-    #ifdef ETXTBSY
-        case ETXTBSY: return DRWAV_BUSY;
-    #endif
-    #ifdef EFBIG
-        case EFBIG: return DRWAV_TOO_BIG;
-    #endif
-    #ifdef ENOSPC
-        case ENOSPC: return DRWAV_NO_SPACE;
-    #endif
-    #ifdef ESPIPE
-        case ESPIPE: return DRWAV_BAD_SEEK;
-    #endif
-    #ifdef EROFS
-        case EROFS: return DRWAV_ACCESS_DENIED;
-    #endif
-    #ifdef EMLINK
-        case EMLINK: return DRWAV_TOO_MANY_LINKS;
-    #endif
-    #ifdef EPIPE
-        case EPIPE: return DRWAV_BAD_PIPE;
-    #endif
-    #ifdef EDOM
-        case EDOM: return DRWAV_OUT_OF_RANGE;
-    #endif
-    #ifdef ERANGE
-        case ERANGE: return DRWAV_OUT_OF_RANGE;
-    #endif
-    #ifdef EDEADLK
-        case EDEADLK: return DRWAV_DEADLOCK;
-    #endif
-    #ifdef ENAMETOOLONG
-        case ENAMETOOLONG: return DRWAV_PATH_TOO_LONG;
-    #endif
-    #ifdef ENOLCK
-        case ENOLCK: return DRWAV_ERROR;
-    #endif
-    #ifdef ENOSYS
-        case ENOSYS: return DRWAV_NOT_IMPLEMENTED;
-    #endif
-    #ifdef ENOTEMPTY
-        case ENOTEMPTY: return DRWAV_DIRECTORY_NOT_EMPTY;
-    #endif
-    #ifdef ELOOP
-        case ELOOP: return DRWAV_TOO_MANY_LINKS;
-    #endif
-    #ifdef ENOMSG
-        case ENOMSG: return DRWAV_NO_MESSAGE;
-    #endif
-    #ifdef EIDRM
-        case EIDRM: return DRWAV_ERROR;
-    #endif
-    #ifdef ECHRNG
-        case ECHRNG: return DRWAV_ERROR;
-    #endif
-    #ifdef EL2NSYNC
-        case EL2NSYNC: return DRWAV_ERROR;
-    #endif
-    #ifdef EL3HLT
-        case EL3HLT: return DRWAV_ERROR;
-    #endif
-    #ifdef EL3RST
-        case EL3RST: return DRWAV_ERROR;
-    #endif
-    #ifdef ELNRNG
-        case ELNRNG: return DRWAV_OUT_OF_RANGE;
-    #endif
-    #ifdef EUNATCH
-        case EUNATCH: return DRWAV_ERROR;
-    #endif
-    #ifdef ENOCSI
-        case ENOCSI: return DRWAV_ERROR;
-    #endif
-    #ifdef EL2HLT
-        case EL2HLT: return DRWAV_ERROR;
-    #endif
-    #ifdef EBADE
-        case EBADE: return DRWAV_ERROR;
-    #endif
-    #ifdef EBADR
-        case EBADR: return DRWAV_ERROR;
-    #endif
-    #ifdef EXFULL
-        case EXFULL: return DRWAV_ERROR;
-    #endif
-    #ifdef ENOANO
-        case ENOANO: return DRWAV_ERROR;
-    #endif
-    #ifdef EBADRQC
-        case EBADRQC: return DRWAV_ERROR;
-    #endif
-    #ifdef EBADSLT
-        case EBADSLT: return DRWAV_ERROR;
-    #endif
-    #ifdef EBFONT
-        case EBFONT: return DRWAV_INVALID_FILE;
-    #endif
-    #ifdef ENOSTR
-        case ENOSTR: return DRWAV_ERROR;
-    #endif
-    #ifdef ENODATA
-        case ENODATA: return DRWAV_NO_DATA_AVAILABLE;
-    #endif
-    #ifdef ETIME
-        case ETIME: return DRWAV_TIMEOUT;
-    #endif
-    #ifdef ENOSR
-        case ENOSR: return DRWAV_NO_DATA_AVAILABLE;
-    #endif
-    #ifdef ENONET
-        case ENONET: return DRWAV_NO_NETWORK;
-    #endif
-    #ifdef ENOPKG
-        case ENOPKG: return DRWAV_ERROR;
-    #endif
-    #ifdef EREMOTE
-        case EREMOTE: return DRWAV_ERROR;
-    #endif
-    #ifdef ENOLINK
-        case ENOLINK: return DRWAV_ERROR;
-    #endif
-    #ifdef EADV
-        case EADV: return DRWAV_ERROR;
-    #endif
-    #ifdef ESRMNT
-        case ESRMNT: return DRWAV_ERROR;
-    #endif
-    #ifdef ECOMM
-        case ECOMM: return DRWAV_ERROR;
-    #endif
-    #ifdef EPROTO
-        case EPROTO: return DRWAV_ERROR;
-    #endif
-    #ifdef EMULTIHOP
-        case EMULTIHOP: return DRWAV_ERROR;
-    #endif
-    #ifdef EDOTDOT
-        case EDOTDOT: return DRWAV_ERROR;
-    #endif
-    #ifdef EBADMSG
-        case EBADMSG: return DRWAV_BAD_MESSAGE;
-    #endif
-    #ifdef EOVERFLOW
-        case EOVERFLOW: return DRWAV_TOO_BIG;
-    #endif
-    #ifdef ENOTUNIQ
-        case ENOTUNIQ: return DRWAV_NOT_UNIQUE;
-    #endif
-    #ifdef EBADFD
-        case EBADFD: return DRWAV_ERROR;
-    #endif
-    #ifdef EREMCHG
-        case EREMCHG: return DRWAV_ERROR;
-    #endif
-    #ifdef ELIBACC
-        case ELIBACC: return DRWAV_ACCESS_DENIED;
-    #endif
-    #ifdef ELIBBAD
-        case ELIBBAD: return DRWAV_INVALID_FILE;
-    #endif
-    #ifdef ELIBSCN
-        case ELIBSCN: return DRWAV_INVALID_FILE;
-    #endif
-    #ifdef ELIBMAX
-        case ELIBMAX: return DRWAV_ERROR;
-    #endif
-    #ifdef ELIBEXEC
-        case ELIBEXEC: return DRWAV_ERROR;
-    #endif
-    #ifdef EILSEQ
-        case EILSEQ: return DRWAV_INVALID_DATA;
-    #endif
-    #ifdef ERESTART
-        case ERESTART: return DRWAV_ERROR;
-    #endif
-    #ifdef ESTRPIPE
-        case ESTRPIPE: return DRWAV_ERROR;
-    #endif
-    #ifdef EUSERS
-        case EUSERS: return DRWAV_ERROR;
-    #endif
-    #ifdef ENOTSOCK
-        case ENOTSOCK: return DRWAV_NOT_SOCKET;
-    #endif
-    #ifdef EDESTADDRREQ
-        case EDESTADDRREQ: return DRWAV_NO_ADDRESS;
-    #endif
-    #ifdef EMSGSIZE
-        case EMSGSIZE: return DRWAV_TOO_BIG;
-    #endif
-    #ifdef EPROTOTYPE
-        case EPROTOTYPE: return DRWAV_BAD_PROTOCOL;
-    #endif
-    #ifdef ENOPROTOOPT
-        case ENOPROTOOPT: return DRWAV_PROTOCOL_UNAVAILABLE;
-    #endif
-    #ifdef EPROTONOSUPPORT
-        case EPROTONOSUPPORT: return DRWAV_PROTOCOL_NOT_SUPPORTED;
-    #endif
-    #ifdef ESOCKTNOSUPPORT
-        case ESOCKTNOSUPPORT: return DRWAV_SOCKET_NOT_SUPPORTED;
-    #endif
-    #ifdef EOPNOTSUPP
-        case EOPNOTSUPP: return DRWAV_INVALID_OPERATION;
-    #endif
-    #ifdef EPFNOSUPPORT
-        case EPFNOSUPPORT: return DRWAV_PROTOCOL_FAMILY_NOT_SUPPORTED;
-    #endif
-    #ifdef EAFNOSUPPORT
-        case EAFNOSUPPORT: return DRWAV_ADDRESS_FAMILY_NOT_SUPPORTED;
-    #endif
-    #ifdef EADDRINUSE
-        case EADDRINUSE: return DRWAV_ALREADY_IN_USE;
-    #endif
-    #ifdef EADDRNOTAVAIL
-        case EADDRNOTAVAIL: return DRWAV_ERROR;
-    #endif
-    #ifdef ENETDOWN
-        case ENETDOWN: return DRWAV_NO_NETWORK;
-    #endif
-    #ifdef ENETUNREACH
-        case ENETUNREACH: return DRWAV_NO_NETWORK;
-    #endif
-    #ifdef ENETRESET
-        case ENETRESET: return DRWAV_NO_NETWORK;
-    #endif
-    #ifdef ECONNABORTED
-        case ECONNABORTED: return DRWAV_NO_NETWORK;
-    #endif
-    #ifdef ECONNRESET
-        case ECONNRESET: return DRWAV_CONNECTION_RESET;
-    #endif
-    #ifdef ENOBUFS
-        case ENOBUFS: return DRWAV_NO_SPACE;
-    #endif
-    #ifdef EISCONN
-        case EISCONN: return DRWAV_ALREADY_CONNECTED;
-    #endif
-    #ifdef ENOTCONN
-        case ENOTCONN: return DRWAV_NOT_CONNECTED;
-    #endif
-    #ifdef ESHUTDOWN
-        case ESHUTDOWN: return DRWAV_ERROR;
-    #endif
-    #ifdef ETOOMANYREFS
-        case ETOOMANYREFS: return DRWAV_ERROR;
-    #endif
-    #ifdef ETIMEDOUT
-        case ETIMEDOUT: return DRWAV_TIMEOUT;
-    #endif
-    #ifdef ECONNREFUSED
-        case ECONNREFUSED: return DRWAV_CONNECTION_REFUSED;
-    #endif
-    #ifdef EHOSTDOWN
-        case EHOSTDOWN: return DRWAV_NO_HOST;
-    #endif
-    #ifdef EHOSTUNREACH
-        case EHOSTUNREACH: return DRWAV_NO_HOST;
-    #endif
-    #ifdef EALREADY
-        case EALREADY: return DRWAV_IN_PROGRESS;
-    #endif
-    #ifdef EINPROGRESS
-        case EINPROGRESS: return DRWAV_IN_PROGRESS;
-    #endif
-    #ifdef ESTALE
-        case ESTALE: return DRWAV_INVALID_FILE;
-    #endif
-    #ifdef EUCLEAN
-        case EUCLEAN: return DRWAV_ERROR;
-    #endif
-    #ifdef ENOTNAM
-        case ENOTNAM: return DRWAV_ERROR;
-    #endif
-    #ifdef ENAVAIL
-        case ENAVAIL: return DRWAV_ERROR;
-    #endif
-    #ifdef EISNAM
-        case EISNAM: return DRWAV_ERROR;
-    #endif
-    #ifdef EREMOTEIO
-        case EREMOTEIO: return DRWAV_IO_ERROR;
-    #endif
-    #ifdef EDQUOT
-        case EDQUOT: return DRWAV_NO_SPACE;
-    #endif
-    #ifdef ENOMEDIUM
-        case ENOMEDIUM: return DRWAV_DOES_NOT_EXIST;
-    #endif
-    #ifdef EMEDIUMTYPE
-        case EMEDIUMTYPE: return DRWAV_ERROR;
-    #endif
-    #ifdef ECANCELED
-        case ECANCELED: return DRWAV_CANCELLED;
-    #endif
-    #ifdef ENOKEY
-        case ENOKEY: return DRWAV_ERROR;
-    #endif
-    #ifdef EKEYEXPIRED
-        case EKEYEXPIRED: return DRWAV_ERROR;
-    #endif
-    #ifdef EKEYREVOKED
-        case EKEYREVOKED: return DRWAV_ERROR;
-    #endif
-    #ifdef EKEYREJECTED
-        case EKEYREJECTED: return DRWAV_ERROR;
-    #endif
-    #ifdef EOWNERDEAD
-        case EOWNERDEAD: return DRWAV_ERROR;
-    #endif
-    #ifdef ENOTRECOVERABLE
-        case ENOTRECOVERABLE: return DRWAV_ERROR;
-    #endif
-    #ifdef ERFKILL
-        case ERFKILL: return DRWAV_ERROR;
-    #endif
-    #ifdef EHWPOISON
-        case EHWPOISON: return DRWAV_ERROR;
-    #endif
-        default: return DRWAV_ERROR;
-    }
-}
-
-static drwav_result drwav_fopen(FILE** ppFile, const char* pFilePath, const char* pOpenMode)
-{
-#if _MSC_VER && _MSC_VER >= 1400
-    errno_t err;
-#endif
-
-    if (ppFile != NULL) {
-        *ppFile = NULL;  /* Safety. */
-    }
-
-    if (pFilePath == NULL || pOpenMode == NULL || ppFile == NULL) {
-        return DRWAV_INVALID_ARGS;
-    }
-
-#if _MSC_VER && _MSC_VER >= 1400
-    err = fopen_s(ppFile, pFilePath, pOpenMode);
-    if (err != 0) {
-        return drwav_result_from_errno(err);
-    }
-#else
-#if defined(_WIN32) || defined(__APPLE__)
-    *ppFile = fopen(pFilePath, pOpenMode);
-#else
-    #if defined(_FILE_OFFSET_BITS) && _FILE_OFFSET_BITS == 64 && defined(_LARGEFILE64_SOURCE)
-        *ppFile = fopen64(pFilePath, pOpenMode);
-    #else
-        *ppFile = fopen(pFilePath, pOpenMode);
-    #endif
-#endif
-    if (*ppFile == NULL) {
-        drwav_result result = drwav_result_from_errno(errno);
-        if (result == DRWAV_SUCCESS) {
-            result = DRWAV_ERROR;   /* Just a safety check to make sure we never ever return success when pFile == NULL. */
-        }
-
-        return result;
-    }
-#endif
-
-    return DRWAV_SUCCESS;
-}
-
-/*
-_wfopen() isn't always available in all compilation environments.
-
-    * Windows only.
-    * MSVC seems to support it universally as far back as VC6 from what I can tell (haven't checked further back).
-    * MinGW-64 (both 32- and 64-bit) seems to support it.
-    * MinGW wraps it in !defined(__STRICT_ANSI__).
-    * OpenWatcom wraps it in !defined(_NO_EXT_KEYS).
-
-This can be reviewed as compatibility issues arise. The preference is to use _wfopen_s() and _wfopen() as opposed to the wcsrtombs()
-fallback, so if you notice your compiler not detecting this properly I'm happy to look at adding support.
-*/
-#if defined(_WIN32)
-    #if defined(_MSC_VER) || defined(__MINGW64__) || (!defined(__STRICT_ANSI__) && !defined(_NO_EXT_KEYS))
-        #define DRWAV_HAS_WFOPEN
-    #endif
-#endif
-
-static drwav_result drwav_wfopen(FILE** ppFile, const wchar_t* pFilePath, const wchar_t* pOpenMode, const drwav_allocation_callbacks* pAllocationCallbacks)
-{
-    if (ppFile != NULL) {
-        *ppFile = NULL;  /* Safety. */
-    }
-
-    if (pFilePath == NULL || pOpenMode == NULL || ppFile == NULL) {
-        return DRWAV_INVALID_ARGS;
-    }
-
-#if defined(DRWAV_HAS_WFOPEN)
-    {
-        /* Use _wfopen() on Windows. */
-    #if defined(_MSC_VER) && _MSC_VER >= 1400
-        errno_t err = _wfopen_s(ppFile, pFilePath, pOpenMode);
-        if (err != 0) {
-            return drwav_result_from_errno(err);
-        }
-    #else
-        *ppFile = _wfopen(pFilePath, pOpenMode);
-        if (*ppFile == NULL) {
-            return drwav_result_from_errno(errno);
-        }
-    #endif
-        (void)pAllocationCallbacks;
-    }
-#else
-    /*
-    Use fopen() on anything other than Windows. Requires a conversion. This is annoying because fopen() is locale specific. The only real way I can
-    think of to do this is with wcsrtombs(). Note that wcstombs() is apparently not thread-safe because it uses a static global mbstate_t object for
-    maintaining state. I've checked this with -std=c89 and it works, but if somebody get's a compiler error I'll look into improving compatibility.
-    */
-    {
-        mbstate_t mbs;
-        size_t lenMB;
-        const wchar_t* pFilePathTemp = pFilePath;
-        char* pFilePathMB = NULL;
-        char pOpenModeMB[32] = {0};
-
-        /* Get the length first. */
-        DRWAV_ZERO_OBJECT(&mbs);
-        lenMB = wcsrtombs(NULL, &pFilePathTemp, 0, &mbs);
-        if (lenMB == (size_t)-1) {
-            return drwav_result_from_errno(errno);
-        }
-
-        pFilePathMB = (char*)drwav__malloc_from_callbacks(lenMB + 1, pAllocationCallbacks);
-        if (pFilePathMB == NULL) {
-            return DRWAV_OUT_OF_MEMORY;
-        }
-
-        pFilePathTemp = pFilePath;
-        DRWAV_ZERO_OBJECT(&mbs);
-        wcsrtombs(pFilePathMB, &pFilePathTemp, lenMB + 1, &mbs);
-
-        /* The open mode should always consist of ASCII characters so we should be able to do a trivial conversion. */
-        {
-            size_t i = 0;
-            for (;;) {
-                if (pOpenMode[i] == 0) {
-                    pOpenModeMB[i] = '\0';
-                    break;
-                }
-
-                pOpenModeMB[i] = (char)pOpenMode[i];
-                i += 1;
-            }
-        }
-
-        *ppFile = fopen(pFilePathMB, pOpenModeMB);
-
-        drwav__free_from_callbacks(pFilePathMB, pAllocationCallbacks);
-    }
-
-    if (*ppFile == NULL) {
-        return DRWAV_ERROR;
-    }
-#endif
-
-    return DRWAV_SUCCESS;
-}
-
-
-static size_t drwav__on_read_stdio(void* pUserData, void* pBufferOut, size_t bytesToRead)
-{
-    return fread(pBufferOut, 1, bytesToRead, (FILE*)pUserData);
-}
-
-static size_t drwav__on_write_stdio(void* pUserData, const void* pData, size_t bytesToWrite)
-{
-    return fwrite(pData, 1, bytesToWrite, (FILE*)pUserData);
-}
-
-static drwav_bool32 drwav__on_seek_stdio(void* pUserData, int offset, drwav_seek_origin origin)
-{
-    return fseek((FILE*)pUserData, offset, (origin == drwav_seek_origin_current) ? SEEK_CUR : SEEK_SET) == 0;
-}
-
-DRWAV_API drwav_bool32 drwav_init_file(drwav* pWav, const char* filename, const drwav_allocation_callbacks* pAllocationCallbacks)
-{
-    return drwav_init_file_ex(pWav, filename, NULL, NULL, 0, pAllocationCallbacks);
-}
-
-
-static drwav_bool32 drwav_init_file__internal_FILE(drwav* pWav, FILE* pFile, drwav_chunk_proc onChunk, void* pChunkUserData, drwav_uint32 flags, const drwav_allocation_callbacks* pAllocationCallbacks)
-{
-    drwav_bool32 result;
-
-    result = drwav_preinit(pWav, drwav__on_read_stdio, drwav__on_seek_stdio, (void*)pFile, pAllocationCallbacks);
-    if (result != DRWAV_TRUE) {
-        fclose(pFile);
-        return result;
-    }
-
-    result = drwav_init__internal(pWav, onChunk, pChunkUserData, flags);
-    if (result != DRWAV_TRUE) {
-        fclose(pFile);
-        return result;
-    }
-
-    return DRWAV_TRUE;
-}
-
-DRWAV_API drwav_bool32 drwav_init_file_ex(drwav* pWav, const char* filename, drwav_chunk_proc onChunk, void* pChunkUserData, drwav_uint32 flags, const drwav_allocation_callbacks* pAllocationCallbacks)
-{
-    FILE* pFile;
-    if (drwav_fopen(&pFile, filename, "rb") != DRWAV_SUCCESS) {
-        return DRWAV_FALSE;
-    }
-
-    /* This takes ownership of the FILE* object. */
-    return drwav_init_file__internal_FILE(pWav, pFile, onChunk, pChunkUserData, flags, pAllocationCallbacks);
-}
-
-DRWAV_API drwav_bool32 drwav_init_file_w(drwav* pWav, const wchar_t* filename, const drwav_allocation_callbacks* pAllocationCallbacks)
-{
-    return drwav_init_file_ex_w(pWav, filename, NULL, NULL, 0, pAllocationCallbacks);
-}
-
-DRWAV_API drwav_bool32 drwav_init_file_ex_w(drwav* pWav, const wchar_t* filename, drwav_chunk_proc onChunk, void* pChunkUserData, drwav_uint32 flags, const drwav_allocation_callbacks* pAllocationCallbacks)
-{
-    FILE* pFile;
-    if (drwav_wfopen(&pFile, filename, L"rb", pAllocationCallbacks) != DRWAV_SUCCESS) {
-        return DRWAV_FALSE;
-    }
-
-    /* This takes ownership of the FILE* object. */
-    return drwav_init_file__internal_FILE(pWav, pFile, onChunk, pChunkUserData, flags, pAllocationCallbacks);
-}
-
-
-static drwav_bool32 drwav_init_file_write__internal_FILE(drwav* pWav, FILE* pFile, const drwav_data_format* pFormat, drwav_uint64 totalSampleCount, drwav_bool32 isSequential, const drwav_allocation_callbacks* pAllocationCallbacks)
-{
-    drwav_bool32 result;
-
-    result = drwav_preinit_write(pWav, pFormat, isSequential, drwav__on_write_stdio, drwav__on_seek_stdio, (void*)pFile, pAllocationCallbacks);
-    if (result != DRWAV_TRUE) {
-        fclose(pFile);
-        return result;
-    }
-
-    result = drwav_init_write__internal(pWav, pFormat, totalSampleCount);
-    if (result != DRWAV_TRUE) {
-        fclose(pFile);
-        return result;
-    }
-
-    return DRWAV_TRUE;
-}
-
-static drwav_bool32 drwav_init_file_write__internal(drwav* pWav, const char* filename, const drwav_data_format* pFormat, drwav_uint64 totalSampleCount, drwav_bool32 isSequential, const drwav_allocation_callbacks* pAllocationCallbacks)
-{
-    FILE* pFile;
-    if (drwav_fopen(&pFile, filename, "wb") != DRWAV_SUCCESS) {
-        return DRWAV_FALSE;
-    }
-
-    /* This takes ownership of the FILE* object. */
-    return drwav_init_file_write__internal_FILE(pWav, pFile, pFormat, totalSampleCount, isSequential, pAllocationCallbacks);
-}
-
-static drwav_bool32 drwav_init_file_write_w__internal(drwav* pWav, const wchar_t* filename, const drwav_data_format* pFormat, drwav_uint64 totalSampleCount, drwav_bool32 isSequential, const drwav_allocation_callbacks* pAllocationCallbacks)
-{
-    FILE* pFile;
-    if (drwav_wfopen(&pFile, filename, L"wb", pAllocationCallbacks) != DRWAV_SUCCESS) {
-        return DRWAV_FALSE;
-    }
-
-    /* This takes ownership of the FILE* object. */
-    return drwav_init_file_write__internal_FILE(pWav, pFile, pFormat, totalSampleCount, isSequential, pAllocationCallbacks);
-}
-
-DRWAV_API drwav_bool32 drwav_init_file_write(drwav* pWav, const char* filename, const drwav_data_format* pFormat, const drwav_allocation_callbacks* pAllocationCallbacks)
-{
-    return drwav_init_file_write__internal(pWav, filename, pFormat, 0, DRWAV_FALSE, pAllocationCallbacks);
-}
-
-DRWAV_API drwav_bool32 drwav_init_file_write_sequential(drwav* pWav, const char* filename, const drwav_data_format* pFormat, drwav_uint64 totalSampleCount, const drwav_allocation_callbacks* pAllocationCallbacks)
-{
-    return drwav_init_file_write__internal(pWav, filename, pFormat, totalSampleCount, DRWAV_TRUE, pAllocationCallbacks);
-}
-
-DRWAV_API drwav_bool32 drwav_init_file_write_sequential_pcm_frames(drwav* pWav, const char* filename, const drwav_data_format* pFormat, drwav_uint64 totalPCMFrameCount, const drwav_allocation_callbacks* pAllocationCallbacks)
-{
-    if (pFormat == NULL) {
-        return DRWAV_FALSE;
-    }
-
-    return drwav_init_file_write_sequential(pWav, filename, pFormat, totalPCMFrameCount*pFormat->channels, pAllocationCallbacks);
-}
-
-DRWAV_API drwav_bool32 drwav_init_file_write_w(drwav* pWav, const wchar_t* filename, const drwav_data_format* pFormat, const drwav_allocation_callbacks* pAllocationCallbacks)
-{
-    return drwav_init_file_write_w__internal(pWav, filename, pFormat, 0, DRWAV_FALSE, pAllocationCallbacks);
-}
-
-DRWAV_API drwav_bool32 drwav_init_file_write_sequential_w(drwav* pWav, const wchar_t* filename, const drwav_data_format* pFormat, drwav_uint64 totalSampleCount, const drwav_allocation_callbacks* pAllocationCallbacks)
-{
-    return drwav_init_file_write_w__internal(pWav, filename, pFormat, totalSampleCount, DRWAV_TRUE, pAllocationCallbacks);
-}
-
-DRWAV_API drwav_bool32 drwav_init_file_write_sequential_pcm_frames_w(drwav* pWav, const wchar_t* filename, const drwav_data_format* pFormat, drwav_uint64 totalPCMFrameCount, const drwav_allocation_callbacks* pAllocationCallbacks)
-{
-    if (pFormat == NULL) {
-        return DRWAV_FALSE;
-    }
-
-    return drwav_init_file_write_sequential_w(pWav, filename, pFormat, totalPCMFrameCount*pFormat->channels, pAllocationCallbacks);
-}
-#endif  /* DR_WAV_NO_STDIO */
-
-
-static size_t drwav__on_read_memory(void* pUserData, void* pBufferOut, size_t bytesToRead)
-{
-    drwav* pWav = (drwav*)pUserData;
-    size_t bytesRemaining;
-
-    DRWAV_ASSERT(pWav != NULL);
-    DRWAV_ASSERT(pWav->memoryStream.dataSize >= pWav->memoryStream.currentReadPos);
-
-    bytesRemaining = pWav->memoryStream.dataSize - pWav->memoryStream.currentReadPos;
-    if (bytesToRead > bytesRemaining) {
-        bytesToRead = bytesRemaining;
-    }
-
-    if (bytesToRead > 0) {
-        DRWAV_COPY_MEMORY(pBufferOut, pWav->memoryStream.data + pWav->memoryStream.currentReadPos, bytesToRead);
-        pWav->memoryStream.currentReadPos += bytesToRead;
-    }
-
-    return bytesToRead;
-}
-
-static drwav_bool32 drwav__on_seek_memory(void* pUserData, int offset, drwav_seek_origin origin)
-{
-    drwav* pWav = (drwav*)pUserData;
-    DRWAV_ASSERT(pWav != NULL);
-
-    if (origin == drwav_seek_origin_current) {
-        if (offset > 0) {
-            if (pWav->memoryStream.currentReadPos + offset > pWav->memoryStream.dataSize) {
-                return DRWAV_FALSE; /* Trying to seek too far forward. */
-            }
-        } else {
-            if (pWav->memoryStream.currentReadPos < (size_t)-offset) {
-                return DRWAV_FALSE; /* Trying to seek too far backwards. */
-            }
-        }
-
-        /* This will never underflow thanks to the clamps above. */
-        pWav->memoryStream.currentReadPos += offset;
-    } else {
-        if ((drwav_uint32)offset <= pWav->memoryStream.dataSize) {
-            pWav->memoryStream.currentReadPos = offset;
-        } else {
-            return DRWAV_FALSE; /* Trying to seek too far forward. */
-        }
-    }
-    
-    return DRWAV_TRUE;
-}
-
-static size_t drwav__on_write_memory(void* pUserData, const void* pDataIn, size_t bytesToWrite)
-{
-    drwav* pWav = (drwav*)pUserData;
-    size_t bytesRemaining;
-
-    DRWAV_ASSERT(pWav != NULL);
-    DRWAV_ASSERT(pWav->memoryStreamWrite.dataCapacity >= pWav->memoryStreamWrite.currentWritePos);
-
-    bytesRemaining = pWav->memoryStreamWrite.dataCapacity - pWav->memoryStreamWrite.currentWritePos;
-    if (bytesRemaining < bytesToWrite) {
-        /* Need to reallocate. */
-        void* pNewData;
-        size_t newDataCapacity = (pWav->memoryStreamWrite.dataCapacity == 0) ? 256 : pWav->memoryStreamWrite.dataCapacity * 2;
-
-        /* If doubling wasn't enough, just make it the minimum required size to write the data. */
-        if ((newDataCapacity - pWav->memoryStreamWrite.currentWritePos) < bytesToWrite) {
-            newDataCapacity = pWav->memoryStreamWrite.currentWritePos + bytesToWrite;
-        }
-
-        pNewData = drwav__realloc_from_callbacks(*pWav->memoryStreamWrite.ppData, newDataCapacity, pWav->memoryStreamWrite.dataCapacity, &pWav->allocationCallbacks);
-        if (pNewData == NULL) {
-            return 0;
-        }
-
-        *pWav->memoryStreamWrite.ppData = pNewData;
-        pWav->memoryStreamWrite.dataCapacity = newDataCapacity;
-    }
-
-    DRWAV_COPY_MEMORY(((drwav_uint8*)(*pWav->memoryStreamWrite.ppData)) + pWav->memoryStreamWrite.currentWritePos, pDataIn, bytesToWrite);
-
-    pWav->memoryStreamWrite.currentWritePos += bytesToWrite;
-    if (pWav->memoryStreamWrite.dataSize < pWav->memoryStreamWrite.currentWritePos) {
-        pWav->memoryStreamWrite.dataSize = pWav->memoryStreamWrite.currentWritePos;
-    }
-
-    *pWav->memoryStreamWrite.pDataSize = pWav->memoryStreamWrite.dataSize;
-
-    return bytesToWrite;
-}
-
-static drwav_bool32 drwav__on_seek_memory_write(void* pUserData, int offset, drwav_seek_origin origin)
-{
-    drwav* pWav = (drwav*)pUserData;
-    DRWAV_ASSERT(pWav != NULL);
-
-    if (origin == drwav_seek_origin_current) {
-        if (offset > 0) {
-            if (pWav->memoryStreamWrite.currentWritePos + offset > pWav->memoryStreamWrite.dataSize) {
-                offset = (int)(pWav->memoryStreamWrite.dataSize - pWav->memoryStreamWrite.currentWritePos);  /* Trying to seek too far forward. */
-            }
-        } else {
-            if (pWav->memoryStreamWrite.currentWritePos < (size_t)-offset) {
-                offset = -(int)pWav->memoryStreamWrite.currentWritePos;  /* Trying to seek too far backwards. */
-            }
-        }
-
-        /* This will never underflow thanks to the clamps above. */
-        pWav->memoryStreamWrite.currentWritePos += offset;
-    } else {
-        if ((drwav_uint32)offset <= pWav->memoryStreamWrite.dataSize) {
-            pWav->memoryStreamWrite.currentWritePos = offset;
-        } else {
-            pWav->memoryStreamWrite.currentWritePos = pWav->memoryStreamWrite.dataSize;  /* Trying to seek too far forward. */
-        }
-    }
-    
-    return DRWAV_TRUE;
-}
-
-DRWAV_API drwav_bool32 drwav_init_memory(drwav* pWav, const void* data, size_t dataSize, const drwav_allocation_callbacks* pAllocationCallbacks)
-{
-    return drwav_init_memory_ex(pWav, data, dataSize, NULL, NULL, 0, pAllocationCallbacks);
-}
-
-DRWAV_API drwav_bool32 drwav_init_memory_ex(drwav* pWav, const void* data, size_t dataSize, drwav_chunk_proc onChunk, void* pChunkUserData, drwav_uint32 flags, const drwav_allocation_callbacks* pAllocationCallbacks)
-{
-    if (data == NULL || dataSize == 0) {
-        return DRWAV_FALSE;
-    }
-
-    if (!drwav_preinit(pWav, drwav__on_read_memory, drwav__on_seek_memory, pWav, pAllocationCallbacks)) {
-        return DRWAV_FALSE;
-    }
-
-    pWav->memoryStream.data = (const drwav_uint8*)data;
-    pWav->memoryStream.dataSize = dataSize;
-    pWav->memoryStream.currentReadPos = 0;
-
-    return drwav_init__internal(pWav, onChunk, pChunkUserData, flags);
-}
-
-
-static drwav_bool32 drwav_init_memory_write__internal(drwav* pWav, void** ppData, size_t* pDataSize, const drwav_data_format* pFormat, drwav_uint64 totalSampleCount, drwav_bool32 isSequential, const drwav_allocation_callbacks* pAllocationCallbacks)
-{
-    if (ppData == NULL || pDataSize == NULL) {
-        return DRWAV_FALSE;
-    }
-
-    *ppData = NULL; /* Important because we're using realloc()! */
-    *pDataSize = 0;
-
-    if (!drwav_preinit_write(pWav, pFormat, isSequential, drwav__on_write_memory, drwav__on_seek_memory_write, pWav, pAllocationCallbacks)) {
-        return DRWAV_FALSE;
-    }
-
-    pWav->memoryStreamWrite.ppData = ppData;
-    pWav->memoryStreamWrite.pDataSize = pDataSize;
-    pWav->memoryStreamWrite.dataSize = 0;
-    pWav->memoryStreamWrite.dataCapacity = 0;
-    pWav->memoryStreamWrite.currentWritePos = 0;
-
-    return drwav_init_write__internal(pWav, pFormat, totalSampleCount);
-}
-
-DRWAV_API drwav_bool32 drwav_init_memory_write(drwav* pWav, void** ppData, size_t* pDataSize, const drwav_data_format* pFormat, const drwav_allocation_callbacks* pAllocationCallbacks)
-{
-    return drwav_init_memory_write__internal(pWav, ppData, pDataSize, pFormat, 0, DRWAV_FALSE, pAllocationCallbacks);
-}
-
-DRWAV_API drwav_bool32 drwav_init_memory_write_sequential(drwav* pWav, void** ppData, size_t* pDataSize, const drwav_data_format* pFormat, drwav_uint64 totalSampleCount, const drwav_allocation_callbacks* pAllocationCallbacks)
-{
-    return drwav_init_memory_write__internal(pWav, ppData, pDataSize, pFormat, totalSampleCount, DRWAV_TRUE, pAllocationCallbacks);
-}
-
-DRWAV_API drwav_bool32 drwav_init_memory_write_sequential_pcm_frames(drwav* pWav, void** ppData, size_t* pDataSize, const drwav_data_format* pFormat, drwav_uint64 totalPCMFrameCount, const drwav_allocation_callbacks* pAllocationCallbacks)
-{
-    if (pFormat == NULL) {
-        return DRWAV_FALSE;
-    }
-
-    return drwav_init_memory_write_sequential(pWav, ppData, pDataSize, pFormat, totalPCMFrameCount*pFormat->channels, pAllocationCallbacks);
-}
-
-
-
-DRWAV_API drwav_result drwav_uninit(drwav* pWav)
-{
-    drwav_result result = DRWAV_SUCCESS;
-
-    if (pWav == NULL) {
-        return DRWAV_INVALID_ARGS;
-    }
-
-    /*
-    If the drwav object was opened in write mode we'll need to finalize a few things:
-      - Make sure the "data" chunk is aligned to 16-bits for RIFF containers, or 64 bits for W64 containers.
-      - Set the size of the "data" chunk.
-    */
-    if (pWav->onWrite != NULL) {
-        drwav_uint32 paddingSize = 0;
-
-        /* Padding. Do not adjust pWav->dataChunkDataSize - this should not include the padding. */
-        if (pWav->container == drwav_container_riff || pWav->container == drwav_container_rf64) {
-            paddingSize = drwav__chunk_padding_size_riff(pWav->dataChunkDataSize);
-        } else {
-            paddingSize = drwav__chunk_padding_size_w64(pWav->dataChunkDataSize);
-        }
-        
-        if (paddingSize > 0) {
-            drwav_uint64 paddingData = 0;
-            drwav__write(pWav, &paddingData, paddingSize);  /* Byte order does not matter for this. */
-        }
-
-        /*
-        Chunk sizes. When using sequential mode, these will have been filled in at initialization time. We only need
-        to do this when using non-sequential mode.
-        */
-        if (pWav->onSeek && !pWav->isSequentialWrite) {
-            if (pWav->container == drwav_container_riff) {
-                /* The "RIFF" chunk size. */
-                if (pWav->onSeek(pWav->pUserData, 4, drwav_seek_origin_start)) {
-                    drwav_uint32 riffChunkSize = drwav__riff_chunk_size_riff(pWav->dataChunkDataSize);
-                    drwav__write_u32ne_to_le(pWav, riffChunkSize);
-                }
-
-                /* the "data" chunk size. */
-                if (pWav->onSeek(pWav->pUserData, (int)pWav->dataChunkDataPos + 4, drwav_seek_origin_start)) {
-                    drwav_uint32 dataChunkSize = drwav__data_chunk_size_riff(pWav->dataChunkDataSize);
-                    drwav__write_u32ne_to_le(pWav, dataChunkSize);
-                }
-            } else if (pWav->container == drwav_container_w64) {
-                /* The "RIFF" chunk size. */
-                if (pWav->onSeek(pWav->pUserData, 16, drwav_seek_origin_start)) {
-                    drwav_uint64 riffChunkSize = drwav__riff_chunk_size_w64(pWav->dataChunkDataSize);
-                    drwav__write_u64ne_to_le(pWav, riffChunkSize);
-                }
-
-                /* The "data" chunk size. */
-                if (pWav->onSeek(pWav->pUserData, (int)pWav->dataChunkDataPos + 16, drwav_seek_origin_start)) {
-                    drwav_uint64 dataChunkSize = drwav__data_chunk_size_w64(pWav->dataChunkDataSize);
-                    drwav__write_u64ne_to_le(pWav, dataChunkSize);
-                }
-            } else if (pWav->container == drwav_container_rf64) {
-                /* We only need to update the ds64 chunk. The "RIFF" and "data" chunks always have their sizes set to 0xFFFFFFFF for RF64. */
-                int ds64BodyPos = 12 + 8;
-
-                /* The "RIFF" chunk size. */
-                if (pWav->onSeek(pWav->pUserData, ds64BodyPos + 0, drwav_seek_origin_start)) {
-                    drwav_uint64 riffChunkSize = drwav__riff_chunk_size_rf64(pWav->dataChunkDataSize);
-                    drwav__write_u64ne_to_le(pWav, riffChunkSize);
-                }
-
-                /* The "data" chunk size. */
-                if (pWav->onSeek(pWav->pUserData, ds64BodyPos + 8, drwav_seek_origin_start)) {
-                    drwav_uint64 dataChunkSize = drwav__data_chunk_size_rf64(pWav->dataChunkDataSize);
-                    drwav__write_u64ne_to_le(pWav, dataChunkSize);
-                }
-            }
-        }
-
-        /* Validation for sequential mode. */
-        if (pWav->isSequentialWrite) {
-            if (pWav->dataChunkDataSize != pWav->dataChunkDataSizeTargetWrite) {
-                result = DRWAV_INVALID_FILE;
-            }
-        }
-    }
-
-#ifndef DR_WAV_NO_STDIO
-    /*
-    If we opened the file with drwav_open_file() we will want to close the file handle. We can know whether or not drwav_open_file()
-    was used by looking at the onRead and onSeek callbacks.
-    */
-    if (pWav->onRead == drwav__on_read_stdio || pWav->onWrite == drwav__on_write_stdio) {
-        fclose((FILE*)pWav->pUserData);
-    }
-#endif
-
-    return result;
-}
-
-
-
-DRWAV_API size_t drwav_read_raw(drwav* pWav, size_t bytesToRead, void* pBufferOut)
-{
-    size_t bytesRead;
-
-    if (pWav == NULL || bytesToRead == 0) {
-        return 0;
-    }
-
-    if (bytesToRead > pWav->bytesRemaining) {
-        bytesToRead = (size_t)pWav->bytesRemaining;
-    }
-
-    if (pBufferOut != NULL) {
-        bytesRead = pWav->onRead(pWav->pUserData, pBufferOut, bytesToRead);
-    } else {
-        /* We need to seek. If we fail, we need to read-and-discard to make sure we get a good byte count. */
-        bytesRead = 0;
-        while (bytesRead < bytesToRead) {
-            size_t bytesToSeek = (bytesToRead - bytesRead);
-            if (bytesToSeek > 0x7FFFFFFF) {
-                bytesToSeek = 0x7FFFFFFF;
-            }
-
-            if (pWav->onSeek(pWav->pUserData, (int)bytesToSeek, drwav_seek_origin_current) == DRWAV_FALSE) {
-                break;
-            }
-
-            bytesRead += bytesToSeek;
-        }
-
-        /* When we get here we may need to read-and-discard some data. */
-        while (bytesRead < bytesToRead) {
-            drwav_uint8 buffer[4096];
-            size_t bytesSeeked;
-            size_t bytesToSeek = (bytesToRead - bytesRead);
-            if (bytesToSeek > sizeof(buffer)) {
-                bytesToSeek = sizeof(buffer);
-            }
-
-            bytesSeeked = pWav->onRead(pWav->pUserData, buffer, bytesToSeek);
-            bytesRead += bytesSeeked;
-
-            if (bytesSeeked < bytesToSeek) {
-                break;  /* Reached the end. */
-            }
-        }
-    }
-
-    pWav->bytesRemaining -= bytesRead;
-    return bytesRead;
-}
-
-
-
-DRWAV_API drwav_uint64 drwav_read_pcm_frames_le(drwav* pWav, drwav_uint64 framesToRead, void* pBufferOut)
-{
-    drwav_uint32 bytesPerFrame;
-    drwav_uint64 bytesToRead;   /* Intentionally uint64 instead of size_t so we can do a check that we're not reading too much on 32-bit builds. */
-
-    if (pWav == NULL || framesToRead == 0) {
-        return 0;
-    }
-
-    /* Cannot use this function for compressed formats. */
-    if (drwav__is_compressed_format_tag(pWav->translatedFormatTag)) {
-        return 0;
-    }
-
-    bytesPerFrame = drwav_get_bytes_per_pcm_frame(pWav);
-    if (bytesPerFrame == 0) {
-        return 0;
-    }
-
-    /* Don't try to read more samples than can potentially fit in the output buffer. */
-    bytesToRead = framesToRead * bytesPerFrame;
-    if (bytesToRead > DRWAV_SIZE_MAX) {
-        bytesToRead = (DRWAV_SIZE_MAX / bytesPerFrame) * bytesPerFrame; /* Round the number of bytes to read to a clean frame boundary. */
-    }
-
-    /*
-    Doing an explicit check here just to make it clear that we don't want to be attempt to read anything if there's no bytes to read. There
-    *could* be a time where it evaluates to 0 due to overflowing.
-    */
-    if (bytesToRead == 0) {
-        return 0;
-    }
-
-    return drwav_read_raw(pWav, (size_t)bytesToRead, pBufferOut) / bytesPerFrame;
-}
-
-DRWAV_API drwav_uint64 drwav_read_pcm_frames_be(drwav* pWav, drwav_uint64 framesToRead, void* pBufferOut)
-{
-    drwav_uint64 framesRead = drwav_read_pcm_frames_le(pWav, framesToRead, pBufferOut);
-
-    if (pBufferOut != NULL) {
-        drwav__bswap_samples(pBufferOut, framesRead*pWav->channels, drwav_get_bytes_per_pcm_frame(pWav)/pWav->channels, pWav->translatedFormatTag);
-    }
-
-    return framesRead;
-}
-
-DRWAV_API drwav_uint64 drwav_read_pcm_frames(drwav* pWav, drwav_uint64 framesToRead, void* pBufferOut)
-{
-    if (drwav__is_little_endian()) {
-        return drwav_read_pcm_frames_le(pWav, framesToRead, pBufferOut);
-    } else {
-        return drwav_read_pcm_frames_be(pWav, framesToRead, pBufferOut);
-    }
-}
-
-
-
-DRWAV_API drwav_bool32 drwav_seek_to_first_pcm_frame(drwav* pWav)
-{
-    if (pWav->onWrite != NULL) {
-        return DRWAV_FALSE; /* No seeking in write mode. */
-    }
-
-    if (!pWav->onSeek(pWav->pUserData, (int)pWav->dataChunkDataPos, drwav_seek_origin_start)) {
-        return DRWAV_FALSE;
-    }
-
-    if (drwav__is_compressed_format_tag(pWav->translatedFormatTag)) {
-        pWav->compressed.iCurrentPCMFrame = 0;
-
-        /* Cached data needs to be cleared for compressed formats. */
-        if (pWav->translatedFormatTag == DR_WAVE_FORMAT_ADPCM) {
-            DRWAV_ZERO_OBJECT(&pWav->msadpcm);
-        } else if (pWav->translatedFormatTag == DR_WAVE_FORMAT_DVI_ADPCM) {
-            DRWAV_ZERO_OBJECT(&pWav->ima);
-        } else {
-            DRWAV_ASSERT(DRWAV_FALSE);  /* If this assertion is triggered it means I've implemented a new compressed format but forgot to add a branch for it here. */
-        }
-    }
-    
-    pWav->bytesRemaining = pWav->dataChunkDataSize;
-    return DRWAV_TRUE;
-}
-
-DRWAV_API drwav_bool32 drwav_seek_to_pcm_frame(drwav* pWav, drwav_uint64 targetFrameIndex)
-{
-    /* Seeking should be compatible with wave files > 2GB. */
-
-    if (pWav == NULL || pWav->onSeek == NULL) {
-        return DRWAV_FALSE;
-    }
-
-    /* No seeking in write mode. */
-    if (pWav->onWrite != NULL) {
-        return DRWAV_FALSE;
-    }
-
-    /* If there are no samples, just return DRWAV_TRUE without doing anything. */
-    if (pWav->totalPCMFrameCount == 0) {
-        return DRWAV_TRUE;
-    }
-
-    /* Make sure the sample is clamped. */
-    if (targetFrameIndex >= pWav->totalPCMFrameCount) {
-        targetFrameIndex  = pWav->totalPCMFrameCount - 1;
-    }
-
-    /*
-    For compressed formats we just use a slow generic seek. If we are seeking forward we just seek forward. If we are going backwards we need
-    to seek back to the start.
-    */
-    if (drwav__is_compressed_format_tag(pWav->translatedFormatTag)) {
-        /* TODO: This can be optimized. */
-        
-        /*
-        If we're seeking forward it's simple - just keep reading samples until we hit the sample we're requesting. If we're seeking backwards,
-        we first need to seek back to the start and then just do the same thing as a forward seek.
-        */
-        if (targetFrameIndex < pWav->compressed.iCurrentPCMFrame) {
-            if (!drwav_seek_to_first_pcm_frame(pWav)) {
-                return DRWAV_FALSE;
-            }
-        }
-
-        if (targetFrameIndex > pWav->compressed.iCurrentPCMFrame) {
-            drwav_uint64 offsetInFrames = targetFrameIndex - pWav->compressed.iCurrentPCMFrame;
-
-            drwav_int16 devnull[2048];
-            while (offsetInFrames > 0) {
-                drwav_uint64 framesRead = 0;
-                drwav_uint64 framesToRead = offsetInFrames;
-                if (framesToRead > drwav_countof(devnull)/pWav->channels) {
-                    framesToRead = drwav_countof(devnull)/pWav->channels;
-                }
-
-                if (pWav->translatedFormatTag == DR_WAVE_FORMAT_ADPCM) {
-                    framesRead = drwav_read_pcm_frames_s16__msadpcm(pWav, framesToRead, devnull);
-                } else if (pWav->translatedFormatTag == DR_WAVE_FORMAT_DVI_ADPCM) {
-                    framesRead = drwav_read_pcm_frames_s16__ima(pWav, framesToRead, devnull);
-                } else {
-                    DRWAV_ASSERT(DRWAV_FALSE);  /* If this assertion is triggered it means I've implemented a new compressed format but forgot to add a branch for it here. */
-                }
-
-                if (framesRead != framesToRead) {
-                    return DRWAV_FALSE;
-                }
-
-                offsetInFrames -= framesRead;
-            }
-        }
-    } else {
-        drwav_uint64 totalSizeInBytes;
-        drwav_uint64 currentBytePos;
-        drwav_uint64 targetBytePos;
-        drwav_uint64 offset;
-
-        totalSizeInBytes = pWav->totalPCMFrameCount * drwav_get_bytes_per_pcm_frame(pWav);
-        DRWAV_ASSERT(totalSizeInBytes >= pWav->bytesRemaining);
-
-        currentBytePos = totalSizeInBytes - pWav->bytesRemaining;
-        targetBytePos  = targetFrameIndex * drwav_get_bytes_per_pcm_frame(pWav);
-
-        if (currentBytePos < targetBytePos) {
-            /* Offset forwards. */
-            offset = (targetBytePos - currentBytePos);
-        } else {
-            /* Offset backwards. */
-            if (!drwav_seek_to_first_pcm_frame(pWav)) {
-                return DRWAV_FALSE;
-            }
-            offset = targetBytePos;
-        }
-
-        while (offset > 0) {
-            int offset32 = ((offset > INT_MAX) ? INT_MAX : (int)offset);
-            if (!pWav->onSeek(pWav->pUserData, offset32, drwav_seek_origin_current)) {
-                return DRWAV_FALSE;
-            }
-
-            pWav->bytesRemaining -= offset32;
-            offset -= offset32;
-        }
-    }
-
-    return DRWAV_TRUE;
-}
-
-
-DRWAV_API size_t drwav_write_raw(drwav* pWav, size_t bytesToWrite, const void* pData)
-{
-    size_t bytesWritten;
-
-    if (pWav == NULL || bytesToWrite == 0 || pData == NULL) {
-        return 0;
-    }
-
-    bytesWritten = pWav->onWrite(pWav->pUserData, pData, bytesToWrite);
-    pWav->dataChunkDataSize += bytesWritten;
-
-    return bytesWritten;
-}
-
-
-DRWAV_API drwav_uint64 drwav_write_pcm_frames_le(drwav* pWav, drwav_uint64 framesToWrite, const void* pData)
-{
-    drwav_uint64 bytesToWrite;
-    drwav_uint64 bytesWritten;
-    const drwav_uint8* pRunningData;
-
-    if (pWav == NULL || framesToWrite == 0 || pData == NULL) {
-        return 0;
-    }
-
-    bytesToWrite = ((framesToWrite * pWav->channels * pWav->bitsPerSample) / 8);
-    if (bytesToWrite > DRWAV_SIZE_MAX) {
-        return 0;
-    }
-
-    bytesWritten = 0;
-    pRunningData = (const drwav_uint8*)pData;
-
-    while (bytesToWrite > 0) {
-        size_t bytesJustWritten;
-        drwav_uint64 bytesToWriteThisIteration;
-
-        bytesToWriteThisIteration = bytesToWrite;
-        DRWAV_ASSERT(bytesToWriteThisIteration <= DRWAV_SIZE_MAX);  /* <-- This is checked above. */
-
-        bytesJustWritten = drwav_write_raw(pWav, (size_t)bytesToWriteThisIteration, pRunningData);
-        if (bytesJustWritten == 0) {
-            break;
-        }
-
-        bytesToWrite -= bytesJustWritten;
-        bytesWritten += bytesJustWritten;
-        pRunningData += bytesJustWritten;
-    }
-
-    return (bytesWritten * 8) / pWav->bitsPerSample / pWav->channels;
-}
-
-DRWAV_API drwav_uint64 drwav_write_pcm_frames_be(drwav* pWav, drwav_uint64 framesToWrite, const void* pData)
-{
-    drwav_uint64 bytesToWrite;
-    drwav_uint64 bytesWritten;
-    drwav_uint32 bytesPerSample;
-    const drwav_uint8* pRunningData;
-
-    if (pWav == NULL || framesToWrite == 0 || pData == NULL) {
-        return 0;
-    }
-
-    bytesToWrite = ((framesToWrite * pWav->channels * pWav->bitsPerSample) / 8);
-    if (bytesToWrite > DRWAV_SIZE_MAX) {
-        return 0;
-    }
-
-    bytesWritten = 0;
-    pRunningData = (const drwav_uint8*)pData;
-
-    bytesPerSample = drwav_get_bytes_per_pcm_frame(pWav) / pWav->channels;
-    
-    while (bytesToWrite > 0) {
-        drwav_uint8 temp[4096];
-        drwav_uint32 sampleCount;
-        size_t bytesJustWritten;
-        drwav_uint64 bytesToWriteThisIteration;
-
-        bytesToWriteThisIteration = bytesToWrite;
-        DRWAV_ASSERT(bytesToWriteThisIteration <= DRWAV_SIZE_MAX);  /* <-- This is checked above. */
-
-        /*
-        WAV files are always little-endian. We need to byte swap on big-endian architectures. Since our input buffer is read-only we need
-        to use an intermediary buffer for the conversion.
-        */
-        sampleCount = sizeof(temp)/bytesPerSample;
-
-        if (bytesToWriteThisIteration > ((drwav_uint64)sampleCount)*bytesPerSample) {
-            bytesToWriteThisIteration = ((drwav_uint64)sampleCount)*bytesPerSample;
-        }
-
-        DRWAV_COPY_MEMORY(temp, pRunningData, (size_t)bytesToWriteThisIteration);
-        drwav__bswap_samples(temp, sampleCount, bytesPerSample, pWav->translatedFormatTag);
-
-        bytesJustWritten = drwav_write_raw(pWav, (size_t)bytesToWriteThisIteration, temp);
-        if (bytesJustWritten == 0) {
-            break;
-        }
-
-        bytesToWrite -= bytesJustWritten;
-        bytesWritten += bytesJustWritten;
-        pRunningData += bytesJustWritten;
-    }
-
-    return (bytesWritten * 8) / pWav->bitsPerSample / pWav->channels;
-}
-
-DRWAV_API drwav_uint64 drwav_write_pcm_frames(drwav* pWav, drwav_uint64 framesToWrite, const void* pData)
-{
-    if (drwav__is_little_endian()) {
-        return drwav_write_pcm_frames_le(pWav, framesToWrite, pData);
-    } else {
-        return drwav_write_pcm_frames_be(pWav, framesToWrite, pData);
-    }
-}
-
-
-static drwav_uint64 drwav_read_pcm_frames_s16__msadpcm(drwav* pWav, drwav_uint64 framesToRead, drwav_int16* pBufferOut)
-{
-    drwav_uint64 totalFramesRead = 0;
-
-    DRWAV_ASSERT(pWav != NULL);
-    DRWAV_ASSERT(framesToRead > 0);
-
-    /* TODO: Lots of room for optimization here. */
-
-    while (framesToRead > 0 && pWav->compressed.iCurrentPCMFrame < pWav->totalPCMFrameCount) {
-        /* If there are no cached frames we need to load a new block. */
-        if (pWav->msadpcm.cachedFrameCount == 0 && pWav->msadpcm.bytesRemainingInBlock == 0) {
-            if (pWav->channels == 1) {
-                /* Mono. */
-                drwav_uint8 header[7];
-                if (pWav->onRead(pWav->pUserData, header, sizeof(header)) != sizeof(header)) {
-                    return totalFramesRead;
-                }
-                pWav->msadpcm.bytesRemainingInBlock = pWav->fmt.blockAlign - sizeof(header);
-
-                pWav->msadpcm.predictor[0]     = header[0];
-                pWav->msadpcm.delta[0]         = drwav__bytes_to_s16(header + 1);
-                pWav->msadpcm.prevFrames[0][1] = (drwav_int32)drwav__bytes_to_s16(header + 3);
-                pWav->msadpcm.prevFrames[0][0] = (drwav_int32)drwav__bytes_to_s16(header + 5);
-                pWav->msadpcm.cachedFrames[2]  = pWav->msadpcm.prevFrames[0][0];
-                pWav->msadpcm.cachedFrames[3]  = pWav->msadpcm.prevFrames[0][1];
-                pWav->msadpcm.cachedFrameCount = 2;
-            } else {
-                /* Stereo. */
-                drwav_uint8 header[14];
-                if (pWav->onRead(pWav->pUserData, header, sizeof(header)) != sizeof(header)) {
-                    return totalFramesRead;
-                }
-                pWav->msadpcm.bytesRemainingInBlock = pWav->fmt.blockAlign - sizeof(header);
-
-                pWav->msadpcm.predictor[0] = header[0];
-                pWav->msadpcm.predictor[1] = header[1];
-                pWav->msadpcm.delta[0] = drwav__bytes_to_s16(header + 2);
-                pWav->msadpcm.delta[1] = drwav__bytes_to_s16(header + 4);
-                pWav->msadpcm.prevFrames[0][1] = (drwav_int32)drwav__bytes_to_s16(header + 6);
-                pWav->msadpcm.prevFrames[1][1] = (drwav_int32)drwav__bytes_to_s16(header + 8);
-                pWav->msadpcm.prevFrames[0][0] = (drwav_int32)drwav__bytes_to_s16(header + 10);
-                pWav->msadpcm.prevFrames[1][0] = (drwav_int32)drwav__bytes_to_s16(header + 12);
-
-                pWav->msadpcm.cachedFrames[0] = pWav->msadpcm.prevFrames[0][0];
-                pWav->msadpcm.cachedFrames[1] = pWav->msadpcm.prevFrames[1][0];
-                pWav->msadpcm.cachedFrames[2] = pWav->msadpcm.prevFrames[0][1];
-                pWav->msadpcm.cachedFrames[3] = pWav->msadpcm.prevFrames[1][1];
-                pWav->msadpcm.cachedFrameCount = 2;
-            }
-        }
-
-        /* Output anything that's cached. */
-        while (framesToRead > 0 && pWav->msadpcm.cachedFrameCount > 0 && pWav->compressed.iCurrentPCMFrame < pWav->totalPCMFrameCount) {
-            if (pBufferOut != NULL) {
-                drwav_uint32 iSample = 0;
-                for (iSample = 0; iSample < pWav->channels; iSample += 1) {
-                    pBufferOut[iSample] = (drwav_int16)pWav->msadpcm.cachedFrames[(drwav_countof(pWav->msadpcm.cachedFrames) - (pWav->msadpcm.cachedFrameCount*pWav->channels)) + iSample];
-                }
-
-                pBufferOut += pWav->channels;
-            }
-
-            framesToRead    -= 1;
-            totalFramesRead += 1;
-            pWav->compressed.iCurrentPCMFrame += 1;
-            pWav->msadpcm.cachedFrameCount -= 1;
-        }
-
-        if (framesToRead == 0) {
-            return totalFramesRead;
-        }
-
-
-        /*
-        If there's nothing left in the cache, just go ahead and load more. If there's nothing left to load in the current block we just continue to the next
-        loop iteration which will trigger the loading of a new block.
-        */
-        if (pWav->msadpcm.cachedFrameCount == 0) {
-            if (pWav->msadpcm.bytesRemainingInBlock == 0) {
-                continue;
-            } else {
-                static drwav_int32 adaptationTable[] = { 
-                    230, 230, 230, 230, 307, 409, 512, 614, 
-                    768, 614, 512, 409, 307, 230, 230, 230 
-                };
-                static drwav_int32 coeff1Table[] = { 256, 512, 0, 192, 240, 460,  392 };
-                static drwav_int32 coeff2Table[] = { 0,  -256, 0, 64,  0,  -208, -232 };
-
-                drwav_uint8 nibbles;
-                drwav_int32 nibble0;
-                drwav_int32 nibble1;
-
-                if (pWav->onRead(pWav->pUserData, &nibbles, 1) != 1) {
-                    return totalFramesRead;
-                }
-                pWav->msadpcm.bytesRemainingInBlock -= 1;
-
-                /* TODO: Optimize away these if statements. */
-                nibble0 = ((nibbles & 0xF0) >> 4); if ((nibbles & 0x80)) { nibble0 |= 0xFFFFFFF0UL; }
-                nibble1 = ((nibbles & 0x0F) >> 0); if ((nibbles & 0x08)) { nibble1 |= 0xFFFFFFF0UL; }
-
-                if (pWav->channels == 1) {
-                    /* Mono. */
-                    drwav_int32 newSample0;
-                    drwav_int32 newSample1;
-
-                    newSample0  = ((pWav->msadpcm.prevFrames[0][1] * coeff1Table[pWav->msadpcm.predictor[0]]) + (pWav->msadpcm.prevFrames[0][0] * coeff2Table[pWav->msadpcm.predictor[0]])) >> 8;
-                    newSample0 += nibble0 * pWav->msadpcm.delta[0];
-                    newSample0  = drwav_clamp(newSample0, -32768, 32767);
-
-                    pWav->msadpcm.delta[0] = (adaptationTable[((nibbles & 0xF0) >> 4)] * pWav->msadpcm.delta[0]) >> 8;
-                    if (pWav->msadpcm.delta[0] < 16) {
-                        pWav->msadpcm.delta[0] = 16;
-                    }
-
-                    pWav->msadpcm.prevFrames[0][0] = pWav->msadpcm.prevFrames[0][1];
-                    pWav->msadpcm.prevFrames[0][1] = newSample0;
-
-
-                    newSample1  = ((pWav->msadpcm.prevFrames[0][1] * coeff1Table[pWav->msadpcm.predictor[0]]) + (pWav->msadpcm.prevFrames[0][0] * coeff2Table[pWav->msadpcm.predictor[0]])) >> 8;
-                    newSample1 += nibble1 * pWav->msadpcm.delta[0];
-                    newSample1  = drwav_clamp(newSample1, -32768, 32767);
-
-                    pWav->msadpcm.delta[0] = (adaptationTable[((nibbles & 0x0F) >> 0)] * pWav->msadpcm.delta[0]) >> 8;
-                    if (pWav->msadpcm.delta[0] < 16) {
-                        pWav->msadpcm.delta[0] = 16;
-                    }
-
-                    pWav->msadpcm.prevFrames[0][0] = pWav->msadpcm.prevFrames[0][1];
-                    pWav->msadpcm.prevFrames[0][1] = newSample1;
-
-
-                    pWav->msadpcm.cachedFrames[2] = newSample0;
-                    pWav->msadpcm.cachedFrames[3] = newSample1;
-                    pWav->msadpcm.cachedFrameCount = 2;
-                } else {
-                    /* Stereo. */
-                    drwav_int32 newSample0;
-                    drwav_int32 newSample1;
-
-                    /* Left. */
-                    newSample0  = ((pWav->msadpcm.prevFrames[0][1] * coeff1Table[pWav->msadpcm.predictor[0]]) + (pWav->msadpcm.prevFrames[0][0] * coeff2Table[pWav->msadpcm.predictor[0]])) >> 8;
-                    newSample0 += nibble0 * pWav->msadpcm.delta[0];
-                    newSample0  = drwav_clamp(newSample0, -32768, 32767);
-
-                    pWav->msadpcm.delta[0] = (adaptationTable[((nibbles & 0xF0) >> 4)] * pWav->msadpcm.delta[0]) >> 8;
-                    if (pWav->msadpcm.delta[0] < 16) {
-                        pWav->msadpcm.delta[0] = 16;
-                    }
-
-                    pWav->msadpcm.prevFrames[0][0] = pWav->msadpcm.prevFrames[0][1];
-                    pWav->msadpcm.prevFrames[0][1] = newSample0;
-
-
-                    /* Right. */
-                    newSample1  = ((pWav->msadpcm.prevFrames[1][1] * coeff1Table[pWav->msadpcm.predictor[1]]) + (pWav->msadpcm.prevFrames[1][0] * coeff2Table[pWav->msadpcm.predictor[1]])) >> 8;
-                    newSample1 += nibble1 * pWav->msadpcm.delta[1];
-                    newSample1  = drwav_clamp(newSample1, -32768, 32767);
-
-                    pWav->msadpcm.delta[1] = (adaptationTable[((nibbles & 0x0F) >> 0)] * pWav->msadpcm.delta[1]) >> 8;
-                    if (pWav->msadpcm.delta[1] < 16) {
-                        pWav->msadpcm.delta[1] = 16;
-                    }
-
-                    pWav->msadpcm.prevFrames[1][0] = pWav->msadpcm.prevFrames[1][1];
-                    pWav->msadpcm.prevFrames[1][1] = newSample1;
-
-                    pWav->msadpcm.cachedFrames[2] = newSample0;
-                    pWav->msadpcm.cachedFrames[3] = newSample1;
-                    pWav->msadpcm.cachedFrameCount = 1;
-                }
-            }
-        }
-    }
-
-    return totalFramesRead;
-}
-
-
-static drwav_uint64 drwav_read_pcm_frames_s16__ima(drwav* pWav, drwav_uint64 framesToRead, drwav_int16* pBufferOut)
-{
-    drwav_uint64 totalFramesRead = 0;
-    drwav_uint32 iChannel;
-
-    static drwav_int32 indexTable[16] = {
-        -1, -1, -1, -1, 2, 4, 6, 8,
-        -1, -1, -1, -1, 2, 4, 6, 8
-    };
-
-    static drwav_int32 stepTable[89] = {
-        7,     8,     9,     10,    11,    12,    13,    14,    16,    17, 
-        19,    21,    23,    25,    28,    31,    34,    37,    41,    45, 
-        50,    55,    60,    66,    73,    80,    88,    97,    107,   118, 
-        130,   143,   157,   173,   190,   209,   230,   253,   279,   307,
-        337,   371,   408,   449,   494,   544,   598,   658,   724,   796,
-        876,   963,   1060,  1166,  1282,  1411,  1552,  1707,  1878,  2066, 
-        2272,  2499,  2749,  3024,  3327,  3660,  4026,  4428,  4871,  5358,
-        5894,  6484,  7132,  7845,  8630,  9493,  10442, 11487, 12635, 13899, 
-        15289, 16818, 18500, 20350, 22385, 24623, 27086, 29794, 32767 
-    };
-
-    DRWAV_ASSERT(pWav != NULL);
-    DRWAV_ASSERT(framesToRead > 0);
-
-    /* TODO: Lots of room for optimization here. */
-
-    while (framesToRead > 0 && pWav->compressed.iCurrentPCMFrame < pWav->totalPCMFrameCount) {
-        /* If there are no cached samples we need to load a new block. */
-        if (pWav->ima.cachedFrameCount == 0 && pWav->ima.bytesRemainingInBlock == 0) {
-            if (pWav->channels == 1) {
-                /* Mono. */
-                drwav_uint8 header[4];
-                if (pWav->onRead(pWav->pUserData, header, sizeof(header)) != sizeof(header)) {
-                    return totalFramesRead;
-                }
-                pWav->ima.bytesRemainingInBlock = pWav->fmt.blockAlign - sizeof(header);
-
-                if (header[2] >= drwav_countof(stepTable)) {
-                    pWav->onSeek(pWav->pUserData, pWav->ima.bytesRemainingInBlock, drwav_seek_origin_current);
-                    pWav->ima.bytesRemainingInBlock = 0;
-                    return totalFramesRead; /* Invalid data. */
-                }
-
-                pWav->ima.predictor[0] = drwav__bytes_to_s16(header + 0);
-                pWav->ima.stepIndex[0] = header[2];
-                pWav->ima.cachedFrames[drwav_countof(pWav->ima.cachedFrames) - 1] = pWav->ima.predictor[0];
-                pWav->ima.cachedFrameCount = 1;
-            } else {
-                /* Stereo. */
-                drwav_uint8 header[8];
-                if (pWav->onRead(pWav->pUserData, header, sizeof(header)) != sizeof(header)) {
-                    return totalFramesRead;
-                }
-                pWav->ima.bytesRemainingInBlock = pWav->fmt.blockAlign - sizeof(header);
-
-                if (header[2] >= drwav_countof(stepTable) || header[6] >= drwav_countof(stepTable)) {
-                    pWav->onSeek(pWav->pUserData, pWav->ima.bytesRemainingInBlock, drwav_seek_origin_current);
-                    pWav->ima.bytesRemainingInBlock = 0;
-                    return totalFramesRead; /* Invalid data. */
-                }
-
-                pWav->ima.predictor[0] = drwav__bytes_to_s16(header + 0);
-                pWav->ima.stepIndex[0] = header[2];
-                pWav->ima.predictor[1] = drwav__bytes_to_s16(header + 4);
-                pWav->ima.stepIndex[1] = header[6];
-
-                pWav->ima.cachedFrames[drwav_countof(pWav->ima.cachedFrames) - 2] = pWav->ima.predictor[0];
-                pWav->ima.cachedFrames[drwav_countof(pWav->ima.cachedFrames) - 1] = pWav->ima.predictor[1];
-                pWav->ima.cachedFrameCount = 1;
-            }
-        }
-
-        /* Output anything that's cached. */
-        while (framesToRead > 0 && pWav->ima.cachedFrameCount > 0 && pWav->compressed.iCurrentPCMFrame < pWav->totalPCMFrameCount) {
-            if (pBufferOut != NULL) {
-                drwav_uint32 iSample;
-                for (iSample = 0; iSample < pWav->channels; iSample += 1) {
-                    pBufferOut[iSample] = (drwav_int16)pWav->ima.cachedFrames[(drwav_countof(pWav->ima.cachedFrames) - (pWav->ima.cachedFrameCount*pWav->channels)) + iSample];
-                }
-                pBufferOut += pWav->channels;
-            }
-
-            framesToRead    -= 1;
-            totalFramesRead += 1;
-            pWav->compressed.iCurrentPCMFrame += 1;
-            pWav->ima.cachedFrameCount -= 1;
-        }
-
-        if (framesToRead == 0) {
-            return totalFramesRead;
-        }
-
-        /*
-        If there's nothing left in the cache, just go ahead and load more. If there's nothing left to load in the current block we just continue to the next
-        loop iteration which will trigger the loading of a new block.
-        */
-        if (pWav->ima.cachedFrameCount == 0) {
-            if (pWav->ima.bytesRemainingInBlock == 0) {
-                continue;
-            } else {
-                /*
-                From what I can tell with stereo streams, it looks like every 4 bytes (8 samples) is for one channel. So it goes 4 bytes for the
-                left channel, 4 bytes for the right channel.
-                */
-                pWav->ima.cachedFrameCount = 8;
-                for (iChannel = 0; iChannel < pWav->channels; ++iChannel) {
-                    drwav_uint32 iByte;
-                    drwav_uint8 nibbles[4];
-                    if (pWav->onRead(pWav->pUserData, &nibbles, 4) != 4) {
-                        pWav->ima.cachedFrameCount = 0;
-                        return totalFramesRead;
-                    }
-                    pWav->ima.bytesRemainingInBlock -= 4;
-
-                    for (iByte = 0; iByte < 4; ++iByte) {
-                        drwav_uint8 nibble0 = ((nibbles[iByte] & 0x0F) >> 0);
-                        drwav_uint8 nibble1 = ((nibbles[iByte] & 0xF0) >> 4);
-
-                        drwav_int32 step      = stepTable[pWav->ima.stepIndex[iChannel]];
-                        drwav_int32 predictor = pWav->ima.predictor[iChannel];
-
-                        drwav_int32      diff  = step >> 3;
-                        if (nibble0 & 1) diff += step >> 2;
-                        if (nibble0 & 2) diff += step >> 1;
-                        if (nibble0 & 4) diff += step;
-                        if (nibble0 & 8) diff  = -diff;
-
-                        predictor = drwav_clamp(predictor + diff, -32768, 32767);
-                        pWav->ima.predictor[iChannel] = predictor;
-                        pWav->ima.stepIndex[iChannel] = drwav_clamp(pWav->ima.stepIndex[iChannel] + indexTable[nibble0], 0, (drwav_int32)drwav_countof(stepTable)-1);
-                        pWav->ima.cachedFrames[(drwav_countof(pWav->ima.cachedFrames) - (pWav->ima.cachedFrameCount*pWav->channels)) + (iByte*2+0)*pWav->channels + iChannel] = predictor;
-
-
-                        step      = stepTable[pWav->ima.stepIndex[iChannel]];
-                        predictor = pWav->ima.predictor[iChannel];
-
-                                         diff  = step >> 3;
-                        if (nibble1 & 1) diff += step >> 2;
-                        if (nibble1 & 2) diff += step >> 1;
-                        if (nibble1 & 4) diff += step;
-                        if (nibble1 & 8) diff  = -diff;
-
-                        predictor = drwav_clamp(predictor + diff, -32768, 32767);
-                        pWav->ima.predictor[iChannel] = predictor;
-                        pWav->ima.stepIndex[iChannel] = drwav_clamp(pWav->ima.stepIndex[iChannel] + indexTable[nibble1], 0, (drwav_int32)drwav_countof(stepTable)-1);
-                        pWav->ima.cachedFrames[(drwav_countof(pWav->ima.cachedFrames) - (pWav->ima.cachedFrameCount*pWav->channels)) + (iByte*2+1)*pWav->channels + iChannel] = predictor;
-                    }
-                }
-            }
-        }
-    }
-
-    return totalFramesRead;
-}
-
-
-#ifndef DR_WAV_NO_CONVERSION_API
-static unsigned short g_drwavAlawTable[256] = {
-    0xEA80, 0xEB80, 0xE880, 0xE980, 0xEE80, 0xEF80, 0xEC80, 0xED80, 0xE280, 0xE380, 0xE080, 0xE180, 0xE680, 0xE780, 0xE480, 0xE580, 
-    0xF540, 0xF5C0, 0xF440, 0xF4C0, 0xF740, 0xF7C0, 0xF640, 0xF6C0, 0xF140, 0xF1C0, 0xF040, 0xF0C0, 0xF340, 0xF3C0, 0xF240, 0xF2C0, 
-    0xAA00, 0xAE00, 0xA200, 0xA600, 0xBA00, 0xBE00, 0xB200, 0xB600, 0x8A00, 0x8E00, 0x8200, 0x8600, 0x9A00, 0x9E00, 0x9200, 0x9600, 
-    0xD500, 0xD700, 0xD100, 0xD300, 0xDD00, 0xDF00, 0xD900, 0xDB00, 0xC500, 0xC700, 0xC100, 0xC300, 0xCD00, 0xCF00, 0xC900, 0xCB00, 
-    0xFEA8, 0xFEB8, 0xFE88, 0xFE98, 0xFEE8, 0xFEF8, 0xFEC8, 0xFED8, 0xFE28, 0xFE38, 0xFE08, 0xFE18, 0xFE68, 0xFE78, 0xFE48, 0xFE58, 
-    0xFFA8, 0xFFB8, 0xFF88, 0xFF98, 0xFFE8, 0xFFF8, 0xFFC8, 0xFFD8, 0xFF28, 0xFF38, 0xFF08, 0xFF18, 0xFF68, 0xFF78, 0xFF48, 0xFF58, 
-    0xFAA0, 0xFAE0, 0xFA20, 0xFA60, 0xFBA0, 0xFBE0, 0xFB20, 0xFB60, 0xF8A0, 0xF8E0, 0xF820, 0xF860, 0xF9A0, 0xF9E0, 0xF920, 0xF960, 
-    0xFD50, 0xFD70, 0xFD10, 0xFD30, 0xFDD0, 0xFDF0, 0xFD90, 0xFDB0, 0xFC50, 0xFC70, 0xFC10, 0xFC30, 0xFCD0, 0xFCF0, 0xFC90, 0xFCB0, 
-    0x1580, 0x1480, 0x1780, 0x1680, 0x1180, 0x1080, 0x1380, 0x1280, 0x1D80, 0x1C80, 0x1F80, 0x1E80, 0x1980, 0x1880, 0x1B80, 0x1A80, 
-    0x0AC0, 0x0A40, 0x0BC0, 0x0B40, 0x08C0, 0x0840, 0x09C0, 0x0940, 0x0EC0, 0x0E40, 0x0FC0, 0x0F40, 0x0CC0, 0x0C40, 0x0DC0, 0x0D40, 
-    0x5600, 0x5200, 0x5E00, 0x5A00, 0x4600, 0x4200, 0x4E00, 0x4A00, 0x7600, 0x7200, 0x7E00, 0x7A00, 0x6600, 0x6200, 0x6E00, 0x6A00, 
-    0x2B00, 0x2900, 0x2F00, 0x2D00, 0x2300, 0x2100, 0x2700, 0x2500, 0x3B00, 0x3900, 0x3F00, 0x3D00, 0x3300, 0x3100, 0x3700, 0x3500, 
-    0x0158, 0x0148, 0x0178, 0x0168, 0x0118, 0x0108, 0x0138, 0x0128, 0x01D8, 0x01C8, 0x01F8, 0x01E8, 0x0198, 0x0188, 0x01B8, 0x01A8, 
-    0x0058, 0x0048, 0x0078, 0x0068, 0x0018, 0x0008, 0x0038, 0x0028, 0x00D8, 0x00C8, 0x00F8, 0x00E8, 0x0098, 0x0088, 0x00B8, 0x00A8, 
-    0x0560, 0x0520, 0x05E0, 0x05A0, 0x0460, 0x0420, 0x04E0, 0x04A0, 0x0760, 0x0720, 0x07E0, 0x07A0, 0x0660, 0x0620, 0x06E0, 0x06A0, 
-    0x02B0, 0x0290, 0x02F0, 0x02D0, 0x0230, 0x0210, 0x0270, 0x0250, 0x03B0, 0x0390, 0x03F0, 0x03D0, 0x0330, 0x0310, 0x0370, 0x0350
-};
-
-static unsigned short g_drwavMulawTable[256] = {
-    0x8284, 0x8684, 0x8A84, 0x8E84, 0x9284, 0x9684, 0x9A84, 0x9E84, 0xA284, 0xA684, 0xAA84, 0xAE84, 0xB284, 0xB684, 0xBA84, 0xBE84, 
-    0xC184, 0xC384, 0xC584, 0xC784, 0xC984, 0xCB84, 0xCD84, 0xCF84, 0xD184, 0xD384, 0xD584, 0xD784, 0xD984, 0xDB84, 0xDD84, 0xDF84, 
-    0xE104, 0xE204, 0xE304, 0xE404, 0xE504, 0xE604, 0xE704, 0xE804, 0xE904, 0xEA04, 0xEB04, 0xEC04, 0xED04, 0xEE04, 0xEF04, 0xF004, 
-    0xF0C4, 0xF144, 0xF1C4, 0xF244, 0xF2C4, 0xF344, 0xF3C4, 0xF444, 0xF4C4, 0xF544, 0xF5C4, 0xF644, 0xF6C4, 0xF744, 0xF7C4, 0xF844, 
-    0xF8A4, 0xF8E4, 0xF924, 0xF964, 0xF9A4, 0xF9E4, 0xFA24, 0xFA64, 0xFAA4, 0xFAE4, 0xFB24, 0xFB64, 0xFBA4, 0xFBE4, 0xFC24, 0xFC64, 
-    0xFC94, 0xFCB4, 0xFCD4, 0xFCF4, 0xFD14, 0xFD34, 0xFD54, 0xFD74, 0xFD94, 0xFDB4, 0xFDD4, 0xFDF4, 0xFE14, 0xFE34, 0xFE54, 0xFE74, 
-    0xFE8C, 0xFE9C, 0xFEAC, 0xFEBC, 0xFECC, 0xFEDC, 0xFEEC, 0xFEFC, 0xFF0C, 0xFF1C, 0xFF2C, 0xFF3C, 0xFF4C, 0xFF5C, 0xFF6C, 0xFF7C, 
-    0xFF88, 0xFF90, 0xFF98, 0xFFA0, 0xFFA8, 0xFFB0, 0xFFB8, 0xFFC0, 0xFFC8, 0xFFD0, 0xFFD8, 0xFFE0, 0xFFE8, 0xFFF0, 0xFFF8, 0x0000, 
-    0x7D7C, 0x797C, 0x757C, 0x717C, 0x6D7C, 0x697C, 0x657C, 0x617C, 0x5D7C, 0x597C, 0x557C, 0x517C, 0x4D7C, 0x497C, 0x457C, 0x417C, 
-    0x3E7C, 0x3C7C, 0x3A7C, 0x387C, 0x367C, 0x347C, 0x327C, 0x307C, 0x2E7C, 0x2C7C, 0x2A7C, 0x287C, 0x267C, 0x247C, 0x227C, 0x207C, 
-    0x1EFC, 0x1DFC, 0x1CFC, 0x1BFC, 0x1AFC, 0x19FC, 0x18FC, 0x17FC, 0x16FC, 0x15FC, 0x14FC, 0x13FC, 0x12FC, 0x11FC, 0x10FC, 0x0FFC, 
-    0x0F3C, 0x0EBC, 0x0E3C, 0x0DBC, 0x0D3C, 0x0CBC, 0x0C3C, 0x0BBC, 0x0B3C, 0x0ABC, 0x0A3C, 0x09BC, 0x093C, 0x08BC, 0x083C, 0x07BC, 
-    0x075C, 0x071C, 0x06DC, 0x069C, 0x065C, 0x061C, 0x05DC, 0x059C, 0x055C, 0x051C, 0x04DC, 0x049C, 0x045C, 0x041C, 0x03DC, 0x039C, 
-    0x036C, 0x034C, 0x032C, 0x030C, 0x02EC, 0x02CC, 0x02AC, 0x028C, 0x026C, 0x024C, 0x022C, 0x020C, 0x01EC, 0x01CC, 0x01AC, 0x018C, 
-    0x0174, 0x0164, 0x0154, 0x0144, 0x0134, 0x0124, 0x0114, 0x0104, 0x00F4, 0x00E4, 0x00D4, 0x00C4, 0x00B4, 0x00A4, 0x0094, 0x0084, 
-    0x0078, 0x0070, 0x0068, 0x0060, 0x0058, 0x0050, 0x0048, 0x0040, 0x0038, 0x0030, 0x0028, 0x0020, 0x0018, 0x0010, 0x0008, 0x0000
-};
-
-static DRWAV_INLINE drwav_int16 drwav__alaw_to_s16(drwav_uint8 sampleIn)
-{
-    return (short)g_drwavAlawTable[sampleIn];
-}
-
-static DRWAV_INLINE drwav_int16 drwav__mulaw_to_s16(drwav_uint8 sampleIn)
-{
-    return (short)g_drwavMulawTable[sampleIn];
-}
-
-
-
-static void drwav__pcm_to_s16(drwav_int16* pOut, const drwav_uint8* pIn, size_t totalSampleCount, unsigned int bytesPerSample)
-{
-    unsigned int i;
-
-    /* Special case for 8-bit sample data because it's treated as unsigned. */
-    if (bytesPerSample == 1) {
-        drwav_u8_to_s16(pOut, pIn, totalSampleCount);
-        return;
-    }
-
-
-    /* Slightly more optimal implementation for common formats. */
-    if (bytesPerSample == 2) {
-        for (i = 0; i < totalSampleCount; ++i) {
-           *pOut++ = ((const drwav_int16*)pIn)[i];
-        }
-        return;
-    }
-    if (bytesPerSample == 3) {
-        drwav_s24_to_s16(pOut, pIn, totalSampleCount);
-        return;
-    }
-    if (bytesPerSample == 4) {
-        drwav_s32_to_s16(pOut, (const drwav_int32*)pIn, totalSampleCount);
-        return;
-    }
-
-
-    /* Anything more than 64 bits per sample is not supported. */
-    if (bytesPerSample > 8) {
-        DRWAV_ZERO_MEMORY(pOut, totalSampleCount * sizeof(*pOut));
-        return;
-    }
-
-
-    /* Generic, slow converter. */
-    for (i = 0; i < totalSampleCount; ++i) {
-        drwav_uint64 sample = 0;
-        unsigned int shift  = (8 - bytesPerSample) * 8;
-
-        unsigned int j;
-        for (j = 0; j < bytesPerSample; j += 1) {
-            DRWAV_ASSERT(j < 8);
-            sample |= (drwav_uint64)(pIn[j]) << shift;
-            shift  += 8;
-        }
-
-        pIn += j;
-        *pOut++ = (drwav_int16)((drwav_int64)sample >> 48);
-    }
-}
-
-static void drwav__ieee_to_s16(drwav_int16* pOut, const drwav_uint8* pIn, size_t totalSampleCount, unsigned int bytesPerSample)
-{
-    if (bytesPerSample == 4) {
-        drwav_f32_to_s16(pOut, (const float*)pIn, totalSampleCount);
-        return;
-    } else if (bytesPerSample == 8) {
-        drwav_f64_to_s16(pOut, (const double*)pIn, totalSampleCount);
-        return;
-    } else {
-        /* Only supporting 32- and 64-bit float. Output silence in all other cases. Contributions welcome for 16-bit float. */
-        DRWAV_ZERO_MEMORY(pOut, totalSampleCount * sizeof(*pOut));
-        return;
-    }
-}
-
-static drwav_uint64 drwav_read_pcm_frames_s16__pcm(drwav* pWav, drwav_uint64 framesToRead, drwav_int16* pBufferOut)
-{
-    drwav_uint32 bytesPerFrame;
-    drwav_uint64 totalFramesRead;
-    drwav_uint8 sampleData[4096];
-
-    /* Fast path. */
-    if ((pWav->translatedFormatTag == DR_WAVE_FORMAT_PCM && pWav->bitsPerSample == 16) || pBufferOut == NULL) {
-        return drwav_read_pcm_frames(pWav, framesToRead, pBufferOut);
-    }
-    
-    bytesPerFrame = drwav_get_bytes_per_pcm_frame(pWav);
-    if (bytesPerFrame == 0) {
-        return 0;
-    }
-
-    totalFramesRead = 0;
-    
-    while (framesToRead > 0) {
-        drwav_uint64 framesRead = drwav_read_pcm_frames(pWav, drwav_min(framesToRead, sizeof(sampleData)/bytesPerFrame), sampleData);
-        if (framesRead == 0) {
-            break;
-        }
-
-        drwav__pcm_to_s16(pBufferOut, sampleData, (size_t)(framesRead*pWav->channels), bytesPerFrame/pWav->channels);
-
-        pBufferOut      += framesRead*pWav->channels;
-        framesToRead    -= framesRead;
-        totalFramesRead += framesRead;
-    }
-
-    return totalFramesRead;
-}
-
-static drwav_uint64 drwav_read_pcm_frames_s16__ieee(drwav* pWav, drwav_uint64 framesToRead, drwav_int16* pBufferOut)
-{
-    drwav_uint64 totalFramesRead;
-    drwav_uint8 sampleData[4096];
-    drwav_uint32 bytesPerFrame;
-
-    if (pBufferOut == NULL) {
-        return drwav_read_pcm_frames(pWav, framesToRead, NULL);
-    }
-
-    bytesPerFrame = drwav_get_bytes_per_pcm_frame(pWav);
-    if (bytesPerFrame == 0) {
-        return 0;
-    }
-
-    totalFramesRead = 0;
-    
-    while (framesToRead > 0) {
-        drwav_uint64 framesRead = drwav_read_pcm_frames(pWav, drwav_min(framesToRead, sizeof(sampleData)/bytesPerFrame), sampleData);
-        if (framesRead == 0) {
-            break;
-        }
-
-        drwav__ieee_to_s16(pBufferOut, sampleData, (size_t)(framesRead*pWav->channels), bytesPerFrame/pWav->channels);
-
-        pBufferOut      += framesRead*pWav->channels;
-        framesToRead    -= framesRead;
-        totalFramesRead += framesRead;
-    }
-
-    return totalFramesRead;
-}
-
-static drwav_uint64 drwav_read_pcm_frames_s16__alaw(drwav* pWav, drwav_uint64 framesToRead, drwav_int16* pBufferOut)
-{
-    drwav_uint64 totalFramesRead;
-    drwav_uint8 sampleData[4096];
-    drwav_uint32 bytesPerFrame;
-
-    if (pBufferOut == NULL) {
-        return drwav_read_pcm_frames(pWav, framesToRead, NULL);
-    }
-
-    bytesPerFrame = drwav_get_bytes_per_pcm_frame(pWav);
-    if (bytesPerFrame == 0) {
-        return 0;
-    }
-
-    totalFramesRead = 0;
-    
-    while (framesToRead > 0) {
-        drwav_uint64 framesRead = drwav_read_pcm_frames(pWav, drwav_min(framesToRead, sizeof(sampleData)/bytesPerFrame), sampleData);
-        if (framesRead == 0) {
-            break;
-        }
-
-        drwav_alaw_to_s16(pBufferOut, sampleData, (size_t)(framesRead*pWav->channels));
-
-        pBufferOut      += framesRead*pWav->channels;
-        framesToRead    -= framesRead;
-        totalFramesRead += framesRead;
-    }
-
-    return totalFramesRead;
-}
-
-static drwav_uint64 drwav_read_pcm_frames_s16__mulaw(drwav* pWav, drwav_uint64 framesToRead, drwav_int16* pBufferOut)
-{
-    drwav_uint64 totalFramesRead;
-    drwav_uint8 sampleData[4096];
-    drwav_uint32 bytesPerFrame;
-
-    if (pBufferOut == NULL) {
-        return drwav_read_pcm_frames(pWav, framesToRead, NULL);
-    }
-
-    bytesPerFrame = drwav_get_bytes_per_pcm_frame(pWav);
-    if (bytesPerFrame == 0) {
-        return 0;
-    }
-
-    totalFramesRead = 0;
-
-    while (framesToRead > 0) {
-        drwav_uint64 framesRead = drwav_read_pcm_frames(pWav, drwav_min(framesToRead, sizeof(sampleData)/bytesPerFrame), sampleData);
-        if (framesRead == 0) {
-            break;
-        }
-
-        drwav_mulaw_to_s16(pBufferOut, sampleData, (size_t)(framesRead*pWav->channels));
-
-        pBufferOut      += framesRead*pWav->channels;
-        framesToRead    -= framesRead;
-        totalFramesRead += framesRead;
-    }
-
-    return totalFramesRead;
-}
-
-DRWAV_API drwav_uint64 drwav_read_pcm_frames_s16(drwav* pWav, drwav_uint64 framesToRead, drwav_int16* pBufferOut)
-{
-    if (pWav == NULL || framesToRead == 0) {
-        return 0;
-    }
-
-    if (pBufferOut == NULL) {
-        return drwav_read_pcm_frames(pWav, framesToRead, NULL);
-    }
-
-    /* Don't try to read more samples than can potentially fit in the output buffer. */
-    if (framesToRead * pWav->channels * sizeof(drwav_int16) > DRWAV_SIZE_MAX) {
-        framesToRead = DRWAV_SIZE_MAX / sizeof(drwav_int16) / pWav->channels;
-    }
-
-    if (pWav->translatedFormatTag == DR_WAVE_FORMAT_PCM) {
-        return drwav_read_pcm_frames_s16__pcm(pWav, framesToRead, pBufferOut);
-    }
-
-    if (pWav->translatedFormatTag == DR_WAVE_FORMAT_IEEE_FLOAT) {
-        return drwav_read_pcm_frames_s16__ieee(pWav, framesToRead, pBufferOut);
-    }
-
-    if (pWav->translatedFormatTag == DR_WAVE_FORMAT_ALAW) {
-        return drwav_read_pcm_frames_s16__alaw(pWav, framesToRead, pBufferOut);
-    }
-
-    if (pWav->translatedFormatTag == DR_WAVE_FORMAT_MULAW) {
-        return drwav_read_pcm_frames_s16__mulaw(pWav, framesToRead, pBufferOut);
-    }
-
-    if (pWav->translatedFormatTag == DR_WAVE_FORMAT_ADPCM) {
-        return drwav_read_pcm_frames_s16__msadpcm(pWav, framesToRead, pBufferOut);
-    }
-
-    if (pWav->translatedFormatTag == DR_WAVE_FORMAT_DVI_ADPCM) {
-        return drwav_read_pcm_frames_s16__ima(pWav, framesToRead, pBufferOut);
-    }
-
-    return 0;
-}
-
-DRWAV_API drwav_uint64 drwav_read_pcm_frames_s16le(drwav* pWav, drwav_uint64 framesToRead, drwav_int16* pBufferOut)
-{
-    drwav_uint64 framesRead = drwav_read_pcm_frames_s16(pWav, framesToRead, pBufferOut);
-    if (pBufferOut != NULL && drwav__is_little_endian() == DRWAV_FALSE) {
-        drwav__bswap_samples_s16(pBufferOut, framesRead*pWav->channels);
-    }
-
-    return framesRead;
-}
-
-DRWAV_API drwav_uint64 drwav_read_pcm_frames_s16be(drwav* pWav, drwav_uint64 framesToRead, drwav_int16* pBufferOut)
-{
-    drwav_uint64 framesRead = drwav_read_pcm_frames_s16(pWav, framesToRead, pBufferOut);
-    if (pBufferOut != NULL && drwav__is_little_endian() == DRWAV_TRUE) {
-        drwav__bswap_samples_s16(pBufferOut, framesRead*pWav->channels);
-    }
-
-    return framesRead;
-}
-
-
-DRWAV_API void drwav_u8_to_s16(drwav_int16* pOut, const drwav_uint8* pIn, size_t sampleCount)
-{
-    int r;
-    size_t i;
-    for (i = 0; i < sampleCount; ++i) {
-        int x = pIn[i];
-        r = x << 8;
-        r = r - 32768;
-        pOut[i] = (short)r;
-    }
-}
-
-DRWAV_API void drwav_s24_to_s16(drwav_int16* pOut, const drwav_uint8* pIn, size_t sampleCount)
-{
-    int r;
-    size_t i;
-    for (i = 0; i < sampleCount; ++i) {
-        int x = ((int)(((unsigned int)(((const drwav_uint8*)pIn)[i*3+0]) << 8) | ((unsigned int)(((const drwav_uint8*)pIn)[i*3+1]) << 16) | ((unsigned int)(((const drwav_uint8*)pIn)[i*3+2])) << 24)) >> 8;
-        r = x >> 8;
-        pOut[i] = (short)r;
-    }
-}
-
-DRWAV_API void drwav_s32_to_s16(drwav_int16* pOut, const drwav_int32* pIn, size_t sampleCount)
-{
-    int r;
-    size_t i;
-    for (i = 0; i < sampleCount; ++i) {
-        int x = pIn[i];
-        r = x >> 16;
-        pOut[i] = (short)r;
-    }
-}
-
-DRWAV_API void drwav_f32_to_s16(drwav_int16* pOut, const float* pIn, size_t sampleCount)
-{
-    int r;
-    size_t i;
-    for (i = 0; i < sampleCount; ++i) {
-        float x = pIn[i];
-        float c;
-        c = ((x < -1) ? -1 : ((x > 1) ? 1 : x));
-        c = c + 1;
-        r = (int)(c * 32767.5f);
-        r = r - 32768;
-        pOut[i] = (short)r;
-    }
-}
-
-DRWAV_API void drwav_f64_to_s16(drwav_int16* pOut, const double* pIn, size_t sampleCount)
-{
-    int r;
-    size_t i;
-    for (i = 0; i < sampleCount; ++i) {
-        double x = pIn[i];
-        double c;
-        c = ((x < -1) ? -1 : ((x > 1) ? 1 : x));
-        c = c + 1;
-        r = (int)(c * 32767.5);
-        r = r - 32768;
-        pOut[i] = (short)r;
-    }
-}
-
-DRWAV_API void drwav_alaw_to_s16(drwav_int16* pOut, const drwav_uint8* pIn, size_t sampleCount)
-{
-    size_t i;
-    for (i = 0; i < sampleCount; ++i) {
-        pOut[i] = drwav__alaw_to_s16(pIn[i]);
-    }
-}
-
-DRWAV_API void drwav_mulaw_to_s16(drwav_int16* pOut, const drwav_uint8* pIn, size_t sampleCount)
-{
-    size_t i;
-    for (i = 0; i < sampleCount; ++i) {
-        pOut[i] = drwav__mulaw_to_s16(pIn[i]);
-    }
-}
-
-
-
-static void drwav__pcm_to_f32(float* pOut, const drwav_uint8* pIn, size_t sampleCount, unsigned int bytesPerSample)
-{
-    unsigned int i;
-
-    /* Special case for 8-bit sample data because it's treated as unsigned. */
-    if (bytesPerSample == 1) {
-        drwav_u8_to_f32(pOut, pIn, sampleCount);
-        return;
-    }
-
-    /* Slightly more optimal implementation for common formats. */
-    if (bytesPerSample == 2) {
-        drwav_s16_to_f32(pOut, (const drwav_int16*)pIn, sampleCount);
-        return;
-    }
-    if (bytesPerSample == 3) {
-        drwav_s24_to_f32(pOut, pIn, sampleCount);
-        return;
-    }
-    if (bytesPerSample == 4) {
-        drwav_s32_to_f32(pOut, (const drwav_int32*)pIn, sampleCount);
-        return;
-    }
-
-
-    /* Anything more than 64 bits per sample is not supported. */
-    if (bytesPerSample > 8) {
-        DRWAV_ZERO_MEMORY(pOut, sampleCount * sizeof(*pOut));
-        return;
-    }
-
-
-    /* Generic, slow converter. */
-    for (i = 0; i < sampleCount; ++i) {
-        drwav_uint64 sample = 0;
-        unsigned int shift  = (8 - bytesPerSample) * 8;
-
-        unsigned int j;
-        for (j = 0; j < bytesPerSample; j += 1) {
-            DRWAV_ASSERT(j < 8);
-            sample |= (drwav_uint64)(pIn[j]) << shift;
-            shift  += 8;
-        }
-
-        pIn += j;
-        *pOut++ = (float)((drwav_int64)sample / 9223372036854775807.0);
-    }
-}
-
-static void drwav__ieee_to_f32(float* pOut, const drwav_uint8* pIn, size_t sampleCount, unsigned int bytesPerSample)
-{
-    if (bytesPerSample == 4) {
-        unsigned int i;
-        for (i = 0; i < sampleCount; ++i) {
-            *pOut++ = ((const float*)pIn)[i];
-        }
-        return;
-    } else if (bytesPerSample == 8) {
-        drwav_f64_to_f32(pOut, (const double*)pIn, sampleCount);
-        return;
-    } else {
-        /* Only supporting 32- and 64-bit float. Output silence in all other cases. Contributions welcome for 16-bit float. */
-        DRWAV_ZERO_MEMORY(pOut, sampleCount * sizeof(*pOut));
-        return;
-    }
-}
-
-
-static drwav_uint64 drwav_read_pcm_frames_f32__pcm(drwav* pWav, drwav_uint64 framesToRead, float* pBufferOut)
-{
-    drwav_uint64 totalFramesRead;
-    drwav_uint8 sampleData[4096];
-
-    drwav_uint32 bytesPerFrame = drwav_get_bytes_per_pcm_frame(pWav);
-    if (bytesPerFrame == 0) {
-        return 0;
-    }
-
-    totalFramesRead = 0;
-
-    while (framesToRead > 0) {
-        drwav_uint64 framesRead = drwav_read_pcm_frames(pWav, drwav_min(framesToRead, sizeof(sampleData)/bytesPerFrame), sampleData);
-        if (framesRead == 0) {
-            break;
-        }
-
-        drwav__pcm_to_f32(pBufferOut, sampleData, (size_t)framesRead*pWav->channels, bytesPerFrame/pWav->channels);
-
-        pBufferOut      += framesRead*pWav->channels;
-        framesToRead    -= framesRead;
-        totalFramesRead += framesRead;
-    }
-
-    return totalFramesRead;
-}
-
-static drwav_uint64 drwav_read_pcm_frames_f32__msadpcm(drwav* pWav, drwav_uint64 framesToRead, float* pBufferOut)
-{
-    /*
-    We're just going to borrow the implementation from the drwav_read_s16() since ADPCM is a little bit more complicated than other formats and I don't
-    want to duplicate that code.
-    */
-    drwav_uint64 totalFramesRead = 0;
-    drwav_int16 samples16[2048];
-    while (framesToRead > 0) {
-        drwav_uint64 framesRead = drwav_read_pcm_frames_s16(pWav, drwav_min(framesToRead, drwav_countof(samples16)/pWav->channels), samples16);
-        if (framesRead == 0) {
-            break;
-        }
-
-        drwav_s16_to_f32(pBufferOut, samples16, (size_t)(framesRead*pWav->channels));   /* <-- Safe cast because we're clamping to 2048. */
-
-        pBufferOut      += framesRead*pWav->channels;
-        framesToRead    -= framesRead;
-        totalFramesRead += framesRead;
-    }
-
-    return totalFramesRead;
-}
-
-static drwav_uint64 drwav_read_pcm_frames_f32__ima(drwav* pWav, drwav_uint64 framesToRead, float* pBufferOut)
-{
-    /*
-    We're just going to borrow the implementation from the drwav_read_s16() since IMA-ADPCM is a little bit more complicated than other formats and I don't
-    want to duplicate that code.
-    */
-    drwav_uint64 totalFramesRead = 0;
-    drwav_int16 samples16[2048];
-    while (framesToRead > 0) {
-        drwav_uint64 framesRead = drwav_read_pcm_frames_s16(pWav, drwav_min(framesToRead, drwav_countof(samples16)/pWav->channels), samples16);
-        if (framesRead == 0) {
-            break;
-        }
-
-        drwav_s16_to_f32(pBufferOut, samples16, (size_t)(framesRead*pWav->channels));   /* <-- Safe cast because we're clamping to 2048. */
-
-        pBufferOut      += framesRead*pWav->channels;
-        framesToRead    -= framesRead;
-        totalFramesRead += framesRead;
-    }
-
-    return totalFramesRead;
-}
-
-static drwav_uint64 drwav_read_pcm_frames_f32__ieee(drwav* pWav, drwav_uint64 framesToRead, float* pBufferOut)
-{
-    drwav_uint64 totalFramesRead;
-    drwav_uint8 sampleData[4096];
-    drwav_uint32 bytesPerFrame;
-
-    /* Fast path. */
-    if (pWav->translatedFormatTag == DR_WAVE_FORMAT_IEEE_FLOAT && pWav->bitsPerSample == 32) {
-        return drwav_read_pcm_frames(pWav, framesToRead, pBufferOut);
-    }
-    
-    bytesPerFrame = drwav_get_bytes_per_pcm_frame(pWav);
-    if (bytesPerFrame == 0) {
-        return 0;
-    }
-
-    totalFramesRead = 0;
-
-    while (framesToRead > 0) {
-        drwav_uint64 framesRead = drwav_read_pcm_frames(pWav, drwav_min(framesToRead, sizeof(sampleData)/bytesPerFrame), sampleData);
-        if (framesRead == 0) {
-            break;
-        }
-
-        drwav__ieee_to_f32(pBufferOut, sampleData, (size_t)(framesRead*pWav->channels), bytesPerFrame/pWav->channels);
-
-        pBufferOut      += framesRead*pWav->channels;
-        framesToRead    -= framesRead;
-        totalFramesRead += framesRead;
-    }
-
-    return totalFramesRead;
-}
-
-static drwav_uint64 drwav_read_pcm_frames_f32__alaw(drwav* pWav, drwav_uint64 framesToRead, float* pBufferOut)
-{
-    drwav_uint64 totalFramesRead;
-    drwav_uint8 sampleData[4096];
-    drwav_uint32 bytesPerFrame = drwav_get_bytes_per_pcm_frame(pWav);
-    if (bytesPerFrame == 0) {
-        return 0;
-    }
-
-    totalFramesRead = 0;
-
-    while (framesToRead > 0) {
-        drwav_uint64 framesRead = drwav_read_pcm_frames(pWav, drwav_min(framesToRead, sizeof(sampleData)/bytesPerFrame), sampleData);
-        if (framesRead == 0) {
-            break;
-        }
-
-        drwav_alaw_to_f32(pBufferOut, sampleData, (size_t)(framesRead*pWav->channels));
-
-        pBufferOut      += framesRead*pWav->channels;
-        framesToRead    -= framesRead;
-        totalFramesRead += framesRead;
-    }
-
-    return totalFramesRead;
-}
-
-static drwav_uint64 drwav_read_pcm_frames_f32__mulaw(drwav* pWav, drwav_uint64 framesToRead, float* pBufferOut)
-{
-    drwav_uint64 totalFramesRead;
-    drwav_uint8 sampleData[4096];
-
-    drwav_uint32 bytesPerFrame = drwav_get_bytes_per_pcm_frame(pWav);
-    if (bytesPerFrame == 0) {
-        return 0;
-    }
-
-    totalFramesRead = 0;
-
-    while (framesToRead > 0) {
-        drwav_uint64 framesRead = drwav_read_pcm_frames(pWav, drwav_min(framesToRead, sizeof(sampleData)/bytesPerFrame), sampleData);
-        if (framesRead == 0) {
-            break;
-        }
-
-        drwav_mulaw_to_f32(pBufferOut, sampleData, (size_t)(framesRead*pWav->channels));
-
-        pBufferOut      += framesRead*pWav->channels;
-        framesToRead    -= framesRead;
-        totalFramesRead += framesRead;
-    }
-
-    return totalFramesRead;
-}
-
-DRWAV_API drwav_uint64 drwav_read_pcm_frames_f32(drwav* pWav, drwav_uint64 framesToRead, float* pBufferOut)
-{
-    if (pWav == NULL || framesToRead == 0) {
-        return 0;
-    }
-
-    if (pBufferOut == NULL) {
-        return drwav_read_pcm_frames(pWav, framesToRead, NULL);
-    }
-
-    /* Don't try to read more samples than can potentially fit in the output buffer. */
-    if (framesToRead * pWav->channels * sizeof(float) > DRWAV_SIZE_MAX) {
-        framesToRead = DRWAV_SIZE_MAX / sizeof(float) / pWav->channels;
-    }
-
-    if (pWav->translatedFormatTag == DR_WAVE_FORMAT_PCM) {
-        return drwav_read_pcm_frames_f32__pcm(pWav, framesToRead, pBufferOut);
-    }
-
-    if (pWav->translatedFormatTag == DR_WAVE_FORMAT_ADPCM) {
-        return drwav_read_pcm_frames_f32__msadpcm(pWav, framesToRead, pBufferOut);
-    }
-
-    if (pWav->translatedFormatTag == DR_WAVE_FORMAT_IEEE_FLOAT) {
-        return drwav_read_pcm_frames_f32__ieee(pWav, framesToRead, pBufferOut);
-    }
-
-    if (pWav->translatedFormatTag == DR_WAVE_FORMAT_ALAW) {
-        return drwav_read_pcm_frames_f32__alaw(pWav, framesToRead, pBufferOut);
-    }
-
-    if (pWav->translatedFormatTag == DR_WAVE_FORMAT_MULAW) {
-        return drwav_read_pcm_frames_f32__mulaw(pWav, framesToRead, pBufferOut);
-    }
-
-    if (pWav->translatedFormatTag == DR_WAVE_FORMAT_DVI_ADPCM) {
-        return drwav_read_pcm_frames_f32__ima(pWav, framesToRead, pBufferOut);
-    }
-
-    return 0;
-}
-
-DRWAV_API drwav_uint64 drwav_read_pcm_frames_f32le(drwav* pWav, drwav_uint64 framesToRead, float* pBufferOut)
-{
-    drwav_uint64 framesRead = drwav_read_pcm_frames_f32(pWav, framesToRead, pBufferOut);
-    if (pBufferOut != NULL && drwav__is_little_endian() == DRWAV_FALSE) {
-        drwav__bswap_samples_f32(pBufferOut, framesRead*pWav->channels);
-    }
-
-    return framesRead;
-}
-
-DRWAV_API drwav_uint64 drwav_read_pcm_frames_f32be(drwav* pWav, drwav_uint64 framesToRead, float* pBufferOut)
-{
-    drwav_uint64 framesRead = drwav_read_pcm_frames_f32(pWav, framesToRead, pBufferOut);
-    if (pBufferOut != NULL && drwav__is_little_endian() == DRWAV_TRUE) {
-        drwav__bswap_samples_f32(pBufferOut, framesRead*pWav->channels);
-    }
-
-    return framesRead;
-}
-
-
-DRWAV_API void drwav_u8_to_f32(float* pOut, const drwav_uint8* pIn, size_t sampleCount)
-{
-    size_t i;
-
-    if (pOut == NULL || pIn == NULL) {
-        return;
-    }
-
-#ifdef DR_WAV_LIBSNDFILE_COMPAT
-    /*
-    It appears libsndfile uses slightly different logic for the u8 -> f32 conversion to dr_wav, which in my opinion is incorrect. It appears
-    libsndfile performs the conversion something like "f32 = (u8 / 256) * 2 - 1", however I think it should be "f32 = (u8 / 255) * 2 - 1" (note
-    the divisor of 256 vs 255). I use libsndfile as a benchmark for testing, so I'm therefore leaving this block here just for my automated
-    correctness testing. This is disabled by default.
-    */
-    for (i = 0; i < sampleCount; ++i) {
-        *pOut++ = (pIn[i] / 256.0f) * 2 - 1;
-    }
-#else
-    for (i = 0; i < sampleCount; ++i) {
-        float x = pIn[i];
-        x = x * 0.00784313725490196078f;    /* 0..255 to 0..2 */
-        x = x - 1;                          /* 0..2 to -1..1 */
-
-        *pOut++ = x;
-    }
-#endif
-}
-
-DRWAV_API void drwav_s16_to_f32(float* pOut, const drwav_int16* pIn, size_t sampleCount)
-{
-    size_t i;
-
-    if (pOut == NULL || pIn == NULL) {
-        return;
-    }
-
-    for (i = 0; i < sampleCount; ++i) {
-        *pOut++ = pIn[i] * 0.000030517578125f;
-    }
-}
-
-DRWAV_API void drwav_s24_to_f32(float* pOut, const drwav_uint8* pIn, size_t sampleCount)
-{
-    size_t i;
-
-    if (pOut == NULL || pIn == NULL) {
-        return;
-    }
-
-    for (i = 0; i < sampleCount; ++i) {
-        double x;
-        drwav_uint32 a = ((drwav_uint32)(pIn[i*3+0]) <<  8);
-        drwav_uint32 b = ((drwav_uint32)(pIn[i*3+1]) << 16);
-        drwav_uint32 c = ((drwav_uint32)(pIn[i*3+2]) << 24);
-
-        x = (double)((drwav_int32)(a | b | c) >> 8);
-        *pOut++ = (float)(x * 0.00000011920928955078125);
-    }
-}
-
-DRWAV_API void drwav_s32_to_f32(float* pOut, const drwav_int32* pIn, size_t sampleCount)
-{
-    size_t i;
-    if (pOut == NULL || pIn == NULL) {
-        return;
-    }
-
-    for (i = 0; i < sampleCount; ++i) {
-        *pOut++ = (float)(pIn[i] / 2147483648.0);
-    }
-}
-
-DRWAV_API void drwav_f64_to_f32(float* pOut, const double* pIn, size_t sampleCount)
-{
-    size_t i;
-
-    if (pOut == NULL || pIn == NULL) {
-        return;
-    }
-
-    for (i = 0; i < sampleCount; ++i) {
-        *pOut++ = (float)pIn[i];
-    }
-}
-
-DRWAV_API void drwav_alaw_to_f32(float* pOut, const drwav_uint8* pIn, size_t sampleCount)
-{
-    size_t i;
-
-    if (pOut == NULL || pIn == NULL) {
-        return;
-    }
-
-    for (i = 0; i < sampleCount; ++i) {
-        *pOut++ = drwav__alaw_to_s16(pIn[i]) / 32768.0f;
-    }
-}
-
-DRWAV_API void drwav_mulaw_to_f32(float* pOut, const drwav_uint8* pIn, size_t sampleCount)
-{
-    size_t i;
-
-    if (pOut == NULL || pIn == NULL) {
-        return;
-    }
-
-    for (i = 0; i < sampleCount; ++i) {
-        *pOut++ = drwav__mulaw_to_s16(pIn[i]) / 32768.0f;
-    }
-}
-
-
-
-static void drwav__pcm_to_s32(drwav_int32* pOut, const drwav_uint8* pIn, size_t totalSampleCount, unsigned int bytesPerSample)
-{
-    unsigned int i;
-
-    /* Special case for 8-bit sample data because it's treated as unsigned. */
-    if (bytesPerSample == 1) {
-        drwav_u8_to_s32(pOut, pIn, totalSampleCount);
-        return;
-    }
-
-    /* Slightly more optimal implementation for common formats. */
-    if (bytesPerSample == 2) {
-        drwav_s16_to_s32(pOut, (const drwav_int16*)pIn, totalSampleCount);
-        return;
-    }
-    if (bytesPerSample == 3) {
-        drwav_s24_to_s32(pOut, pIn, totalSampleCount);
-        return;
-    }
-    if (bytesPerSample == 4) {
-        for (i = 0; i < totalSampleCount; ++i) {
-           *pOut++ = ((const drwav_int32*)pIn)[i];
-        }
-        return;
-    }
-
-
-    /* Anything more than 64 bits per sample is not supported. */
-    if (bytesPerSample > 8) {
-        DRWAV_ZERO_MEMORY(pOut, totalSampleCount * sizeof(*pOut));
-        return;
-    }
-
-
-    /* Generic, slow converter. */
-    for (i = 0; i < totalSampleCount; ++i) {
-        drwav_uint64 sample = 0;
-        unsigned int shift  = (8 - bytesPerSample) * 8;
-
-        unsigned int j;
-        for (j = 0; j < bytesPerSample; j += 1) {
-            DRWAV_ASSERT(j < 8);
-            sample |= (drwav_uint64)(pIn[j]) << shift;
-            shift  += 8;
-        }
-
-        pIn += j;
-        *pOut++ = (drwav_int32)((drwav_int64)sample >> 32);
-    }
-}
-
-static void drwav__ieee_to_s32(drwav_int32* pOut, const drwav_uint8* pIn, size_t totalSampleCount, unsigned int bytesPerSample)
-{
-    if (bytesPerSample == 4) {
-        drwav_f32_to_s32(pOut, (const float*)pIn, totalSampleCount);
-        return;
-    } else if (bytesPerSample == 8) {
-        drwav_f64_to_s32(pOut, (const double*)pIn, totalSampleCount);
-        return;
-    } else {
-        /* Only supporting 32- and 64-bit float. Output silence in all other cases. Contributions welcome for 16-bit float. */
-        DRWAV_ZERO_MEMORY(pOut, totalSampleCount * sizeof(*pOut));
-        return;
-    }
-}
-
-
-static drwav_uint64 drwav_read_pcm_frames_s32__pcm(drwav* pWav, drwav_uint64 framesToRead, drwav_int32* pBufferOut)
-{
-    drwav_uint64 totalFramesRead;
-    drwav_uint8 sampleData[4096];
-    drwav_uint32 bytesPerFrame;
-
-    /* Fast path. */
-    if (pWav->translatedFormatTag == DR_WAVE_FORMAT_PCM && pWav->bitsPerSample == 32) {
-        return drwav_read_pcm_frames(pWav, framesToRead, pBufferOut);
-    }
-    
-    bytesPerFrame = drwav_get_bytes_per_pcm_frame(pWav);
-    if (bytesPerFrame == 0) {
-        return 0;
-    }
-
-    totalFramesRead = 0;
-
-    while (framesToRead > 0) {
-        drwav_uint64 framesRead = drwav_read_pcm_frames(pWav, drwav_min(framesToRead, sizeof(sampleData)/bytesPerFrame), sampleData);
-        if (framesRead == 0) {
-            break;
-        }
-
-        drwav__pcm_to_s32(pBufferOut, sampleData, (size_t)(framesRead*pWav->channels), bytesPerFrame/pWav->channels);
-
-        pBufferOut      += framesRead*pWav->channels;
-        framesToRead    -= framesRead;
-        totalFramesRead += framesRead;
-    }
-
-    return totalFramesRead;
-}
-
-static drwav_uint64 drwav_read_pcm_frames_s32__msadpcm(drwav* pWav, drwav_uint64 framesToRead, drwav_int32* pBufferOut)
-{
-    /*
-    We're just going to borrow the implementation from the drwav_read_s16() since ADPCM is a little bit more complicated than other formats and I don't
-    want to duplicate that code.
-    */
-    drwav_uint64 totalFramesRead = 0;
-    drwav_int16 samples16[2048];
-    while (framesToRead > 0) {
-        drwav_uint64 framesRead = drwav_read_pcm_frames_s16(pWav, drwav_min(framesToRead, drwav_countof(samples16)/pWav->channels), samples16);
-        if (framesRead == 0) {
-            break;
-        }
-
-        drwav_s16_to_s32(pBufferOut, samples16, (size_t)(framesRead*pWav->channels));   /* <-- Safe cast because we're clamping to 2048. */
-
-        pBufferOut      += framesRead*pWav->channels;
-        framesToRead    -= framesRead;
-        totalFramesRead += framesRead;
-    }
-
-    return totalFramesRead;
-}
-
-static drwav_uint64 drwav_read_pcm_frames_s32__ima(drwav* pWav, drwav_uint64 framesToRead, drwav_int32* pBufferOut)
-{
-    /*
-    We're just going to borrow the implementation from the drwav_read_s16() since IMA-ADPCM is a little bit more complicated than other formats and I don't
-    want to duplicate that code.
-    */
-    drwav_uint64 totalFramesRead = 0;
-    drwav_int16 samples16[2048];
-    while (framesToRead > 0) {
-        drwav_uint64 framesRead = drwav_read_pcm_frames_s16(pWav, drwav_min(framesToRead, drwav_countof(samples16)/pWav->channels), samples16);
-        if (framesRead == 0) {
-            break;
-        }
-
-        drwav_s16_to_s32(pBufferOut, samples16, (size_t)(framesRead*pWav->channels));   /* <-- Safe cast because we're clamping to 2048. */
-
-        pBufferOut      += framesRead*pWav->channels;
-        framesToRead    -= framesRead;
-        totalFramesRead += framesRead;
-    }
-
-    return totalFramesRead;
-}
-
-static drwav_uint64 drwav_read_pcm_frames_s32__ieee(drwav* pWav, drwav_uint64 framesToRead, drwav_int32* pBufferOut)
-{
-    drwav_uint64 totalFramesRead;
-    drwav_uint8 sampleData[4096];
-
-    drwav_uint32 bytesPerFrame = drwav_get_bytes_per_pcm_frame(pWav);
-    if (bytesPerFrame == 0) {
-        return 0;
-    }
-
-    totalFramesRead = 0;
-
-    while (framesToRead > 0) {
-        drwav_uint64 framesRead = drwav_read_pcm_frames(pWav, drwav_min(framesToRead, sizeof(sampleData)/bytesPerFrame), sampleData);
-        if (framesRead == 0) {
-            break;
-        }
-
-        drwav__ieee_to_s32(pBufferOut, sampleData, (size_t)(framesRead*pWav->channels), bytesPerFrame/pWav->channels);
-
-        pBufferOut      += framesRead*pWav->channels;
-        framesToRead    -= framesRead;
-        totalFramesRead += framesRead;
-    }
-
-    return totalFramesRead;
-}
-
-static drwav_uint64 drwav_read_pcm_frames_s32__alaw(drwav* pWav, drwav_uint64 framesToRead, drwav_int32* pBufferOut)
-{
-    drwav_uint64 totalFramesRead;
-    drwav_uint8 sampleData[4096];
-
-    drwav_uint32 bytesPerFrame = drwav_get_bytes_per_pcm_frame(pWav);
-    if (bytesPerFrame == 0) {
-        return 0;
-    }
-
-    totalFramesRead = 0;
-
-    while (framesToRead > 0) {
-        drwav_uint64 framesRead = drwav_read_pcm_frames(pWav, drwav_min(framesToRead, sizeof(sampleData)/bytesPerFrame), sampleData);
-        if (framesRead == 0) {
-            break;
-        }
-
-        drwav_alaw_to_s32(pBufferOut, sampleData, (size_t)(framesRead*pWav->channels));
-
-        pBufferOut      += framesRead*pWav->channels;
-        framesToRead    -= framesRead;
-        totalFramesRead += framesRead;
-    }
-
-    return totalFramesRead;
-}
-
-static drwav_uint64 drwav_read_pcm_frames_s32__mulaw(drwav* pWav, drwav_uint64 framesToRead, drwav_int32* pBufferOut)
-{
-    drwav_uint64 totalFramesRead;
-    drwav_uint8 sampleData[4096];
-
-    drwav_uint32 bytesPerFrame = drwav_get_bytes_per_pcm_frame(pWav);
-    if (bytesPerFrame == 0) {
-        return 0;
-    }
-
-    totalFramesRead = 0;
-
-    while (framesToRead > 0) {
-        drwav_uint64 framesRead = drwav_read_pcm_frames(pWav, drwav_min(framesToRead, sizeof(sampleData)/bytesPerFrame), sampleData);
-        if (framesRead == 0) {
-            break;
-        }
-
-        drwav_mulaw_to_s32(pBufferOut, sampleData, (size_t)(framesRead*pWav->channels));
-
-        pBufferOut      += framesRead*pWav->channels;
-        framesToRead    -= framesRead;
-        totalFramesRead += framesRead;
-    }
-
-    return totalFramesRead;
-}
-
-DRWAV_API drwav_uint64 drwav_read_pcm_frames_s32(drwav* pWav, drwav_uint64 framesToRead, drwav_int32* pBufferOut)
-{
-    if (pWav == NULL || framesToRead == 0) {
-        return 0;
-    }
-
-    if (pBufferOut == NULL) {
-        return drwav_read_pcm_frames(pWav, framesToRead, NULL);
-    }
-
-    /* Don't try to read more samples than can potentially fit in the output buffer. */
-    if (framesToRead * pWav->channels * sizeof(drwav_int32) > DRWAV_SIZE_MAX) {
-        framesToRead = DRWAV_SIZE_MAX / sizeof(drwav_int32) / pWav->channels;
-    }
-
-    if (pWav->translatedFormatTag == DR_WAVE_FORMAT_PCM) {
-        return drwav_read_pcm_frames_s32__pcm(pWav, framesToRead, pBufferOut);
-    }
-
-    if (pWav->translatedFormatTag == DR_WAVE_FORMAT_ADPCM) {
-        return drwav_read_pcm_frames_s32__msadpcm(pWav, framesToRead, pBufferOut);
-    }
-
-    if (pWav->translatedFormatTag == DR_WAVE_FORMAT_IEEE_FLOAT) {
-        return drwav_read_pcm_frames_s32__ieee(pWav, framesToRead, pBufferOut);
-    }
-
-    if (pWav->translatedFormatTag == DR_WAVE_FORMAT_ALAW) {
-        return drwav_read_pcm_frames_s32__alaw(pWav, framesToRead, pBufferOut);
-    }
-
-    if (pWav->translatedFormatTag == DR_WAVE_FORMAT_MULAW) {
-        return drwav_read_pcm_frames_s32__mulaw(pWav, framesToRead, pBufferOut);
-    }
-
-    if (pWav->translatedFormatTag == DR_WAVE_FORMAT_DVI_ADPCM) {
-        return drwav_read_pcm_frames_s32__ima(pWav, framesToRead, pBufferOut);
-    }
-
-    return 0;
-}
-
-DRWAV_API drwav_uint64 drwav_read_pcm_frames_s32le(drwav* pWav, drwav_uint64 framesToRead, drwav_int32* pBufferOut)
-{
-    drwav_uint64 framesRead = drwav_read_pcm_frames_s32(pWav, framesToRead, pBufferOut);
-    if (pBufferOut != NULL && drwav__is_little_endian() == DRWAV_FALSE) {
-        drwav__bswap_samples_s32(pBufferOut, framesRead*pWav->channels);
-    }
-
-    return framesRead;
-}
-
-DRWAV_API drwav_uint64 drwav_read_pcm_frames_s32be(drwav* pWav, drwav_uint64 framesToRead, drwav_int32* pBufferOut)
-{
-    drwav_uint64 framesRead = drwav_read_pcm_frames_s32(pWav, framesToRead, pBufferOut);
-    if (pBufferOut != NULL && drwav__is_little_endian() == DRWAV_TRUE) {
-        drwav__bswap_samples_s32(pBufferOut, framesRead*pWav->channels);
-    }
-
-    return framesRead;
-}
-
-
-DRWAV_API void drwav_u8_to_s32(drwav_int32* pOut, const drwav_uint8* pIn, size_t sampleCount)
-{
-    size_t i;
-
-    if (pOut == NULL || pIn == NULL) {
-        return;
-    }
-
-    for (i = 0; i < sampleCount; ++i) {
-        *pOut++ = ((int)pIn[i] - 128) << 24;
-    }
-}
-
-DRWAV_API void drwav_s16_to_s32(drwav_int32* pOut, const drwav_int16* pIn, size_t sampleCount)
-{
-    size_t i;
-
-    if (pOut == NULL || pIn == NULL) {
-        return;
-    }
-
-    for (i = 0; i < sampleCount; ++i) {
-        *pOut++ = pIn[i] << 16;
-    }
-}
-
-DRWAV_API void drwav_s24_to_s32(drwav_int32* pOut, const drwav_uint8* pIn, size_t sampleCount)
-{
-    size_t i;
-
-    if (pOut == NULL || pIn == NULL) {
-        return;
-    }
-
-    for (i = 0; i < sampleCount; ++i) {
-        unsigned int s0 = pIn[i*3 + 0];
-        unsigned int s1 = pIn[i*3 + 1];
-        unsigned int s2 = pIn[i*3 + 2];
-
-        drwav_int32 sample32 = (drwav_int32)((s0 << 8) | (s1 << 16) | (s2 << 24));
-        *pOut++ = sample32;
-    }
-}
-
-DRWAV_API void drwav_f32_to_s32(drwav_int32* pOut, const float* pIn, size_t sampleCount)
-{
-    size_t i;
-
-    if (pOut == NULL || pIn == NULL) {
-        return;
-    }
-
-    for (i = 0; i < sampleCount; ++i) {
-        *pOut++ = (drwav_int32)(2147483648.0 * pIn[i]);
-    }
-}
-
-DRWAV_API void drwav_f64_to_s32(drwav_int32* pOut, const double* pIn, size_t sampleCount)
-{
-    size_t i;
-
-    if (pOut == NULL || pIn == NULL) {
-        return;
-    }
-
-    for (i = 0; i < sampleCount; ++i) {
-        *pOut++ = (drwav_int32)(2147483648.0 * pIn[i]);
-    }
-}
-
-DRWAV_API void drwav_alaw_to_s32(drwav_int32* pOut, const drwav_uint8* pIn, size_t sampleCount)
-{
-    size_t i;
-
-    if (pOut == NULL || pIn == NULL) {
-        return;
-    }
-
-    for (i = 0; i < sampleCount; ++i) {
-        *pOut++ = ((drwav_int32)drwav__alaw_to_s16(pIn[i])) << 16;
-    }
-}
-
-DRWAV_API void drwav_mulaw_to_s32(drwav_int32* pOut, const drwav_uint8* pIn, size_t sampleCount)
-{
-    size_t i;
-
-    if (pOut == NULL || pIn == NULL) {
-        return;
-    }
-
-    for (i= 0; i < sampleCount; ++i) {
-        *pOut++ = ((drwav_int32)drwav__mulaw_to_s16(pIn[i])) << 16;
-    }
-}
-
-
-
-static drwav_int16* drwav__read_pcm_frames_and_close_s16(drwav* pWav, unsigned int* channels, unsigned int* sampleRate, drwav_uint64* totalFrameCount)
-{
-    drwav_uint64 sampleDataSize;
-    drwav_int16* pSampleData;
-    drwav_uint64 framesRead;
-
-    DRWAV_ASSERT(pWav != NULL);
-
-    sampleDataSize = pWav->totalPCMFrameCount * pWav->channels * sizeof(drwav_int16);
-    if (sampleDataSize > DRWAV_SIZE_MAX) {
-        drwav_uninit(pWav);
-        return NULL;    /* File's too big. */
-    }
-
-    pSampleData = (drwav_int16*)drwav__malloc_from_callbacks((size_t)sampleDataSize, &pWav->allocationCallbacks); /* <-- Safe cast due to the check above. */
-    if (pSampleData == NULL) {
-        drwav_uninit(pWav);
-        return NULL;    /* Failed to allocate memory. */
-    }
-
-    framesRead = drwav_read_pcm_frames_s16(pWav, (size_t)pWav->totalPCMFrameCount, pSampleData);
-    if (framesRead != pWav->totalPCMFrameCount) {
-        drwav__free_from_callbacks(pSampleData, &pWav->allocationCallbacks);
-        drwav_uninit(pWav);
-        return NULL;    /* There was an error reading the samples. */
-    }
-
-    drwav_uninit(pWav);
-
-    if (sampleRate) {
-        *sampleRate = pWav->sampleRate;
-    }
-    if (channels) {
-        *channels = pWav->channels;
-    }
-    if (totalFrameCount) {
-        *totalFrameCount = pWav->totalPCMFrameCount;
-    }
-
-    return pSampleData;
-}
-
-static float* drwav__read_pcm_frames_and_close_f32(drwav* pWav, unsigned int* channels, unsigned int* sampleRate, drwav_uint64* totalFrameCount)
-{
-    drwav_uint64 sampleDataSize;
-    float* pSampleData;
-    drwav_uint64 framesRead;
-
-    DRWAV_ASSERT(pWav != NULL);
-
-    sampleDataSize = pWav->totalPCMFrameCount * pWav->channels * sizeof(float);
-    if (sampleDataSize > DRWAV_SIZE_MAX) {
-        drwav_uninit(pWav);
-        return NULL;    /* File's too big. */
-    }
-
-    pSampleData = (float*)drwav__malloc_from_callbacks((size_t)sampleDataSize, &pWav->allocationCallbacks); /* <-- Safe cast due to the check above. */
-    if (pSampleData == NULL) {
-        drwav_uninit(pWav);
-        return NULL;    /* Failed to allocate memory. */
-    }
-
-    framesRead = drwav_read_pcm_frames_f32(pWav, (size_t)pWav->totalPCMFrameCount, pSampleData);
-    if (framesRead != pWav->totalPCMFrameCount) {
-        drwav__free_from_callbacks(pSampleData, &pWav->allocationCallbacks);
-        drwav_uninit(pWav);
-        return NULL;    /* There was an error reading the samples. */
-    }
-
-    drwav_uninit(pWav);
-
-    if (sampleRate) {
-        *sampleRate = pWav->sampleRate;
-    }
-    if (channels) {
-        *channels = pWav->channels;
-    }
-    if (totalFrameCount) {
-        *totalFrameCount = pWav->totalPCMFrameCount;
-    }
-
-    return pSampleData;
-}
-
-static drwav_int32* drwav__read_pcm_frames_and_close_s32(drwav* pWav, unsigned int* channels, unsigned int* sampleRate, drwav_uint64* totalFrameCount)
-{
-    drwav_uint64 sampleDataSize;
-    drwav_int32* pSampleData;
-    drwav_uint64 framesRead;
-
-    DRWAV_ASSERT(pWav != NULL);
-
-    sampleDataSize = pWav->totalPCMFrameCount * pWav->channels * sizeof(drwav_int32);
-    if (sampleDataSize > DRWAV_SIZE_MAX) {
-        drwav_uninit(pWav);
-        return NULL;    /* File's too big. */
-    }
-
-    pSampleData = (drwav_int32*)drwav__malloc_from_callbacks((size_t)sampleDataSize, &pWav->allocationCallbacks); /* <-- Safe cast due to the check above. */
-    if (pSampleData == NULL) {
-        drwav_uninit(pWav);
-        return NULL;    /* Failed to allocate memory. */
-    }
-
-    framesRead = drwav_read_pcm_frames_s32(pWav, (size_t)pWav->totalPCMFrameCount, pSampleData);
-    if (framesRead != pWav->totalPCMFrameCount) {
-        drwav__free_from_callbacks(pSampleData, &pWav->allocationCallbacks);
-        drwav_uninit(pWav);
-        return NULL;    /* There was an error reading the samples. */
-    }
-
-    drwav_uninit(pWav);
-
-    if (sampleRate) {
-        *sampleRate = pWav->sampleRate;
-    }
-    if (channels) {
-        *channels = pWav->channels;
-    }
-    if (totalFrameCount) {
-        *totalFrameCount = pWav->totalPCMFrameCount;
-    }
-
-    return pSampleData;
-}
-
-
-
-DRWAV_API drwav_int16* drwav_open_and_read_pcm_frames_s16(drwav_read_proc onRead, drwav_seek_proc onSeek, void* pUserData, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks)
-{
-    drwav wav;
-
-    if (channelsOut) {
-        *channelsOut = 0;
-    }
-    if (sampleRateOut) {
-        *sampleRateOut = 0;
-    }
-    if (totalFrameCountOut) {
-        *totalFrameCountOut = 0;
-    }
-
-    if (!drwav_init(&wav, onRead, onSeek, pUserData, pAllocationCallbacks)) {
-        return NULL;
-    }
-
-    return drwav__read_pcm_frames_and_close_s16(&wav, channelsOut, sampleRateOut, totalFrameCountOut);
-}
-
-DRWAV_API float* drwav_open_and_read_pcm_frames_f32(drwav_read_proc onRead, drwav_seek_proc onSeek, void* pUserData, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks)
-{
-    drwav wav;
-
-    if (channelsOut) {
-        *channelsOut = 0;
-    }
-    if (sampleRateOut) {
-        *sampleRateOut = 0;
-    }
-    if (totalFrameCountOut) {
-        *totalFrameCountOut = 0;
-    }
-
-    if (!drwav_init(&wav, onRead, onSeek, pUserData, pAllocationCallbacks)) {
-        return NULL;
-    }
-
-    return drwav__read_pcm_frames_and_close_f32(&wav, channelsOut, sampleRateOut, totalFrameCountOut);
-}
-
-DRWAV_API drwav_int32* drwav_open_and_read_pcm_frames_s32(drwav_read_proc onRead, drwav_seek_proc onSeek, void* pUserData, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks)
-{
-    drwav wav;
-
-    if (channelsOut) {
-        *channelsOut = 0;
-    }
-    if (sampleRateOut) {
-        *sampleRateOut = 0;
-    }
-    if (totalFrameCountOut) {
-        *totalFrameCountOut = 0;
-    }
-
-    if (!drwav_init(&wav, onRead, onSeek, pUserData, pAllocationCallbacks)) {
-        return NULL;
-    }
-
-    return drwav__read_pcm_frames_and_close_s32(&wav, channelsOut, sampleRateOut, totalFrameCountOut);
-}
-
-#ifndef DR_WAV_NO_STDIO
-DRWAV_API drwav_int16* drwav_open_file_and_read_pcm_frames_s16(const char* filename, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks)
-{
-    drwav wav;
-
-    if (channelsOut) {
-        *channelsOut = 0;
-    }
-    if (sampleRateOut) {
-        *sampleRateOut = 0;
-    }
-    if (totalFrameCountOut) {
-        *totalFrameCountOut = 0;
-    }
-
-    if (!drwav_init_file(&wav, filename, pAllocationCallbacks)) {
-        return NULL;
-    }
-
-    return drwav__read_pcm_frames_and_close_s16(&wav, channelsOut, sampleRateOut, totalFrameCountOut);
-}
-
-DRWAV_API float* drwav_open_file_and_read_pcm_frames_f32(const char* filename, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks)
-{
-    drwav wav;
-
-    if (channelsOut) {
-        *channelsOut = 0;
-    }
-    if (sampleRateOut) {
-        *sampleRateOut = 0;
-    }
-    if (totalFrameCountOut) {
-        *totalFrameCountOut = 0;
-    }
-
-    if (!drwav_init_file(&wav, filename, pAllocationCallbacks)) {
-        return NULL;
-    }
-
-    return drwav__read_pcm_frames_and_close_f32(&wav, channelsOut, sampleRateOut, totalFrameCountOut);
-}
-
-DRWAV_API drwav_int32* drwav_open_file_and_read_pcm_frames_s32(const char* filename, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks)
-{
-    drwav wav;
-
-    if (channelsOut) {
-        *channelsOut = 0;
-    }
-    if (sampleRateOut) {
-        *sampleRateOut = 0;
-    }
-    if (totalFrameCountOut) {
-        *totalFrameCountOut = 0;
-    }
-
-    if (!drwav_init_file(&wav, filename, pAllocationCallbacks)) {
-        return NULL;
-    }
-
-    return drwav__read_pcm_frames_and_close_s32(&wav, channelsOut, sampleRateOut, totalFrameCountOut);
-}
-
-
-DRWAV_API drwav_int16* drwav_open_file_and_read_pcm_frames_s16_w(const wchar_t* filename, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks)
-{
-    drwav wav;
-
-    if (sampleRateOut) {
-        *sampleRateOut = 0;
-    }
-    if (channelsOut) {
-        *channelsOut = 0;
-    }
-    if (totalFrameCountOut) {
-        *totalFrameCountOut = 0;
-    }
-
-    if (!drwav_init_file_w(&wav, filename, pAllocationCallbacks)) {
-        return NULL;
-    }
-
-    return drwav__read_pcm_frames_and_close_s16(&wav, channelsOut, sampleRateOut, totalFrameCountOut);
-}
-
-DRWAV_API float* drwav_open_file_and_read_pcm_frames_f32_w(const wchar_t* filename, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks)
-{
-    drwav wav;
-
-    if (sampleRateOut) {
-        *sampleRateOut = 0;
-    }
-    if (channelsOut) {
-        *channelsOut = 0;
-    }
-    if (totalFrameCountOut) {
-        *totalFrameCountOut = 0;
-    }
-
-    if (!drwav_init_file_w(&wav, filename, pAllocationCallbacks)) {
-        return NULL;
-    }
-
-    return drwav__read_pcm_frames_and_close_f32(&wav, channelsOut, sampleRateOut, totalFrameCountOut);
-}
-
-DRWAV_API drwav_int32* drwav_open_file_and_read_pcm_frames_s32_w(const wchar_t* filename, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks)
-{
-    drwav wav;
-
-    if (sampleRateOut) {
-        *sampleRateOut = 0;
-    }
-    if (channelsOut) {
-        *channelsOut = 0;
-    }
-    if (totalFrameCountOut) {
-        *totalFrameCountOut = 0;
-    }
-
-    if (!drwav_init_file_w(&wav, filename, pAllocationCallbacks)) {
-        return NULL;
-    }
-
-    return drwav__read_pcm_frames_and_close_s32(&wav, channelsOut, sampleRateOut, totalFrameCountOut);
-}
-#endif
-
-DRWAV_API drwav_int16* drwav_open_memory_and_read_pcm_frames_s16(const void* data, size_t dataSize, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks)
-{
-    drwav wav;
-
-    if (channelsOut) {
-        *channelsOut = 0;
-    }
-    if (sampleRateOut) {
-        *sampleRateOut = 0;
-    }
-    if (totalFrameCountOut) {
-        *totalFrameCountOut = 0;
-    }
-
-    if (!drwav_init_memory(&wav, data, dataSize, pAllocationCallbacks)) {
-        return NULL;
-    }
-
-    return drwav__read_pcm_frames_and_close_s16(&wav, channelsOut, sampleRateOut, totalFrameCountOut);
-}
-
-DRWAV_API float* drwav_open_memory_and_read_pcm_frames_f32(const void* data, size_t dataSize, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks)
-{
-    drwav wav;
-
-    if (channelsOut) {
-        *channelsOut = 0;
-    }
-    if (sampleRateOut) {
-        *sampleRateOut = 0;
-    }
-    if (totalFrameCountOut) {
-        *totalFrameCountOut = 0;
-    }
-
-    if (!drwav_init_memory(&wav, data, dataSize, pAllocationCallbacks)) {
-        return NULL;
-    }
-
-    return drwav__read_pcm_frames_and_close_f32(&wav, channelsOut, sampleRateOut, totalFrameCountOut);
-}
-
-DRWAV_API drwav_int32* drwav_open_memory_and_read_pcm_frames_s32(const void* data, size_t dataSize, unsigned int* channelsOut, unsigned int* sampleRateOut, drwav_uint64* totalFrameCountOut, const drwav_allocation_callbacks* pAllocationCallbacks)
-{
-    drwav wav;
-
-    if (channelsOut) {
-        *channelsOut = 0;
-    }
-    if (sampleRateOut) {
-        *sampleRateOut = 0;
-    }
-    if (totalFrameCountOut) {
-        *totalFrameCountOut = 0;
-    }
-
-    if (!drwav_init_memory(&wav, data, dataSize, pAllocationCallbacks)) {
-        return NULL;
-    }
-
-    return drwav__read_pcm_frames_and_close_s32(&wav, channelsOut, sampleRateOut, totalFrameCountOut);
-}
-#endif  /* DR_WAV_NO_CONVERSION_API */
-
-
-DRWAV_API void drwav_free(void* p, const drwav_allocation_callbacks* pAllocationCallbacks)
-{
-    if (pAllocationCallbacks != NULL) {
-        drwav__free_from_callbacks(p, pAllocationCallbacks);
-    } else {
-        drwav__free_default(p, NULL);
-    }
-}
-
-DRWAV_API drwav_uint16 drwav_bytes_to_u16(const drwav_uint8* data)
-{
-    return drwav__bytes_to_u16(data);
-}
-
-DRWAV_API drwav_int16 drwav_bytes_to_s16(const drwav_uint8* data)
-{
-    return drwav__bytes_to_s16(data);
-}
-
-DRWAV_API drwav_uint32 drwav_bytes_to_u32(const drwav_uint8* data)
-{
-    return drwav__bytes_to_u32(data);
-}
-
-DRWAV_API drwav_int32 drwav_bytes_to_s32(const drwav_uint8* data)
-{
-    return drwav__bytes_to_s32(data);
-}
-
-DRWAV_API drwav_uint64 drwav_bytes_to_u64(const drwav_uint8* data)
-{
-    return drwav__bytes_to_u64(data);
-}
-
-DRWAV_API drwav_int64 drwav_bytes_to_s64(const drwav_uint8* data)
-{
-    return drwav__bytes_to_s64(data);
-}
-
-
-DRWAV_API drwav_bool32 drwav_guid_equal(const drwav_uint8 a[16], const drwav_uint8 b[16])
-{
-    return drwav__guid_equal(a, b);
-}
-
-DRWAV_API drwav_bool32 drwav_fourcc_equal(const drwav_uint8* a, const char* b)
-{
-    return drwav__fourcc_equal(a, b);
-}
-
-#endif  /* dr_wav_c */
-#endif  /* DR_WAV_IMPLEMENTATION */
-
-/*
-RELEASE NOTES - v0.11.0
-=======================
-Version 0.11.0 has breaking API changes.
-
-Improved Client-Defined Memory Allocation
------------------------------------------
-The main change with this release is the addition of a more flexible way of implementing custom memory allocation routines. The
-existing system of DRWAV_MALLOC, DRWAV_REALLOC and DRWAV_FREE are still in place and will be used by default when no custom
-allocation callbacks are specified.
-
-To use the new system, you pass in a pointer to a drwav_allocation_callbacks object to drwav_init() and family, like this:
-
-    void* my_malloc(size_t sz, void* pUserData)
-    {
-        return malloc(sz);
-    }
-    void* my_realloc(void* p, size_t sz, void* pUserData)
-    {
-        return realloc(p, sz);
-    }
-    void my_free(void* p, void* pUserData)
-    {
-        free(p);
-    }
-
-    ...
-
-    drwav_allocation_callbacks allocationCallbacks;
-    allocationCallbacks.pUserData = &myData;
-    allocationCallbacks.onMalloc  = my_malloc;
-    allocationCallbacks.onRealloc = my_realloc;
-    allocationCallbacks.onFree    = my_free;
-    drwav_init_file(&wav, "my_file.wav", &allocationCallbacks);
-
-The advantage of this new system is that it allows you to specify user data which will be passed in to the allocation routines.
-
-Passing in null for the allocation callbacks object will cause dr_wav to use defaults which is the same as DRWAV_MALLOC,
-DRWAV_REALLOC and DRWAV_FREE and the equivalent of how it worked in previous versions.
-
-Every API that opens a drwav object now takes this extra parameter. These include the following:
-
-    drwav_init()
-    drwav_init_ex()
-    drwav_init_file()
-    drwav_init_file_ex()
-    drwav_init_file_w()
-    drwav_init_file_w_ex()
-    drwav_init_memory()
-    drwav_init_memory_ex()
-    drwav_init_write()
-    drwav_init_write_sequential()
-    drwav_init_write_sequential_pcm_frames()
-    drwav_init_file_write()
-    drwav_init_file_write_sequential()
-    drwav_init_file_write_sequential_pcm_frames()
-    drwav_init_file_write_w()
-    drwav_init_file_write_sequential_w()
-    drwav_init_file_write_sequential_pcm_frames_w()
-    drwav_init_memory_write()
-    drwav_init_memory_write_sequential()
-    drwav_init_memory_write_sequential_pcm_frames()
-    drwav_open_and_read_pcm_frames_s16()
-    drwav_open_and_read_pcm_frames_f32()
-    drwav_open_and_read_pcm_frames_s32()
-    drwav_open_file_and_read_pcm_frames_s16()
-    drwav_open_file_and_read_pcm_frames_f32()
-    drwav_open_file_and_read_pcm_frames_s32()
-    drwav_open_file_and_read_pcm_frames_s16_w()
-    drwav_open_file_and_read_pcm_frames_f32_w()
-    drwav_open_file_and_read_pcm_frames_s32_w()
-    drwav_open_memory_and_read_pcm_frames_s16()
-    drwav_open_memory_and_read_pcm_frames_f32()
-    drwav_open_memory_and_read_pcm_frames_s32()
-
-Endian Improvements
--------------------
-Previously, the following APIs returned little-endian audio data. These now return native-endian data. This improves compatibility
-on big-endian architectures.
-
-    drwav_read_pcm_frames()
-    drwav_read_pcm_frames_s16()
-    drwav_read_pcm_frames_s32()
-    drwav_read_pcm_frames_f32()
-    drwav_open_and_read_pcm_frames_s16()
-    drwav_open_and_read_pcm_frames_s32()
-    drwav_open_and_read_pcm_frames_f32()
-    drwav_open_file_and_read_pcm_frames_s16()
-    drwav_open_file_and_read_pcm_frames_s32()
-    drwav_open_file_and_read_pcm_frames_f32()
-    drwav_open_file_and_read_pcm_frames_s16_w()
-    drwav_open_file_and_read_pcm_frames_s32_w()
-    drwav_open_file_and_read_pcm_frames_f32_w()
-    drwav_open_memory_and_read_pcm_frames_s16()
-    drwav_open_memory_and_read_pcm_frames_s32()
-    drwav_open_memory_and_read_pcm_frames_f32()
-
-APIs have been added to give you explicit control over whether or not audio data is read or written in big- or little-endian byte
-order:
-
-    drwav_read_pcm_frames_le()
-    drwav_read_pcm_frames_be()
-    drwav_read_pcm_frames_s16le()
-    drwav_read_pcm_frames_s16be()
-    drwav_read_pcm_frames_f32le()
-    drwav_read_pcm_frames_f32be()
-    drwav_read_pcm_frames_s32le()
-    drwav_read_pcm_frames_s32be()
-    drwav_write_pcm_frames_le()
-    drwav_write_pcm_frames_be()
-
-Removed APIs
-------------
-The following APIs were deprecated in version 0.10.0 and have now been removed:
-
-    drwav_open()
-    drwav_open_ex()
-    drwav_open_write()
-    drwav_open_write_sequential()
-    drwav_open_file()
-    drwav_open_file_ex()
-    drwav_open_file_write()
-    drwav_open_file_write_sequential()
-    drwav_open_memory()
-    drwav_open_memory_ex()
-    drwav_open_memory_write()
-    drwav_open_memory_write_sequential()
-    drwav_close()
-
-
-
-RELEASE NOTES - v0.10.0
-=======================
-Version 0.10.0 has breaking API changes. There are no significant bug fixes in this release, so if you are affected you do
-not need to upgrade.
-
-Removed APIs
-------------
-The following APIs were deprecated in version 0.9.0 and have been completely removed in version 0.10.0:
-
-    drwav_read()
-    drwav_read_s16()
-    drwav_read_f32()
-    drwav_read_s32()
-    drwav_seek_to_sample()
-    drwav_write()
-    drwav_open_and_read_s16()
-    drwav_open_and_read_f32()
-    drwav_open_and_read_s32()
-    drwav_open_file_and_read_s16()
-    drwav_open_file_and_read_f32()
-    drwav_open_file_and_read_s32()
-    drwav_open_memory_and_read_s16()
-    drwav_open_memory_and_read_f32()
-    drwav_open_memory_and_read_s32()
-    drwav::totalSampleCount
-
-See release notes for version 0.9.0 at the bottom of this file for replacement APIs.
-
-Deprecated APIs
----------------
-The following APIs have been deprecated. There is a confusing and completely arbitrary difference between drwav_init*() and
-drwav_open*(), where drwav_init*() initializes a pre-allocated drwav object, whereas drwav_open*() will first allocated a
-drwav object on the heap and then initialize it. drwav_open*() has been deprecated which means you must now use a pre-
-allocated drwav object with drwav_init*(). If you need the previous functionality, you can just do a malloc() followed by
-a called to one of the drwav_init*() APIs.
-
-    drwav_open()
-    drwav_open_ex()
-    drwav_open_write()
-    drwav_open_write_sequential()
-    drwav_open_file()
-    drwav_open_file_ex()
-    drwav_open_file_write()
-    drwav_open_file_write_sequential()
-    drwav_open_memory()
-    drwav_open_memory_ex()
-    drwav_open_memory_write()
-    drwav_open_memory_write_sequential()
-    drwav_close()
-
-These APIs will be removed completely in a future version. The rationale for this change is to remove confusion between the
-two different ways to initialize a drwav object.
-*/
-
-/*
-REVISION HISTORY
-================
-v0.12.16 - 2020-12-02
-  - Fix a bug when trying to read more bytes than can fit in a size_t.
-
-v0.12.15 - 2020-11-21
-  - Fix compilation with OpenWatcom.
-
-v0.12.14 - 2020-11-13
-  - Minor code clean up.
-
-v0.12.13 - 2020-11-01
-  - Improve compiler support for older versions of GCC.
-
-v0.12.12 - 2020-09-28
-  - Add support for RF64.
-  - Fix a bug in writing mode where the size of the RIFF chunk incorrectly includes the header section.
-
-v0.12.11 - 2020-09-08
-  - Fix a compilation error on older compilers.
-
-v0.12.10 - 2020-08-24
-  - Fix a bug when seeking with ADPCM formats.
-
-v0.12.9 - 2020-08-02
-  - Simplify sized types.
-
-v0.12.8 - 2020-07-25
-  - Fix a compilation warning.
-
-v0.12.7 - 2020-07-15
-  - Fix some bugs on big-endian architectures.
-  - Fix an error in s24 to f32 conversion.
-
-v0.12.6 - 2020-06-23
-  - Change drwav_read_*() to allow NULL to be passed in as the output buffer which is equivalent to a forward seek.
-  - Fix a buffer overflow when trying to decode invalid IMA-ADPCM files.
-  - Add include guard for the implementation section.
-
-v0.12.5 - 2020-05-27
-  - Minor documentation fix.
-
-v0.12.4 - 2020-05-16
-  - Replace assert() with DRWAV_ASSERT().
-  - Add compile-time and run-time version querying.
-    - DRWAV_VERSION_MINOR
-    - DRWAV_VERSION_MAJOR
-    - DRWAV_VERSION_REVISION
-    - DRWAV_VERSION_STRING
-    - drwav_version()
-    - drwav_version_string()
-
-v0.12.3 - 2020-04-30
-  - Fix compilation errors with VC6.
-
-v0.12.2 - 2020-04-21
-  - Fix a bug where drwav_init_file() does not close the file handle after attempting to load an erroneous file.
-
-v0.12.1 - 2020-04-13
-  - Fix some pedantic warnings.
-
-v0.12.0 - 2020-04-04
-  - API CHANGE: Add container and format parameters to the chunk callback.
-  - Minor documentation updates.
-
-v0.11.5 - 2020-03-07
-  - Fix compilation error with Visual Studio .NET 2003.
-
-v0.11.4 - 2020-01-29
-  - Fix some static analysis warnings.
-  - Fix a bug when reading f32 samples from an A-law encoded stream.
-
-v0.11.3 - 2020-01-12
-  - Minor changes to some f32 format conversion routines.
-  - Minor bug fix for ADPCM conversion when end of file is reached.
-
-v0.11.2 - 2019-12-02
-  - Fix a possible crash when using custom memory allocators without a custom realloc() implementation.
-  - Fix an integer overflow bug.
-  - Fix a null pointer dereference bug.
-  - Add limits to sample rate, channels and bits per sample to tighten up some validation.
-
-v0.11.1 - 2019-10-07
-  - Internal code clean up.
-
-v0.11.0 - 2019-10-06
-  - API CHANGE: Add support for user defined memory allocation routines. This system allows the program to specify their own memory allocation
-    routines with a user data pointer for client-specific contextual data. This adds an extra parameter to the end of the following APIs:
-    - drwav_init()
-    - drwav_init_ex()
-    - drwav_init_file()
-    - drwav_init_file_ex()
-    - drwav_init_file_w()
-    - drwav_init_file_w_ex()
-    - drwav_init_memory()
-    - drwav_init_memory_ex()
-    - drwav_init_write()
-    - drwav_init_write_sequential()
-    - drwav_init_write_sequential_pcm_frames()
-    - drwav_init_file_write()
-    - drwav_init_file_write_sequential()
-    - drwav_init_file_write_sequential_pcm_frames()
-    - drwav_init_file_write_w()
-    - drwav_init_file_write_sequential_w()
-    - drwav_init_file_write_sequential_pcm_frames_w()
-    - drwav_init_memory_write()
-    - drwav_init_memory_write_sequential()
-    - drwav_init_memory_write_sequential_pcm_frames()
-    - drwav_open_and_read_pcm_frames_s16()
-    - drwav_open_and_read_pcm_frames_f32()
-    - drwav_open_and_read_pcm_frames_s32()
-    - drwav_open_file_and_read_pcm_frames_s16()
-    - drwav_open_file_and_read_pcm_frames_f32()
-    - drwav_open_file_and_read_pcm_frames_s32()
-    - drwav_open_file_and_read_pcm_frames_s16_w()
-    - drwav_open_file_and_read_pcm_frames_f32_w()
-    - drwav_open_file_and_read_pcm_frames_s32_w()
-    - drwav_open_memory_and_read_pcm_frames_s16()
-    - drwav_open_memory_and_read_pcm_frames_f32()
-    - drwav_open_memory_and_read_pcm_frames_s32()
-    Set this extra parameter to NULL to use defaults which is the same as the previous behaviour. Setting this NULL will use
-    DRWAV_MALLOC, DRWAV_REALLOC and DRWAV_FREE.
-  - Add support for reading and writing PCM frames in an explicit endianness. New APIs:
-    - drwav_read_pcm_frames_le()
-    - drwav_read_pcm_frames_be()
-    - drwav_read_pcm_frames_s16le()
-    - drwav_read_pcm_frames_s16be()
-    - drwav_read_pcm_frames_f32le()
-    - drwav_read_pcm_frames_f32be()
-    - drwav_read_pcm_frames_s32le()
-    - drwav_read_pcm_frames_s32be()
-    - drwav_write_pcm_frames_le()
-    - drwav_write_pcm_frames_be()
-  - Remove deprecated APIs.
-  - API CHANGE: The following APIs now return native-endian data. Previously they returned little-endian data.
-    - drwav_read_pcm_frames()
-    - drwav_read_pcm_frames_s16()
-    - drwav_read_pcm_frames_s32()
-    - drwav_read_pcm_frames_f32()
-    - drwav_open_and_read_pcm_frames_s16()
-    - drwav_open_and_read_pcm_frames_s32()
-    - drwav_open_and_read_pcm_frames_f32()
-    - drwav_open_file_and_read_pcm_frames_s16()
-    - drwav_open_file_and_read_pcm_frames_s32()
-    - drwav_open_file_and_read_pcm_frames_f32()
-    - drwav_open_file_and_read_pcm_frames_s16_w()
-    - drwav_open_file_and_read_pcm_frames_s32_w()
-    - drwav_open_file_and_read_pcm_frames_f32_w()
-    - drwav_open_memory_and_read_pcm_frames_s16()
-    - drwav_open_memory_and_read_pcm_frames_s32()
-    - drwav_open_memory_and_read_pcm_frames_f32()
-
-v0.10.1 - 2019-08-31
-  - Correctly handle partial trailing ADPCM blocks.
-
-v0.10.0 - 2019-08-04
-  - Remove deprecated APIs.
-  - Add wchar_t variants for file loading APIs:
-      drwav_init_file_w()
-      drwav_init_file_ex_w()
-      drwav_init_file_write_w()
-      drwav_init_file_write_sequential_w()
-  - Add drwav_target_write_size_bytes() which calculates the total size in bytes of a WAV file given a format and sample count.
-  - Add APIs for specifying the PCM frame count instead of the sample count when opening in sequential write mode:
-      drwav_init_write_sequential_pcm_frames()
-      drwav_init_file_write_sequential_pcm_frames()
-      drwav_init_file_write_sequential_pcm_frames_w()
-      drwav_init_memory_write_sequential_pcm_frames()
-  - Deprecate drwav_open*() and drwav_close():
-      drwav_open()
-      drwav_open_ex()
-      drwav_open_write()
-      drwav_open_write_sequential()
-      drwav_open_file()
-      drwav_open_file_ex()
-      drwav_open_file_write()
-      drwav_open_file_write_sequential()
-      drwav_open_memory()
-      drwav_open_memory_ex()
-      drwav_open_memory_write()
-      drwav_open_memory_write_sequential()
-      drwav_close()
-  - Minor documentation updates.
-
-v0.9.2 - 2019-05-21
-  - Fix warnings.
-
-v0.9.1 - 2019-05-05
-  - Add support for C89.
-  - Change license to choice of public domain or MIT-0.
-
-v0.9.0 - 2018-12-16
-  - API CHANGE: Add new reading APIs for reading by PCM frames instead of samples. Old APIs have been deprecated and
-    will be removed in v0.10.0. Deprecated APIs and their replacements:
-      drwav_read()                     -> drwav_read_pcm_frames()
-      drwav_read_s16()                 -> drwav_read_pcm_frames_s16()
-      drwav_read_f32()                 -> drwav_read_pcm_frames_f32()
-      drwav_read_s32()                 -> drwav_read_pcm_frames_s32()
-      drwav_seek_to_sample()           -> drwav_seek_to_pcm_frame()
-      drwav_write()                    -> drwav_write_pcm_frames()
-      drwav_open_and_read_s16()        -> drwav_open_and_read_pcm_frames_s16()
-      drwav_open_and_read_f32()        -> drwav_open_and_read_pcm_frames_f32()
-      drwav_open_and_read_s32()        -> drwav_open_and_read_pcm_frames_s32()
-      drwav_open_file_and_read_s16()   -> drwav_open_file_and_read_pcm_frames_s16()
-      drwav_open_file_and_read_f32()   -> drwav_open_file_and_read_pcm_frames_f32()
-      drwav_open_file_and_read_s32()   -> drwav_open_file_and_read_pcm_frames_s32()
-      drwav_open_memory_and_read_s16() -> drwav_open_memory_and_read_pcm_frames_s16()
-      drwav_open_memory_and_read_f32() -> drwav_open_memory_and_read_pcm_frames_f32()
-      drwav_open_memory_and_read_s32() -> drwav_open_memory_and_read_pcm_frames_s32()
-      drwav::totalSampleCount          -> drwav::totalPCMFrameCount
-  - API CHANGE: Rename drwav_open_and_read_file_*() to drwav_open_file_and_read_*().
-  - API CHANGE: Rename drwav_open_and_read_memory_*() to drwav_open_memory_and_read_*().
-  - Add built-in support for smpl chunks.
-  - Add support for firing a callback for each chunk in the file at initialization time.
-    - This is enabled through the drwav_init_ex(), etc. family of APIs.
-  - Handle invalid FMT chunks more robustly.
-
-v0.8.5 - 2018-09-11
-  - Const correctness.
-  - Fix a potential stack overflow.
-
-v0.8.4 - 2018-08-07
-  - Improve 64-bit detection.
-
-v0.8.3 - 2018-08-05
-  - Fix C++ build on older versions of GCC.
-
-v0.8.2 - 2018-08-02
-  - Fix some big-endian bugs.
-
-v0.8.1 - 2018-06-29
-  - Add support for sequential writing APIs.
-  - Disable seeking in write mode.
-  - Fix bugs with Wave64.
-  - Fix typos.
-
-v0.8 - 2018-04-27
-  - Bug fix.
-  - Start using major.minor.revision versioning.
-
-v0.7f - 2018-02-05
-  - Restrict ADPCM formats to a maximum of 2 channels.
-
-v0.7e - 2018-02-02
-  - Fix a crash.
-
-v0.7d - 2018-02-01
-  - Fix a crash.
-
-v0.7c - 2018-02-01
-  - Set drwav.bytesPerSample to 0 for all compressed formats.
-  - Fix a crash when reading 16-bit floating point WAV files. In this case dr_wav will output silence for
-    all format conversion reading APIs (*_s16, *_s32, *_f32 APIs).
-  - Fix some divide-by-zero errors.
-
-v0.7b - 2018-01-22
-  - Fix errors with seeking of compressed formats.
-  - Fix compilation error when DR_WAV_NO_CONVERSION_API
-
-v0.7a - 2017-11-17
-  - Fix some GCC warnings.
-
-v0.7 - 2017-11-04
-  - Add writing APIs.
-
-v0.6 - 2017-08-16
-  - API CHANGE: Rename dr_* types to drwav_*.
-  - Add support for custom implementations of malloc(), realloc(), etc.
-  - Add support for Microsoft ADPCM.
-  - Add support for IMA ADPCM (DVI, format code 0x11).
-  - Optimizations to drwav_read_s16().
-  - Bug fixes.
-
-v0.5g - 2017-07-16
-  - Change underlying type for booleans to unsigned.
-
-v0.5f - 2017-04-04
-  - Fix a minor bug with drwav_open_and_read_s16() and family.
-
-v0.5e - 2016-12-29
-  - Added support for reading samples as signed 16-bit integers. Use the _s16() family of APIs for this.
-  - Minor fixes to documentation.
-
-v0.5d - 2016-12-28
-  - Use drwav_int* and drwav_uint* sized types to improve compiler support.
-
-v0.5c - 2016-11-11
-  - Properly handle JUNK chunks that come before the FMT chunk.
-
-v0.5b - 2016-10-23
-  - A minor change to drwav_bool8 and drwav_bool32 types.
-
-v0.5a - 2016-10-11
-  - Fixed a bug with drwav_open_and_read() and family due to incorrect argument ordering.
-  - Improve A-law and mu-law efficiency.
-
-v0.5 - 2016-09-29
-  - API CHANGE. Swap the order of "channels" and "sampleRate" parameters in drwav_open_and_read*(). Rationale for this is to
-    keep it consistent with dr_audio and dr_flac.
-
-v0.4b - 2016-09-18
-  - Fixed a typo in documentation.
-
-v0.4a - 2016-09-18
-  - Fixed a typo.
-  - Change date format to ISO 8601 (YYYY-MM-DD)
-
-v0.4 - 2016-07-13
-  - API CHANGE. Make onSeek consistent with dr_flac.
-  - API CHANGE. Rename drwav_seek() to drwav_seek_to_sample() for clarity and consistency with dr_flac.
-  - Added support for Sony Wave64.
-
-v0.3a - 2016-05-28
-  - API CHANGE. Return drwav_bool32 instead of int in onSeek callback.
-  - Fixed a memory leak.
-
-v0.3 - 2016-05-22
-  - Lots of API changes for consistency.
-
-v0.2a - 2016-05-16
-  - Fixed Linux/GCC build.
-
-v0.2 - 2016-05-11
-  - Added support for reading data as signed 32-bit PCM for consistency with dr_flac.
-
-v0.1a - 2016-05-07
-  - Fixed a bug in drwav_open_file() where the file handle would not be closed if the loader failed to initialize.
-
-v0.1 - 2016-05-04
-  - Initial versioned release.
-*/
-
-/*
-This software is available as a choice of the following licenses. Choose
-whichever you prefer.
-
-===============================================================================
-ALTERNATIVE 1 - Public Domain (www.unlicense.org)
-===============================================================================
-This is free and unencumbered software released into the public domain.
-
-Anyone is free to copy, modify, publish, use, compile, sell, or distribute this
-software, either in source code form or as a compiled binary, for any purpose,
-commercial or non-commercial, and by any means.
-
-In jurisdictions that recognize copyright laws, the author or authors of this
-software dedicate any and all copyright interest in the software to the public
-domain. We make this dedication for the benefit of the public at large and to
-the detriment of our heirs and successors. We intend this dedication to be an
-overt act of relinquishment in perpetuity of all present and future rights to
-this software under copyright law.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
-ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
-WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
-For more information, please refer to <http://unlicense.org/>
-
-===============================================================================
-ALTERNATIVE 2 - MIT No Attribution
-===============================================================================
-Copyright 2020 David Reid
-
-Permission is hereby granted, free of charge, to any person obtaining a copy of
-this software and associated documentation files (the "Software"), to deal in
-the Software without restriction, including without limitation the rights to
-use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
-of the Software, and to permit persons to whom the Software is furnished to do
-so.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-*/
diff --git a/ggml/examples/gpt-2/CMakeLists.txt b/ggml/examples/gpt-2/CMakeLists.txt
deleted file mode 100644
index 91f15f0..0000000
--- a/ggml/examples/gpt-2/CMakeLists.txt
+++ /dev/null
@@ -1,48 +0,0 @@
-#
-# gpt-2
-
-set(TEST_TARGET gpt-2-ctx)
-add_executable(${TEST_TARGET} main-ctx.cpp)
-target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)
-
-set(TEST_TARGET gpt-2-alloc)
-add_executable(${TEST_TARGET} main-alloc.cpp)
-target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)
-
-set(TEST_TARGET gpt-2-backend)
-add_executable(${TEST_TARGET} main-backend.cpp)
-target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)
-
-set(TEST_TARGET gpt-2-backend2)
-add_executable(${TEST_TARGET} main.cpp)
-target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)
-
-#
-# gpt-2-quantize
-
-set(TEST_TARGET gpt-2-quantize)
-add_executable(${TEST_TARGET} quantize.cpp)
-target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)
-
-#
-# gpt-2-batched
-
-set(TEST_TARGET gpt-2-batched)
-add_executable(${TEST_TARGET} main-batched.cpp)
-target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)
-
-
-#
-# For GPU offloading
-
-if (GGML_CUBLAS)
-    add_compile_definitions(GGML_USE_CUBLAS)
-endif()
-
-if (GGML_CLBLAST)
-    add_compile_definitions(GGML_USE_CLBLAST)
-endif()
-
-if (GGML_METAL)
-    add_compile_definitions(GGML_USE_METAL)
-endif()
diff --git a/ggml/examples/gpt-2/README.md b/ggml/examples/gpt-2/README.md
deleted file mode 100644
index 45c932c..0000000
--- a/ggml/examples/gpt-2/README.md
+++ /dev/null
@@ -1,225 +0,0 @@
-# gpt-2
-
-This is a C++ example running GPT-2 inference using the [ggml](https://github.com/ggerganov/ggml) library.
-
-The program runs on the CPU - no video card is required.
-
-The [Cerebras-GPT](https://huggingface.co/cerebras) models are also supported.
-
-The example supports the following GPT-2 models:
-
-| Model | Description  | Disk Size |
-| ---   | ---          | ---       |
-| 117M  | Small model  | 240 MB    |
-| 345M  | Medium model | 680 MB    |
-| 774M  | Large model  | 1.5 GB    |
-| 1558M | XL model     | 3.0 GB    |
-
-Sample performance on MacBook M1 Pro:
-
-| Model | Size  | Time / Token |
-| ---   | ---   | ---    |
-| GPT-2 |  117M |   5 ms |
-| GPT-2 |  345M |  12 ms |
-| GPT-2 |  774M |  23 ms |
-| GPT-2 | 1558M |  42 ms |
-
-*TODO: add tables for Cerebras-GPT models*
-
-Sample output:
-
-```
-$ ./bin/gpt-2 -h
-usage: ./bin/gpt-2 [options]
-
-options:
-  -h, --help            show this help message and exit
-  -s SEED, --seed SEED  RNG seed (default: -1)
-  -t N, --threads N     number of threads to use during computation (default: 8)
-  -p PROMPT, --prompt PROMPT
-                        prompt to start generation with (default: random)
-  -n N, --n_predict N   number of tokens to predict (default: 200)
-  --top_k N             top-k sampling (default: 40)
-  --top_p N             top-p sampling (default: 0.9)
-  --temp N              temperature (default: 1.0)
-  -b N, --batch_size N  batch size for prompt processing (default: 8)
-  -m FNAME, --model FNAME
-                        model path (default: models/gpt-2-117M/ggml-model.bin)
-
-$ ./bin/gpt-2
-gpt2_model_load: loading model from 'models/gpt-2-117M/ggml-model.bin'
-gpt2_model_load: n_vocab = 50257
-gpt2_model_load: n_ctx   = 1024
-gpt2_model_load: n_embd  = 768
-gpt2_model_load: n_head  = 12
-gpt2_model_load: n_layer = 12
-gpt2_model_load: f16     = 1
-gpt2_model_load: ggml ctx size = 311.12 MB
-gpt2_model_load: memory size =    72.00 MB, n_mem = 12288
-gpt2_model_load: model size  =   239.08 MB
-main: number of tokens in prompt = 1
-
-So this is going to be the end of the line for us.
-
-If the Dolphins continue to do their business, it's possible that the team could make a bid to bring in new defensive coordinator Scott Linehan.
-
-Linehan's job is a little daunting, but he's a great coach and an excellent coach. I don't believe we're going to make the playoffs.
-
-We're going to have to work hard to keep our heads down and get ready to go.<|endoftext|>
-
-main: mem per token =  2048612 bytes
-main:     load time =   106.32 ms
-main:   sample time =     7.10 ms
-main:  predict time =   506.40 ms / 5.06 ms per token
-main:    total time =   629.84 ms
-```
-
-## Downloading and converting the original models (GPT-2)
-
-You can download the original model files using the [download-model.sh](download-model.sh) Bash script. The models are
-in Tensorflow format, so in order to use them with ggml, you need to convert them to appropriate format. This is done
-via the [convert-ckpt-to-ggml.py](convert-ckpt-to-ggml.py) python script.
-
-Here is the entire process for the GPT-2 117M model (download from official site + conversion):
-
-```
-cd ggml/build
-../examples/gpt-2/download-model.sh 117M
-
-Downloading model 117M ...
-models/gpt-2-117M/checkpoint                      100%[=============================>]      77  --.-KB/s    in 0s
-models/gpt-2-117M/encoder.json                    100%[=============================>]   1018K  1.20MB/s    in 0.8s
-models/gpt-2-117M/hparams.json                    100%[=============================>]      90  --.-KB/s    in 0s
-models/gpt-2-117M/model.ckpt.data-00000-of-00001  100%[=============================>] 474.70M  1.21MB/s    in 8m 39s
-models/gpt-2-117M/model.ckpt.index                100%[=============================>]   5.09K  --.-KB/s    in 0s
-models/gpt-2-117M/model.ckpt.meta                 100%[=============================>] 460.11K   806KB/s    in 0.6s
-models/gpt-2-117M/vocab.bpe                       100%[=============================>] 445.62K   799KB/s    in 0.6s
-Done! Model '117M' saved in 'models/gpt-2-117M/'
-
-Run the convert-ckpt-to-ggml.py script to convert the model to ggml format.
-
-  python /Users/john/ggml/examples/gpt-2/convert-ckpt-to-ggml.py models/gpt-2-117M/ 1
-
-```
-
-This conversion requires that you have python and Tensorflow installed on your computer. Still, if you want to avoid
-this, you can download the already converted ggml models as described below.
-
-## Downloading and converting the original models (Cerebras-GPT)
-
-Clone the respective repository from here: https://huggingface.co/cerebras
-
-Use the [convert-cerebras-to-ggml.py](convert-cerebras-to-ggml.py) script to convert the model to `ggml` format:
-
-```
-cd ggml/build
-git clone https://huggingface.co/cerebras/Cerebras-GPT-111M models/
-python ../examples/gpt-2/convert-cerebras-to-ggml.py models/Cerebras-GPT-111M/
-
-```
-
-## Downloading the ggml model directly (GPT-2)
-
-For convenience, I will be hosting the converted ggml model files in order to make it easier to run the examples. This
-way, you can directly download a single binary file and start using it. No python or Tensorflow is required.
-
-Here is how to get the 117M ggml model:
-
-```
-cd ggml/build
-../examples/gpt-2/download-ggml-model.sh 117M
-
-Downloading ggml model 117M ...
-models/gpt-2-117M/ggml-model.bin         100%[===============================>] 239.58M  8.52MB/s    in 28s
-Done! Model '117M' saved in 'models/gpt-2-117M/ggml-model.bin'
-You can now use it like this:
-
-  $ ./bin/gpt-2 -m models/gpt-2-117M/ggml-model.bin -p "This is an example"
-
-```
-
-At some point, I might decide to stop hosting these models. So in that case, simply revert to the manual process above.
-
-## Quantizing the models
-
-You can also try to quantize the `ggml` models via 4-bit integer quantization.
-Keep in mind that for smaller models, this will render them completely useless.
-You generally want to quantize larger models.
-
-```
-# quantize GPT-2 F16 to Q4_0 (faster but less precise)
-./bin/gpt-2-quantize models/gpt-2-1558M/ggml-model-f16.bin models/gpt-2-1558M/ggml-model-q4_0.bin 2
-./bin/gpt-2 -m models/gpt-2-1558M/ggml-model-q4_0.bin -p "This is an example"
-
-# quantize Cerebras F16 to Q4_1 (slower but more precise)
-./bin/gpt-2-quantize models/Cerebras-GPT-6.7B/ggml-model-f16.bin models/Cerebras-GPT-6.7B/ggml-model-q4_1.bin 3
-./bin/gpt-2 -m models/Cerebras-GPT-6.7B/ggml-model-q4_1.bin -p "This is an example"
-
-```
-
-## Batched generation example
-
-You can try the batched generation from a given prompt using the gpt-2-batched binary.
-
-Sample output:
-
-```
-$ gpt-2-batched -np 5 -m models/gpt-2-117M/ggml-model.bin -p "Hello my name is" -n 50
-
-main: seed = 1697037431
-gpt2_model_load: loading model from 'models/gpt-2-117M/ggml-model.bin'
-gpt2_model_load: n_vocab = 50257
-gpt2_model_load: n_ctx   = 1024
-gpt2_model_load: n_embd  = 768
-gpt2_model_load: n_head  = 12
-gpt2_model_load: n_layer = 12
-gpt2_model_load: ftype   = 1
-gpt2_model_load: qntvr   = 0
-gpt2_model_load: ggml tensor size    = 320 bytes
-gpt2_model_load: backend buffer size = 312.72 MB
-ggml_init_cublas: found 1 CUDA devices:
-  Device 0: NVIDIA GeForce GTX 1660, compute capability 7.5
-gpt2_model_load: using CPU backend
-gpt2_model_load: memory size =    72.00 MB, n_mem = 12288
-gpt2_model_load: model size  =   239.08 MB
-extract_tests_from_file : No test file found.
-test_gpt_tokenizer : 0 tests failed out of 0 tests.
-main: compute buffer size: 3.26 MB
-
-
-main: generating 5 sequences ...
-main: prompt: 'Hello my name is'
-main: number of tokens in prompt = 4, first 8 tokens: 15496 616 1438 318
-
-
-sequence 0:
-
-Hello my name is John. You can call me any way you want, if you want, but for my very first date, I will be on the phone with you. We're both in our early 20s, but I feel like it's all
-
-sequence 1:
-
-Hello my name is Robert, and I want to say that we're proud to have your company here on the world's largest platform for sharing your stories with us. This is a huge opportunity for our community. We have hundreds of people on this team and
-
-sequence 2:
-
-Hello my name is Jack. I'm the one who created you.
-
-Jack is a boy with a big smile and a big heart. He is a handsome guy. He loves the outdoors and loves the people he meets. He wants to be a
-
-sequence 3:
-
-Hello my name is John. I am a Canadian citizen with a large number of family in Quebec and I am interested in studying. My aim is to take up a post in the Journal of the International Academy of Sciences of Canada which I am currently finishing.
-
-sequence 4:
-
-Hello my name is Dan. I am an entrepreneur. I am a great father. I am a great husband. I am a great husband. I am a great dad. And I am a great husband.
-
-I love my life. I love
-
-
-
-main:     load time =   880.80 ms
-main:   sample time =    91.43 ms
-main:  predict time =  2518.29 ms
-main:    total time =  3544.32 ms
-```
diff --git a/ggml/examples/gpt-2/convert-cerebras-to-ggml.py b/ggml/examples/gpt-2/convert-cerebras-to-ggml.py
deleted file mode 100644
index 6057f81..0000000
--- a/ggml/examples/gpt-2/convert-cerebras-to-ggml.py
+++ /dev/null
@@ -1,183 +0,0 @@
-# Convert Cerebras models to ggml format
-#
-# ref: https://www.cerebras.net/blog/cerebras-gpt-a-family-of-open-compute-efficient-large-language-models/
-#
-
-import sys
-import struct
-import json
-import torch
-import numpy as np
-import re
-
-from transformers import AutoModelForCausalLM
-
-# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
-def bytes_to_unicode():
-    """
-    Returns list of utf-8 byte and a corresponding list of unicode strings.
-    The reversible bpe codes work on unicode strings.
-    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
-    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
-    This is a signficant percentage of your normal, say, 32K bpe vocab.
-    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
-    And avoids mapping to whitespace/control characters the bpe code barfs on.
-    """
-    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
-    cs = bs[:]
-    n = 0
-    for b in range(2**8):
-        if b not in bs:
-            bs.append(b)
-            cs.append(2**8+n)
-            n += 1
-    cs = [chr(n) for n in cs]
-    return dict(zip(bs, cs))
-
-if len(sys.argv) < 2:
-    print("Usage: convert-cerebras-to-ggml.py dir-model [use-f32]\n")
-    sys.exit(1)
-
-# output in the same directory as the model
-dir_model = sys.argv[1]
-fname_out = sys.argv[1] + "/ggml-model-f16.bin"
-
-with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f:
-    encoder = json.load(f)
-
-with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
-    hparams = json.load(f)
-
-# use 16-bit or 32-bit floats
-use_f16 = True
-if len(sys.argv) > 2:
-    use_f16 = False
-    fname_out = sys.argv[1] + "/ggml-model-f32.bin"
-
-model = AutoModelForCausalLM.from_pretrained(dir_model, low_cpu_mem_usage=True)
-#print (model)
-
-list_vars = model.state_dict()
-#print (list_vars)
-
-print(hparams)
-
-fout = open(fname_out, "wb")
-
-fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
-fout.write(struct.pack("i", hparams["vocab_size"]))
-fout.write(struct.pack("i", hparams["n_positions"]))
-fout.write(struct.pack("i", hparams["n_embd"]))
-fout.write(struct.pack("i", hparams["n_head"]))
-fout.write(struct.pack("i", hparams["n_layer"]))
-fout.write(struct.pack("i", use_f16))
-
-byte_encoder = bytes_to_unicode()
-byte_decoder = {v:k for k, v in byte_encoder.items()}
-
-fout.write(struct.pack("i", len(encoder)))
-
-for key in encoder:
-    text = bytearray([byte_decoder[c] for c in key])
-    fout.write(struct.pack("i", len(text)))
-    fout.write(text)
-
-for name in list_vars.keys():
-    data = list_vars[name].squeeze().numpy()
-    print("Processing variable: " + name + " with shape: ", data.shape)
-
-    # rename headers to keep compatibility
-    if name == "transformer.ln_f.weight":
-        name = "model/ln_f/g"
-    elif name == "transformer.ln_f.bias":
-        name = "model/ln_f/b"
-    elif name == "transformer.wte.weight":
-        name = "model/wte"
-    elif name == "transformer.wpe.weight":
-        name = "model/wpe"
-    elif name == "lm_head.weight":
-        name = "model/lm_head"
-    elif re.match(r"transformer.h\.\d+\.ln_1\.weight", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/ln_1/g"
-    elif re.match(r"transformer.h\.\d+\.ln_1\.bias", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/ln_1/b"
-    elif re.match(r"transformer.h\.\d+\.attn\.c_attn\.weight", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/attn/c_attn/w"
-    elif re.match(r"transformer.h\.\d+\.attn\.c_attn\.bias", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/attn/c_attn/b"
-    elif re.match(r"transformer.h\.\d+\.attn\.c_proj\.weight", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/attn/c_proj/w"
-    elif re.match(r"transformer.h.\d+.attn.c_proj.bias", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/attn/c_proj/b"
-    elif re.match(r"transformer.h.\d+.ln_2.weight", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/ln_2/g"
-    elif re.match(r"transformer.h.\d+.ln_2.bias", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/ln_2/b"
-    elif re.match(r"transformer.h.\d+.mlp.c_fc.weight", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/mlp/c_fc/w"
-    elif re.match(r"transformer.h.\d+.mlp.c_fc.bias", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/mlp/c_fc/b"
-    elif re.match(r"transformer.h.\d+.mlp.c_proj.weight", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/mlp/c_proj/w"
-    elif re.match(r"transformer.h.\d+.mlp.c_proj.bias", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/mlp/c_proj/b"
-    else:
-        print("Unrecognized variable name. %s", name)
-
-    # we don't need these
-    if name.endswith("attn.masked_bias") or name.endswith(".attn.bias"):
-        print("  Skipping variable: " + name)
-        continue
-
-    n_dims = len(data.shape);
-
-    # ftype == 0 -> float32, ftype == 1 -> float16
-    ftype = 0;
-    if use_f16:
-        if (name == "model/wte" or name == "model/lm_head" or name[-2:] == "/g" or name[-2:] == "/w") and n_dims == 2:
-            print("  Converting to float16")
-            data = data.astype(np.float16)
-            ftype = 1
-        else:
-            print("  Converting to float32")
-            data = data.astype(np.float32)
-            ftype = 0
-
-    # for efficiency - transpose the projection matrices
-    # "model/h.*/attn/c_attn/w"
-    # "model/h.*/attn/c_proj/w"
-    # "model/h.*/mlp/c_fc/w"
-    # "model/h.*/mlp/c_proj/w"
-    if name[-14:] == "/attn/c_attn/w" or \
-       name[-14:] == "/attn/c_proj/w" or \
-       name[-11:] == "/mlp/c_fc/w" or \
-       name[-13:] == "/mlp/c_proj/w":
-        print("  Transposing")
-        data = data.transpose()
-
-    # header
-    str = name.encode('utf-8')
-    fout.write(struct.pack("iii", n_dims, len(str), ftype))
-    for i in range(n_dims):
-        fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
-    fout.write(str);
-
-    # data
-    data.tofile(fout)
-
-fout.close()
-
-print("Done. Output file: " + fname_out)
-print("")
diff --git a/ggml/examples/gpt-2/convert-ckpt-to-ggml.py b/ggml/examples/gpt-2/convert-ckpt-to-ggml.py
deleted file mode 100644
index 9113141..0000000
--- a/ggml/examples/gpt-2/convert-ckpt-to-ggml.py
+++ /dev/null
@@ -1,159 +0,0 @@
-# Convert a model checkpoint to a ggml compatible file
-#
-# Load the model using TensorFlow.
-# Iterate over all variables and write them to a binary file.
-#
-# For each variable, write the following:
-#   - Number of dimensions (int)
-#   - Name length (int)
-#   - Dimensions (int[n_dims])
-#   - Name (char[name_length])
-#   - Data (float[n_dims])
-#
-# By default, the bigger matrices are converted to 16-bit floats.
-# This can be disabled by adding the "use-f32" CLI argument.
-#
-# At the start of the ggml file we write the model parameters
-# and vocabulary.
-#
-
-import sys
-import json
-import struct
-import numpy as np
-import tensorflow as tf
-
-# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
-def bytes_to_unicode():
-    """
-    Returns list of utf-8 byte and a corresponding list of unicode strings.
-    The reversible bpe codes work on unicode strings.
-    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
-    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
-    This is a signficant percentage of your normal, say, 32K bpe vocab.
-    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
-    And avoids mapping to whitespace/control characters the bpe code barfs on.
-    """
-    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
-    cs = bs[:]
-    n = 0
-    for b in range(2**8):
-        if b not in bs:
-            bs.append(b)
-            cs.append(2**8+n)
-            n += 1
-    cs = [chr(n) for n in cs]
-    return dict(zip(bs, cs))
-
-# helper method to convert a numpy array to different float types
-def convert_to_ftype(data, ftype):
-    # fp16
-    if ftype == 1:
-        return data.astype(np.float16)
-
-    assert False, "Invalid ftype: " + str(ftype)
-
-if len(sys.argv) < 3:
-    print("Usage: convert-ckpt-to-ggml.py dir-model ftype\n")
-    print("  ftype == 0 -> float32")
-    print("  ftype == 1 -> float16")
-    sys.exit(1)
-
-# output in the same directory as the model
-dir_model = sys.argv[1]
-fname_out = sys.argv[1] + "/ggml-model.bin"
-
-with open(dir_model + "/encoder.json", "r", encoding="utf-8") as f:
-    encoder = json.load(f)
-
-with open(dir_model + "/hparams.json", "r", encoding="utf-8") as f:
-    hparams = json.load(f)
-
-# possible data types
-#   ftype == 0 -> float32
-#   ftype == 1 -> float16
-#
-# map from ftype to string
-ftype_str = ["f32", "f16"]
-
-ftype = 1
-if len(sys.argv) > 2:
-    ftype = int(sys.argv[2])
-    if ftype < 0 or ftype > 1:
-        print("Invalid ftype: " + str(ftype))
-        sys.exit(1)
-    fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin"
-
-list_vars = tf.train.list_variables(dir_model)
-
-fout = open(fname_out, "wb")
-
-fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
-fout.write(struct.pack("i", hparams["n_vocab"]))
-fout.write(struct.pack("i", hparams["n_ctx"]))
-fout.write(struct.pack("i", hparams["n_embd"]))
-fout.write(struct.pack("i", hparams["n_head"]))
-fout.write(struct.pack("i", hparams["n_layer"]))
-fout.write(struct.pack("i", ftype))
-
-byte_encoder = bytes_to_unicode()
-byte_decoder = {v:k for k, v in byte_encoder.items()}
-
-fout.write(struct.pack("i", len(encoder)))
-
-for key in encoder:
-    text = bytearray([byte_decoder[c] for c in key])
-    fout.write(struct.pack("i", len(text)))
-    fout.write(text)
-
-for name, shape in list_vars:
-    print("Processing variable: " + name + " with shape: ", shape)
-
-    data = tf.train.load_variable(dir_model, name).squeeze()
-    n_dims = len(data.shape);
-
-    # for efficiency - transpose the projection matrices
-    # "model/h.*/attn/c_attn/w"
-    # "model/h.*/attn/c_proj/w"
-    # "model/h.*/mlp/c_fc/w"
-    # "model/h.*/mlp/c_proj/w"
-    if name[-14:] == "/attn/c_attn/w" or \
-       name[-14:] == "/attn/c_proj/w" or \
-       name[-11:] == "/mlp/c_fc/w" or \
-       name[-13:] == "/mlp/c_proj/w":
-        print("  Transposing")
-        data = data.transpose()
-
-    dshape = data.shape
-
-    ftype_cur = 0
-    if ftype != 0:
-        # match name:
-        #  "model/wte"
-        #  "model/h.*/attn/c_attn/w"
-        #  "model/h.*/attn/c_proj/w"
-        #  "model/h.*/mlp/c_fc/w"
-        #  "model/h.*/mlp/c_proj/w"
-        if name == "model/wte" or name[-2:] == "/w":
-            print("  Converting to " + ftype_str[ftype])
-            data = convert_to_ftype(data, ftype)
-            ftype_cur = ftype
-        else:
-            print("  Converting to float32")
-            data = data.astype(np.float32)
-            ftype_cur = 0
-
-    # header
-    str = name.encode('utf-8')
-    fout.write(struct.pack("iii", n_dims, len(str), ftype_cur))
-    for i in range(n_dims):
-        fout.write(struct.pack("i", dshape[n_dims - 1 - i]))
-    fout.write(str);
-
-    # data
-    data.tofile(fout)
-
-fout.close()
-
-print("Done. Output file: " + fname_out)
-print("")
diff --git a/ggml/examples/gpt-2/convert-h5-to-ggml.py b/ggml/examples/gpt-2/convert-h5-to-ggml.py
deleted file mode 100644
index 6a2b865..0000000
--- a/ggml/examples/gpt-2/convert-h5-to-ggml.py
+++ /dev/null
@@ -1,195 +0,0 @@
-# Convert GPT-2 h5 transformer model to ggml format
-#
-# Load the model using GPT2Model.
-# Iterate over all variables and write them to a binary file.
-#
-# For each variable, write the following:
-#   - Number of dimensions (int)
-#   - Name length (int)
-#   - Dimensions (int[n_dims])
-#   - Name (char[name_length])
-#   - Data (float[n_dims])
-#
-# By default, the bigger matrices are converted to 16-bit floats.
-# This can be disabled by adding the "use-f32" CLI argument.
-#
-# At the start of the ggml file we write the model parameters
-# and vocabulary.
-#
-
-import sys
-import struct
-import json
-import numpy as np
-import re
-
-from transformers import GPT2Model
-
-# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
-def bytes_to_unicode():
-    """
-    Returns list of utf-8 byte and a corresponding list of unicode strings.
-    The reversible bpe codes work on unicode strings.
-    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
-    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
-    This is a signficant percentage of your normal, say, 32K bpe vocab.
-    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
-    And avoids mapping to whitespace/control characters the bpe code barfs on.
-    """
-    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
-    cs = bs[:]
-    n = 0
-    for b in range(2**8):
-        if b not in bs:
-            bs.append(b)
-            cs.append(2**8+n)
-            n += 1
-    cs = [chr(n) for n in cs]
-    return dict(zip(bs, cs))
-
-if len(sys.argv) < 2:
-    print("Usage: convert-h5-to-ggml.py dir-model [use-f32]\n")
-    sys.exit(1)
-
-# output in the same directory as the model
-dir_model = sys.argv[1]
-fname_out = sys.argv[1] + "/ggml-model.bin"
-
-with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f:
-    encoder = json.load(f)
-
-with open(dir_model + "/added_tokens.json", "r", encoding="utf-8") as f:
-    encoder_added = json.load(f)
-
-with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
-    hparams = json.load(f)
-
-# use 16-bit or 32-bit floats
-use_f16 = True
-if len(sys.argv) > 2:
-    use_f16 = False
-    fname_out = sys.argv[1] + "/ggml-model-f32.bin"
-
-model = GPT2Model.from_pretrained(dir_model, low_cpu_mem_usage=True)
-#print (model)
-
-list_vars = model.state_dict()
-#print (list_vars)
-
-fout = open(fname_out, "wb")
-
-fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
-fout.write(struct.pack("i", hparams["vocab_size"]))
-fout.write(struct.pack("i", hparams["n_positions"]))
-fout.write(struct.pack("i", hparams["n_embd"]))
-fout.write(struct.pack("i", hparams["n_head"]))
-fout.write(struct.pack("i", hparams["n_layer"]))
-#fout.write(struct.pack("i", hparams["rotary_dim"]))
-fout.write(struct.pack("i", use_f16))
-
-byte_encoder = bytes_to_unicode()
-byte_decoder = {v:k for k, v in byte_encoder.items()}
-
-fout.write(struct.pack("i", len(encoder) + len(encoder_added)))
-
-for key in encoder:
-    text = bytearray([byte_decoder[c] for c in key])
-    fout.write(struct.pack("i", len(text)))
-    fout.write(text)
-
-for key in encoder_added:
-    text = bytearray([byte_decoder[c] for c in key])
-    fout.write(struct.pack("i", len(text)))
-    fout.write(text)
-
-for name in list_vars.keys():
-    data = list_vars[name].squeeze().numpy()
-    print("Processing variable: " + name + " with shape: ", data.shape)
-
-    # we don't need these
-    if name.endswith("attn.masked_bias") or name.endswith(".attn.bias"):
-        print("  Skipping variable: " + name)
-        continue
-
-    n_dims = len(data.shape);
-
-    # ftype == 0 -> float32, ftype == 1 -> float16
-    ftype = 0;
-    if use_f16:
-        if name[-7:] == ".weight" and n_dims == 2:
-            print("  Converting to float16")
-            data = data.astype(np.float16)
-            ftype = 1
-        else:
-            print("  Converting to float32")
-            data = data.astype(np.float32)
-            ftype = 0
-
-    # for efficiency - transpose these matrices:
-    #  "transformer.h.*.mlp.c_proj.weight
-    if name.endswith(".mlp.c_proj.weight"):
-        print("  Transposing")
-        data = data.transpose()
-
-    # rename headers to keep compatibility
-    if name == "ln_f.weight":
-        name = "model/ln_f/g"
-    elif name == "ln_f.bias":
-        name = "model/ln_f/b"
-    elif name == "wte.weight":
-        name = "model/wte"
-    elif name == "wpe.weight":
-        name = "model/wpe"
-    elif re.match(r"h\.\d+\.ln_1\.weight", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/ln_1/g"
-    elif re.match(r"h\.\d+\.ln_1\.bias", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/ln_1/b"
-    elif re.match(r"h\.\d+\.attn\.c_attn\.weight", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/attn/c_attn/w"
-    elif re.match(r"h\.\d+\.attn\.c_attn\.bias", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/attn/c_attn/b"
-    elif re.match(r"h\.\d+\.attn\.c_proj\.weight", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/attn/c_proj/w"
-    elif re.match(r"h.\d+.attn.c_proj.bias", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/attn/c_proj/b"
-    elif re.match(r"h.\d+.ln_2.weight", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/ln_2/g"
-    elif re.match(r"h.\d+.ln_2.bias", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/ln_2/b"
-    elif re.match(r"h.\d+.mlp.c_fc.weight", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/mlp/c_fc/w"
-    elif re.match(r"h.\d+.mlp.c_fc.bias", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/mlp/c_fc/b"
-    elif re.match(r"h.\d+.mlp.c_proj.weight", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/mlp/c_proj/w"
-    elif re.match(r"h.\d+.mlp.c_proj.bias", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/mlp/c_proj/b"
-    else:
-        print("Unrecognized variable name. %s", name)
-
-    str = name.encode('utf-8')
-
-    fout.write(struct.pack("iii", n_dims, len(str), ftype))
-    for i in range(n_dims):
-        fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
-    fout.write(str);
-
-    # data
-    data.tofile(fout)
-
-fout.close()
-
-print("Done. Output file: " + fname_out)
-print("")
diff --git a/ggml/examples/gpt-2/download-ggml-model.sh b/ggml/examples/gpt-2/download-ggml-model.sh
deleted file mode 100644
index 3aae015..0000000
--- a/ggml/examples/gpt-2/download-ggml-model.sh
+++ /dev/null
@@ -1,69 +0,0 @@
-#!/bin/bash
-
-# This script downloads GPT-2 model files that have already been converted to ggml format.
-# This way you don't have to convert them yourself.
-#
-# If you want to download the original GPT-2 model files, use the "download-model.sh" script instead.
-
-#src="https://ggml.ggerganov.com"
-#pfx="ggml-model-gpt-2"
-
-src="https://huggingface.co/ggerganov/ggml"
-pfx="resolve/main/ggml-model-gpt-2"
-
-ggml_path=$(dirname $(realpath $0))
-
-# GPT-2 models
-models=( "117M" "345M" "774M" "1558M" )
-
-# list available models
-function list_models {
-    printf "\n"
-    printf "  Available models:"
-    for model in "${models[@]}"; do
-        printf " $model"
-    done
-    printf "\n\n"
-}
-
-if [ "$#" -ne 1 ]; then
-    printf "Usage: $0 <model>\n"
-    list_models
-
-    exit 1
-fi
-
-model=$1
-
-if [[ ! " ${models[@]} " =~ " ${model} " ]]; then
-    printf "Invalid model: $model\n"
-    list_models
-
-    exit 1
-fi
-
-# download ggml model
-
-printf "Downloading ggml model $model ...\n"
-
-mkdir -p models/gpt-2-$model
-
-if [ -x "$(command -v wget)" ]; then
-    wget --quiet --show-progress -O models/gpt-2-$model/ggml-model.bin $src/$pfx-$model.bin
-elif [ -x "$(command -v curl)" ]; then
-    curl -L --output models/gpt-2-$model/ggml-model.bin $src/$pfx-$model.bin
-else
-    printf "Either wget or curl is required to download models.\n"
-    exit 1
-fi
-
-if [ $? -ne 0 ]; then
-    printf "Failed to download ggml model $model \n"
-    printf "Please try again later or download the original GPT-2 model files and convert them yourself.\n"
-    exit 1
-fi
-
-printf "Done! Model '$model' saved in 'models/gpt-2-$model/ggml-model.bin'\n"
-printf "You can now use it like this:\n\n"
-printf "  $ ./bin/gpt-2 -m models/gpt-2-$model/ggml-model.bin -p \"This is an example\"\n"
-printf "\n"
diff --git a/ggml/examples/gpt-2/download-model.sh b/ggml/examples/gpt-2/download-model.sh
deleted file mode 100644
index f0c62f4..0000000
--- a/ggml/examples/gpt-2/download-model.sh
+++ /dev/null
@@ -1,48 +0,0 @@
-#!/bin/bash
-
-ggml_path=$(dirname $(realpath $0))
-
-# GPT-2 models
-models=( "117M" "345M" "774M" "1558M" )
-
-# list available models
-function list_models {
-    printf "\n"
-    printf "  Available models:"
-    for model in "${models[@]}"; do
-        printf " $model"
-    done
-    printf "\n\n"
-}
-
-if [ "$#" -ne 1 ]; then
-    printf "Usage: $0 <model>\n"
-    list_models
-
-    exit 1
-fi
-
-model=$1
-
-if [[ ! " ${models[@]} " =~ " ${model} " ]]; then
-    printf "Invalid model: $model\n"
-    list_models
-
-    exit 1
-fi
-
-# download model
-
-printf "Downloading model $model ...\n"
-
-mkdir -p models/gpt-2-$model
-
-for file in checkpoint encoder.json hparams.json model.ckpt.data-00000-of-00001 model.ckpt.index model.ckpt.meta vocab.bpe; do
-    wget --quiet --show-progress -O models/gpt-2-$model/$file https://openaipublic.blob.core.windows.net/gpt-2/models/$model/$file
-done
-
-printf "Done! Model '$model' saved in 'models/gpt-2-$model/'\n\n"
-printf "Run the convert-ckpt-to-ggml.py script to convert the model to ggml format.\n"
-printf "\n"
-printf "  python $ggml_path/convert-ckpt-to-ggml.py models/gpt-2-$model/\n"
-printf "\n"
diff --git a/ggml/examples/gpt-2/main-alloc.cpp b/ggml/examples/gpt-2/main-alloc.cpp
deleted file mode 100644
index c0a6846..0000000
--- a/ggml/examples/gpt-2/main-alloc.cpp
+++ /dev/null
@@ -1,886 +0,0 @@
-#include "ggml/ggml.h"
-#include "ggml/ggml-alloc.h"
-
-#include "common.h"
-#include "common-ggml.h"
-
-#include <cassert>
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <fstream>
-#include <map>
-#include <string>
-#include <vector>
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-
-// default hparams (GPT-2 117M)
-struct gpt2_hparams {
-    int32_t n_vocab = 50257;
-    int32_t n_ctx   = 1024;
-    int32_t n_embd  = 768;
-    int32_t n_head  = 12;
-    int32_t n_layer = 12;
-    int32_t ftype   = 1;
-    float   eps     = 1e-5f;
-};
-
-struct gpt2_layer {
-    // normalization
-    struct ggml_tensor * ln_1_g;
-    struct ggml_tensor * ln_1_b;
-
-    struct ggml_tensor * ln_2_g;
-    struct ggml_tensor * ln_2_b;
-
-    // attention
-    struct ggml_tensor * c_attn_attn_w;
-    struct ggml_tensor * c_attn_attn_b;
-
-    struct ggml_tensor * c_attn_proj_w;
-    struct ggml_tensor * c_attn_proj_b;
-
-    // mlp
-    struct ggml_tensor * c_mlp_fc_w;
-    struct ggml_tensor * c_mlp_fc_b;
-
-    struct ggml_tensor * c_mlp_proj_w;
-    struct ggml_tensor * c_mlp_proj_b;
-};
-
-struct gpt2_model {
-    gpt2_hparams hparams;
-
-    // normalization
-    struct ggml_tensor * ln_f_g;
-    struct ggml_tensor * ln_f_b;
-
-    struct ggml_tensor * wte;     // position embedding
-    struct ggml_tensor * wpe;     //    token embedding
-    struct ggml_tensor * lm_head; // language model head
-
-    std::vector<gpt2_layer> layers;
-
-    // key + value memory
-    struct ggml_tensor * memory_k;
-    struct ggml_tensor * memory_v;
-
-    //
-    struct ggml_context * ctx;
-    std::map<std::string, struct ggml_tensor *> tensors;
-};
-
-// load the model's weights from a file
-bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & vocab) {
-    printf("%s: loading model from '%s'\n", __func__, fname.c_str());
-
-    auto fin = std::ifstream(fname, std::ios::binary);
-    if (!fin) {
-        fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
-        return false;
-    }
-
-    // verify magic
-    {
-        uint32_t magic;
-        fin.read((char *) &magic, sizeof(magic));
-        if (magic != GGML_FILE_MAGIC) {
-            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
-            return false;
-        }
-    }
-
-    // load hparams
-    {
-        auto & hparams = model.hparams;
-
-        fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
-        fin.read((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
-        fin.read((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
-        fin.read((char *) &hparams.n_head,  sizeof(hparams.n_head));
-        fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
-        fin.read((char *) &hparams.ftype,   sizeof(hparams.ftype));
-
-        const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
-
-        printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
-        printf("%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
-        printf("%s: n_embd  = %d\n", __func__, hparams.n_embd);
-        printf("%s: n_head  = %d\n", __func__, hparams.n_head);
-        printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
-        printf("%s: ftype   = %d\n", __func__, hparams.ftype);
-        printf("%s: qntvr   = %d\n", __func__, qntvr);
-
-        hparams.ftype %= GGML_QNT_VERSION_FACTOR;
-    }
-
-    // load vocab
-    {
-        int32_t n_vocab = 0;
-        fin.read((char *) &n_vocab, sizeof(n_vocab));
-
-        if (n_vocab != model.hparams.n_vocab) {
-            fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
-                    __func__, fname.c_str(), n_vocab, model.hparams.n_vocab);
-            return false;
-        }
-
-        std::string word;
-        std::vector<char> buf(128);
-
-        for (int i = 0; i < n_vocab; i++) {
-            uint32_t len;
-            fin.read((char *) &len, sizeof(len));
-
-            buf.resize(len);
-            fin.read((char *) buf.data(), len);
-            word.assign(buf.data(), len);
-
-            vocab.token_to_id[word] = i;
-            vocab.id_to_token[i] = word;
-        }
-    }
-
-    // for the big tensors, we have the option to store the data in 16-bit floats or quantized
-    // in order to save memory and also to speed up the computation
-    ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
-    if (wtype == GGML_TYPE_COUNT) {
-        fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
-                __func__, fname.c_str(), model.hparams.ftype);
-        return false;
-    }
-
-    auto & ctx = model.ctx;
-
-    size_t ctx_size = 0;
-
-    {
-        const auto & hparams = model.hparams;
-
-        const int n_embd  = hparams.n_embd;
-        const int n_layer = hparams.n_layer;
-        const int n_ctx   = hparams.n_ctx;
-        const int n_vocab = hparams.n_vocab;
-
-        ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_g
-        ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_b
-
-        ctx_size += ggml_row_size(wtype,         n_vocab*n_embd); // wte
-        ctx_size += ggml_row_size(GGML_TYPE_F32  , n_ctx*n_embd); // wpe
-        ctx_size += ggml_row_size(wtype,         n_vocab*n_embd); // lm_head
-
-        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_g
-        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_b
-
-        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_g
-        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_b
-
-        ctx_size += n_layer*(ggml_row_size(wtype,         3*n_embd*n_embd)); // c_attn_attn_w
-        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 3*n_embd));        // c_attn_attn_b
-
-        ctx_size += n_layer*(ggml_row_size(wtype,         n_embd*n_embd));   // c_attn_proj_w
-        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd));          // c_attn_proj_b
-
-        ctx_size += n_layer*(ggml_row_size(wtype,         4*n_embd*n_embd)); // c_mlp_fc_w
-        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 4*n_embd));        // c_mlp_fc_b
-
-        ctx_size += n_layer*(ggml_row_size(wtype,         4*n_embd*n_embd)); // c_mlp_proj_w
-        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 4*n_embd));        // c_mlp_proj_b
-
-        ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F32, n_embd); // memory_k
-        ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F32, n_embd); // memory_v
-
-        ctx_size += (6 + 12*n_layer)*512; // object overhead
-
-        printf("%s: ggml tensor size = %d bytes\n", __func__, (int) sizeof(ggml_tensor));
-        printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
-    }
-
-    // create the ggml context
-    {
-        struct ggml_init_params params = {
-            /*.mem_size   =*/ ctx_size,
-            /*.mem_buffer =*/ NULL,
-            /*.no_alloc   =*/ false,
-        };
-
-        model.ctx = ggml_init(params);
-        if (!model.ctx) {
-            fprintf(stderr, "%s: ggml_init() failed\n", __func__);
-            return false;
-        }
-    }
-
-    // prepare memory for the weights
-    {
-        const auto & hparams = model.hparams;
-
-        const int n_embd  = hparams.n_embd;
-        const int n_layer = hparams.n_layer;
-        const int n_ctx   = hparams.n_ctx;
-        const int n_vocab = hparams.n_vocab;
-
-        model.layers.resize(n_layer);
-
-        model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-        model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-
-        model.wte     = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
-        model.wpe     = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ctx);
-        model.lm_head = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
-
-        // map by name
-        model.tensors["model/ln_f/g"] = model.ln_f_g;
-        model.tensors["model/ln_f/b"] = model.ln_f_b;
-
-        model.tensors["model/wte"]     = model.wte;
-        model.tensors["model/wpe"]     = model.wpe;
-        model.tensors["model/lm_head"] = model.lm_head;
-
-        for (int i = 0; i < n_layer; ++i) {
-            auto & layer = model.layers[i];
-
-            layer.ln_1_g        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-            layer.ln_1_b        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-
-            layer.ln_2_g        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-            layer.ln_2_b        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-
-            layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype,           n_embd, 3*n_embd);
-            layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd);
-
-            layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype,           n_embd, n_embd);
-            layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-
-            layer.c_mlp_fc_w    = ggml_new_tensor_2d(ctx, wtype,           n_embd, 4*n_embd);
-            layer.c_mlp_fc_b    = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
-
-            layer.c_mlp_proj_w  = ggml_new_tensor_2d(ctx, wtype,         4*n_embd, n_embd);
-            layer.c_mlp_proj_b  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-
-            // map by name
-            model.tensors["model/h" + std::to_string(i) + "/ln_1/g"]        = layer.ln_1_g;
-            model.tensors["model/h" + std::to_string(i) + "/ln_1/b"]        = layer.ln_1_b;
-
-            model.tensors["model/h" + std::to_string(i) + "/ln_2/g"]        = layer.ln_2_g;
-            model.tensors["model/h" + std::to_string(i) + "/ln_2/b"]        = layer.ln_2_b;
-
-            model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/w"] = layer.c_attn_attn_w;
-            model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/b"] = layer.c_attn_attn_b;
-
-            model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/w"] = layer.c_attn_proj_w;
-            model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/b"] = layer.c_attn_proj_b;
-
-            model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/w"]    = layer.c_mlp_fc_w;
-            model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/b"]    = layer.c_mlp_fc_b;
-
-            model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/w"]  = layer.c_mlp_proj_w;
-            model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/b"]  = layer.c_mlp_proj_b;
-        }
-    }
-
-    // key + value memory
-    {
-        const auto & hparams = model.hparams;
-
-        const int n_embd  = hparams.n_embd;
-        const int n_layer = hparams.n_layer;
-        const int n_ctx   = hparams.n_ctx;
-
-        const int n_mem      = n_layer*n_ctx;
-        const int n_elements = n_embd*n_mem;
-
-        model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
-        model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
-
-        const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
-
-        printf("%s: memory size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem);
-    }
-
-    // load weights
-    {
-        size_t total_size = 0;
-
-        bool has_lm_head = false;
-
-        while (true) {
-            int32_t n_dims;
-            int32_t length;
-            int32_t ttype;
-
-            fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
-            fin.read(reinterpret_cast<char *>(&length), sizeof(length));
-            fin.read(reinterpret_cast<char *>(&ttype),  sizeof(ttype));
-
-            if (fin.eof()) {
-                break;
-            }
-
-            int32_t nelements = 1;
-            int32_t ne[2] = { 1, 1 };
-            for (int i = 0; i < n_dims; ++i) {
-                fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
-                nelements *= ne[i];
-            }
-
-            std::string name(length, 0);
-            fin.read(&name[0], length);
-
-            if (model.tensors.find(name) == model.tensors.end()) {
-                fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.c_str());
-                return false;
-            }
-
-            auto tensor = model.tensors[name];
-            if (ggml_nelements(tensor) != nelements) {
-                fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.c_str());
-                return false;
-            }
-
-            if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
-                fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
-                        __func__, name.c_str(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]);
-                return false;
-            }
-
-            // for debugging
-            if (0) {
-                printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.c_str(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
-            }
-
-            const size_t bpe = ggml_type_size(ggml_type(ttype));
-
-            if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
-                fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
-                        __func__, name.c_str(), ggml_nbytes(tensor), nelements*bpe);
-                return false;
-            }
-
-            fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
-
-            // GPT-2 models share the WTE tensor as the LM head
-            if (name == "model/wte" && has_lm_head == false) {
-                memcpy(model.lm_head->data, tensor->data, ggml_nbytes(tensor));
-            }
-
-            if (name == "model/lm_head") {
-                has_lm_head = true;
-            }
-
-            total_size += ggml_nbytes(tensor);
-        }
-
-        printf("%s: model size  = %8.2f MB\n", __func__, total_size/1024.0/1024.0);
-    }
-
-    fin.close();
-
-    return true;
-}
-
-// build the computation graph
-struct ggml_cgraph * gpt2_graph(
-        const gpt2_model & model,
-        struct ggml_allocr * allocr,
-        const int n_past,
-        const std::vector<gpt_vocab::id> & embd_inp) {
-    const int N = embd_inp.size();
-
-    const auto & hparams = model.hparams;
-
-    const int n_embd  = hparams.n_embd;
-    const int n_layer = hparams.n_layer;
-    const int n_ctx   = hparams.n_ctx;
-    const int n_head  = hparams.n_head;
-
-    // since we are using ggml-alloc, this buffer only needs enough space to hold the ggml_tensor and ggml_cgraph structs, but not the tensor data
-    static size_t buf_size = ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead();
-    static std::vector<uint8_t> buf(buf_size);
-
-    struct ggml_init_params params = {
-        /*.mem_size   =*/ buf_size,
-        /*.mem_buffer =*/ buf.data(),
-        /*.no_alloc   =*/ true, // the tensors will be allocated later by ggml_allocr_alloc_graph()
-    };
-
-    struct ggml_context * ctx0 = ggml_init(params);
-
-    struct ggml_cgraph  * gf = ggml_new_graph(ctx0);
-
-    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
-    ggml_allocr_alloc(allocr, embd);
-
-    // avoid writing to tensors if we are only measuring the memory usage
-    if (!ggml_allocr_is_measure(allocr)) {
-        memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
-    }
-
-    struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
-    ggml_allocr_alloc(allocr, position);
-    if (!ggml_allocr_is_measure(allocr)) {
-        for (int i = 0; i < N; ++i) {
-            ((int32_t *) position->data)[i] = n_past + i;
-        }
-    }
-
-    // wte + wpe
-    struct ggml_tensor * inpL =
-        ggml_add(ctx0,
-                ggml_get_rows(ctx0, model.wte, embd),
-                ggml_get_rows(ctx0, model.wpe, position));
-
-    for (int il = 0; il < n_layer; ++il) {
-        struct ggml_tensor * cur;
-
-        // norm
-        {
-            // [ 768, N]
-            cur = ggml_norm(ctx0, inpL, hparams.eps);
-
-            // cur = ln_1_g*cur + ln_1_b
-            // [ 768, N]
-            cur = ggml_add(ctx0,
-                    ggml_mul(ctx0,
-                        ggml_repeat(ctx0, model.layers[il].ln_1_g, cur),
-                        cur),
-                    ggml_repeat(ctx0, model.layers[il].ln_1_b, cur));
-        }
-
-        // attn
-        // [2304, 768] - model.layers[il].c_attn_attn_w
-        // [2304,   1] - model.layers[il].c_attn_attn_b
-        // [ 768,   N] - cur (in)
-        // [2304,   N] - cur (out)
-        //
-        // cur = attn_w*cur + attn_b
-        // [2304, N]
-        {
-            cur = ggml_mul_mat(ctx0,
-                    model.layers[il].c_attn_attn_w,
-                    cur);
-
-            cur = ggml_add(ctx0,
-                    ggml_repeat(ctx0, model.layers[il].c_attn_attn_b, cur),
-                    cur);
-        }
-
-        // self-attention
-        {
-            struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd);
-            struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd);
-            struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd);
-
-            // store key and value to memory
-            if (N >= 1) {
-                struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past));
-                struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past));
-
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
-            }
-
-            // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
-            // [64, N, 12]
-            struct ggml_tensor * Q =
-                ggml_permute(ctx0,
-                        ggml_cpy(ctx0,
-                            Qcur,
-                            ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)),
-                        0, 2, 1, 3);
-
-            // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
-            // [64, n_past + N, 12]
-            struct ggml_tensor * K =
-                ggml_permute(ctx0,
-                        ggml_reshape_3d(ctx0,
-                            ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd),
-                            n_embd/n_head, n_head, n_past + N),
-                        0, 2, 1, 3);
-
-            // GG: flash attention
-            //struct ggml_tensor * V =
-            //    ggml_cpy(ctx0,
-            //            ggml_permute(ctx0,
-            //                ggml_reshape_3d(ctx0,
-            //                    ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
-            //                    n_embd/n_head, n_head, n_past + N),
-            //                1, 2, 0, 3),
-            //            ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, n_embd/n_head, n_head));
-
-            //struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, true);
-
-            // K * Q
-            // [n_past + N, N, 12]
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
-
-            // KQ_scaled = KQ / sqrt(n_embd/n_head)
-            // [n_past + N, N, 12]
-            struct ggml_tensor * KQ_scaled =
-                ggml_scale(ctx0,
-                        KQ,
-                        1.0f/sqrtf(float(n_embd)/n_head));
-
-            // KQ_masked = mask_past(KQ_scaled)
-            // [n_past + N, N, 12]
-            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
-
-            // KQ = soft_max(KQ_masked)
-            // [n_past + N, N, 12]
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
-
-            // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
-            // [n_past + N, 64, 12]
-            struct ggml_tensor * V_trans =
-                ggml_cpy(ctx0,
-                        ggml_permute(ctx0,
-                            ggml_reshape_3d(ctx0,
-                                ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
-                                n_embd/n_head, n_head, n_past + N),
-                            1, 2, 0, 3),
-                        ggml_new_tensor_3d(ctx0, model.memory_v->type, n_past + N, n_embd/n_head, n_head));
-
-            // KQV = transpose(V) * KQ_soft_max
-            // [64, N, 12]
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
-
-            // KQV_merged = KQV.permute(0, 2, 1, 3)
-            // [64, 12, N]
-            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
-
-            // cur = KQV_merged.contiguous().view(n_embd, N)
-            // [768, N]
-            cur = ggml_cpy(ctx0,
-                    KQV_merged,
-                    ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
-        }
-
-        // projection
-        // [ 768, 768] - model.layers[il].c_attn_proj_w
-        // [ 768,   1] - model.layers[il].c_attn_proj_b
-        // [ 768,   N] - cur (in)
-        // [ 768,   N] - cur (out)
-        //
-        // cur = proj_w*cur + proj_b
-        // [768, N]
-        {
-            cur = ggml_mul_mat(ctx0,
-                    model.layers[il].c_attn_proj_w,
-                    cur);
-
-            cur = ggml_add(ctx0,
-                    ggml_repeat(ctx0, model.layers[il].c_attn_proj_b, cur),
-                    cur);
-        }
-
-        // add the input
-        cur = ggml_add(ctx0, cur, inpL);
-
-        struct ggml_tensor * inpFF = cur;
-
-        // feed-forward network
-        {
-            // norm
-            {
-                cur = ggml_norm(ctx0, inpFF, hparams.eps);
-
-                // cur = ln_2_g*cur + ln_2_b
-                // [ 768, N]
-                cur = ggml_add(ctx0,
-                        ggml_mul(ctx0,
-                            ggml_repeat(ctx0, model.layers[il].ln_2_g, cur),
-                            cur),
-                        ggml_repeat(ctx0, model.layers[il].ln_2_b, cur));
-            }
-
-            // fully connected
-            // [3072, 768] - model.layers[il].c_mlp_fc_w
-            // [3072,   1] - model.layers[il].c_mlp_fc_b
-            // [ 768,   N] - cur (in)
-            // [3072,   N] - cur (out)
-            //
-            // cur = fc_w*cur + fc_b
-            // [3072, N]
-            cur = ggml_mul_mat(ctx0,
-                    model.layers[il].c_mlp_fc_w,
-                    cur);
-
-            cur = ggml_add(ctx0,
-                    ggml_repeat(ctx0, model.layers[il].c_mlp_fc_b, cur),
-                    cur);
-
-            // GELU activation
-            // [3072, N]
-            cur = ggml_gelu(ctx0, cur);
-
-            // projection
-            // [ 768, 3072] - model.layers[il].c_mlp_proj_w
-            // [ 768,    1] - model.layers[il].c_mlp_proj_b
-            // [3072,    N] - cur (in)
-            // [ 768,    N] - cur (out)
-            //
-            // cur = proj_w*cur + proj_b
-            // [768, N]
-            cur = ggml_mul_mat(ctx0,
-                    model.layers[il].c_mlp_proj_w,
-                    cur);
-
-            cur = ggml_add(ctx0,
-                    ggml_repeat(ctx0, model.layers[il].c_mlp_proj_b, cur),
-                    cur);
-        }
-
-        // input for next layer
-        inpL = ggml_add(ctx0, cur, inpFF);
-    }
-
-    // norm
-    {
-        // [ 768, N]
-        inpL = ggml_norm(ctx0, inpL, hparams.eps);
-
-        // inpL = ln_f_g*inpL + ln_f_b
-        // [ 768, N]
-        inpL = ggml_add(ctx0,
-                ggml_mul(ctx0,
-                    ggml_repeat(ctx0, model.ln_f_g, inpL),
-                    inpL),
-                ggml_repeat(ctx0, model.ln_f_b, inpL));
-    }
-
-    // inpL = WTE * inpL
-    // [ 768, 50257] - model.lm_head
-    // [ 768, N]     - inpL
-    inpL = ggml_mul_mat(ctx0, model.lm_head, inpL);
-
-    // logits -> probs
-    //inpL = ggml_soft_max(ctx0, inpL);
-
-    ggml_build_forward_expand(gf, inpL);
-
-    ggml_free(ctx0);
-
-    return gf;
-}
-
-// evaluate the transformer
-//
-//   - model:     the model
-//   - allocr:    ggml_allocr to use to allocate the compute buffer
-//   - n_threads: number of threads to use
-//   - n_past:    the context size so far
-//   - embd_inp:  the embeddings of the tokens in the context
-//   - embd_w:    the predicted logits for the next token
-//
-bool gpt2_eval(
-        const gpt2_model & model,
-        struct ggml_allocr * allocr,
-        const int n_threads,
-        const int n_past,
-        const std::vector<gpt_vocab::id> & embd_inp,
-              std::vector<float>         & embd_w) {
-    const int N = embd_inp.size();
-
-    const auto & hparams = model.hparams;
-
-    const int n_vocab = hparams.n_vocab;
-
-    // reset the allocator to free all the memory allocated during the previous inference
-    ggml_allocr_reset(allocr);
-
-    struct ggml_cgraph * gf = gpt2_graph(model, allocr, n_past, embd_inp);
-
-    // allocate tensors
-    ggml_allocr_alloc_graph(allocr, gf);
-
-    // run the computation
-    struct ggml_cplan plan = ggml_graph_plan(gf, n_threads);
-    static std::vector<uint8_t> work_buffer;
-    work_buffer.resize(plan.work_size);
-    plan.work_data = work_buffer.data();
-    ggml_graph_compute(gf, &plan);
-
-    //if (n_past%100 == 0) {
-    //    ggml_graph_print   (&gf);
-    //    ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
-    //}
-
-    // in this case, the output tensor is the last one in the graph
-    struct ggml_tensor * inpL = gf->nodes[gf->n_nodes - 1];
-
-    //embd_w.resize(n_vocab*N);
-    //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
-
-    // return result just for the last token
-    embd_w.resize(n_vocab);
-    memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
-
-    return true;
-}
-
-int main(int argc, char ** argv) {
-    ggml_time_init();
-
-    const int64_t t_main_start_us = ggml_time_us();
-
-    gpt_params params;
-    params.model = "models/gpt-2-117M/ggml-model.bin";
-
-    if (gpt_params_parse(argc, argv, params) == false) {
-        return 1;
-    }
-
-    if (params.seed < 0) {
-        params.seed = time(NULL);
-    }
-
-    printf("%s: seed = %d\n", __func__, params.seed);
-
-    std::mt19937 rng(params.seed);
-    if (params.prompt.empty()) {
-        params.prompt = gpt_random_prompt(rng);
-    }
-
-    int64_t t_load_us = 0;
-
-    gpt_vocab vocab;
-    gpt2_model model;
-
-    // load the model
-    {
-        const int64_t t_start_us = ggml_time_us();
-
-        if (!gpt2_model_load(params.model, model, vocab)) {
-            fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
-            return 1;
-        }
-
-        t_load_us = ggml_time_us() - t_start_us;
-
-        test_gpt_tokenizer(vocab, params.token_test);
-    }
-
-    // keep this buffer alive while evaluating the model
-    std::vector<uint8_t> compute_buffer;
-
-    struct ggml_allocr * allocr = NULL;
-    // allocate the compute buffer
-    {
-        allocr = ggml_allocr_new_measure(GGML_MEM_ALIGN);
-
-        // create the worst case graph for memory usage estimation
-        int n_tokens = std::min(model.hparams.n_ctx, params.n_batch);
-        int n_past = model.hparams.n_ctx - n_tokens;
-        struct ggml_cgraph * gf = gpt2_graph(model, allocr, n_past, std::vector<gpt_vocab::id>(n_tokens, 0));
-
-        // compute the required memory
-        size_t mem_size = ggml_allocr_alloc_graph(allocr, gf) + GGML_MEM_ALIGN;
-
-        // recreate the allocator with the required memory
-        ggml_allocr_free(allocr);
-        compute_buffer.resize(mem_size);
-        allocr = ggml_allocr_new(compute_buffer.data(), mem_size, GGML_MEM_ALIGN);
-
-        fprintf(stderr, "%s: compute buffer size: %.2f MB\n", __func__, mem_size/1024.0/1024.0);
-    }
-
-    int n_past = 0;
-
-    int64_t t_sample_us  = 0;
-    int64_t t_predict_us = 0;
-
-    std::vector<float> logits;
-
-    // tokenize the prompt
-    std::vector<gpt_vocab::id> embd_inp = ::gpt_tokenize(vocab, params.prompt);
-
-    params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size());
-
-    printf("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
-    printf("%s: number of tokens in prompt = %zu, first 8 tokens: ", __func__, embd_inp.size());
-    for (int i = 0; i < std::min(8, (int) embd_inp.size()); i++) {
-        printf("%d ", embd_inp[i]);
-    }
-    printf("\n\n");
-
-    // submit the input prompt token-by-token
-    // this reduces the memory usage during inference, at the cost of a bit of speed at the beginning
-    std::vector<gpt_vocab::id> embd;
-
-    for (size_t i = embd.size(); i < embd_inp.size() + params.n_predict; i++) {
-        // predict
-        if (embd.size() > 0) {
-            const int64_t t_start_us = ggml_time_us();
-
-            if (!gpt2_eval(model, allocr, params.n_threads, n_past, embd, logits)) {
-                printf("Failed to predict\n");
-                return 1;
-            }
-
-            t_predict_us += ggml_time_us() - t_start_us;
-        }
-
-        n_past += embd.size();
-        embd.clear();
-
-        if (i >= embd_inp.size()) {
-            // sample next token
-            const int   top_k = params.top_k;
-            const float top_p = params.top_p;
-            const float temp  = params.temp;
-
-            const int n_vocab = model.hparams.n_vocab;
-
-            gpt_vocab::id id = 0;
-
-            {
-                const int64_t t_start_sample_us = ggml_time_us();
-
-                id = gpt_sample_top_k_top_p(vocab, logits.data() + (logits.size() - n_vocab), top_k, top_p, temp, rng);
-
-                t_sample_us += ggml_time_us() - t_start_sample_us;
-            }
-
-            // add it to the context
-            embd.push_back(id);
-        } else {
-            // if here, it means we are still processing the input prompt
-            for (size_t k = i; k < embd_inp.size(); k++) {
-                embd.push_back(embd_inp[k]);
-                if (int32_t(embd.size()) >= params.n_batch) {
-                    break;
-                }
-            }
-            i += embd.size() - 1;
-        }
-
-        // display text
-        for (auto id : embd) {
-            printf("%s", vocab.id_to_token[id].c_str());
-        }
-        fflush(stdout);
-
-        // end of text token
-        if (embd.back() == 50256) {
-            break;
-        }
-    }
-
-    // report timing
-    {
-        const int64_t t_main_end_us = ggml_time_us();
-
-        printf("\n\n");
-        printf("%s:     load time = %8.2f ms\n", __func__, t_load_us/1000.0f);
-        printf("%s:   sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f);
-        printf("%s:  predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/n_past);
-        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
-    }
-
-    ggml_free(model.ctx);
-
-    return 0;
-}
diff --git a/ggml/examples/gpt-2/main-backend.cpp b/ggml/examples/gpt-2/main-backend.cpp
deleted file mode 100644
index 2759161..0000000
--- a/ggml/examples/gpt-2/main-backend.cpp
+++ /dev/null
@@ -1,993 +0,0 @@
-#include "ggml/ggml.h"
-#include "ggml/ggml-alloc.h"
-#include "ggml/ggml-backend.h"
-
-#ifdef GGML_USE_CUBLAS
-#include "ggml-cuda.h"
-#endif
-
-#ifdef GGML_USE_METAL
-#include "ggml-metal.h"
-#endif
-
-#include "common.h"
-#include "common-ggml.h"
-
-#include <cassert>
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <fstream>
-#include <map>
-#include <string>
-#include <vector>
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-
-#define GPT2_MAX_NODES 4096
-
-static void ggml_log_callback_default(ggml_log_level level, const char * text, void * user_data) {
-    (void) level;
-    (void) user_data;
-    fputs(text, stderr);
-    fflush(stderr);
-}
-
-// default hparams (GPT-2 117M)
-struct gpt2_hparams {
-    int32_t n_vocab = 50257;
-    int32_t n_ctx   = 1024;
-    int32_t n_embd  = 768;
-    int32_t n_head  = 12;
-    int32_t n_layer = 12;
-    int32_t ftype   = 1;
-    float   eps     = 1e-5f;
-};
-
-struct gpt2_layer {
-    // normalization
-    struct ggml_tensor * ln_1_g;
-    struct ggml_tensor * ln_1_b;
-
-    struct ggml_tensor * ln_2_g;
-    struct ggml_tensor * ln_2_b;
-
-    // attention
-    struct ggml_tensor * c_attn_attn_w;
-    struct ggml_tensor * c_attn_attn_b;
-
-    struct ggml_tensor * c_attn_proj_w;
-    struct ggml_tensor * c_attn_proj_b;
-
-    // mlp
-    struct ggml_tensor * c_mlp_fc_w;
-    struct ggml_tensor * c_mlp_fc_b;
-
-    struct ggml_tensor * c_mlp_proj_w;
-    struct ggml_tensor * c_mlp_proj_b;
-};
-
-struct gpt2_model {
-    gpt2_hparams hparams;
-
-    // normalization
-    struct ggml_tensor * ln_f_g;
-    struct ggml_tensor * ln_f_b;
-
-    struct ggml_tensor * wte;     // position embedding
-    struct ggml_tensor * wpe;     //    token embedding
-    struct ggml_tensor * lm_head; // language model head
-
-    std::vector<gpt2_layer> layers;
-
-    // key + value memory
-    struct ggml_tensor * memory_k;
-    struct ggml_tensor * memory_v;
-
-    //
-    struct ggml_context * ctx;
-
-    ggml_backend_t backend = NULL;
-
-    ggml_backend_buffer_t buffer_w;
-    ggml_backend_buffer_t buffer_kv;
-
-    std::map<std::string, struct ggml_tensor *> tensors;
-};
-
-// load the model's weights from a file
-bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & vocab, int n_ctx, int n_gpu_layers) {
-    printf("%s: loading model from '%s'\n", __func__, fname.c_str());
-
-    auto fin = std::ifstream(fname, std::ios::binary);
-    if (!fin) {
-        fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
-        return false;
-    }
-
-    // verify magic
-    {
-        uint32_t magic;
-        fin.read((char *) &magic, sizeof(magic));
-        if (magic != GGML_FILE_MAGIC) {
-            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
-            return false;
-        }
-    }
-
-    // load hparams
-    {
-        auto & hparams = model.hparams;
-
-        fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
-        fin.read((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
-        fin.read((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
-        fin.read((char *) &hparams.n_head,  sizeof(hparams.n_head));
-        fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
-        fin.read((char *) &hparams.ftype,   sizeof(hparams.ftype));
-
-        const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
-
-        printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
-        printf("%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
-        printf("%s: n_embd  = %d\n", __func__, hparams.n_embd);
-        printf("%s: n_head  = %d\n", __func__, hparams.n_head);
-        printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
-        printf("%s: ftype   = %d\n", __func__, hparams.ftype);
-        printf("%s: qntvr   = %d\n", __func__, qntvr);
-
-        hparams.ftype %= GGML_QNT_VERSION_FACTOR;
-    }
-
-    // load vocab
-    {
-        int32_t n_vocab = 0;
-        fin.read((char *) &n_vocab, sizeof(n_vocab));
-
-        if (n_vocab != model.hparams.n_vocab) {
-            fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
-                    __func__, fname.c_str(), n_vocab, model.hparams.n_vocab);
-            return false;
-        }
-
-        std::string word;
-        std::vector<char> buf(128);
-
-        for (int i = 0; i < n_vocab; i++) {
-            uint32_t len;
-            fin.read((char *) &len, sizeof(len));
-
-            buf.resize(len);
-            fin.read((char *) buf.data(), len);
-            word.assign(buf.data(), len);
-
-            vocab.token_to_id[word] = i;
-            vocab.id_to_token[i] = word;
-        }
-    }
-
-    // for the big tensors, we have the option to store the data in 16-bit floats or quantized
-    // in order to save memory and also to speed up the computation
-    ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
-    if (wtype == GGML_TYPE_COUNT) {
-        fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
-                __func__, fname.c_str(), model.hparams.ftype);
-        return false;
-    }
-
-    auto & ctx = model.ctx;
-
-    // create the ggml context
-    {
-        size_t n_tensors = 2 + 6 + 12*model.hparams.n_layer;
-        struct ggml_init_params params = {
-            /*.mem_size   =*/ ggml_tensor_overhead() * n_tensors,
-            /*.mem_buffer =*/ NULL,
-            /*.no_alloc   =*/ true,
-        };
-
-        ctx = ggml_init(params);
-        if (!ctx) {
-            fprintf(stderr, "%s: ggml_init() failed\n", __func__);
-            return false;
-        }
-    }
-
-    // initialize the backend
-#ifdef GGML_USE_CUBLAS
-    if (n_gpu_layers > 0) {
-        fprintf(stderr, "%s: using CUDA backend\n", __func__);
-        model.backend = ggml_backend_cuda_init(0);
-        if (!model.backend) {
-            fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
-        }
-    }
-#endif
-
-#ifdef GGML_USE_METAL
-    if (n_gpu_layers > 0) {
-        fprintf(stderr, "%s: using Metal backend\n", __func__);
-        ggml_backend_metal_log_set_callback(ggml_log_callback_default, nullptr);
-        model.backend = ggml_backend_metal_init();
-        if (!model.backend) {
-            fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
-        }
-    }
-#endif
-
-    if (!model.backend) {
-        // fallback to CPU backend
-        fprintf(stderr, "%s: using CPU backend\n", __func__);
-        model.backend = ggml_backend_cpu_init();
-    }
-
-    if (!model.backend) {
-        fprintf(stderr, "%s: ggml_backend_cpu_init() failed\n", __func__);
-        return false;
-    }
-
-    // create the tensors for the model
-    {
-        const auto & hparams = model.hparams;
-
-        const int n_embd  = hparams.n_embd;
-        const int n_layer = hparams.n_layer;
-        const int n_ctx   = hparams.n_ctx;
-        const int n_vocab = hparams.n_vocab;
-
-        model.layers.resize(n_layer);
-
-        model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-        model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-
-        model.wte     = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
-        model.wpe     = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ctx);
-        model.lm_head = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
-
-        // map by name
-        model.tensors["model/ln_f/g"] = model.ln_f_g;
-        model.tensors["model/ln_f/b"] = model.ln_f_b;
-
-        model.tensors["model/wte"]     = model.wte;
-        model.tensors["model/wpe"]     = model.wpe;
-        model.tensors["model/lm_head"] = model.lm_head;
-
-        for (int i = 0; i < n_layer; ++i) {
-            auto & layer = model.layers[i];
-
-            layer.ln_1_g        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-            layer.ln_1_b        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-
-            layer.ln_2_g        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-            layer.ln_2_b        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-
-            layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype,           n_embd, 3*n_embd);
-            layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd);
-
-            layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype,           n_embd, n_embd);
-            layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-
-            layer.c_mlp_fc_w    = ggml_new_tensor_2d(ctx, wtype,           n_embd, 4*n_embd);
-            layer.c_mlp_fc_b    = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
-
-            layer.c_mlp_proj_w  = ggml_new_tensor_2d(ctx, wtype,         4*n_embd, n_embd);
-            layer.c_mlp_proj_b  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-
-            // map by name
-            model.tensors["model/h" + std::to_string(i) + "/ln_1/g"]        = layer.ln_1_g;
-            model.tensors["model/h" + std::to_string(i) + "/ln_1/b"]        = layer.ln_1_b;
-
-            model.tensors["model/h" + std::to_string(i) + "/ln_2/g"]        = layer.ln_2_g;
-            model.tensors["model/h" + std::to_string(i) + "/ln_2/b"]        = layer.ln_2_b;
-
-            model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/w"] = layer.c_attn_attn_w;
-            model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/b"] = layer.c_attn_attn_b;
-
-            model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/w"] = layer.c_attn_proj_w;
-            model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/b"] = layer.c_attn_proj_b;
-
-            model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/w"]    = layer.c_mlp_fc_w;
-            model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/b"]    = layer.c_mlp_fc_b;
-
-            model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/w"]  = layer.c_mlp_proj_w;
-            model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/b"]  = layer.c_mlp_proj_b;
-        }
-    }
-
-    // allocate the model tensors in a backend buffer
-    model.buffer_w = ggml_backend_alloc_ctx_tensors(ctx, model.backend);
-
-    printf("%s: ggml tensor size    = %d bytes\n", __func__, (int) sizeof(ggml_tensor));
-    printf("%s: backend buffer size = %6.2f MB\n", __func__, ggml_backend_buffer_get_size(model.buffer_w)/(1024.0*1024.0));
-
-    // override the default training context with the user-provided
-    model.hparams.n_ctx = n_ctx;
-
-    // key + value memory
-    {
-        const auto & hparams = model.hparams;
-
-        const int n_embd  = hparams.n_embd;
-        const int n_layer = hparams.n_layer;
-        const int n_ctx   = hparams.n_ctx;
-
-        const int n_mem      = n_layer*n_ctx;
-        const int n_elements = n_embd*n_mem;
-
-        model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
-        model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
-
-        const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
-
-        printf("%s: memory size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem);
-
-        // create a backend buffer (can be in host or device memory)
-        model.buffer_kv = ggml_backend_alloc_buffer(model.backend, memory_size + 256);
-
-        // allocate the tensors into the backend buffer
-        {
-            ggml_allocr * alloc = ggml_allocr_new_from_buffer(model.buffer_kv);
-
-            // this updates the pointers in the tensors to point to the correct location in the buffer
-            // this is necessary since the ggml_context is .no_alloc == true
-            // note that the buffer can actually be a device buffer, depending on the backend
-            ggml_allocr_alloc(alloc, model.memory_k);
-            ggml_allocr_alloc(alloc, model.memory_v);
-
-            ggml_allocr_free(alloc);
-        }
-    }
-
-    // load weights
-    {
-        size_t total_size = 0;
-
-        bool has_lm_head = false;
-
-        std::vector<char> read_buf;
-
-        while (true) {
-            int32_t n_dims;
-            int32_t length;
-            int32_t ttype;
-
-            fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
-            fin.read(reinterpret_cast<char *>(&length), sizeof(length));
-            fin.read(reinterpret_cast<char *>(&ttype),  sizeof(ttype));
-
-            if (fin.eof()) {
-                break;
-            }
-
-            int32_t nelements = 1;
-            int32_t ne[2] = { 1, 1 };
-            for (int i = 0; i < n_dims; ++i) {
-                fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
-                nelements *= ne[i];
-            }
-
-            std::string name(length, 0);
-            fin.read(&name[0], length);
-
-            if (model.tensors.find(name) == model.tensors.end()) {
-                fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.c_str());
-                return false;
-            }
-
-            auto tensor = model.tensors[name];
-            ggml_set_name(tensor, name.c_str());
-            if (ggml_nelements(tensor) != nelements) {
-                fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.c_str());
-                return false;
-            }
-
-            if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
-                fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
-                        __func__, name.c_str(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]);
-                return false;
-            }
-
-            // for debugging
-            if (0) {
-                printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.c_str(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
-            }
-
-            const size_t bpe = ggml_type_size(ggml_type(ttype));
-
-            if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
-                fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
-                        __func__, name.c_str(), ggml_nbytes(tensor), nelements*bpe);
-                return false;
-            }
-
-            if (ggml_backend_is_cpu  (model.backend)
-#ifdef GGML_USE_METAL
-                || ggml_backend_is_metal(model.backend)
-#endif
-                ) {
-                // for the CPU and Metal backend, we can read directly into the tensor
-                fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
-            } else {
-                // read into a temporary buffer first, then copy to device memory
-                read_buf.resize(ggml_nbytes(tensor));
-                fin.read(read_buf.data(), ggml_nbytes(tensor));
-                ggml_backend_tensor_set(tensor, read_buf.data(), 0, ggml_nbytes(tensor));
-            }
-
-            // GPT-2 models share the WTE tensor as the LM head
-            if (name == "model/wte" && has_lm_head == false) {
-                //ggml_allocr_alloc(alloc, model.lm_head);
-                //ggml_backend_tensor_copy(tensor, model.lm_head);
-                model.lm_head = tensor;
-            }
-
-            if (name == "model/lm_head") {
-                has_lm_head = true;
-            }
-
-            total_size += ggml_nbytes(tensor);
-        }
-
-        printf("%s: model size  = %8.2f MB\n", __func__, total_size/1024.0/1024.0);
-    }
-
-    fin.close();
-
-    return true;
-}
-
-// build the computation graph
-struct ggml_cgraph * gpt2_graph(
-        const gpt2_model & model,
-        struct ggml_allocr * allocr,
-        const int n_past,
-        const std::vector<gpt_vocab::id> & embd_inp) {
-    const int N = embd_inp.size();
-
-    const auto & hparams = model.hparams;
-
-    const int n_embd  = hparams.n_embd;
-    const int n_layer = hparams.n_layer;
-    const int n_ctx   = hparams.n_ctx;
-    const int n_head  = hparams.n_head;
-
-    // since we are using ggml-alloc, this buffer only needs enough space to hold the ggml_tensor and ggml_cgraph structs, but not the tensor data
-    static size_t buf_size = ggml_tensor_overhead()*GPT2_MAX_NODES + ggml_graph_overhead_custom(GPT2_MAX_NODES, false);
-    static std::vector<uint8_t> buf(buf_size);
-
-    struct ggml_init_params params = {
-        /*.mem_size   =*/ buf_size,
-        /*.mem_buffer =*/ buf.data(),
-        /*.no_alloc   =*/ true, // the tensors will be allocated later by ggml_allocr_alloc_graph()
-    };
-
-    struct ggml_context * ctx0 = ggml_init(params);
-
-    struct ggml_cgraph  * gf = ggml_new_graph_custom(ctx0, GPT2_MAX_NODES, false);
-
-    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
-    ggml_allocr_alloc(allocr, embd);
-
-    // avoid writing to tensors if we are only measuring the memory usage
-    if (!ggml_allocr_is_measure(allocr)) {
-        ggml_backend_tensor_set(embd, embd_inp.data(), 0, N*ggml_element_size(embd));
-    }
-
-    struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
-    ggml_allocr_alloc(allocr, position);
-    if (!ggml_allocr_is_measure(allocr)) {
-        for (int i = 0; i < N; ++i) {
-            int32_t v = n_past + i;
-            ggml_backend_tensor_set(position, &v, i*sizeof(int32_t), sizeof(v));
-        }
-    }
-
-    // wte + wpe
-    struct ggml_tensor * inpL =
-        ggml_add(ctx0,
-                ggml_get_rows(ctx0, model.wte, embd),
-                ggml_get_rows(ctx0, model.wpe, position));
-
-    for (int il = 0; il < n_layer; ++il) {
-        struct ggml_tensor * cur;
-
-        // norm
-        {
-            // [ 768, N]
-            cur = ggml_norm(ctx0, inpL, hparams.eps);
-
-            // cur = ln_1_g*cur + ln_1_b
-            // [ 768, N]
-            cur = ggml_add(ctx0,
-                    ggml_mul(ctx0,
-                        cur,
-                        model.layers[il].ln_1_g),
-                    model.layers[il].ln_1_b);
-        }
-
-        // attn
-        // [2304, 768] - model.layers[il].c_attn_attn_w
-        // [2304,   1] - model.layers[il].c_attn_attn_b
-        // [ 768,   N] - cur (in)
-        // [2304,   N] - cur (out)
-        //
-        // cur = attn_w*cur + attn_b
-        // [2304, N]
-        {
-            cur = ggml_mul_mat(ctx0,
-                    model.layers[il].c_attn_attn_w,
-                    cur);
-
-            cur = ggml_add(ctx0,
-                    cur,
-                    model.layers[il].c_attn_attn_b);
-        }
-
-        // self-attention
-        {
-            struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd);
-            struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd);
-            struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd);
-
-            // store key and value to memory
-            if (N >= 1) {
-                struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past));
-                struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past));
-
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
-            }
-
-            // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
-            // [64, N, 12]
-            struct ggml_tensor * Q =
-                ggml_permute(ctx0,
-                        ggml_cpy(ctx0,
-                            Qcur,
-                            ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)),
-                        0, 2, 1, 3);
-
-            // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
-            // [64, n_past + N, 12]
-            struct ggml_tensor * K =
-                ggml_permute(ctx0,
-                        ggml_reshape_3d(ctx0,
-                            ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd),
-                            n_embd/n_head, n_head, n_past + N),
-                        0, 2, 1, 3);
-
-            // GG: flash attention
-            //struct ggml_tensor * V =
-            //    ggml_cpy(ctx0,
-            //            ggml_permute(ctx0,
-            //                ggml_reshape_3d(ctx0,
-            //                    ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
-            //                    n_embd/n_head, n_head, n_past + N),
-            //                1, 2, 0, 3),
-            //            ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, n_embd/n_head, n_head));
-
-            //struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, true);
-
-            // K * Q
-            // [n_past + N, N, 12]
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
-
-            // KQ_scaled = KQ / sqrt(n_embd/n_head)
-            // [n_past + N, N, 12]
-            struct ggml_tensor * KQ_scaled =
-                ggml_scale(ctx0,
-                        KQ,
-                        1.0f/sqrtf(float(n_embd)/n_head));
-
-            // KQ_masked = mask_past(KQ_scaled)
-            // [n_past + N, N, 12]
-            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
-
-            // KQ = soft_max(KQ_masked)
-            // [n_past + N, N, 12]
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
-
-            // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
-            // [n_past + N, 64, 12]
-            struct ggml_tensor * V_trans =
-                ggml_cpy(ctx0,
-                        ggml_permute(ctx0,
-                            ggml_reshape_3d(ctx0,
-                                ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
-                                n_embd/n_head, n_head, n_past + N),
-                            1, 2, 0, 3),
-                        ggml_new_tensor_3d(ctx0, model.memory_v->type, n_past + N, n_embd/n_head, n_head));
-
-            // KQV = transpose(V) * KQ_soft_max
-            // [64, N, 12]
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
-
-            // KQV_merged = KQV.permute(0, 2, 1, 3)
-            // [64, 12, N]
-            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
-
-            // cur = KQV_merged.contiguous().view(n_embd, N)
-            // [768, N]
-            cur = ggml_cpy(ctx0,
-                    KQV_merged,
-                    ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
-        }
-
-        // projection
-        // [ 768, 768] - model.layers[il].c_attn_proj_w
-        // [ 768,   1] - model.layers[il].c_attn_proj_b
-        // [ 768,   N] - cur (in)
-        // [ 768,   N] - cur (out)
-        //
-        // cur = proj_w*cur + proj_b
-        // [768, N]
-        {
-            cur = ggml_mul_mat(ctx0,
-                    model.layers[il].c_attn_proj_w,
-                    cur);
-
-            cur = ggml_add(ctx0,
-                    cur,
-                    model.layers[il].c_attn_proj_b);
-        }
-
-        // add the input
-        cur = ggml_add(ctx0, cur, inpL);
-
-        struct ggml_tensor * inpFF = cur;
-
-        // feed-forward network
-        {
-            // norm
-            {
-                cur = ggml_norm(ctx0, inpFF, hparams.eps);
-
-                // cur = ln_2_g*cur + ln_2_b
-                // [ 768, N]
-                cur = ggml_add(ctx0,
-                        ggml_mul(ctx0,
-                            cur,
-                            model.layers[il].ln_2_g),
-                        model.layers[il].ln_2_b);
-            }
-
-            // fully connected
-            // [3072, 768] - model.layers[il].c_mlp_fc_w
-            // [3072,   1] - model.layers[il].c_mlp_fc_b
-            // [ 768,   N] - cur (in)
-            // [3072,   N] - cur (out)
-            //
-            // cur = fc_w*cur + fc_b
-            // [3072, N]
-            cur = ggml_mul_mat(ctx0,
-                    model.layers[il].c_mlp_fc_w,
-                    cur);
-
-            cur = ggml_add(ctx0,
-                    cur,
-                    model.layers[il].c_mlp_fc_b);
-
-            // GELU activation
-            // [3072, N]
-            cur = ggml_gelu(ctx0, cur);
-
-            // projection
-            // [ 768, 3072] - model.layers[il].c_mlp_proj_w
-            // [ 768,    1] - model.layers[il].c_mlp_proj_b
-            // [3072,    N] - cur (in)
-            // [ 768,    N] - cur (out)
-            //
-            // cur = proj_w*cur + proj_b
-            // [768, N]
-            cur = ggml_mul_mat(ctx0,
-                    model.layers[il].c_mlp_proj_w,
-                    cur);
-
-            cur = ggml_add(ctx0,
-                    cur,
-                    model.layers[il].c_mlp_proj_b);
-        }
-
-        // input for next layer
-        inpL = ggml_add(ctx0, cur, inpFF);
-    }
-
-    // norm
-    {
-        // [ 768, N]
-        inpL = ggml_norm(ctx0, inpL, hparams.eps);
-
-        // inpL = ln_f_g*inpL + ln_f_b
-        // [ 768, N]
-        inpL = ggml_add(ctx0,
-                ggml_mul(ctx0,
-                    inpL,
-                    model.ln_f_g),
-                model.ln_f_b);
-    }
-
-    // inpL = WTE * inpL
-    // [ 768, 50257] - model.lm_head
-    // [ 768, N]     - inpL
-    inpL = ggml_mul_mat(ctx0, model.lm_head, inpL);
-
-    // logits -> probs
-    //inpL = ggml_soft_max(ctx0, inpL);
-
-    ggml_build_forward_expand(gf, inpL);
-
-    ggml_free(ctx0);
-
-    return gf;
-}
-
-// evaluate the transformer
-//
-//   - model:     the model
-//   - allocr:    ggml_allocr to use to allocate the compute buffer
-//   - n_threads: number of threads to use
-//   - n_past:    the context size so far
-//   - embd_inp:  the embeddings of the tokens in the context
-//   - embd_w:    the predicted logits for the next token
-//
-bool gpt2_eval(
-        const gpt2_model & model,
-        struct ggml_allocr * allocr,
-        const int n_threads,
-        const int n_past,
-        const std::vector<gpt_vocab::id> & embd_inp,
-              std::vector<float>         & embd_w) {
-    const int N = embd_inp.size();
-
-    const auto & hparams = model.hparams;
-
-    const int n_vocab = hparams.n_vocab;
-
-    // reset the allocator to free all the memory allocated during the previous inference
-    ggml_allocr_reset(allocr);
-
-    struct ggml_cgraph * gf = gpt2_graph(model, allocr, n_past, embd_inp);
-
-    // allocate tensors
-    ggml_allocr_alloc_graph(allocr, gf);
-
-    // set backend options
-    if (ggml_backend_is_cpu(model.backend)) {
-        ggml_backend_cpu_set_n_threads(model.backend, n_threads);
-    }
-
-#ifdef GGML_USE_METAL
-    if (ggml_backend_is_metal(model.backend)) {
-        ggml_backend_metal_set_n_cb(model.backend, n_threads);
-    }
-#endif
-
-    // test
-#if 0 && defined(GGML_USE_CUBLAS)
-    if (ggml_backend_is_cuda(model.backend)) {
-        auto eval_callback = [](int index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data) {
-            auto tv1 = tensor_to_float(t1);
-            auto tv2 = tensor_to_float(t2);
-
-#if 1
-            float sim = cosine_similarity(tv1, tv2);
-            float len1 = vec_len(tv1);
-            float len2 = vec_len(tv2);
-            float lenr = len1/len2;
-            float lenrd = std::abs(1.0f-lenr);
-
-            float angle = acosf(sim)*180.0f/M_PI;
-
-            if (angle > 0.5f || lenrd > 0.05f) {
-                printf("%3d [%15s] %s: sim = %f, a = %f, lenrd = %f\n", index, ggml_op_desc(t1), t1->name, sim, angle, lenrd);
-            }
-            assert(sim > 0.90f);
-#else
-            float dist = distance(tv1, tv2) / vec_len(tv1);
-            if (dist > 0.01f) {
-                printf("%3d [%15s] %s: distance = %f\n", index, ggml_op_desc(t1), t1->name, dist);
-            }
-#endif
-
-            return true;
-        };
-        ggml_backend_t backend_cpu = ggml_backend_cpu_init();
-        ggml_backend_compare_graph_backend(model.backend, backend_cpu, gf, eval_callback, nullptr);
-        ggml_backend_free(backend_cpu);
-        //printf("done\n");
-    } else
-#endif
-    {
-        // run the computation
-        ggml_backend_graph_compute(model.backend, gf);
-    }
-
-    //if (n_past%100 == 0) {
-    //    ggml_graph_print   (&gf);
-    //    ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
-    //}
-
-    // in this case, the output tensor is the last one in the graph
-    struct ggml_tensor * inpL = gf->nodes[gf->n_nodes - 1];
-
-    //embd_w.resize(n_vocab*N);
-    //ggml_backend_tensor_get(inpL, embd_w.data(), 0, sizeof(float)*n_vocab*N);
-
-    // return result just for the last token
-    embd_w.resize(n_vocab);
-    ggml_backend_tensor_get(inpL, embd_w.data(), (n_vocab*(N-1))*sizeof(float), sizeof(float)*n_vocab);
-
-    return true;
-}
-
-int main(int argc, char ** argv) {
-    ggml_time_init();
-
-    const int64_t t_main_start_us = ggml_time_us();
-
-    gpt_params params;
-    params.model = "models/gpt-2-117M/ggml-model.bin";
-
-    if (gpt_params_parse(argc, argv, params) == false) {
-        return 1;
-    }
-
-    if (params.seed < 0) {
-        params.seed = time(NULL);
-    }
-
-    printf("%s: seed = %d\n", __func__, params.seed);
-
-    std::mt19937 rng(params.seed);
-    if (params.prompt.empty()) {
-        params.prompt = gpt_random_prompt(rng);
-    }
-
-    int64_t t_load_us = 0;
-
-    gpt_vocab vocab;
-    gpt2_model model;
-
-    // load the model
-    {
-        const int64_t t_start_us = ggml_time_us();
-
-        if (!gpt2_model_load(params.model, model, vocab, params.n_ctx, params.n_gpu_layers)) {
-            fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
-            return 1;
-        }
-
-        t_load_us = ggml_time_us() - t_start_us;
-
-        test_gpt_tokenizer(vocab, params.token_test);
-    }
-
-    // keep this buffer alive while evaluating the model
-    ggml_backend_buffer_t buf_compute;
-
-    struct ggml_allocr * allocr = NULL;
-    // allocate the compute buffer
-    {
-        // create an allocator to measure the memory usage
-        allocr = ggml_allocr_new_measure_from_backend(model.backend);
-
-        // create the worst case graph for memory usage estimation
-        int n_tokens = std::min(model.hparams.n_ctx, params.n_batch);
-        int n_past = model.hparams.n_ctx - n_tokens;
-        struct ggml_cgraph * gf = gpt2_graph(model, allocr, n_past, std::vector<gpt_vocab::id>(n_tokens, 0));
-
-        // compute the required memory
-        size_t mem_size = ggml_allocr_alloc_graph(allocr, gf);
-
-        // recreate the allocator with the required memory
-        ggml_allocr_free(allocr);
-        buf_compute = ggml_backend_alloc_buffer(model.backend, mem_size);
-        allocr = ggml_allocr_new_from_buffer(buf_compute);
-
-        fprintf(stderr, "%s: compute buffer size: %.2f MB\n", __func__, mem_size/1024.0/1024.0);
-    }
-
-    int n_past = 0;
-
-    int64_t t_sample_us  = 0;
-    int64_t t_predict_us = 0;
-
-    std::vector<float> logits;
-
-    // tokenize the prompt
-    std::vector<gpt_vocab::id> embd_inp = ::gpt_tokenize(vocab, params.prompt);
-
-    params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size());
-
-    printf("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
-    printf("%s: number of tokens in prompt = %zu, first 8 tokens: ", __func__, embd_inp.size());
-    for (int i = 0; i < std::min(8, (int) embd_inp.size()); i++) {
-        printf("%d ", embd_inp[i]);
-    }
-    printf("\n\n");
-
-    // submit the input prompt token-by-token
-    // this reduces the memory usage during inference, at the cost of a bit of speed at the beginning
-    std::vector<gpt_vocab::id> embd;
-
-    for (size_t i = embd.size(); i < embd_inp.size() + params.n_predict; i++) {
-        // predict
-        if (embd.size() > 0) {
-            const int64_t t_start_us = ggml_time_us();
-
-            if (!gpt2_eval(model, allocr, params.n_threads, n_past, embd, logits)) {
-                printf("Failed to predict\n");
-                return 1;
-            }
-
-            t_predict_us += ggml_time_us() - t_start_us;
-        }
-
-        n_past += embd.size();
-        embd.clear();
-
-        if (i >= embd_inp.size()) {
-            // sample next token
-            const int   top_k = params.top_k;
-            const float top_p = params.top_p;
-            const float temp  = params.temp;
-
-            const int n_vocab = model.hparams.n_vocab;
-
-            gpt_vocab::id id = 0;
-
-            {
-                const int64_t t_start_sample_us = ggml_time_us();
-
-                id = gpt_sample_top_k_top_p(vocab, logits.data() + (logits.size() - n_vocab), top_k, top_p, temp, rng);
-
-                t_sample_us += ggml_time_us() - t_start_sample_us;
-            }
-
-            // add it to the context
-            embd.push_back(id);
-        } else {
-            // if here, it means we are still processing the input prompt
-            for (size_t k = i; k < embd_inp.size(); k++) {
-                embd.push_back(embd_inp[k]);
-                if (int32_t(embd.size()) >= params.n_batch) {
-                    break;
-                }
-            }
-            i += embd.size() - 1;
-        }
-
-        // display text
-        for (auto id : embd) {
-            printf("%s", vocab.id_to_token[id].c_str());
-        }
-        fflush(stdout);
-
-        // end of text token
-        if (!params.ignore_eos && embd.back() == 50256) {
-            break;
-        }
-    }
-
-    // report timing
-    {
-        const int64_t t_main_end_us = ggml_time_us();
-
-        printf("\n\n");
-        printf("%s:     load time = %8.2f ms\n", __func__, t_load_us/1000.0f);
-        printf("%s:   sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f);
-        printf("%s:  predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/n_past);
-        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
-    }
-
-    ggml_free(model.ctx);
-
-    ggml_backend_buffer_free(model.buffer_w);
-    ggml_backend_buffer_free(model.buffer_kv);
-    ggml_backend_buffer_free(buf_compute);
-    ggml_backend_free(model.backend);
-
-    return 0;
-}
diff --git a/ggml/examples/gpt-2/main-batched.cpp b/ggml/examples/gpt-2/main-batched.cpp
deleted file mode 100644
index 02b7076..0000000
--- a/ggml/examples/gpt-2/main-batched.cpp
+++ /dev/null
@@ -1,1218 +0,0 @@
-#include "ggml/ggml.h"
-#include "ggml/ggml-alloc.h"
-#include "ggml/ggml-backend.h"
-
-#ifdef GGML_USE_CUBLAS
-#include "ggml-cuda.h"
-#endif
-
-#ifdef GGML_USE_METAL
-#include "ggml-metal.h"
-#endif
-
-#include "common.h"
-#include "common-ggml.h"
-
-#include <cassert>
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <fstream>
-#include <map>
-#include <set>
-#include <string>
-#include <vector>
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-
-#define GPT2_MAX_NODES 4096
-
-static void ggml_log_callback_default(ggml_log_level level, const char * text, void * user_data) {
-    (void) level;
-    (void) user_data;
-    fputs(text, stderr);
-    fflush(stderr);
-}
-
-typedef int32_t gpt2_pos;
-typedef int32_t gpt2_seq_id;
-
-// default hparams (GPT-2 117M)
-struct gpt2_hparams {
-    int32_t n_vocab = 50257;
-    int32_t n_ctx   = 1024;
-    int32_t n_embd  = 768;
-    int32_t n_head  = 12;
-    int32_t n_layer = 12;
-    int32_t ftype   = 1;
-    float   eps     = 1e-5f;
-};
-
-struct gpt2_layer {
-    // normalization
-    struct ggml_tensor * ln_1_g;
-    struct ggml_tensor * ln_1_b;
-
-    struct ggml_tensor * ln_2_g;
-    struct ggml_tensor * ln_2_b;
-
-    // attention
-    struct ggml_tensor * c_attn_attn_w;
-    struct ggml_tensor * c_attn_attn_b;
-
-    struct ggml_tensor * c_attn_proj_w;
-    struct ggml_tensor * c_attn_proj_b;
-
-    // mlp
-    struct ggml_tensor * c_mlp_fc_w;
-    struct ggml_tensor * c_mlp_fc_b;
-
-    struct ggml_tensor * c_mlp_proj_w;
-    struct ggml_tensor * c_mlp_proj_b;
-};
-
-struct gpt2_kv_cell {
-    gpt2_pos pos   = -1;
-    gpt2_pos delta = 0;
-
-    std::set<gpt2_seq_id> seq_id;
-
-    bool has_seq_id(const gpt2_seq_id & id) const {
-        return seq_id.find(id) != seq_id.end();
-    }
-};
-
-struct gpt2_kv_cache {
-    // key + value memory
-    struct ggml_tensor * k;
-    struct ggml_tensor * v;
-    //
-
-    uint32_t head = 0;
-    uint32_t size = 0;
-
-    // computed before each graph build
-    uint32_t n = 0;
-
-    std::vector<gpt2_kv_cell> cells;
-
-    ggml_backend_buffer_t buffer;
-};
-
-struct gpt2_model {
-    gpt2_hparams hparams;
-
-    // normalization
-    struct ggml_tensor * ln_f_g;
-    struct ggml_tensor * ln_f_b;
-
-    struct ggml_tensor * wte;     // position embedding
-    struct ggml_tensor * wpe;     //    token embedding
-    struct ggml_tensor * lm_head; // language model head
-
-    std::vector<gpt2_layer> layers;
-
-    gpt2_kv_cache kv_cache;
-
-    struct ggml_context * ctx;
-
-    ggml_backend_t backend = NULL;
-
-    ggml_backend_buffer_t buffer_w;
-
-    std::map<std::string, struct ggml_tensor *> tensors;
-};
-
-// Input data for gpt2_decode
-// A gpt2_batch object can contain input about one or many sequences
-// The provided arrays (i.e. token, embd, pos, etc.) must have size of n_tokens
-//
-// - token  : the token ids of the input (used when embd is NULL)
-// - embd   : token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
-// - pos    : the positions of the respective token in the sequence
-// - seq_id : the sequence to which the respective token belongs
-// - logits : if zero, the logits for the respective token will not be output
-//
-struct gpt2_batch {
-    int32_t n_tokens = -1;
-
-    gpt_vocab::id  * token  = {};
-    float          * embd   = {};
-    gpt2_pos       * pos    = {};
-    gpt2_seq_id    * seq_id = {};
-    int8_t         * logits = {};
-};
-
-// load the model's weights from a file
-bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & vocab, int n_ctx, int n_gpu_layers) {
-    printf("%s: loading model from '%s'\n", __func__, fname.c_str());
-
-    auto fin = std::ifstream(fname, std::ios::binary);
-    if (!fin) {
-        fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
-        return false;
-    }
-
-    // verify magic
-    {
-        uint32_t magic;
-        fin.read((char *) &magic, sizeof(magic));
-        if (magic != GGML_FILE_MAGIC) {
-            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
-            return false;
-        }
-    }
-
-    // load hparams
-    {
-        auto & hparams = model.hparams;
-
-        fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
-        fin.read((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
-        fin.read((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
-        fin.read((char *) &hparams.n_head,  sizeof(hparams.n_head));
-        fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
-        fin.read((char *) &hparams.ftype,   sizeof(hparams.ftype));
-
-        const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
-
-        printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
-        printf("%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
-        printf("%s: n_embd  = %d\n", __func__, hparams.n_embd);
-        printf("%s: n_head  = %d\n", __func__, hparams.n_head);
-        printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
-        printf("%s: ftype   = %d\n", __func__, hparams.ftype);
-        printf("%s: qntvr   = %d\n", __func__, qntvr);
-
-        hparams.ftype %= GGML_QNT_VERSION_FACTOR;
-    }
-
-    // load vocab
-    {
-        int32_t n_vocab = 0;
-        fin.read((char *) &n_vocab, sizeof(n_vocab));
-
-        if (n_vocab != model.hparams.n_vocab) {
-            fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
-                    __func__, fname.c_str(), n_vocab, model.hparams.n_vocab);
-            return false;
-        }
-
-        std::string word;
-        std::vector<char> buf(128);
-
-        for (int i = 0; i < n_vocab; i++) {
-            uint32_t len;
-            fin.read((char *) &len, sizeof(len));
-
-            buf.resize(len);
-            fin.read((char *) buf.data(), len);
-            word.assign(buf.data(), len);
-
-            vocab.token_to_id[word] = i;
-            vocab.id_to_token[i] = word;
-        }
-    }
-
-    // for the big tensors, we have the option to store the data in 16-bit floats or quantized
-    // in order to save memory and also to speed up the computation
-    ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
-    if (wtype == GGML_TYPE_COUNT) {
-        fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
-                __func__, fname.c_str(), model.hparams.ftype);
-        return false;
-    }
-
-    auto & ctx = model.ctx;
-
-    size_t buffer_size = 0;
-
-    {
-        const auto & hparams = model.hparams;
-
-        const int n_embd  = hparams.n_embd;
-        const int n_layer = hparams.n_layer;
-        const int n_ctx   = hparams.n_ctx;
-        const int n_vocab = hparams.n_vocab;
-
-        buffer_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_g
-        buffer_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_b
-
-        buffer_size += ggml_row_size(wtype,         n_vocab*n_embd); // wte
-        buffer_size += ggml_row_size(GGML_TYPE_F32,   n_ctx*n_embd); // wpe
-        buffer_size += ggml_row_size(wtype,         n_vocab*n_embd); // lm_head
-
-        buffer_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_g
-        buffer_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_b
-
-        buffer_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_g
-        buffer_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_b
-
-        buffer_size += n_layer*(ggml_row_size(wtype,         3*n_embd*n_embd)); // c_attn_attn_w
-        buffer_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 3*n_embd));        // c_attn_attn_b
-
-        buffer_size += n_layer*(ggml_row_size(wtype,         n_embd*n_embd));   // c_attn_proj_w
-        buffer_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd));          // c_attn_proj_b
-
-        buffer_size += n_layer*(ggml_row_size(wtype,         4*n_embd*n_embd)); // c_mlp_fc_w
-        buffer_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 4*n_embd));        // c_mlp_fc_b
-
-        buffer_size += n_layer*(ggml_row_size(wtype,         4*n_embd*n_embd)); // c_mlp_proj_w
-        buffer_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 4*n_embd));        // c_mlp_proj_b
-
-        buffer_size += (6 + 12*n_layer)*128; // alignment overhead
-
-        printf("%s: ggml tensor size    = %d bytes\n", __func__, (int) sizeof(ggml_tensor));
-        printf("%s: backend buffer size = %6.2f MB\n", __func__, buffer_size/(1024.0*1024.0));
-    }
-
-    // create the ggml context
-    {
-        size_t n_tensors = 2 + 6 + 12*model.hparams.n_layer;
-        struct ggml_init_params params = {
-            /*.mem_size   =*/ ggml_tensor_overhead() * n_tensors,
-            /*.mem_buffer =*/ NULL,
-            /*.no_alloc   =*/ true,
-        };
-
-        model.ctx = ggml_init(params);
-        if (!model.ctx) {
-            fprintf(stderr, "%s: ggml_init() failed\n", __func__);
-            return false;
-        }
-    }
-
-    // initialize the backend
-#ifdef GGML_USE_CUBLAS
-    if (n_gpu_layers > 0) {
-        fprintf(stderr, "%s: using CUDA backend\n", __func__);
-        model.backend = ggml_backend_cuda_init(0);
-        if (!model.backend) {
-            fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
-        }
-    }
-#endif
-
-#ifdef GGML_USE_METAL
-    if (n_gpu_layers > 0) {
-        fprintf(stderr, "%s: using Metal backend\n", __func__);
-        ggml_backend_metal_log_set_callback(ggml_log_callback_default, nullptr);
-        model.backend = ggml_backend_metal_init();
-        if (!model.backend) {
-            fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
-        }
-    }
-#endif
-
-    if (!model.backend) {
-        // fallback to CPU backend
-        fprintf(stderr, "%s: using CPU backend\n", __func__);
-        model.backend = ggml_backend_cpu_init();
-    }
-
-    if (!model.backend) {
-        fprintf(stderr, "%s: ggml_backend_cpu_init() failed\n", __func__);
-        return false;
-    }
-
-    // allocate weights buffer
-    model.buffer_w = ggml_backend_alloc_buffer(model.backend, buffer_size);
-
-    // prepare memory for the weights
-    {
-        const auto & hparams = model.hparams;
-
-        const int n_embd  = hparams.n_embd;
-        const int n_layer = hparams.n_layer;
-        const int n_ctx   = hparams.n_ctx;
-        const int n_vocab = hparams.n_vocab;
-
-        model.layers.resize(n_layer);
-
-        model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-        model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-
-        model.wte     = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
-        model.wpe     = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ctx);
-        model.lm_head = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
-
-        // map by name
-        model.tensors["model/ln_f/g"] = model.ln_f_g;
-        model.tensors["model/ln_f/b"] = model.ln_f_b;
-
-        model.tensors["model/wte"]     = model.wte;
-        model.tensors["model/wpe"]     = model.wpe;
-        model.tensors["model/lm_head"] = model.lm_head;
-
-        for (int i = 0; i < n_layer; ++i) {
-            auto & layer = model.layers[i];
-
-            layer.ln_1_g        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-            layer.ln_1_b        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-
-            layer.ln_2_g        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-            layer.ln_2_b        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-
-            layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype,           n_embd, 3*n_embd);
-            layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd);
-
-            layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype,           n_embd, n_embd);
-            layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-
-            layer.c_mlp_fc_w    = ggml_new_tensor_2d(ctx, wtype,           n_embd, 4*n_embd);
-            layer.c_mlp_fc_b    = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
-
-            layer.c_mlp_proj_w  = ggml_new_tensor_2d(ctx, wtype,         4*n_embd, n_embd);
-            layer.c_mlp_proj_b  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-
-            // map by name
-            model.tensors["model/h" + std::to_string(i) + "/ln_1/g"]        = layer.ln_1_g;
-            model.tensors["model/h" + std::to_string(i) + "/ln_1/b"]        = layer.ln_1_b;
-
-            model.tensors["model/h" + std::to_string(i) + "/ln_2/g"]        = layer.ln_2_g;
-            model.tensors["model/h" + std::to_string(i) + "/ln_2/b"]        = layer.ln_2_b;
-
-            model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/w"] = layer.c_attn_attn_w;
-            model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/b"] = layer.c_attn_attn_b;
-
-            model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/w"] = layer.c_attn_proj_w;
-            model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/b"] = layer.c_attn_proj_b;
-
-            model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/w"]    = layer.c_mlp_fc_w;
-            model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/b"]    = layer.c_mlp_fc_b;
-
-            model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/w"]  = layer.c_mlp_proj_w;
-            model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/b"]  = layer.c_mlp_proj_b;
-        }
-    }
-
-    // override the default training context with the user-provided
-    model.hparams.n_ctx = n_ctx;
-
-    // key + value memory
-    {
-        const auto & hparams = model.hparams;
-
-        const int n_embd  = hparams.n_embd;
-        const int n_layer = hparams.n_layer;
-        const int n_ctx   = hparams.n_ctx;
-
-        const int n_mem      = n_layer*n_ctx;
-        const int n_elements = n_embd*n_mem;
-
-        model.kv_cache.k = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
-        model.kv_cache.v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
-
-        model.kv_cache.head      = 0;
-        model.kv_cache.size      = n_ctx;
-
-        model.kv_cache.cells.resize(n_ctx);
-
-        const size_t memory_size = ggml_nbytes(model.kv_cache.k) + ggml_nbytes(model.kv_cache.v);
-
-        printf("%s: memory size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem);
-
-        // create a backend buffer (can be in host or device memory)
-        model.kv_cache.buffer = ggml_backend_alloc_buffer(model.backend, memory_size + 256);
-
-        // allocate the tensors into the backend buffer
-        {
-            ggml_allocr * alloc = ggml_allocr_new_from_buffer(model.kv_cache.buffer);
-
-            // this updates the pointers in the tensors to point to the correct location in the buffer
-            // this is necessary since the ggml_context is .no_alloc == true
-            // note that the buffer can actually be a device buffer, depending on the backend
-            ggml_allocr_alloc(alloc, model.kv_cache.k);
-            ggml_allocr_alloc(alloc, model.kv_cache.v);
-
-            ggml_allocr_free(alloc);
-        }
-    }
-
-    // load weights
-    {
-        ggml_allocr * alloc = ggml_allocr_new_from_buffer(model.buffer_w);
-
-        size_t total_size = 0;
-
-        bool has_lm_head = false;
-
-        std::vector<char> read_buf;
-
-        while (true) {
-            int32_t n_dims;
-            int32_t length;
-            int32_t ttype;
-
-            fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
-            fin.read(reinterpret_cast<char *>(&length), sizeof(length));
-            fin.read(reinterpret_cast<char *>(&ttype),  sizeof(ttype));
-
-            if (fin.eof()) {
-                break;
-            }
-
-            int32_t nelements = 1;
-            int32_t ne[2] = { 1, 1 };
-            for (int i = 0; i < n_dims; ++i) {
-                fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
-                nelements *= ne[i];
-            }
-
-            std::string name(length, 0);
-            fin.read(&name[0], length);
-
-            if (model.tensors.find(name) == model.tensors.end()) {
-                fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.c_str());
-                return false;
-            }
-
-            auto tensor = model.tensors[name];
-            ggml_set_name(tensor, name.c_str());
-            if (ggml_nelements(tensor) != nelements) {
-                fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.c_str());
-                return false;
-            }
-
-            if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
-                fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
-                        __func__, name.c_str(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]);
-                return false;
-            }
-
-            // for debugging
-            if (0) {
-                printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.c_str(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
-            }
-
-            const size_t bpe = ggml_type_size(ggml_type(ttype));
-
-            if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
-                fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
-                        __func__, name.c_str(), ggml_nbytes(tensor), nelements*bpe);
-                return false;
-            }
-
-            ggml_allocr_alloc(alloc, tensor);
-
-            if (ggml_backend_is_cpu  (model.backend)
-#ifdef GGML_USE_METAL
-                || ggml_backend_is_metal(model.backend)
-#endif
-                ) {
-                // for the CPU and Metal backend, we can read directly into the tensor
-                fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
-            } else {
-                // read into a temporary buffer first, then copy to device memory
-                read_buf.resize(ggml_nbytes(tensor));
-                fin.read(read_buf.data(), ggml_nbytes(tensor));
-                ggml_backend_tensor_set(tensor, read_buf.data(), 0, ggml_nbytes(tensor));
-            }
-
-            // GPT-2 models share the WTE tensor as the LM head
-            if (name == "model/wte" && has_lm_head == false) {
-                //ggml_allocr_alloc(alloc, model.lm_head);
-                //ggml_backend_tensor_copy(tensor, model.lm_head);
-                model.lm_head = tensor;
-            }
-
-            if (name == "model/lm_head") {
-                has_lm_head = true;
-            }
-
-            total_size += ggml_nbytes(tensor);
-        }
-
-        ggml_allocr_free(alloc);
-        printf("%s: model size  = %8.2f MB\n", __func__, total_size/1024.0/1024.0);
-    }
-
-    fin.close();
-
-    return true;
-}
-
-// build the computation graph
-struct ggml_cgraph * gpt2_graph(
-        const  gpt2_model  & model,
-        struct ggml_allocr * allocr,
-        const  gpt2_batch  & batch) {
-    const auto & hparams = model.hparams;
-
-    const int n_embd  = hparams.n_embd;
-    const int n_layer = hparams.n_layer;
-    const int n_ctx   = hparams.n_ctx;
-    const int n_head  = hparams.n_head;
-
-    const auto & kv_cache = model.kv_cache;
-
-    const int32_t n_tokens = batch.n_tokens;
-    const int32_t n_kv     = ggml_allocr_is_measure(allocr) ? n_ctx            : kv_cache.n;
-    const int32_t kv_head  = ggml_allocr_is_measure(allocr) ? n_ctx - n_tokens : kv_cache.head;
-
-    // since we are using ggml-alloc, this buffer only needs enough space to hold the ggml_tensor and ggml_cgraph structs, but not the tensor data
-    static size_t buf_size = ggml_tensor_overhead()*GPT2_MAX_NODES + ggml_graph_overhead_custom(GPT2_MAX_NODES, false);
-    static std::vector<uint8_t> buf(buf_size);
-
-    struct ggml_init_params params = {
-        /*.mem_size   =*/ buf_size,
-        /*.mem_buffer =*/ buf.data(),
-        /*.no_alloc   =*/ true, // the tensors will be allocated later by ggml_allocr_alloc_graph()
-    };
-
-    struct ggml_context * ctx0 = ggml_init(params);
-
-    struct ggml_cgraph  * gf = ggml_new_graph_custom(ctx0, GPT2_MAX_NODES, false);
-
-    struct ggml_tensor * inpL;
-    if (batch.token) {
-        struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
-        ggml_allocr_alloc(allocr, inp_tokens);
-        if (!ggml_allocr_is_measure(allocr)) {
-            ggml_backend_tensor_set(inp_tokens, batch.token, 0, n_tokens*ggml_element_size(inp_tokens));
-        }
-
-        struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
-        ggml_allocr_alloc(allocr, position);
-        if (!ggml_allocr_is_measure(allocr)) {
-            for (int i = 0; i < n_tokens; ++i) {
-                int32_t v = batch.pos[i];
-                ggml_backend_tensor_set(position, &v, i*sizeof(int32_t), sizeof(v));
-            }
-        }
-
-        // wte + wpe
-        inpL =
-            ggml_add(ctx0,
-                    ggml_get_rows(ctx0, model.wte, inp_tokens),
-                    ggml_get_rows(ctx0, model.wpe, position));
-    } else {
-        GGML_ASSERT(batch.embd);
-
-        inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
-
-        ggml_allocr_alloc(allocr, inpL);
-        if (!ggml_allocr_is_measure(allocr)) {
-            ggml_backend_tensor_set(inpL, batch.embd, 0, n_tokens * n_embd * ggml_element_size(inpL));
-        }
-    }
-
-    // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-    struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
-    ggml_set_name(KQ_mask, "KQ_mask");
-    ggml_allocr_alloc(allocr, KQ_mask);
-    if (!ggml_allocr_is_measure(allocr)) {
-        std::vector<float> data_buf(n_kv*n_tokens);
-        const float neg_inf_v = -INFINITY;
-
-        for (int h = 0; h < 1; ++h) {
-            int h_offset = h*(n_kv*n_tokens);
-            for (int j = 0; j < n_tokens; ++j) {
-                const gpt2_pos    pos    = batch.pos[j];
-                const gpt2_seq_id seq_id = batch.seq_id[j];
-
-                for (int i = 0; i < n_kv; ++i) {
-                    if (!kv_cache.cells[i].has_seq_id(seq_id) || kv_cache.cells[i].pos > pos) {
-                        data_buf[h_offset + j*n_kv + i] = neg_inf_v;
-                    }
-                }
-            }
-        }
-
-        ggml_backend_tensor_set(KQ_mask, data_buf.data(), 0, data_buf.size() * sizeof(float));
-    }
-
-    for (int il = 0; il < n_layer; ++il) {
-        struct ggml_tensor * cur;
-
-        // norm
-        {
-            // [ 768, N]
-            cur = ggml_norm(ctx0, inpL, hparams.eps);
-
-            // cur = ln_1_g*cur + ln_1_b
-            // [ 768, N]
-            cur = ggml_add(ctx0,
-                    ggml_mul(ctx0,
-                        cur,
-                        model.layers[il].ln_1_g),
-                    model.layers[il].ln_1_b);
-        }
-
-        // attn
-        // [2304,        768] - model.layers[il].c_attn_attn_w
-        // [2304,          1] - model.layers[il].c_attn_attn_b
-        // [ 768,   n_tokens] - cur (in)
-        // [2304,   n_tokens] - cur (out)
-        //
-        // cur = attn_w*cur + attn_b
-        // [2304, n_tokens]
-        {
-            cur = ggml_mul_mat(ctx0,
-                    model.layers[il].c_attn_attn_w,
-                    cur);
-
-            cur = ggml_add(ctx0,
-                    cur,
-                    model.layers[il].c_attn_attn_b);
-        }
-
-        // self-attention
-        {
-            struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*n_embd);
-            struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 1*sizeof(float)*n_embd);
-            struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 2*sizeof(float)*n_embd);
-
-            // store key and value to memory
-            if (n_tokens >= 1) {
-                struct ggml_tensor * k = ggml_view_1d(ctx0, model.kv_cache.k, n_tokens*n_embd, (ggml_element_size(model.kv_cache.k)*n_embd)*(il*n_ctx + kv_head));
-                struct ggml_tensor * v = ggml_view_1d(ctx0, model.kv_cache.v, n_tokens*n_embd, (ggml_element_size(model.kv_cache.v)*n_embd)*(il*n_ctx + kv_head));
-
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
-            }
-
-            // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
-            // [64, N, 12]
-            struct ggml_tensor * Q =
-                ggml_permute(ctx0,
-                        ggml_cpy(ctx0,
-                            Qcur,
-                            ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, n_tokens)),
-                        0, 2, 1, 3);
-
-            // K = Kmem.view(n_embd/n_head, n_head, n_kv).permute(0, 2, 1, 3)
-            // [64, n_kv, 12]
-            struct ggml_tensor * K =
-                ggml_permute(ctx0,
-                        ggml_reshape_3d(ctx0,
-                            ggml_view_1d(ctx0, model.kv_cache.k, n_kv*n_embd, il*n_ctx*ggml_element_size(model.kv_cache.k)*n_embd),
-                            n_embd/n_head, n_head, n_kv),
-                        0, 2, 1, 3);
-
-            // GG: flash attention
-            //struct ggml_tensor * V =
-            //    ggml_cpy(ctx0,
-            //            ggml_permute(ctx0,
-            //                ggml_reshape_3d(ctx0,
-            //                    ggml_view_1d(ctx0, model.kv_cache.v, n_kv*n_embd, il*n_ctx*ggml_element_size(model.kv_cache.v)*n_embd),
-            //                    n_embd/n_head, n_head, n_kv),
-            //                1, 2, 0, 3),
-            //            ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_embd/n_head, n_head));
-
-            //struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, true);
-
-            // K * Q
-            // [n_kv, n_tokens, 12]
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
-
-            // KQ_scaled = KQ / sqrt(n_embd/n_head)
-            // [n_kv, n_tokens, 12]
-            struct ggml_tensor * KQ_scaled =
-                ggml_scale(ctx0,
-                        KQ,
-                        1.0f/sqrtf(float(n_embd)/n_head));
-
-            // KQ_masked = mask_past(KQ_scaled)
-            // [n_kv, n_tokens, 12]
-            struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
-
-            // KQ = soft_max(KQ_masked)
-            // [n_kv, N, 12]
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
-
-            // V_trans = Vmem.view(n_embd/n_head, n_head, n_kv).permute(1, 2, 0, 3).contiguous()
-            // [n_kv, 64, 12]
-            struct ggml_tensor * V_trans =
-                ggml_cpy(ctx0,
-                        ggml_permute(ctx0,
-                            ggml_reshape_3d(ctx0,
-                                ggml_view_1d(ctx0, model.kv_cache.v, n_kv*n_embd, il*n_ctx*ggml_element_size(model.kv_cache.v)*n_embd),
-                                n_embd/n_head, n_head, n_kv),
-                            1, 2, 0, 3),
-                        ggml_new_tensor_3d(ctx0, model.kv_cache.v->type, n_kv, n_embd/n_head, n_head));
-
-            // KQV = transpose(V) * KQ_soft_max
-            // [64, n_tokens, 12]
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
-
-            // KQV_merged = KQV.permute(0, 2, 1, 3)
-            // [64, 12, n_tokens]
-            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
-
-            // cur = KQV_merged.contiguous().view(n_embd, N)
-            // [768, n_tokens]
-            cur = ggml_cpy(ctx0,
-                    KQV_merged,
-                    ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens));
-        }
-
-        // projection
-        // [ 768, 768] - model.layers[il].c_attn_proj_w
-        // [ 768,   1] - model.layers[il].c_attn_proj_b
-        // [ 768,   N] - cur (in)
-        // [ 768,   N] - cur (out)
-        //
-        // cur = proj_w*cur + proj_b
-        // [768, N]
-        {
-            cur = ggml_mul_mat(ctx0,
-                    model.layers[il].c_attn_proj_w,
-                    cur);
-
-            cur = ggml_add(ctx0,
-                    cur,
-                    model.layers[il].c_attn_proj_b);
-        }
-
-        // add the input
-        cur = ggml_add(ctx0, cur, inpL);
-
-        struct ggml_tensor * inpFF = cur;
-
-        // feed-forward network
-        {
-            // norm
-            {
-                cur = ggml_norm(ctx0, inpFF, hparams.eps);
-
-                // cur = ln_2_g*cur + ln_2_b
-                // [ 768, N]
-                cur = ggml_add(ctx0,
-                        ggml_mul(ctx0,
-                            cur,
-                            model.layers[il].ln_2_g),
-                        model.layers[il].ln_2_b);
-            }
-
-            // fully connected
-            // [3072, 768] - model.layers[il].c_mlp_fc_w
-            // [3072,   1] - model.layers[il].c_mlp_fc_b
-            // [ 768,   N] - cur (in)
-            // [3072,   N] - cur (out)
-            //
-            // cur = fc_w*cur + fc_b
-            // [3072, N]
-            cur = ggml_mul_mat(ctx0,
-                    model.layers[il].c_mlp_fc_w,
-                    cur);
-
-            cur = ggml_add(ctx0,
-                    cur,
-                    model.layers[il].c_mlp_fc_b);
-
-            // GELU activation
-            // [3072, N]
-            cur = ggml_gelu(ctx0, cur);
-
-            // projection
-            // [ 768, 3072] - model.layers[il].c_mlp_proj_w
-            // [ 768,    1] - model.layers[il].c_mlp_proj_b
-            // [3072,    N] - cur (in)
-            // [ 768,    N] - cur (out)
-            //
-            // cur = proj_w*cur + proj_b
-            // [768, N]
-            cur = ggml_mul_mat(ctx0,
-                    model.layers[il].c_mlp_proj_w,
-                    cur);
-
-            cur = ggml_add(ctx0,
-                    cur,
-                    model.layers[il].c_mlp_proj_b);
-        }
-
-        // input for next layer
-        inpL = ggml_add(ctx0, cur, inpFF);
-    }
-
-    // norm
-    {
-        // [ 768, N]
-        inpL = ggml_norm(ctx0, inpL, hparams.eps);
-
-        // inpL = ln_f_g*inpL + ln_f_b
-        // [ 768, N]
-        inpL = ggml_add(ctx0,
-                ggml_mul(ctx0,
-                    inpL,
-                    model.ln_f_g),
-                model.ln_f_b);
-    }
-
-    // inpL = WTE * inpL
-    // [ 768, 50257] - model.lm_head
-    // [ 768, N]     - inpL
-    inpL = ggml_mul_mat(ctx0, model.lm_head, inpL);
-
-    // logits -> probs
-    //inpL = ggml_soft_max(ctx0, inpL);
-
-    ggml_build_forward_expand(gf, inpL);
-
-    ggml_free(ctx0);
-
-    return gf;
-}
-
-static void gpt2_kv_cache_seq_cp(
-        struct gpt2_kv_cache & cache,
-                 gpt2_seq_id   seq_id_src,
-                 gpt2_seq_id   seq_id_dst,
-                    gpt2_pos   p0,
-                    gpt2_pos   p1) {
-    if (p0 < 0) p0 = 0;
-    if (p1 < 0) p1 = std::numeric_limits<gpt2_pos>::max();
-
-    for (uint32_t i = 0; i < cache.size; ++i) {
-        if (cache.cells[i].has_seq_id(seq_id_src) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
-            cache.cells[i].seq_id.insert(seq_id_dst);
-        }
-    }
-}
-
-struct gpt2_batch gpt2_batch_init(int32_t n_tokens, int32_t embd) {
-    gpt2_batch batch;
-
-    if (embd) {
-        batch.embd = (float *) malloc(sizeof(float) * n_tokens * embd);
-    } else {
-        batch.token = (gpt_vocab::id *) malloc(sizeof(gpt_vocab::id) * n_tokens);
-    }
-
-    batch.pos    = (gpt2_pos *)    malloc(sizeof(gpt2_pos)    * n_tokens);
-    batch.seq_id = (gpt2_seq_id *) malloc(sizeof(gpt2_seq_id) * n_tokens);
-    batch.logits = (int8_t *)      malloc(sizeof(int8_t)      * n_tokens);
-
-    return batch;
-}
-
-void gpt2_batch_free(struct gpt2_batch batch) {
-    if (batch.token)  free(batch.token);
-    if (batch.embd)   free(batch.embd);
-    if (batch.pos)    free(batch.pos);
-    if (batch.seq_id) free(batch.seq_id);
-    if (batch.logits) free(batch.logits);
-}
-
-// Positive return values does not mean a fatal error, but rather a warning.
-//   0 - success
-// < 0 - error
-int gpt2_decode(
-        struct gpt2_model  & model,
-        struct ggml_allocr * allocr,
-        struct gpt2_batch    batch,
-        int                  n_threads,
-        std::vector<float> & logits) {
-    const int32_t n_tokens = batch.n_tokens;
-    const auto &  hparams  = model.hparams;
-    const int     n_vocab  = hparams.n_vocab;
-
-    if (n_tokens == 0) {
-        printf("%s: n_tokens == 0", __func__);
-        return -1;
-    }
-
-    GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd));
-
-    auto & cache = model.kv_cache;
-
-    for (int i = 0; i < n_tokens; i++) {
-        cache.cells[cache.head + i].pos = batch.pos[i];
-        cache.cells[cache.head + i].seq_id.insert(batch.seq_id[i]);
-    }
-
-    cache.n = cache.head + n_tokens;
-
-    // reset the allocator to free all the memory allocated during the previous inference
-    ggml_allocr_reset(allocr);
-
-    struct ggml_cgraph * gf = gpt2_graph(model, allocr, batch);
-
-    // allocate tensors
-    ggml_allocr_alloc_graph(allocr, gf);
-
-    // run the computation
-    if (ggml_backend_is_cpu(model.backend)) {
-        ggml_backend_cpu_set_n_threads(model.backend, n_threads);
-    }
-#ifdef GGML_USE_METAL
-    if (ggml_backend_is_metal(model.backend)) {
-        ggml_backend_metal_set_n_cb(model.backend, n_threads);
-    }
-#endif
-    ggml_backend_graph_compute(model.backend, gf);
-
-    //if (n_past%100 == 0) {
-    //    ggml_graph_print   (&gf);
-    //    ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
-    //}
-
-    // in this case, the output tensor is the last one in the graph
-    struct ggml_tensor * inpL = gf->nodes[gf->n_nodes - 1];
-
-    if (batch.logits) {
-        // return logits for all tokens
-        logits.resize(n_vocab*n_tokens);
-        for (int32_t i = 0; i < n_tokens; i++) {
-            if (batch.logits[i] == 0) {
-                continue;
-            }
-            ggml_backend_tensor_get(inpL, logits.data() + n_vocab*i, n_vocab*i*sizeof(float), sizeof(float)*n_vocab);
-        }
-    } else {
-        // return result just for the last token
-        logits.resize(n_vocab);
-        ggml_backend_tensor_get(inpL, logits.data(), (n_vocab*(n_tokens-1))*sizeof(float), sizeof(float)*n_vocab);
-    }
-
-    // update the kv ring buffer
-    cache.head += n_tokens;
-
-    // ensure kv cache head points to a valid index.
-    if (cache.head >= cache.size) {
-        printf("%s: cache.head >= cache.size\n", __func__);
-        return -2;
-    }
-
-    return 0;
-}
-
-int main(int argc, char ** argv) {
-    ggml_time_init();
-
-    const int64_t t_main_start_us = ggml_time_us();
-
-    gpt_params params;
-
-    if (gpt_params_parse(argc, argv, params) == false) {
-        return 1;
-    }
-
-    if (params.seed < 0) {
-        params.seed = time(NULL);
-    }
-
-    printf("%s: seed = %d\n", __func__, params.seed);
-
-    std::mt19937 rng(params.seed);
-    if (params.prompt.empty()) {
-        params.prompt = gpt_random_prompt(rng);
-    }
-
-    int64_t t_load_us = 0;
-
-    gpt_vocab vocab;
-    gpt2_model model;
-
-    // load the model
-    {
-        const int64_t t_start_us = ggml_time_us();
-
-        if (!gpt2_model_load(params.model, model, vocab, params.n_ctx, params.n_gpu_layers)) {
-            fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
-            return 1;
-        }
-
-        t_load_us = ggml_time_us() - t_start_us;
-
-        test_gpt_tokenizer(vocab, params.token_test);
-    }
-
-    // tokenize the prompt
-    std::vector<gpt_vocab::id> embd_inp = ::gpt_tokenize(vocab, params.prompt);
-
-    // keep this buffer alive while evaluating the model
-    ggml_backend_buffer_t buf_compute;
-
-    const int n_parallel = params.n_parallel;
-    const int n_batch_max = std::max(embd_inp.size(), (size_t)n_parallel);
-
-    // create a gpt2_batch
-    // we use this object to submit token data for decoding
-    gpt2_batch batch = gpt2_batch_init(n_batch_max, 0);
-
-    // prepare required memory and allocate the compute buffer
-    struct ggml_allocr * allocr = NULL;
-    {
-        // create an allocator to measure the memory usage
-        allocr = ggml_allocr_new_measure_from_backend(model.backend);
-
-        batch.n_tokens = n_batch_max;
-
-        // create the worst case graph for memory usage estimation
-        struct ggml_cgraph * gf = gpt2_graph(model, allocr, batch);
-
-        // compute the required memory
-        size_t mem_size = ggml_allocr_alloc_graph(allocr, gf);
-
-        // recreate the allocator with the required memory
-        ggml_allocr_free(allocr);
-        buf_compute = ggml_backend_alloc_buffer(model.backend, mem_size);
-        allocr = ggml_allocr_new_from_buffer(buf_compute);
-
-        fprintf(stderr, "%s: compute buffer size: %.2f MB\n", __func__, mem_size/1024.0/1024.0);
-    }
-
-    int64_t t_sample_us  = 0;
-    int64_t t_predict_us = 0;
-
-    std::vector<float> logits;
-
-    // evaluate the initial prompt
-    batch.n_tokens = embd_inp.size();
-
-    for (int32_t i = 0; i < batch.n_tokens; i++) {
-        batch.token[i]  = embd_inp[i];
-        batch.pos[i]    = i;
-        batch.seq_id[i] = 0;
-        batch.logits[i] = false;
-    }
-
-    // gpt2_decode will output logits only for the last token of the prompt
-    batch.logits[batch.n_tokens - 1] = true;
-
-    if (gpt2_decode(model, allocr, batch, params.n_threads, logits) != 0) {
-        printf("%s: gpt2_decode() failed\n", __func__);
-        return 1;
-    }
-
-    // assign the system KV cache to all parallel sequences
-    // this way, the parallel sequences will "reuse" the prompt tokens without having to copy them
-    for (int32_t i = 1; i < n_parallel; ++i) {
-        gpt2_kv_cache_seq_cp(model.kv_cache, 0, i, 0, batch.n_tokens);
-    }
-
-    if (n_parallel > 1) {
-        printf("\n\n%s: generating %d sequences ...\n", __func__, n_parallel);
-    }
-
-    params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size());
-
-    printf("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
-    printf("%s: number of tokens in prompt = %zu, first 8 tokens: ", __func__, embd_inp.size());
-    for (int i = 0; i < std::min(8, (int) embd_inp.size()); i++) {
-        printf("%d ", embd_inp[i]);
-    }
-    printf("\n\n");
-
-    std::vector<gpt_vocab::token> streams(n_parallel);
-
-    // remember the batch index of the last token for each parallel sequence
-    // we need this to determine which logits to sample from
-    std::vector<int32_t> i_batch(n_parallel, batch.n_tokens - 1);
-
-    int n_cur     = batch.n_tokens;
-    int n_len     = batch.n_tokens + params.n_predict;
-    int n_decoded = 0;
-
-    const int   n_vocab = model.hparams.n_vocab;
-    const int   top_k = params.top_k;
-    const float top_p = params.top_p;
-    const float temp  = params.temp;
-
-    while (n_cur < n_len) {
-        batch.n_tokens = 0;
-
-        for (int32_t i = 0; i < n_parallel; ++i) {
-            if (i_batch[i] < 0) {
-                // the stream has already finished
-                continue;
-            }
-
-            auto * logits_i = logits.data() + i_batch[i]*n_vocab;
-
-            gpt_vocab::id id = 0;
-            {
-                const int64_t t_start_sample_us = ggml_time_us();
-
-                id = gpt_sample_top_k_top_p(vocab, logits_i, top_k, top_p, temp, rng);
-
-                t_sample_us += ggml_time_us() - t_start_sample_us;
-            }
-
-            // is it an end of stream? -> mark the stream as finished
-            if ((!params.ignore_eos && id == 50256) || n_cur == n_len - 1) {
-                i_batch[i] = -1;
-                printf("\n");
-                if (n_parallel > 1) {
-                    printf("%s: stream %d finished at n_cur = %d", __func__, i, n_cur);
-                }
-
-                continue;
-            }
-
-            auto& token = vocab.id_to_token[id];
-            if (n_parallel == 1) {
-                printf("%s", token.c_str());
-                fflush(stdout);
-            }
-
-            streams[i] += token;
-
-            // push this new token for next evaluation
-            batch.token [batch.n_tokens] = id;
-            batch.pos   [batch.n_tokens] = n_cur;
-            batch.seq_id[batch.n_tokens] = i;
-            batch.logits[batch.n_tokens] = true;
-
-            i_batch[i] = batch.n_tokens;
-
-            batch.n_tokens += 1;
-
-            n_decoded += 1;
-        }
-
-        // all streams are finished
-        if (batch.n_tokens == 0) {
-            break;
-        }
-
-        n_cur += 1;
-
-        {
-            const int64_t t_start_us = ggml_time_us();
-
-            // evaluate the current batch with the transformer model
-            int ret_code = gpt2_decode(model, allocr, batch, params.n_threads, logits);
-            if (ret_code != 0) {
-                fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, ret_code);
-                return 1;
-            }
-
-            t_predict_us += ggml_time_us() - t_start_us;
-        }
-    }
-
-    if (n_parallel > 1) {
-        printf("\n");
-
-        for (int32_t i = 0; i < n_parallel; ++i) {
-            printf("sequence %d:\n\n%s%s\n\n", i, params.prompt.c_str(), streams[i].c_str());
-        }
-    }
-
-    // report timing
-    {
-        const int64_t t_main_end_us = ggml_time_us();
-
-        printf("\n\n");
-        printf("%s:     n_decoded = %8d\n",      __func__, n_decoded);
-        printf("%s:     load time = %8.2f ms\n", __func__, t_load_us/1000.0f);
-        printf("%s:   sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f);
-        printf("%s:  predict time = %8.2f ms\n", __func__, t_predict_us/1000.0f);
-        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
-    }
-
-    gpt2_batch_free(batch);
-    ggml_free(model.ctx);
-
-    ggml_backend_buffer_free(model.buffer_w);
-    ggml_backend_buffer_free(model.kv_cache.buffer);
-    ggml_backend_buffer_free(buf_compute);
-    ggml_backend_free(model.backend);
-
-    return 0;
-}
diff --git a/ggml/examples/gpt-2/main-ctx.cpp b/ggml/examples/gpt-2/main-ctx.cpp
deleted file mode 100644
index 2c075f3..0000000
--- a/ggml/examples/gpt-2/main-ctx.cpp
+++ /dev/null
@@ -1,840 +0,0 @@
-#include "ggml/ggml.h"
-
-#include "common.h"
-#include "common-ggml.h"
-
-#include <cassert>
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <fstream>
-#include <map>
-#include <string>
-#include <vector>
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-
-// default hparams (GPT-2 117M)
-struct gpt2_hparams {
-    int32_t n_vocab = 50257;
-    int32_t n_ctx   = 1024;
-    int32_t n_embd  = 768;
-    int32_t n_head  = 12;
-    int32_t n_layer = 12;
-    int32_t ftype   = 1;
-    float   eps     = 1e-5f;
-};
-
-struct gpt2_layer {
-    // normalization
-    struct ggml_tensor * ln_1_g;
-    struct ggml_tensor * ln_1_b;
-
-    struct ggml_tensor * ln_2_g;
-    struct ggml_tensor * ln_2_b;
-
-    // attention
-    struct ggml_tensor * c_attn_attn_w;
-    struct ggml_tensor * c_attn_attn_b;
-
-    struct ggml_tensor * c_attn_proj_w;
-    struct ggml_tensor * c_attn_proj_b;
-
-    // mlp
-    struct ggml_tensor * c_mlp_fc_w;
-    struct ggml_tensor * c_mlp_fc_b;
-
-    struct ggml_tensor * c_mlp_proj_w;
-    struct ggml_tensor * c_mlp_proj_b;
-};
-
-struct gpt2_model {
-    gpt2_hparams hparams;
-
-    // normalization
-    struct ggml_tensor * ln_f_g;
-    struct ggml_tensor * ln_f_b;
-
-    struct ggml_tensor * wte;     // position embedding
-    struct ggml_tensor * wpe;     //    token embedding
-    struct ggml_tensor * lm_head; // language model head
-
-    std::vector<gpt2_layer> layers;
-
-    // key + value memory
-    struct ggml_tensor * memory_k;
-    struct ggml_tensor * memory_v;
-
-    //
-    struct ggml_context * ctx;
-    std::map<std::string, struct ggml_tensor *> tensors;
-};
-
-// load the model's weights from a file
-bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & vocab) {
-    printf("%s: loading model from '%s'\n", __func__, fname.c_str());
-
-    auto fin = std::ifstream(fname, std::ios::binary);
-    if (!fin) {
-        fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
-        return false;
-    }
-
-    // verify magic
-    {
-        uint32_t magic;
-        fin.read((char *) &magic, sizeof(magic));
-        if (magic != GGML_FILE_MAGIC) {
-            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
-            return false;
-        }
-    }
-
-    // load hparams
-    {
-        auto & hparams = model.hparams;
-
-        fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
-        fin.read((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
-        fin.read((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
-        fin.read((char *) &hparams.n_head,  sizeof(hparams.n_head));
-        fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
-        fin.read((char *) &hparams.ftype,   sizeof(hparams.ftype));
-
-        const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
-
-        printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
-        printf("%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
-        printf("%s: n_embd  = %d\n", __func__, hparams.n_embd);
-        printf("%s: n_head  = %d\n", __func__, hparams.n_head);
-        printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
-        printf("%s: ftype   = %d\n", __func__, hparams.ftype);
-        printf("%s: qntvr   = %d\n", __func__, qntvr);
-
-        hparams.ftype %= GGML_QNT_VERSION_FACTOR;
-    }
-
-    // load vocab
-    {
-        int32_t n_vocab = 0;
-        fin.read((char *) &n_vocab, sizeof(n_vocab));
-
-        if (n_vocab != model.hparams.n_vocab) {
-            fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
-                    __func__, fname.c_str(), n_vocab, model.hparams.n_vocab);
-            return false;
-        }
-
-        std::string word;
-        std::vector<char> buf(128);
-
-        for (int i = 0; i < n_vocab; i++) {
-            uint32_t len;
-            fin.read((char *) &len, sizeof(len));
-
-            buf.resize(len);
-            fin.read((char *) buf.data(), len);
-            word.assign(buf.data(), len);
-
-            vocab.token_to_id[word] = i;
-            vocab.id_to_token[i] = word;
-        }
-    }
-
-    // for the big tensors, we have the option to store the data in 16-bit floats or quantized
-    // in order to save memory and also to speed up the computation
-    ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
-    if (wtype == GGML_TYPE_COUNT) {
-        fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
-                __func__, fname.c_str(), model.hparams.ftype);
-        return false;
-    }
-
-    auto & ctx = model.ctx;
-
-    size_t ctx_size = 0;
-
-    {
-        const auto & hparams = model.hparams;
-
-        const int n_embd  = hparams.n_embd;
-        const int n_layer = hparams.n_layer;
-        const int n_ctx   = hparams.n_ctx;
-        const int n_vocab = hparams.n_vocab;
-
-        ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_g
-        ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_b
-
-        ctx_size += ggml_row_size(wtype,         n_vocab*n_embd); // wte
-        ctx_size += ggml_row_size(GGML_TYPE_F32,   n_ctx*n_embd); // wpe
-        ctx_size += ggml_row_size(wtype,         n_vocab*n_embd); // lm_head
-
-        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_g
-        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_b
-
-        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_g
-        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_b
-
-        ctx_size += n_layer*(ggml_row_size(wtype,         3*n_embd*n_embd)); // c_attn_attn_w
-        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 3*n_embd));        // c_attn_attn_b
-
-        ctx_size += n_layer*(ggml_row_size(wtype,         n_embd*n_embd));   // c_attn_proj_w
-        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd));          // c_attn_proj_b
-
-        ctx_size += n_layer*(ggml_row_size(wtype,         4*n_embd*n_embd)); // c_mlp_fc_w
-        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 4*n_embd));        // c_mlp_fc_b
-
-        ctx_size += n_layer*(ggml_row_size(wtype,         4*n_embd*n_embd)); // c_mlp_proj_w
-        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 4*n_embd));        // c_mlp_proj_b
-
-        ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F32, n_embd); // memory_k
-        ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F32, n_embd); // memory_v
-
-        ctx_size += (6 + 12*n_layer)*512; // object overhead
-
-        printf("%s: ggml tensor size = %d bytes\n", __func__, (int) sizeof(ggml_tensor));
-        printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
-    }
-
-    // create the ggml context
-    {
-        struct ggml_init_params params = {
-            /*.mem_size   =*/ ctx_size,
-            /*.mem_buffer =*/ NULL,
-            /*.no_alloc   =*/ false,
-        };
-
-        model.ctx = ggml_init(params);
-        if (!model.ctx) {
-            fprintf(stderr, "%s: ggml_init() failed\n", __func__);
-            return false;
-        }
-    }
-
-    // prepare memory for the weights
-    {
-        const auto & hparams = model.hparams;
-
-        const int n_embd  = hparams.n_embd;
-        const int n_layer = hparams.n_layer;
-        const int n_ctx   = hparams.n_ctx;
-        const int n_vocab = hparams.n_vocab;
-
-        model.layers.resize(n_layer);
-
-        model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-        model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-
-        model.wte     = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
-        model.wpe     = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ctx);
-        model.lm_head = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
-
-        // map by name
-        model.tensors["model/ln_f/g"] = model.ln_f_g;
-        model.tensors["model/ln_f/b"] = model.ln_f_b;
-
-        model.tensors["model/wte"]     = model.wte;
-        model.tensors["model/wpe"]     = model.wpe;
-        model.tensors["model/lm_head"] = model.lm_head;
-
-        for (int i = 0; i < n_layer; ++i) {
-            auto & layer = model.layers[i];
-
-            layer.ln_1_g        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-            layer.ln_1_b        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-
-            layer.ln_2_g        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-            layer.ln_2_b        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-
-            layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype,           n_embd, 3*n_embd);
-            layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd);
-
-            layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype,           n_embd, n_embd);
-            layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-
-            layer.c_mlp_fc_w    = ggml_new_tensor_2d(ctx, wtype,           n_embd, 4*n_embd);
-            layer.c_mlp_fc_b    = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
-
-            layer.c_mlp_proj_w  = ggml_new_tensor_2d(ctx, wtype,         4*n_embd, n_embd);
-            layer.c_mlp_proj_b  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-
-            // map by name
-            model.tensors["model/h" + std::to_string(i) + "/ln_1/g"]        = layer.ln_1_g;
-            model.tensors["model/h" + std::to_string(i) + "/ln_1/b"]        = layer.ln_1_b;
-
-            model.tensors["model/h" + std::to_string(i) + "/ln_2/g"]        = layer.ln_2_g;
-            model.tensors["model/h" + std::to_string(i) + "/ln_2/b"]        = layer.ln_2_b;
-
-            model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/w"] = layer.c_attn_attn_w;
-            model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/b"] = layer.c_attn_attn_b;
-
-            model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/w"] = layer.c_attn_proj_w;
-            model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/b"] = layer.c_attn_proj_b;
-
-            model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/w"]    = layer.c_mlp_fc_w;
-            model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/b"]    = layer.c_mlp_fc_b;
-
-            model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/w"]  = layer.c_mlp_proj_w;
-            model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/b"]  = layer.c_mlp_proj_b;
-        }
-    }
-
-    // key + value memory
-    {
-        const auto & hparams = model.hparams;
-
-        const int n_embd  = hparams.n_embd;
-        const int n_layer = hparams.n_layer;
-        const int n_ctx   = hparams.n_ctx;
-
-        const int n_mem      = n_layer*n_ctx;
-        const int n_elements = n_embd*n_mem;
-
-        model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
-        model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
-
-        const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
-
-        printf("%s: memory size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem);
-    }
-
-    // load weights
-    {
-        size_t total_size = 0;
-
-        bool has_lm_head = false;
-
-        while (true) {
-            int32_t n_dims;
-            int32_t length;
-            int32_t ttype;
-
-            fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
-            fin.read(reinterpret_cast<char *>(&length), sizeof(length));
-            fin.read(reinterpret_cast<char *>(&ttype),  sizeof(ttype));
-
-            if (fin.eof()) {
-                break;
-            }
-
-            int32_t nelements = 1;
-            int32_t ne[2] = { 1, 1 };
-            for (int i = 0; i < n_dims; ++i) {
-                fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
-                nelements *= ne[i];
-            }
-
-            std::string name(length, 0);
-            fin.read(&name[0], length);
-
-            if (model.tensors.find(name) == model.tensors.end()) {
-                fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.c_str());
-                return false;
-            }
-
-            auto tensor = model.tensors[name];
-            if (ggml_nelements(tensor) != nelements) {
-                fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.c_str());
-                return false;
-            }
-
-            if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
-                fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
-                        __func__, name.c_str(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]);
-                return false;
-            }
-
-            // for debugging
-            if (0) {
-                printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.c_str(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
-            }
-
-            const size_t bpe = ggml_type_size(ggml_type(ttype));
-
-            if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
-                fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
-                        __func__, name.c_str(), ggml_nbytes(tensor), nelements*bpe);
-                return false;
-            }
-
-            fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
-
-            // GPT-2 models share the WTE tensor as the LM head
-            if (name == "model/wte" && has_lm_head == false) {
-                memcpy(model.lm_head->data, tensor->data, ggml_nbytes(tensor));
-            }
-
-            if (name == "model/lm_head") {
-                has_lm_head = true;
-            }
-
-            total_size += ggml_nbytes(tensor);
-        }
-
-        printf("%s: model size  = %8.2f MB\n", __func__, total_size/1024.0/1024.0);
-    }
-
-    fin.close();
-
-    return true;
-}
-
-// evaluate the transformer
-//
-//   - model:     the model
-//   - n_threads: number of threads to use
-//   - n_past:    the context size so far
-//   - embd_inp:  the embeddings of the tokens in the context
-//   - embd_w:    the predicted logits for the next token
-//
-bool gpt2_eval(
-        const gpt2_model & model,
-        const int n_threads,
-        const int n_past,
-        const std::vector<gpt_vocab::id> & embd_inp,
-              std::vector<float>         & embd_w,
-              size_t                     & mem_per_token) {
-    const int N = embd_inp.size();
-
-    const auto & hparams = model.hparams;
-
-    const int n_embd  = hparams.n_embd;
-    const int n_layer = hparams.n_layer;
-    const int n_ctx   = hparams.n_ctx;
-    const int n_head  = hparams.n_head;
-    const int n_vocab = hparams.n_vocab;
-
-    static size_t buf_size = 256u*1024*1024;
-    static void * buf = malloc(buf_size);
-
-    if (mem_per_token > 0 && mem_per_token*N > buf_size) {
-        const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
-        //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
-
-        // reallocate
-        buf_size = buf_size_new;
-        buf = realloc(buf, buf_size);
-        if (buf == nullptr) {
-            fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size);
-            return false;
-        }
-    }
-
-    struct ggml_init_params params = {
-        /*.mem_size   =*/ buf_size,
-        /*.mem_buffer =*/ buf,
-        /*.no_alloc   =*/ false,
-    };
-
-    struct ggml_context * ctx0 = ggml_init(params);
-    struct ggml_cgraph * gf = ggml_new_graph(ctx0);
-
-    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
-    memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
-
-    struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
-    for (int i = 0; i < N; ++i) {
-        ((int32_t *) position->data)[i] = n_past + i;
-    }
-
-    // wte + wpe
-    struct ggml_tensor * inpL =
-        ggml_add(ctx0,
-                ggml_get_rows(ctx0, model.wte, embd),
-                ggml_get_rows(ctx0, model.wpe, position));
-
-    for (int il = 0; il < n_layer; ++il) {
-        struct ggml_tensor * cur;
-
-        // norm
-        {
-            // [ 768, N]
-            cur = ggml_norm(ctx0, inpL, hparams.eps);
-
-            // cur = ln_1_g*cur + ln_1_b
-            // [ 768, N]
-            cur = ggml_add(ctx0,
-                    ggml_mul(ctx0,
-                        ggml_repeat(ctx0, model.layers[il].ln_1_g, cur),
-                        cur),
-                    ggml_repeat(ctx0, model.layers[il].ln_1_b, cur));
-        }
-
-        // attn
-        // [2304, 768] - model.layers[il].c_attn_attn_w
-        // [2304,   1] - model.layers[il].c_attn_attn_b
-        // [ 768,   N] - cur (in)
-        // [2304,   N] - cur (out)
-        //
-        // cur = attn_w*cur + attn_b
-        // [2304, N]
-        {
-            cur = ggml_mul_mat(ctx0,
-                    model.layers[il].c_attn_attn_w,
-                    cur);
-
-            cur = ggml_add(ctx0,
-                    ggml_repeat(ctx0, model.layers[il].c_attn_attn_b, cur),
-                    cur);
-        }
-
-        // self-attention
-        {
-            struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd);
-            struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd);
-            struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd);
-
-            // store key and value to memory
-            if (N >= 1) {
-                struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past));
-                struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past));
-
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
-            }
-
-            // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
-            // [64, N, 12]
-            struct ggml_tensor * Q =
-                ggml_permute(ctx0,
-                        ggml_cpy(ctx0,
-                            Qcur,
-                            ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)),
-                        0, 2, 1, 3);
-
-            // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
-            // [64, n_past + N, 12]
-            struct ggml_tensor * K =
-                ggml_permute(ctx0,
-                        ggml_reshape_3d(ctx0,
-                            ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd),
-                            n_embd/n_head, n_head, n_past + N),
-                        0, 2, 1, 3);
-
-            // GG: flash attention
-            //struct ggml_tensor * V =
-            //    ggml_cpy(ctx0,
-            //            ggml_permute(ctx0,
-            //                ggml_reshape_3d(ctx0,
-            //                    ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
-            //                    n_embd/n_head, n_head, n_past + N),
-            //                1, 2, 0, 3),
-            //            ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, n_embd/n_head, n_head));
-
-            //struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, true);
-
-            // K * Q
-            // [n_past + N, N, 12]
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
-
-            // KQ_scaled = KQ / sqrt(n_embd/n_head)
-            // [n_past + N, N, 12]
-            struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, 1.0f/sqrt(float(n_embd)/n_head));
-
-            // KQ_masked = mask_past(KQ_scaled)
-            // [n_past + N, N, 12]
-            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
-
-            // KQ = soft_max(KQ_masked)
-            // [n_past + N, N, 12]
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
-
-            // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
-            // [n_past + N, 64, 12]
-            struct ggml_tensor * V_trans =
-                ggml_cpy(ctx0,
-                        ggml_permute(ctx0,
-                            ggml_reshape_3d(ctx0,
-                                ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
-                                n_embd/n_head, n_head, n_past + N),
-                            1, 2, 0, 3),
-                        ggml_new_tensor_3d(ctx0, model.memory_v->type, n_past + N, n_embd/n_head, n_head));
-
-            // KQV = transpose(V) * KQ_soft_max
-            // [64, N, 12]
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
-
-            // KQV_merged = KQV.permute(0, 2, 1, 3)
-            // [64, 12, N]
-            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
-
-            // cur = KQV_merged.contiguous().view(n_embd, N)
-            // [768, N]
-            cur = ggml_cpy(ctx0,
-                    KQV_merged,
-                    ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
-        }
-
-        // projection
-        // [ 768, 768] - model.layers[il].c_attn_proj_w
-        // [ 768,   1] - model.layers[il].c_attn_proj_b
-        // [ 768,   N] - cur (in)
-        // [ 768,   N] - cur (out)
-        //
-        // cur = proj_w*cur + proj_b
-        // [768, N]
-        {
-            cur = ggml_mul_mat(ctx0,
-                    model.layers[il].c_attn_proj_w,
-                    cur);
-
-            cur = ggml_add(ctx0,
-                    ggml_repeat(ctx0, model.layers[il].c_attn_proj_b, cur),
-                    cur);
-        }
-
-        // add the input
-        cur = ggml_add(ctx0, cur, inpL);
-
-        struct ggml_tensor * inpFF = cur;
-
-        // feed-forward network
-        {
-            // norm
-            {
-                cur = ggml_norm(ctx0, inpFF, hparams.eps);
-
-                // cur = ln_2_g*cur + ln_2_b
-                // [ 768, N]
-                cur = ggml_add(ctx0,
-                        ggml_mul(ctx0,
-                            ggml_repeat(ctx0, model.layers[il].ln_2_g, cur),
-                            cur),
-                        ggml_repeat(ctx0, model.layers[il].ln_2_b, cur));
-            }
-
-            // fully connected
-            // [3072, 768] - model.layers[il].c_mlp_fc_w
-            // [3072,   1] - model.layers[il].c_mlp_fc_b
-            // [ 768,   N] - cur (in)
-            // [3072,   N] - cur (out)
-            //
-            // cur = fc_w*cur + fc_b
-            // [3072, N]
-            cur = ggml_mul_mat(ctx0,
-                    model.layers[il].c_mlp_fc_w,
-                    cur);
-
-            cur = ggml_add(ctx0,
-                    ggml_repeat(ctx0, model.layers[il].c_mlp_fc_b, cur),
-                    cur);
-
-            // GELU activation
-            // [3072, N]
-            cur = ggml_gelu(ctx0, cur);
-
-            // projection
-            // [ 768, 3072] - model.layers[il].c_mlp_proj_w
-            // [ 768,    1] - model.layers[il].c_mlp_proj_b
-            // [3072,    N] - cur (in)
-            // [ 768,    N] - cur (out)
-            //
-            // cur = proj_w*cur + proj_b
-            // [768, N]
-            cur = ggml_mul_mat(ctx0,
-                    model.layers[il].c_mlp_proj_w,
-                    cur);
-
-            cur = ggml_add(ctx0,
-                    ggml_repeat(ctx0, model.layers[il].c_mlp_proj_b, cur),
-                    cur);
-        }
-
-        // input for next layer
-        inpL = ggml_add(ctx0, cur, inpFF);
-    }
-
-    // norm
-    {
-        // [ 768, N]
-        inpL = ggml_norm(ctx0, inpL, hparams.eps);
-
-        // inpL = ln_f_g*inpL + ln_f_b
-        // [ 768, N]
-        inpL = ggml_add(ctx0,
-                ggml_mul(ctx0,
-                    ggml_repeat(ctx0, model.ln_f_g, inpL),
-                    inpL),
-                ggml_repeat(ctx0, model.ln_f_b, inpL));
-    }
-
-    // inpL = WTE * inpL
-    // [ 768, 50257] - model.lm_head
-    // [ 768, N]     - inpL
-    inpL = ggml_mul_mat(ctx0, model.lm_head, inpL);
-
-    // logits -> probs
-    //inpL = ggml_soft_max_inplace(ctx0, inpL);
-
-    // run the computation
-    ggml_build_forward_expand(gf, inpL);
-    ggml_graph_compute_with_ctx(ctx0, gf, n_threads);
-
-    //if (n_past%100 == 0) {
-    //    ggml_graph_print   (&gf);
-    //    ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
-    //}
-
-    //embd_w.resize(n_vocab*N);
-    //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
-
-    // return result just for the last token
-    embd_w.resize(n_vocab);
-    memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
-
-    if (mem_per_token == 0) {
-        mem_per_token = ggml_used_mem(ctx0)/N;
-    }
-    //printf("used_mem = %zu\n", ggml_used_mem(ctx0));
-
-    ggml_free(ctx0);
-
-    return true;
-}
-
-int main(int argc, char ** argv) {
-    ggml_time_init();
-
-    const int64_t t_main_start_us = ggml_time_us();
-
-    gpt_params params;
-    params.model = "models/gpt-2-117M/ggml-model.bin";
-
-    if (gpt_params_parse(argc, argv, params) == false) {
-        return 1;
-    }
-
-    if (params.seed < 0) {
-        params.seed = time(NULL);
-    }
-
-    printf("%s: seed = %d\n", __func__, params.seed);
-
-    std::mt19937 rng(params.seed);
-    if (params.prompt.empty()) {
-        params.prompt = gpt_random_prompt(rng);
-    }
-
-    int64_t t_load_us = 0;
-
-    gpt_vocab vocab;
-    gpt2_model model;
-
-    // load the model
-    {
-        const int64_t t_start_us = ggml_time_us();
-
-        if (!gpt2_model_load(params.model, model, vocab)) {
-            fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
-            return 1;
-        }
-
-        t_load_us = ggml_time_us() - t_start_us;
-
-        test_gpt_tokenizer(vocab, params.token_test);
-    }
-
-    int n_past = 0;
-
-    int64_t t_sample_us  = 0;
-    int64_t t_predict_us = 0;
-
-    std::vector<float> logits;
-
-    // tokenize the prompt
-    std::vector<gpt_vocab::id> embd_inp = ::gpt_tokenize(vocab, params.prompt);
-
-    params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size());
-
-    printf("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
-    printf("%s: number of tokens in prompt = %zu, first 8 tokens: ", __func__, embd_inp.size());
-    for (int i = 0; i < std::min(8, (int) embd_inp.size()); i++) {
-        printf("%d ", embd_inp[i]);
-    }
-    printf("\n\n");
-
-    // submit the input prompt token-by-token
-    // this reduces the memory usage during inference, at the cost of a bit of speed at the beginning
-    std::vector<gpt_vocab::id> embd;
-
-    // determine the required inference memory per token:
-    size_t mem_per_token = 0;
-    gpt2_eval(model, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
-
-    for (size_t i = embd.size(); i < embd_inp.size() + params.n_predict; i++) {
-        // predict
-        if (embd.size() > 0) {
-            const int64_t t_start_us = ggml_time_us();
-
-            if (!gpt2_eval(model, params.n_threads, n_past, embd, logits, mem_per_token)) {
-                printf("Failed to predict\n");
-                return 1;
-            }
-
-            t_predict_us += ggml_time_us() - t_start_us;
-        }
-
-        n_past += embd.size();
-        embd.clear();
-
-        if (i >= embd_inp.size()) {
-            // sample next token
-            const int   top_k = params.top_k;
-            const float top_p = params.top_p;
-            const float temp  = params.temp;
-
-            const int n_vocab = model.hparams.n_vocab;
-
-            gpt_vocab::id id = 0;
-
-            {
-                const int64_t t_start_sample_us = ggml_time_us();
-
-                id = gpt_sample_top_k_top_p(vocab, logits.data() + (logits.size() - n_vocab), top_k, top_p, temp, rng);
-
-                t_sample_us += ggml_time_us() - t_start_sample_us;
-            }
-
-            // add it to the context
-            embd.push_back(id);
-        } else {
-            // if here, it means we are still processing the input prompt
-            for (size_t k = i; k < embd_inp.size(); k++) {
-                embd.push_back(embd_inp[k]);
-                if (int32_t(embd.size()) >= params.n_batch) {
-                    break;
-                }
-            }
-            i += embd.size() - 1;
-        }
-
-        // display text
-        for (auto id : embd) {
-            printf("%s", vocab.id_to_token[id].c_str());
-        }
-        fflush(stdout);
-
-        // end of text token
-        if (embd.back() == 50256) {
-            break;
-        }
-    }
-
-    // report timing
-    {
-        const int64_t t_main_end_us = ggml_time_us();
-
-        printf("\n\n");
-        printf("%s: mem per token = %8zu bytes\n", __func__, mem_per_token);
-        printf("%s:     load time = %8.2f ms\n", __func__, t_load_us/1000.0f);
-        printf("%s:   sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f);
-        printf("%s:  predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/n_past);
-        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
-    }
-
-    ggml_free(model.ctx);
-
-    return 0;
-}
diff --git a/ggml/examples/gpt-2/main.cpp b/ggml/examples/gpt-2/main.cpp
deleted file mode 100644
index 05ce370..0000000
--- a/ggml/examples/gpt-2/main.cpp
+++ /dev/null
@@ -1,1080 +0,0 @@
-#include "ggml/ggml.h"
-#include "ggml/ggml-alloc.h"
-#include "ggml/ggml-backend.h"
-
-#ifdef GGML_USE_CUBLAS
-#include "ggml-cuda.h"
-#endif
-
-#ifdef GGML_USE_METAL
-#include "ggml-metal.h"
-#endif
-
-#include "common.h"
-#include "common-ggml.h"
-
-#include <cassert>
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <fstream>
-#include <map>
-#include <string>
-#include <vector>
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-
-#define GPT2_MAX_NODES 4096
-
-static void ggml_log_callback_default(ggml_log_level level, const char * text, void * user_data) {
-    (void) level;
-    (void) user_data;
-    fputs(text, stderr);
-    fflush(stderr);
-}
-
-// default hparams (GPT-2 117M)
-struct gpt2_hparams {
-    int32_t n_vocab = 50257;
-    int32_t n_ctx   = 1024;
-    int32_t n_embd  = 768;
-    int32_t n_head  = 12;
-    int32_t n_layer = 12;
-    int32_t ftype   = 1;
-    float   eps     = 1e-5f;
-};
-
-struct gpt2_layer {
-    // normalization
-    struct ggml_tensor * ln_1_g;
-    struct ggml_tensor * ln_1_b;
-
-    struct ggml_tensor * ln_2_g;
-    struct ggml_tensor * ln_2_b;
-
-    // attention
-    struct ggml_tensor * c_attn_attn_w;
-    struct ggml_tensor * c_attn_attn_b;
-
-    struct ggml_tensor * c_attn_proj_w;
-    struct ggml_tensor * c_attn_proj_b;
-
-    // mlp
-    struct ggml_tensor * c_mlp_fc_w;
-    struct ggml_tensor * c_mlp_fc_b;
-
-    struct ggml_tensor * c_mlp_proj_w;
-    struct ggml_tensor * c_mlp_proj_b;
-};
-
-struct gpt2_model {
-    gpt2_hparams hparams;
-
-    // normalization
-    struct ggml_tensor * ln_f_g;
-    struct ggml_tensor * ln_f_b;
-
-    struct ggml_tensor * wte;     // position embedding
-    struct ggml_tensor * wpe;     //    token embedding
-    struct ggml_tensor * lm_head; // language model head
-
-    std::vector<gpt2_layer> layers;
-
-    // key + value memory
-    struct ggml_tensor * memory_k;
-    struct ggml_tensor * memory_v;
-
-    //
-    struct ggml_context * ctx;
-
-    std::vector<ggml_backend_t> backends;
-    std::vector<ggml_backend_buffer_t> buffers_w;
-    ggml_backend_buffer_t buffer_kv;
-    ggml_backend_buffer_t buffer_input;
-
-    std::map<std::string, struct ggml_tensor *> tensors;
-
-    // inputs/constants
-    struct ggml_tensor * embd;
-    struct ggml_tensor * position;
-};
-
-void init_backends(gpt2_model & model, const gpt_params & params) {
-    ggml_backend_t gpu_backend = NULL;
-
-    // initialize the backends
-#ifdef GGML_USE_CUBLAS
-    if (params.n_gpu_layers > 0) {
-        fprintf(stderr, "%s: using CUDA backend\n", __func__);
-        gpu_backend = ggml_backend_cuda_init(0);
-        if (!gpu_backend) {
-            fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
-        }
-    }
-#endif
-
-#ifdef GGML_USE_METAL
-    if (params.n_gpu_layers > 0) {
-        fprintf(stderr, "%s: using Metal backend\n", __func__);
-        ggml_backend_metal_log_set_callback(ggml_log_callback_default, nullptr);
-        gpu_backend = ggml_backend_metal_init();
-        if (!gpu_backend) {
-            fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
-        } else {
-            ggml_backend_metal_set_n_cb(gpu_backend, params.n_threads);
-        }
-    }
-#endif
-    if (gpu_backend) {
-        model.backends.push_back(gpu_backend);
-    }
-
-    // always add the CPU backend as a fallback
-    ggml_backend_t cpu_backend = ggml_backend_cpu_init();
-    ggml_backend_cpu_set_n_threads(cpu_backend, params.n_threads);
-    model.backends.push_back(cpu_backend);
-}
-
-// load the model's weights from a file
-bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & vocab, const gpt_params & params) {
-    printf("%s: loading model from '%s'\n", __func__, fname.c_str());
-
-    auto fin = std::ifstream(fname, std::ios::binary);
-    if (!fin) {
-        fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
-        return false;
-    }
-
-    // verify magic
-    {
-        uint32_t magic;
-        fin.read((char *) &magic, sizeof(magic));
-        if (magic != GGML_FILE_MAGIC) {
-            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
-            return false;
-        }
-    }
-
-    // load hparams
-    {
-        auto & hparams = model.hparams;
-
-        fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
-        fin.read((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
-        fin.read((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
-        fin.read((char *) &hparams.n_head,  sizeof(hparams.n_head));
-        fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
-        fin.read((char *) &hparams.ftype,   sizeof(hparams.ftype));
-
-        const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
-
-        printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
-        printf("%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
-        printf("%s: n_embd  = %d\n", __func__, hparams.n_embd);
-        printf("%s: n_head  = %d\n", __func__, hparams.n_head);
-        printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
-        printf("%s: ftype   = %d\n", __func__, hparams.ftype);
-        printf("%s: qntvr   = %d\n", __func__, qntvr);
-
-        hparams.ftype %= GGML_QNT_VERSION_FACTOR;
-    }
-
-    // load vocab
-    {
-        int32_t n_vocab = 0;
-        fin.read((char *) &n_vocab, sizeof(n_vocab));
-
-        if (n_vocab != model.hparams.n_vocab) {
-            fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
-                    __func__, fname.c_str(), n_vocab, model.hparams.n_vocab);
-            return false;
-        }
-
-        std::string word;
-        std::vector<char> buf(128);
-
-        for (int i = 0; i < n_vocab; i++) {
-            uint32_t len;
-            fin.read((char *) &len, sizeof(len));
-
-            buf.resize(len);
-            fin.read((char *) buf.data(), len);
-            word.assign(buf.data(), len);
-
-            vocab.token_to_id[word] = i;
-            vocab.id_to_token[i] = word;
-        }
-    }
-
-    // for the big tensors, we have the option to store the data in 16-bit floats or quantized
-    // in order to save memory and also to speed up the computation
-    ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
-    if (wtype == GGML_TYPE_COUNT) {
-        fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
-                __func__, fname.c_str(), model.hparams.ftype);
-        return false;
-    }
-
-    auto & ctx = model.ctx;
-
-    // create the ggml context
-    {
-        size_t n_tensors = 3 /* input */ + 2 /* kv */ + 6 + 12*model.hparams.n_layer;
-        struct ggml_init_params params = {
-            /*.mem_size   =*/ ggml_tensor_overhead() * n_tensors,
-            /*.mem_buffer =*/ NULL,
-            /*.no_alloc   =*/ true,
-        };
-
-        model.ctx = ggml_init(params);
-        if (!model.ctx) {
-            fprintf(stderr, "%s: ggml_init() failed\n", __func__);
-            return false;
-        }
-    }
-
-    // create tensors for the weights
-    {
-        const auto & hparams = model.hparams;
-
-        const int n_embd  = hparams.n_embd;
-        const int n_layer = hparams.n_layer;
-        const int n_ctx   = hparams.n_ctx;
-        const int n_vocab = hparams.n_vocab;
-
-        model.layers.resize(n_layer);
-
-        model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-        model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-
-        model.wte     = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
-        model.wpe     = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ctx);
-        model.lm_head = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
-
-        // map by name
-        model.tensors["model/ln_f/g"] = model.ln_f_g;
-        model.tensors["model/ln_f/b"] = model.ln_f_b;
-
-        model.tensors["model/wte"]     = model.wte;
-        model.tensors["model/wpe"]     = model.wpe;
-        model.tensors["model/lm_head"] = model.lm_head;
-
-        for (int i = 0; i < n_layer; ++i) {
-            auto & layer = model.layers[i];
-
-            layer.ln_1_g        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-            layer.ln_1_b        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-
-            layer.ln_2_g        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-            layer.ln_2_b        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-
-            layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype,           n_embd, 3*n_embd);
-            layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd);
-
-            layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype,           n_embd, n_embd);
-            layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-
-            layer.c_mlp_fc_w    = ggml_new_tensor_2d(ctx, wtype,           n_embd, 4*n_embd);
-            layer.c_mlp_fc_b    = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
-
-            layer.c_mlp_proj_w  = ggml_new_tensor_2d(ctx, wtype,         4*n_embd, n_embd);
-            layer.c_mlp_proj_b  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-
-            // map by name
-            model.tensors["model/h" + std::to_string(i) + "/ln_1/g"]        = layer.ln_1_g;
-            model.tensors["model/h" + std::to_string(i) + "/ln_1/b"]        = layer.ln_1_b;
-
-            model.tensors["model/h" + std::to_string(i) + "/ln_2/g"]        = layer.ln_2_g;
-            model.tensors["model/h" + std::to_string(i) + "/ln_2/b"]        = layer.ln_2_b;
-
-            model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/w"] = layer.c_attn_attn_w;
-            model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/b"] = layer.c_attn_attn_b;
-
-            model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/w"] = layer.c_attn_proj_w;
-            model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/b"] = layer.c_attn_proj_b;
-
-            model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/w"]    = layer.c_mlp_fc_w;
-            model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/b"]    = layer.c_mlp_fc_b;
-
-            model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/w"]  = layer.c_mlp_proj_w;
-            model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/b"]  = layer.c_mlp_proj_b;
-        }
-    }
-
-    // assign tensors to backends
-    init_backends(model, params);
-    ggml_backend_t backend_gpu = model.backends.front();
-    ggml_backend_t backend_cpu = model.backends.back();
-    std::map<std::string, ggml_backend_t> tensor_backends;
-    {
-        const int i_gpu_first_layer = model.hparams.n_layer - params.n_gpu_layers;
-        for (auto it : model.tensors) {
-            const std::string & name = it.first;
-            // input tensors
-            if (name == "model/wte" || name == "model/wpe") {
-                if (params.n_gpu_layers > model.hparams.n_layer) {
-                    tensor_backends[name] = backend_gpu;
-                } else {
-                    tensor_backends[name] = backend_cpu;
-                }
-            }
-            // output tensors
-            if (name == "model/ln_f/g" || name == "model/ln_f/b" || name == "model/lm_head") {
-                if (params.n_gpu_layers > 0) {
-                    tensor_backends[name] = backend_gpu;
-                } else {
-                    tensor_backends[name] = backend_cpu;
-                }
-            }
-            // layer tensors
-            if (name.substr(0, 7) == "model/h") {
-                // parse layer number
-                int layer = std::stoi(name.substr(7, 2));
-                if (layer >= i_gpu_first_layer) {
-                    tensor_backends[name] = backend_gpu;
-                } else {
-                    tensor_backends[name] = backend_cpu;
-                }
-            }
-        }
-    }
-
-    // allocate buffers
-    std::map<ggml_backend_t, std::unique_ptr<ggml_allocr, decltype(&ggml_allocr_free)>> backend_buffers;
-    for (auto backend : model.backends) {
-        // compute the size of the buffer
-        size_t size = 0;
-        for (auto it : model.tensors) {
-            if (tensor_backends[it.first] == backend) {
-                size += ggml_nbytes(it.second) + 512;
-            }
-        }
-        if (size > 0) {
-            printf("%s: %8s buffer size = %8.2f MB\n", __func__, ggml_backend_name(backend), size/1024.0/1024.0);
-            // allocate the buffer
-            ggml_backend_buffer_t buffer = ggml_backend_alloc_buffer(backend, size);
-            model.buffers_w.push_back(buffer);
-
-            // create an allocator for the buffer to allocate the tensors
-            auto alloc = std::unique_ptr<ggml_allocr, decltype(&ggml_allocr_free)>(ggml_allocr_new_from_buffer(buffer), ggml_allocr_free);
-            backend_buffers.insert(std::make_pair(backend, std::move(alloc)));
-        } else {
-            model.buffers_w.push_back(NULL);
-        }
-    }
-
-    // allocate key + value memory
-    {
-        const auto & hparams = model.hparams;
-
-        const int n_embd  = hparams.n_embd;
-        const int n_layer = hparams.n_layer;
-        const int n_ctx   = hparams.n_ctx;
-
-        const int n_mem      = n_layer*n_ctx;
-        const int n_elements = n_embd*n_mem;
-
-        model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
-        model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
-
-        ggml_set_name(model.memory_k, "model/memory_k");
-        ggml_set_name(model.memory_v, "model/memory_v");
-
-        const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
-
-        printf("%s: memory size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem);
-
-        // create a backend buffer (can be in host or device memory)
-        ggml_backend_t backend_kv = params.n_gpu_layers >= hparams.n_layer/2 ? backend_gpu : backend_cpu;
-        printf("%s: backend_kv = %s\n", __func__, ggml_backend_name(backend_kv));
-        model.buffer_kv = ggml_backend_alloc_buffer(backend_kv, memory_size + 512*2);
-
-        // allocate the tensors into the backend buffer
-        {
-            ggml_allocr * alloc = ggml_allocr_new_from_buffer(model.buffer_kv);
-
-            // this updates the pointers in the tensors to point to the correct location in the buffer
-            // this is necessary since the ggml_context is .no_alloc == true
-            // note that the buffer can actually be a device buffer, depending on the backend
-            ggml_allocr_alloc(alloc, model.memory_k);
-            ggml_allocr_alloc(alloc, model.memory_v);
-
-            ggml_allocr_free(alloc);
-        }
-    }
-
-    // load weights
-    {
-        size_t total_size = 0;
-
-        bool has_lm_head = false;
-
-        std::vector<char> read_buf;
-
-        while (true) {
-            int32_t n_dims;
-            int32_t length;
-            int32_t ttype;
-
-            fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
-            fin.read(reinterpret_cast<char *>(&length), sizeof(length));
-            fin.read(reinterpret_cast<char *>(&ttype),  sizeof(ttype));
-
-            if (fin.eof()) {
-                break;
-            }
-
-            int32_t nelements = 1;
-            int32_t ne[2] = { 1, 1 };
-            for (int i = 0; i < n_dims; ++i) {
-                fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
-                nelements *= ne[i];
-            }
-
-            std::string name(length, 0);
-            fin.read(&name[0], length);
-
-            if (model.tensors.find(name) == model.tensors.end()) {
-                fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.c_str());
-                return false;
-            }
-
-            auto tensor = model.tensors[name];
-            ggml_set_name(tensor, name.c_str());
-            if (ggml_nelements(tensor) != nelements) {
-                fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.c_str());
-                return false;
-            }
-
-            if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
-                fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
-                        __func__, name.c_str(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]);
-                return false;
-            }
-
-            // for debugging
-            if (0) {
-                printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.c_str(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
-            }
-
-            const size_t bpe = ggml_type_size(ggml_type(ttype));
-
-            if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
-                fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
-                        __func__, name.c_str(), ggml_nbytes(tensor), nelements*bpe);
-                return false;
-            }
-
-            // allocate the tensor
-            ggml_backend_t backend = tensor_backends[name];
-            ggml_allocr * alloc = backend_buffers.find(backend)->second.get();
-            ggml_allocr_alloc(alloc, tensor);
-            //printf("%s: [%5.5s] %s\n", __func__, ggml_backend_name(backend), name.c_str());
-
-            if (ggml_backend_is_cpu(backend)
-#ifdef GGML_USE_METAL
-                || ggml_backend_is_metal(backend)
-#endif
-                ) {
-                // for the CPU and Metal backend, we can read directly into the tensor
-                fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
-            } else {
-                // read into a temporary buffer first, then copy to device memory
-                read_buf.resize(ggml_nbytes(tensor));
-                fin.read(read_buf.data(), ggml_nbytes(tensor));
-                ggml_backend_tensor_set(tensor, read_buf.data(), 0, ggml_nbytes(tensor));
-            }
-
-            // GPT-2 models share the WTE tensor as the LM head
-            if (name == "model/wte" && has_lm_head == false) {
-                ggml_allocr_alloc(backend_buffers.find(tensor_backends["model/lm_head"])->second.get(), model.lm_head);
-                //printf("%s: [%5.5s] %s (copied)\n", __func__, ggml_backend_name(tensor_backends["model/lm_head"]), "model/lm_head");
-                ggml_backend_tensor_copy(tensor, model.lm_head);
-                total_size += ggml_nbytes(model.lm_head);
-            }
-
-            if (name == "model/lm_head") {
-                has_lm_head = true;
-            }
-
-            total_size += ggml_nbytes(tensor);
-        }
-        printf("%s: model size  = %8.2f MB\n", __func__, total_size/1024.0/1024.0);
-    }
-
-    fin.close();
-
-    // allocate input tensors
-    {
-        model.embd = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, model.hparams.n_ctx);
-        model.position = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, model.hparams.n_ctx);
-
-        ggml_set_name(model.embd, "in/embd");
-        ggml_set_name(model.position, "in/position");
-
-        // add input tensors to cpu backend
-        size_t input_size = ggml_nbytes(model.embd) + ggml_nbytes(model.position);
-
-        // FIXME: use cpu backend after sched impl
-        ggml_backend_t backend_input = params.n_gpu_layers >= model.hparams.n_layer ? backend_gpu : backend_cpu;
-        model.buffer_input = ggml_backend_alloc_buffer(backend_input, input_size + 512*3);
-        printf("%s: backend_in = %s (%zu bytes)\n", __func__, ggml_backend_name(backend_input), input_size);
-
-        // allocate the tensors into the backend buffer
-        ggml_allocr * alloc = ggml_allocr_new_from_buffer(model.buffer_input);
-        ggml_allocr_alloc(alloc, model.embd);
-        ggml_allocr_alloc(alloc, model.position);
-        ggml_allocr_free(alloc);
-    }
-
-    return true;
-}
-
-// build the computation graph
-struct ggml_cgraph * gpt2_graph(
-        const gpt2_model & model,
-        const int n_past,
-        const std::vector<gpt_vocab::id> & embd_inp) {
-    const int N = embd_inp.size();
-
-    const auto & hparams = model.hparams;
-
-    const int n_embd  = hparams.n_embd;
-    const int n_layer = hparams.n_layer;
-    const int n_ctx   = hparams.n_ctx;
-    const int n_head  = hparams.n_head;
-
-    // since we are using ggml-alloc, this buffer only needs enough space to hold the ggml_tensor and ggml_cgraph structs, but not the tensor data
-    static size_t buf_size = ggml_tensor_overhead()*GPT2_MAX_NODES + ggml_graph_overhead_custom(GPT2_MAX_NODES, false);
-    static std::vector<uint8_t> buf(buf_size);
-
-    struct ggml_init_params params = {
-        /*.mem_size   =*/ buf_size,
-        /*.mem_buffer =*/ buf.data(),
-        /*.no_alloc   =*/ true, // the tensors will be allocated later by ggml_allocr_alloc_graph()
-    };
-
-    struct ggml_context * ctx0 = ggml_init(params);
-
-    struct ggml_cgraph  * gf = ggml_new_graph_custom(ctx0, GPT2_MAX_NODES, false);
-
-    struct ggml_tensor * embd = ggml_view_1d(ctx0, model.embd, N, 0);
-
-    // TODO: avoid writing to tensors if we are only measuring the memory usage
-    // not critical, just a minor optimization
-
-    //if (!ggml_allocr_is_measure(allocr)) {
-        //ggml_backend_tensor_set(embd, embd_inp.data(), 0, N*ggml_element_size(embd));
-        ggml_backend_tensor_set(model.embd, embd_inp.data(), 0, N*ggml_element_size(embd)); // FIXME: cannot use the view here because it's not initialized yet (buffer not set), but we should
-    //}
-    //memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
-
-    struct ggml_tensor * position = ggml_view_1d(ctx0, model.position, N, 0);
-    //if (!ggml_allocr_is_measure(allocr)) {
-        for (int i = 0; i < N; ++i) {
-            int32_t v = n_past + i;
-            ggml_backend_tensor_set(model.position, &v, i*sizeof(int32_t), sizeof(v)); // FIXME: same
-            //((int32_t *) position->data)[i] = n_past + i;
-        }
-    //}
-
-    const float KQ_scale = 1.0f/sqrtf(float(model.hparams.n_embd)/model.hparams.n_head);
-
-    // wte + wpe
-    struct ggml_tensor * inpL =
-        ggml_add(ctx0,
-                ggml_get_rows(ctx0, model.wte, embd),
-                ggml_get_rows(ctx0, model.wpe, position));
-    ggml_set_name(inpL, "inpL");
-    ggml_set_name(inpL->src[0], "wte");
-    ggml_set_name(inpL->src[1], "wpe");
-
-    for (int il = 0; il < n_layer; ++il) {
-        struct ggml_tensor * cur;
-
-        // norm
-        {
-            // [ 768, N]
-            cur = ggml_norm(ctx0, inpL, hparams.eps);
-            ggml_format_name(cur, "l%d.norm", il);
-
-            // cur = ln_1_g*cur + ln_1_b
-            // [ 768, N]
-            cur = ggml_add(ctx0,
-                    ggml_mul(ctx0,
-                        cur,
-                        model.layers[il].ln_1_g),
-                    model.layers[il].ln_1_b);
-            ggml_format_name(cur, "l%d.ln_1_b", il);
-            ggml_format_name(cur->src[0], "l%d.ln_1_g", il);
-        }
-
-        // attn
-        // [2304, 768] - model.layers[il].c_attn_attn_w
-        // [2304,   1] - model.layers[il].c_attn_attn_b
-        // [ 768,   N] - cur (in)
-        // [2304,   N] - cur (out)
-        //
-        // cur = attn_w*cur + attn_b
-        // [2304, N]
-        {
-            cur = ggml_mul_mat(ctx0,
-                    model.layers[il].c_attn_attn_w,
-                    cur);
-            ggml_format_name(cur, "l%d.attn_w", il);
-
-            cur = ggml_add(ctx0,
-                    cur,
-                    model.layers[il].c_attn_attn_b);
-            ggml_format_name(cur, "l%d.attn_b", il);
-        }
-
-        // self-attention
-        {
-            struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd);
-            struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd);
-            struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd);
-
-            ggml_format_name(Qcur, "l%d.Qcur", il);
-            ggml_format_name(Kcur, "l%d.Kcur", il);
-            ggml_format_name(Vcur, "l%d.Vcur", il);
-
-            // store key and value to memory
-            if (N >= 1) {
-                struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past));
-                struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past));
-
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
-            }
-
-            // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
-            // [64, N, 12]
-            struct ggml_tensor * Q =
-                ggml_permute(ctx0,
-                        ggml_cpy(ctx0,
-                            Qcur,
-                            ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)),
-                        0, 2, 1, 3);
-            ggml_format_name(Q, "l%d.Q", il);
-
-            // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
-            // [64, n_past + N, 12]
-            struct ggml_tensor * K =
-                ggml_permute(ctx0,
-                        ggml_reshape_3d(ctx0,
-                            ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd),
-                            n_embd/n_head, n_head, n_past + N),
-                        0, 2, 1, 3);
-            ggml_format_name(K, "l%d.K", il);
-
-            // GG: flash attention
-            //struct ggml_tensor * V =
-            //    ggml_cpy(ctx0,
-            //            ggml_permute(ctx0,
-            //                ggml_reshape_3d(ctx0,
-            //                    ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
-            //                    n_embd/n_head, n_head, n_past + N),
-            //                1, 2, 0, 3),
-            //            ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, n_embd/n_head, n_head));
-
-            //struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, true);
-
-            // K * Q
-            // [n_past + N, N, 12]
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
-            ggml_format_name(KQ, "l%d.KQ", il);
-
-            // KQ_scaled = KQ / sqrt(n_embd/n_head)
-            // [n_past + N, N, 12]
-            struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
-            ggml_format_name(KQ_scaled, "l%d.KQ_scaled", il);
-
-            // KQ_masked = mask_past(KQ_scaled)
-            // [n_past + N, N, 12]
-            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
-            ggml_format_name(KQ_masked, "l%d.KQ_masked", il);
-
-            // KQ = soft_max(KQ_masked)
-            // [n_past + N, N, 12]
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
-            ggml_format_name(KQ_soft_max, "l%d.KQ_soft_max", il);
-
-            // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
-            // [n_past + N, 64, 12]
-            struct ggml_tensor * V_trans =
-                ggml_cpy(ctx0,
-                        ggml_permute(ctx0,
-                            ggml_reshape_3d(ctx0,
-                                ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
-                                n_embd/n_head, n_head, n_past + N),
-                            1, 2, 0, 3),
-                        ggml_new_tensor_3d(ctx0, model.memory_v->type, n_past + N, n_embd/n_head, n_head));
-            ggml_format_name(V_trans, "l%d.V_trans", il);
-
-            // KQV = transpose(V) * KQ_soft_max
-            // [64, N, 12]
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
-            ggml_format_name(KQV, "l%d.KQV", il);
-
-            // KQV_merged = KQV.permute(0, 2, 1, 3)
-            // [64, 12, N]
-            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
-            ggml_format_name(KQV_merged, "l%d.KQV_merged", il);
-
-            // cur = KQV_merged.contiguous().view(n_embd, N)
-            // [768, N]
-            cur = ggml_cpy(ctx0,
-                    KQV_merged,
-                    ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
-            ggml_format_name(cur, "l%d.KQV_merged_contiguous", il);
-        }
-
-        // projection
-        // [ 768, 768] - model.layers[il].c_attn_proj_w
-        // [ 768,   1] - model.layers[il].c_attn_proj_b
-        // [ 768,   N] - cur (in)
-        // [ 768,   N] - cur (out)
-        //
-        // cur = proj_w*cur + proj_b
-        // [768, N]
-        {
-            cur = ggml_mul_mat(ctx0,
-                    model.layers[il].c_attn_proj_w,
-                    cur);
-            ggml_format_name(cur, "l%d.attn_proj_w", il);
-
-            cur = ggml_add(ctx0,
-                    cur,
-                    model.layers[il].c_attn_proj_b);
-            ggml_format_name(cur, "l%d.attn_proj_b", il);
-        }
-
-        // add the input
-        cur = ggml_add(ctx0, cur, inpL);
-        ggml_format_name(cur, "l%d.add", il);
-
-        struct ggml_tensor * inpFF = cur;
-
-        // feed-forward network
-        {
-            // norm
-            {
-                cur = ggml_norm(ctx0, inpFF, hparams.eps);
-                ggml_format_name(cur, "l%d.FFnorm", il);
-
-                // cur = ln_2_g*cur + ln_2_b
-                // [ 768, N]
-                cur = ggml_add(ctx0,
-                        ggml_mul(ctx0,
-                            cur,
-                            model.layers[il].ln_2_g),
-                        model.layers[il].ln_2_b);
-                ggml_format_name(cur, "l%d.ln_2_b", il);
-                ggml_format_name(cur->src[0], "l%d.ln_2_g", il);
-            }
-
-            // fully connected
-            // [3072, 768] - model.layers[il].c_mlp_fc_w
-            // [3072,   1] - model.layers[il].c_mlp_fc_b
-            // [ 768,   N] - cur (in)
-            // [3072,   N] - cur (out)
-            //
-            // cur = fc_w*cur + fc_b
-            // [3072, N]
-            cur = ggml_mul_mat(ctx0,
-                    model.layers[il].c_mlp_fc_w,
-                    cur);
-            ggml_format_name(cur, "l%d.mlp_fc_w", il);
-
-            cur = ggml_add(ctx0,
-                    cur,
-                    model.layers[il].c_mlp_fc_b);
-            ggml_format_name(cur, "l%d.mlp_fc_b", il);
-
-            // GELU activation
-            // [3072, N]
-            cur = ggml_gelu(ctx0, cur);
-            ggml_format_name(cur, "l%d.gelu", il);
-
-            // projection
-            // [ 768, 3072] - model.layers[il].c_mlp_proj_w
-            // [ 768,    1] - model.layers[il].c_mlp_proj_b
-            // [3072,    N] - cur (in)
-            // [ 768,    N] - cur (out)
-            //
-            // cur = proj_w*cur + proj_b
-            // [768, N]
-            cur = ggml_mul_mat(ctx0,
-                    model.layers[il].c_mlp_proj_w,
-                    cur);
-            ggml_format_name(cur, "l%d.mlp_proj_w", il);
-
-            cur = ggml_add(ctx0,
-                    cur,
-                    model.layers[il].c_mlp_proj_b);
-            ggml_format_name(cur, "l%d.mlp_proj_b", il);
-        }
-
-        // input for next layer
-        inpL = ggml_add(ctx0, cur, inpFF);
-        ggml_format_name(inpL, "l%d.add2", il);
-    }
-
-    // norm
-    {
-        // [ 768, N]
-        inpL = ggml_norm(ctx0, inpL, hparams.eps);
-        ggml_format_name(inpL, "out_norm");
-
-        // inpL = ln_f_g*inpL + ln_f_b
-        // [ 768, N]
-        inpL = ggml_add(ctx0,
-                ggml_mul(ctx0,
-                    inpL,
-                    model.ln_f_g),
-                model.ln_f_b);
-        ggml_format_name(inpL, "out_ln_f_b");
-        ggml_format_name(inpL->src[0], "out_ln_f_g");
-    }
-
-    // inpL = WTE * inpL
-    // [ 768, 50257] - model.lm_head
-    // [ 768, N]     - inpL
-    inpL = ggml_mul_mat(ctx0, model.lm_head, inpL);
-    ggml_format_name(inpL, "out_lm_head");
-
-    // logits -> probs
-    //inpL = ggml_soft_max(ctx0, inpL);
-
-    ggml_build_forward_expand(gf, inpL);
-
-    ggml_free(ctx0);
-
-    return gf;
-}
-
-// evaluate the transformer
-//
-//   - model:     the model
-//   - allocr:    ggml_allocr to use to allocate the compute buffer
-//   - n_threads: number of threads to use
-//   - n_past:    the context size so far
-//   - embd_inp:  the embeddings of the tokens in the context
-//   - embd_w:    the predicted logits for the next token
-//
-bool gpt2_eval(
-        const gpt2_model & model,
-        ggml_backend_sched_t sched,
-        const int n_past,
-        const std::vector<gpt_vocab::id> & embd_inp,
-              std::vector<float>         & embd_w) {
-    const int N = embd_inp.size();
-
-    const auto & hparams = model.hparams;
-
-    const int n_vocab = hparams.n_vocab;
-
-    struct ggml_cgraph * gf = gpt2_graph(model, n_past, embd_inp);
-
-    // run the computation
-    ggml_backend_sched_graph_compute(sched, gf);
-
-    //if (n_past%100 == 0) {
-    //    ggml_graph_print   (&gf);
-    //    ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
-    //}
-
-    // in this case, the output tensor is the last one in the graph
-    struct ggml_tensor * inpL = gf->nodes[gf->n_nodes - 1];
-
-    //embd_w.resize(n_vocab*N);
-    //ggml_backend_tensor_get(inpL, embd_w.data(), 0, sizeof(float)*n_vocab*N);
-
-    // return result just for the last token
-    embd_w.resize(n_vocab);
-    ggml_backend_tensor_get(inpL, embd_w.data(), (n_vocab*(N-1))*sizeof(float), sizeof(float)*n_vocab);
-
-    return true;
-}
-
-int main(int argc, char ** argv) {
-    ggml_time_init();
-
-    const int64_t t_main_start_us = ggml_time_us();
-
-    gpt_params params;
-    params.model = "models/gpt-2-117M/ggml-model.bin";
-
-    if (gpt_params_parse(argc, argv, params) == false) {
-        return 1;
-    }
-
-    if (params.seed < 0) {
-        params.seed = time(NULL);
-    }
-
-    printf("%s: seed = %d\n", __func__, params.seed);
-
-    std::mt19937 rng(params.seed);
-    if (params.prompt.empty()) {
-        params.prompt = gpt_random_prompt(rng);
-    }
-
-    int64_t t_load_us = 0;
-
-    gpt_vocab vocab;
-    gpt2_model model;
-
-    // load the model
-    {
-        const int64_t t_start_us = ggml_time_us();
-
-        if (!gpt2_model_load(params.model, model, vocab, params)) {
-            fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
-            return 1;
-        }
-
-        t_load_us = ggml_time_us() - t_start_us;
-
-        test_gpt_tokenizer(vocab, params.token_test);
-    }
-
-    // create the backend scheduler
-    // the scheduler handles the allocation of the compute buffers and the scheduling of the computation between the different backends
-    ggml_backend_sched_t sched;
-    {
-        // initialize the scheduler
-        sched = ggml_backend_sched_new(model.backends.data(), NULL, model.backends.size(), GPT2_MAX_NODES);
-
-        // create the worst case graph for memory usage estimation
-        int n_tokens = std::min(model.hparams.n_ctx, params.n_batch);
-        int n_past = model.hparams.n_ctx - n_tokens;
-        struct ggml_cgraph * gf = gpt2_graph(model, n_past, std::vector<gpt_vocab::id>(n_tokens, 0));
-
-        ggml_backend_sched_init_measure(sched, gf);
-
-
-        // compute the required memory
-        size_t mem_size = 0;
-        for (size_t i = 0; i < model.backends.size(); i++) {
-            ggml_backend_buffer_t buf = ggml_backend_sched_get_buffer(sched, model.backends[i]);
-            size_t size = ggml_backend_buffer_get_size(buf);
-            if (size > 0) {
-                mem_size += size;
-                printf("%s: %8s compute buffer size = %8.2f MB\n", __func__, ggml_backend_name(model.backends[i]), size/1024.0/1024.0);
-                //printf("%s: %8s compute buffer size = %zu bytes\n", __func__, ggml_backend_name(model.backends[i]), size);
-            }
-        }
-
-        printf("%s: total compute buffer size: %.2f MB\n", __func__, mem_size/1024.0/1024.0);
-    }
-
-    int n_past = 0;
-
-    int64_t t_sample_us  = 0;
-    int64_t t_predict_us = 0;
-
-    std::vector<float> logits;
-
-    // tokenize the prompt
-    std::vector<gpt_vocab::id> embd_inp = ::gpt_tokenize(vocab, params.prompt);
-
-    params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size());
-
-    printf("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
-    printf("%s: number of tokens in prompt = %zu, first 8 tokens: ", __func__, embd_inp.size());
-    for (int i = 0; i < std::min(8, (int) embd_inp.size()); i++) {
-        printf("%d ", embd_inp[i]);
-    }
-    printf("\n\n");
-
-    // submit the input prompt token-by-token
-    // this reduces the memory usage during inference, at the cost of a bit of speed at the beginning
-    std::vector<gpt_vocab::id> embd;
-
-    for (size_t i = embd.size(); i < embd_inp.size() + params.n_predict; i++) {
-        // predict
-        if (embd.size() > 0) {
-            const int64_t t_start_us = ggml_time_us();
-
-            if (!gpt2_eval(model, sched, n_past, embd, logits)) {
-                printf("Failed to predict\n");
-                return 1;
-            }
-
-            t_predict_us += ggml_time_us() - t_start_us;
-        }
-
-        n_past += embd.size();
-        embd.clear();
-
-        if (i >= embd_inp.size()) {
-            // sample next token
-            const int   top_k = params.top_k;
-            const float top_p = params.top_p;
-            const float temp  = params.temp;
-
-            const int n_vocab = model.hparams.n_vocab;
-
-            gpt_vocab::id id = 0;
-
-            {
-                const int64_t t_start_sample_us = ggml_time_us();
-
-                id = gpt_sample_top_k_top_p(vocab, logits.data() + (logits.size() - n_vocab), top_k, top_p, temp, rng);
-
-                t_sample_us += ggml_time_us() - t_start_sample_us;
-            }
-
-            // add it to the context
-            embd.push_back(id);
-        } else {
-            // if here, it means we are still processing the input prompt
-            for (size_t k = i; k < embd_inp.size(); k++) {
-                embd.push_back(embd_inp[k]);
-                if (int32_t(embd.size()) >= params.n_batch) {
-                    break;
-                }
-            }
-            i += embd.size() - 1;
-        }
-
-        // display text
-        for (auto id : embd) {
-            printf("%s", vocab.id_to_token[id].c_str());
-        }
-        fflush(stdout);
-
-        // end of text token
-        if (embd.back() == 50256) {
-            break;
-        }
-    }
-
-    // report timing
-    {
-        const int64_t t_main_end_us = ggml_time_us();
-
-        printf("\n\n");
-        printf("%s:     load time = %8.2f ms\n", __func__, t_load_us/1000.0f);
-        printf("%s:   sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f);
-        printf("%s:  predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/n_past);
-        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
-    }
-
-    ggml_free(model.ctx);
-
-    ggml_backend_sched_free(sched);
-    ggml_backend_buffer_free(model.buffer_kv);
-    for (auto & buf : model.buffers_w) {
-        ggml_backend_buffer_free(buf);
-    }
-    for (auto backend : model.backends) {
-        ggml_backend_free(backend);
-    }
-
-    return 0;
-}
diff --git a/ggml/examples/gpt-2/quantize.cpp b/ggml/examples/gpt-2/quantize.cpp
deleted file mode 100644
index 9d8d53a..0000000
--- a/ggml/examples/gpt-2/quantize.cpp
+++ /dev/null
@@ -1,184 +0,0 @@
-#include "ggml/ggml.h"
-
-#include "common.h"
-#include "common-ggml.h"
-
-#include <cassert>
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <fstream>
-#include <map>
-#include <string>
-#include <vector>
-#include <regex>
-
-// default hparams (GPT-2 117M)
-struct gpt2_hparams {
-    int32_t n_vocab = 50257;
-    int32_t n_ctx   = 1024;
-    int32_t n_embd  = 768;
-    int32_t n_head  = 12;
-    int32_t n_layer = 12;
-    int32_t ftype   = 1;
-};
-
-// quantize a model
-bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_ftype ftype) {
-    gpt_vocab vocab;
-
-    printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
-
-    auto finp = std::ifstream(fname_inp, std::ios::binary);
-    if (!finp) {
-        fprintf(stderr, "%s: failed to open '%s' for reading\n", __func__, fname_inp.c_str());
-        return false;
-    }
-
-    auto fout = std::ofstream(fname_out, std::ios::binary);
-    if (!fout) {
-        fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname_out.c_str());
-        return false;
-    }
-
-    // verify magic
-    {
-        uint32_t magic;
-        finp.read((char *) &magic, sizeof(magic));
-        if (magic != GGML_FILE_MAGIC) {
-            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname_inp.c_str());
-            return false;
-        }
-
-        fout.write((char *) &magic, sizeof(magic));
-    }
-
-    gpt2_hparams hparams;
-
-    // load hparams
-    {
-        finp.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
-        finp.read((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
-        finp.read((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
-        finp.read((char *) &hparams.n_head,  sizeof(hparams.n_head));
-        finp.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
-        finp.read((char *) &hparams.ftype,   sizeof(hparams.ftype));
-
-        const int32_t qntvr_src =    hparams.ftype / GGML_QNT_VERSION_FACTOR;
-        const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype;
-
-        printf("%s: n_vocab     = %d\n", __func__, hparams.n_vocab);
-        printf("%s: n_ctx       = %d\n", __func__, hparams.n_ctx);
-        printf("%s: n_embd      = %d\n", __func__, hparams.n_embd);
-        printf("%s: n_head      = %d\n", __func__, hparams.n_head);
-        printf("%s: n_layer     = %d\n", __func__, hparams.n_layer);
-        printf("%s: ftype (src) = %d\n", __func__, hparams.ftype);
-        printf("%s: qntvr (src) = %d\n", __func__, qntvr_src);
-        printf("%s: ftype (dst) = %d\n", __func__, ftype_dst);
-        printf("%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION);
-
-        fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
-        fout.write((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
-        fout.write((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
-        fout.write((char *) &hparams.n_head,  sizeof(hparams.n_head));
-        fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer));
-        fout.write((char *) &ftype_dst,       sizeof(ftype_dst));
-    }
-
-    // load vocab
-    {
-        int32_t n_vocab = 0;
-        finp.read ((char *) &n_vocab, sizeof(n_vocab));
-        fout.write((char *) &n_vocab, sizeof(n_vocab));
-
-        if (n_vocab != hparams.n_vocab) {
-            fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
-                    __func__, fname_inp.c_str(), n_vocab, hparams.n_vocab);
-            return false;
-        }
-
-        std::string word;
-        for (int i = 0; i < n_vocab; i++) {
-            uint32_t len;
-            finp.read ((char *) &len, sizeof(len));
-            fout.write((char *) &len, sizeof(len));
-
-            word.resize(len);
-            finp.read ((char *) word.data(), len);
-            fout.write((char *) word.data(), len);
-
-            vocab.token_to_id[word] = i;
-            vocab.id_to_token[i] = word;
-        }
-    }
-
-    // regexes of tensor names to be quantized
-    const std::vector<std::string> to_quant = {
-        "model/wte",
-        "model/lm_head",
-        "model/h.*/attn/c_attn/w",
-        "model/h.*/attn/c_proj/w",
-        "model/h.*/mlp/c_fc/w",
-        "model/h.*/mlp/c_proj/w",
-    };
-
-    if (!ggml_common_quantize_0(finp, fout, ftype, to_quant, {})) {
-        fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__, fname_inp.c_str());
-        return false;
-    }
-
-    finp.close();
-    fout.close();
-
-    return true;
-}
-
-// usage:
-//  ./gpt-2-quantize models/gpt-2-117M/ggml-model.bin models/gpt-2-117M/ggml-model-quant.bin type
-//
-int main(int argc, char ** argv) {
-    if (argc != 4) {
-        fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
-        ggml_print_ftypes(stderr);
-        return 1;
-    }
-
-    // needed to initialize f16 tables
-    {
-        struct ggml_init_params params = { 0, NULL, false };
-        struct ggml_context * ctx = ggml_init(params);
-        ggml_free(ctx);
-    }
-
-    const std::string fname_inp = argv[1];
-    const std::string fname_out = argv[2];
-
-    const ggml_ftype ftype = ggml_parse_ftype(argv[3]);
-
-    const int64_t t_main_start_us = ggml_time_us();
-
-    int64_t t_quantize_us = 0;
-
-    // load the model
-    {
-        const int64_t t_start_us = ggml_time_us();
-
-        if (!gpt2_model_quantize(fname_inp, fname_out, ggml_ftype(ftype))) {
-            fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
-            return 1;
-        }
-
-        t_quantize_us = ggml_time_us() - t_start_us;
-    }
-
-    // report timing
-    {
-        const int64_t t_main_end_us = ggml_time_us();
-
-        printf("\n");
-        printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0f);
-        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
-    }
-
-    return 0;
-}
diff --git a/ggml/examples/gpt-j/CMakeLists.txt b/ggml/examples/gpt-j/CMakeLists.txt
deleted file mode 100644
index 3675b7d..0000000
--- a/ggml/examples/gpt-j/CMakeLists.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-#
-# gpt-j
-
-set(TEST_TARGET gpt-j)
-add_executable(${TEST_TARGET} main.cpp)
-target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)
-
-#
-# gpt-j-quantize
-
-set(TEST_TARGET gpt-j-quantize)
-add_executable(${TEST_TARGET} quantize.cpp)
-target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)
diff --git a/ggml/examples/gpt-j/README.md b/ggml/examples/gpt-j/README.md
deleted file mode 100644
index e5cc795..0000000
--- a/ggml/examples/gpt-j/README.md
+++ /dev/null
@@ -1,246 +0,0 @@
-# gpt-j
-
-Local GPT-J inference on your computer using C/C++
-
-No video card required. You just need to have 16 GB of RAM.
-
-## Motivation
-
-The GPT-J 6B model is the open-source alternative to OpenAI's GPT-3. It's basically a neural network that allows you to
-generate coherent, human-like text given a certain context (prompt).
-
-The GPT-J model is quite big - the compact version of the model uses 16-bit floating point representation of the weights
-and is still 12 GB big. This means that in order to run inference on your computer, you would need to have a video card
-with at least 12 GB of video RAM. Alternatively, you can try to run the python implementations on the CPU, but that
-would probably not be very efficient as they are primarily optimized for running on a GPU (or at least this is my guess -
-I don't have much experience with python).
-
-I wanted to try and run the model on my MacBook, so I decided to implement the model inference from scratch using my own
-custom build tensor library. The tensor library (called [ggml](https://github.com/ggerganov/ggml), written in C) is in
-early development stage, but it already allows me to run the GPT-J model.
-
-On my 32GB MacBook M1 Pro, I achieve an inference speed of about `125 ms/token` or about ~6 words per second (1 word
-typically consists of 1 or 2 tokens).
-
-Here is a sample run with prompt `int main(int argc, char ** argv) {`:
-
-```
-$ time ./bin/gpt-j -p "int main(int argc, char ** argv) {"
-
-gptj_model_load: loading model from 'models/gpt-j-6B/ggml-model.bin' - please wait ...
-gptj_model_load: n_vocab = 50400
-gptj_model_load: n_ctx   = 2048
-gptj_model_load: n_embd  = 4096
-gptj_model_load: n_head  = 16
-gptj_model_load: n_layer = 28
-gptj_model_load: n_rot   = 64
-gptj_model_load: f16     = 1
-gptj_model_load: ggml ctx size = 13334.86 MB
-gptj_model_load: memory_size =  1792.00 MB, n_mem = 57344
-gptj_model_load: ................................... done
-gptj_model_load: model size = 11542.79 MB / num tensors = 285
-main: number of tokens in prompt = 13
-
-int main(int argc, char ** argv) {
-    (void)argc;
-    (void)argv;
-
-    {
-        struct sockaddr_in addr;
-        int addrlen;
-        char * ip = "192.168.1.4";
-        int i;
-
-        if ( (addrlen = sizeof(addr)) == -1 )
-            return -1;
-
-        for (i = 0; i < 10; ++i) {
-            addr.sin_family = AF_INET;
-            addr.sin_addr.s_addr = inet_addr(ip);
-
-main: mem per token = 16430420 bytes
-main:     load time =  6211.48 ms
-main:   sample time =    13.74 ms
-main:  predict time = 26420.34 ms / 124.62 ms per token
-main:    total time = 33035.37 ms
-
-real	0m33.171s
-user	3m32.269s
-sys      0m3.686s
-
-$
-```
-
-It took ~6.2 seconds to load the model to memory. After that, it took ~26.4 seconds to generate 200 tokens of what
-looks like to be the beginning of a networking program in C. Pretty cool!
-
-Here is another run, just for fun:
-
-```
-time ./bin/gpt-j -n 500 -t 8 -p "Ask HN: Inherited the worst code and tech team I have ever seen. How to fix it?
-"
-
-gptj_model_load: loading model from 'models/gpt-j-6B/ggml-model.bin' - please wait ...
-gptj_model_load: n_vocab = 50400
-gptj_model_load: n_ctx   = 2048
-gptj_model_load: n_embd  = 4096
-gptj_model_load: n_head  = 16
-gptj_model_load: n_layer = 28
-gptj_model_load: n_rot   = 64
-gptj_model_load: f16     = 1
-gptj_model_load: ggml ctx size = 13334.86 MB
-gptj_model_load: memory_size =  1792.00 MB, n_mem = 57344
-gptj_model_load: ................................... done
-gptj_model_load: model size = 11542.79 MB / num tensors = 285
-main: number of tokens in prompt = 24
-
-Ask HN: Inherited the worst code and tech team I have ever seen. How to fix it?
-
-I've inherited a team with some very strange and un-documented practices, one of them is that they use an old custom
-application with a very slow tech stack written in Python that the team doesn't want to touch but also doesn't want to
-throw away as it has some "legacy" code in it.
-
-The problem is, the tech stack is very very slow.
-
-They have a single web server on a VM that is slow.
-The server is a little bit busy (not very busy though) and they have a lot of processes (30+ that are constantly being
-spawned by the application)
-They have an application that is single threaded and was written in Python and the team don't want to touch this, and
-the application is very slow.
-
-My task as a new member of the team is to fix this.
-
-I'm a senior dev on the team (3 years on the project) and have been told that I will take the lead on this task. I know
-next to nothing about Python. So here is what I have so far.
-
-What I have done is I've been trying to debug the processes with the "ps" command. This way I can see what is running
-and where. From what I see, the application spawns 10 processes a minute and some of them are used for nothing.
-
-I have also started to look for the code. The application source is not in GitHub or any other repository, it is only on
-our internal GitLab.
-
-What I've found so far:
-
-The application uses a custom SQLAlchemy implementation to interact with the data. I've looked at the source, it looks
-like an object cache or something like that. But from what I've seen, the cache gets full every 20 minutes and then gets
-cleared with a special command.
-
-Another strange thing is that the application creates a file for every entry in the database (even if the entry already
-exists). I've looked at the file to see if it contains something, but it seems to be a JSON file with lots of records.
-
-The other strange thing is that I can only find the database tables in the GitLab repository and not the code. So I
-can't really understand how the application is supposed to interact with the database.
-
-I also found a "log" directory, but the code is encrypted with AES. From what I've found, it is in
-
-main: mem per token = 16430420 bytes
-main:     load time =  3900.10 ms
-main:   sample time =    32.58 ms
-main:  predict time = 68049.91 ms / 130.11 ms per token
-main:    total time = 73020.05 ms
-
-real	1m13.156s
-user	9m1.328s
-sys.    0m7.103s
-```
-
-## Implementation details
-
-The high level implementation of the model is contained in the [main.cpp](main.cpp) file. The core computations are
-performed by the [ggml](https://github.com/ggerganov/ggml/blob/master/include/ggml/ggml.h) library.
-
-
-#### Matrix multiplication
-
-The most performance critical part of the implementation is of course the matrix multiplication routine. 99% of the time
-is spent here, so it was important to optimize this as much as possible.
-
-On Arm64, I utilize the 128-bit NEON intrinsics for 16-bit floating point operations:
-
-https://github.com/ggerganov/ggml/blob/fb558f78d905f85c54813602649ddd628ffe0f3a/src/ggml.c#L187-L243
-
-These instructions allow each core to operate simultaneously on 64 16-bit floats. I'm no expert in SIMD, but after quite
-some trials this was the most efficient code for dot product of a row and column that I could come up with. Combined
-with the parallel computation on 8 CPU threads, I believe I'm close to the maximum performance that one could possibly
-get on the M1 CPU. Still, I'm curious to know if there is a more efficient way to implement this.
-
-
-#### Attempt to use the M1 GPU
-
-One interesting property of the GPT-J transformer architecture is that it allows you to perform part of the inference in
-parallel - i.e. the Feed-forward network can be computed in parallel to the Self-attention layer:
-
-https://github.com/ggerganov/ggml/blob/fb558f78d905f85c54813602649ddd628ffe0f3a/examples/gpt-j/main.cpp#L507-L531
-
-So I thought why not try and bring in the M1 GPU to compute half of the neural network in parallel to the CPU and
-potentially gain some extra performance. Thanks to the M1's shared memory model, it was relatively easy to offload part
-of the computation to the GPU using Apple's [Metal Performance
-Shaders](https://developer.apple.com/documentation/metalperformanceshaders). The GPU shares the host memory, so there is
-no need to copy the data back and forth as you would normally do with Cuda or OpenCL. The weight matrices are directly
-available to be used by the GPU.
-
-However, to my surprise, using MPS together with the CPU did not lead to any performance improvement at all. My
-conclusion was that the 8-thread NEON CPU computation is already saturating the memory bandwidth of the M1 and since
-the CPU and the GPU on the MacBook are sharing that bandwidth, it does not help to offload the computation to the GPU.
-Another observation was that the MPS GPU matrix multiplication using 16-bit floats had the same performance as the
-8-thread NEON CPU implementation. Again, I explain this with a saturated memory channel. But of course, my explanation
-could be totally wrong and somehow the implementation wasn't utilizing the resources correctly.
-
-In the end, I decided to not use MPS or the GPU all together.
-
-### Zero memory allocations
-
-Another property of my implementation is that it does not perform any memory allocations once the model is loaded into
-memory. All required memory is allocated at the start of the program with a single `malloc` (technically 2 calls, but
-that is not important).
-
-## Usage
-
-If you want to give this a try and you are on Linux or Mac OS, simply follow these instructions:
-
-```bash
-# Clone the ggml library and build the gpt-j example
-git clone https://github.com/ggerganov/ggml
-cd ggml
-mkdir build && cd build
-cmake ..
-make -j4 gpt-j
-
-# Download the ggml-compatible GPT-J 6B model (requires 12GB disk space)
-../examples/gpt-j/download-ggml-model.sh 6B
-
-# Run the inference (requires 16GB of CPU RAM)
-./bin/gpt-j -m models/gpt-j-6B/ggml-model.bin -p "This is an example"
-
-# Input prompt through pipe and run the inference.
-echo "This is an example" > prompt.txt
-cat prompt.txt | ./bin/gpt-j -m models/gpt-j-6B/ggml-model.bin
-```
-
-To run the `gpt-j` tool, you need the 12GB `ggml-model.bin` file which contains the GPT-J model in
-[ggml](https://github.com/ggerganov/ggml) compatible format. In the instructions above, the binary file
-is downloaded from my repository on Hugging Face using the [download-ggml-model.sh](download-ggml-model.sh) script.
-You can also, download the file manually from this link:
-
-https://huggingface.co/ggerganov/ggml/tree/main
-
----
-
-Alternatively, if you don't want to download the 12GB ggml model file, you can perform the conversion yourself using
-python.
-
-First, you need to download the full GPT-J model from here: https://huggingface.co/EleutherAI/gpt-j-6B
-
-Note that the full model is quite big - about 72 GB. After you download it, you need to convert it to ggml format using
-the [convert-h5-to-ggml.py](convert-h5-to-ggml.py) script. This will generate the `ggml-model.bin` file, which you can
-then use with the `gpt-j` program.
-
-
-## GPT-2
-
-I also implemented a tool for CPU inference using the smaller GPT-2 models. They have worse quality compared to GPT-J,
-but are much faster to execute.
-
-For example, the Small GPT-2 model is only 240 MB big and the inference speed on my MacBook is about 200 tokens/sec.
-
-For more details, checkout the GPT-2 example here: [gpt-2](https://github.com/ggerganov/ggml/tree/master/examples/gpt-2)
diff --git a/ggml/examples/gpt-j/convert-h5-to-ggml.py b/ggml/examples/gpt-j/convert-h5-to-ggml.py
deleted file mode 100644
index cb77317..0000000
--- a/ggml/examples/gpt-j/convert-h5-to-ggml.py
+++ /dev/null
@@ -1,173 +0,0 @@
-# Convert GPT-J-6B h5 transformer model to ggml format
-#
-# Load the model using GPTJForCausalLM.
-# Iterate over all variables and write them to a binary file.
-#
-# For each variable, write the following:
-#   - Number of dimensions (int)
-#   - Name length (int)
-#   - Dimensions (int[n_dims])
-#   - Name (char[name_length])
-#   - Data (float[n_dims])
-#
-# By default, the bigger matrices are converted to 16-bit floats.
-# This can be disabled by adding the "use-f32" CLI argument.
-#
-# At the start of the ggml file we write the model parameters
-# and vocabulary.
-#
-
-import sys
-import struct
-import json
-import torch
-import numpy as np
-
-from transformers import GPTJForCausalLM
-
-# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
-def bytes_to_unicode():
-    """
-    Returns list of utf-8 byte and a corresponding list of unicode strings.
-    The reversible bpe codes work on unicode strings.
-    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
-    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
-    This is a signficant percentage of your normal, say, 32K bpe vocab.
-    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
-    And avoids mapping to whitespace/control characters the bpe code barfs on.
-    """
-    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
-    cs = bs[:]
-    n = 0
-    for b in range(2**8):
-        if b not in bs:
-            bs.append(b)
-            cs.append(2**8+n)
-            n += 1
-    cs = [chr(n) for n in cs]
-    return dict(zip(bs, cs))
-
-if len(sys.argv) < 3:
-    print("Usage: convert-h5-to-ggml.py dir-model [use-f32]\n")
-    print("  ftype == 0 -> float32")
-    print("  ftype == 1 -> float16")
-    sys.exit(1)
-
-# output in the same directory as the model
-dir_model = sys.argv[1]
-fname_out = sys.argv[1] + "/ggml-model.bin"
-
-with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f:
-    encoder = json.load(f)
-
-with open(dir_model + "/added_tokens.json", "r", encoding="utf-8") as f:
-    encoder_added = json.load(f)
-
-with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
-    hparams = json.load(f)
-
-# possible data types
-#   ftype == 0 -> float32
-#   ftype == 1 -> float16
-#
-# map from ftype to string
-ftype_str = ["f32", "f16"]
-
-ftype = 1
-if len(sys.argv) > 2:
-    ftype = int(sys.argv[2])
-    if ftype < 0 or ftype > 1:
-        print("Invalid ftype: " + str(ftype))
-        sys.exit(1)
-    fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin"
-
-
-model = GPTJForCausalLM.from_pretrained(dir_model, low_cpu_mem_usage=True)
-#print (model)
-
-list_vars = model.state_dict()
-#print (list_vars)
-
-fout = open(fname_out, "wb")
-
-fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
-fout.write(struct.pack("i", hparams["vocab_size"]))
-fout.write(struct.pack("i", hparams["n_positions"]))
-fout.write(struct.pack("i", hparams["n_embd"]))
-fout.write(struct.pack("i", hparams["n_head"]))
-fout.write(struct.pack("i", hparams["n_layer"]))
-fout.write(struct.pack("i", hparams["rotary_dim"]))
-fout.write(struct.pack("i", ftype))
-
-byte_encoder = bytes_to_unicode()
-byte_decoder = {v:k for k, v in byte_encoder.items()}
-
-fout.write(struct.pack("i", len(encoder) + len(encoder_added)))
-
-for key in encoder:
-    text = bytearray([byte_decoder[c] for c in key])
-    fout.write(struct.pack("i", len(text)))
-    fout.write(text)
-
-for key in encoder_added:
-    text = bytearray([byte_decoder[c] for c in key])
-    fout.write(struct.pack("i", len(text)))
-    fout.write(text)
-
-for name in list_vars.keys():
-    data = list_vars[name].squeeze().numpy()
-    print("Processing variable: " + name + " with shape: ", data.shape)
-
-    # we don't need these
-    if name.endswith("attn.masked_bias") or name.endswith(".attn.bias"):
-        print("  Skipping variable: " + name)
-        continue
-
-    n_dims = len(data.shape);
-
-    # ftype == 0 -> float32, ftype == 1 -> float16
-    ftype_cur = 0;
-    if ftype != 0:
-        if name[-7:] == ".weight" and n_dims == 2:
-            print("  Converting to float16")
-            data = data.astype(np.float16)
-            ftype_cur = 1
-        else:
-            print("  Converting to float32")
-            data = data.astype(np.float32)
-            ftype_cur = 0
-    else:
-        if data.dtype != np.float32:
-            print("  Converting to float32")
-            data = data.astype(np.float32)
-            ftype_cur = 0
-
-    # for efficiency - transpose these matrices:
-    # (note - with latest ggml this is no longer more efficient, so disabling it)
-    #  "transformer.h.*.mlp.fc_in.weight"
-    #  "transformer.h.*.attn.out_proj.weight"
-    #  "transformer.h.*.attn.q_proj.weight"
-    #  "transformer.h.*.attn.k_proj.weight"
-    #  "transformer.h.*.attn.v_proj.weight"
-    #if name.endswith(".mlp.fc_in.weight")     or \
-    #   name.endswith(".attn.out_proj.weight") or \
-    #   name.endswith(".attn.q_proj.weight")   or \
-    #   name.endswith(".attn.k_proj.weight")   or \
-    #   name.endswith(".attn.v_proj.weight"):
-    #    print("  Transposing")
-    #    data = data.transpose()
-
-    # header
-    str = name.encode('utf-8')
-    fout.write(struct.pack("iii", n_dims, len(str), ftype_cur))
-    for i in range(n_dims):
-        fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
-    fout.write(str);
-
-    # data
-    data.tofile(fout)
-
-fout.close()
-
-print("Done. Output file: " + fname_out)
-print("")
diff --git a/ggml/examples/gpt-j/download-ggml-model.sh b/ggml/examples/gpt-j/download-ggml-model.sh
deleted file mode 100644
index a9e2aa5..0000000
--- a/ggml/examples/gpt-j/download-ggml-model.sh
+++ /dev/null
@@ -1,69 +0,0 @@
-#!/bin/bash
-
-# This script downloads GPT-J model files that have already been converted to ggml format.
-# This way you don't have to convert them yourself.
-#
-# If you want to download the original GPT-J model files, use the "download-model.sh" script instead.
-
-#src="https://ggml.ggerganov.com"
-#pfx="ggml-model-gpt-j"
-
-src="https://huggingface.co/ggerganov/ggml"
-pfx="resolve/main/ggml-model-gpt-j"
-
-ggml_path=$(dirname $(realpath $0))
-
-# GPT-J models
-models=( "6B" )
-
-# list available models
-function list_models {
-    printf "\n"
-    printf "  Available models:"
-    for model in "${models[@]}"; do
-        printf " $model"
-    done
-    printf "\n\n"
-}
-
-if [ "$#" -ne 1 ]; then
-    printf "Usage: $0 <model>\n"
-    list_models
-
-    exit 1
-fi
-
-model=$1
-
-if [[ ! " ${models[@]} " =~ " ${model} " ]]; then
-    printf "Invalid model: $model\n"
-    list_models
-
-    exit 1
-fi
-
-# download ggml model
-
-printf "Downloading ggml model $model ...\n"
-
-mkdir -p models/gpt-j-$model
-
-if [ -x "$(command -v wget)" ]; then
-    wget --quiet --show-progress -O models/gpt-j-$model/ggml-model.bin $src/$pfx-$model.bin
-elif [ -x "$(command -v curl)" ]; then
-    curl -L --output models/gpt-j-$model/ggml-model.bin $src/$pfx-$model.bin
-else
-    printf "Either wget or curl is required to download models.\n"
-    exit 1
-fi
-
-if [ $? -ne 0 ]; then
-    printf "Failed to download ggml model $model \n"
-    printf "Please try again later or download the original GPT-J model files and convert them yourself.\n"
-    exit 1
-fi
-
-printf "Done! Model '$model' saved in 'models/gpt-j-$model/ggml-model.bin'\n"
-printf "You can now use it like this:\n\n"
-printf "  $ ./bin/gpt-j -m models/gpt-j-$model/ggml-model.bin -p \"This is an example\"\n"
-printf "\n"
diff --git a/ggml/examples/gpt-j/download-model.sh b/ggml/examples/gpt-j/download-model.sh
deleted file mode 100644
index c773baf..0000000
--- a/ggml/examples/gpt-j/download-model.sh
+++ /dev/null
@@ -1,11 +0,0 @@
-#!/bin/bash
-
-printf "To obtain the GPT-J 6B model files, please visit: https://huggingface.co/EleutherAI/gpt-j-6B\n\n"
-
-printf "The model is very big. For example, the reposirory above is 72GB in size.\n"
-printf "If you are sure that you want to clone it, simply run the following command:\n\n"
-
-printf " $ git clone https://huggingface.co/EleutherAI/gpt-j-6B models/gpt-j-6B\n\n"
-
-printf "Alternatively, use the 'download-ggml-model.sh' script to download a 12GB ggml version of the model.\n"
-printf "This version is enough to run inference using the ggml library.\n\n"
diff --git a/ggml/examples/gpt-j/main.cpp b/ggml/examples/gpt-j/main.cpp
deleted file mode 100644
index f29c357..0000000
--- a/ggml/examples/gpt-j/main.cpp
+++ /dev/null
@@ -1,754 +0,0 @@
-#include "ggml/ggml.h"
-
-#include "common.h"
-#include "common-ggml.h"
-
-#include <cassert>
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <fstream>
-#include <map>
-#include <string>
-#include <vector>
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-
-
-// default hparams (GPT-J 6B)
-struct gptj_hparams {
-    int32_t n_vocab = 50400;
-    int32_t n_ctx   = 2048;
-    int32_t n_embd  = 4096;
-    int32_t n_head  = 16;
-    int32_t n_layer = 28;
-    int32_t n_rot   = 64;
-    int32_t ftype   = 1;
-    float   eps     = 1e-5f;
-};
-
-struct gptj_layer {
-    // normalization
-    struct ggml_tensor * ln_1_g;
-    struct ggml_tensor * ln_1_b;
-
-    // attention
-    struct ggml_tensor * c_attn_q_proj_w;
-    struct ggml_tensor * c_attn_k_proj_w;
-    struct ggml_tensor * c_attn_v_proj_w;
-
-    struct ggml_tensor * c_attn_proj_w;
-
-    // ff
-    struct ggml_tensor * c_mlp_fc_w;
-    struct ggml_tensor * c_mlp_fc_b;
-
-    struct ggml_tensor * c_mlp_proj_w;
-    struct ggml_tensor * c_mlp_proj_b;
-};
-
-struct gptj_model {
-    gptj_hparams hparams;
-
-    // normalization
-    struct ggml_tensor * ln_f_g;
-    struct ggml_tensor * ln_f_b;
-
-    struct ggml_tensor * wte; // position embedding
-
-    struct ggml_tensor * lmh_g; // language model head
-    struct ggml_tensor * lmh_b; // language model bias
-
-    std::vector<gptj_layer> layers;
-
-    // key + value memory
-    struct ggml_tensor * memory_k;
-    struct ggml_tensor * memory_v;
-
-    //
-    struct ggml_context * ctx;
-    std::map<std::string, struct ggml_tensor *> tensors;
-};
-
-// load the model's weights from a file
-bool gptj_model_load(const std::string & fname, gptj_model & model, gpt_vocab & vocab) {
-    printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
-
-    auto fin = std::ifstream(fname, std::ios::binary);
-    if (!fin) {
-        fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
-        return false;
-    }
-
-    // verify magic
-    {
-        uint32_t magic;
-        fin.read((char *) &magic, sizeof(magic));
-        if (magic != GGML_FILE_MAGIC) {
-            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
-            return false;
-        }
-    }
-
-    // load hparams
-    {
-        auto & hparams = model.hparams;
-
-        fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
-        fin.read((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
-        fin.read((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
-        fin.read((char *) &hparams.n_head,  sizeof(hparams.n_head));
-        fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
-        fin.read((char *) &hparams.n_rot,   sizeof(hparams.n_rot));
-        fin.read((char *) &hparams.ftype,   sizeof(hparams.ftype));
-
-        const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
-
-        printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
-        printf("%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
-        printf("%s: n_embd  = %d\n", __func__, hparams.n_embd);
-        printf("%s: n_head  = %d\n", __func__, hparams.n_head);
-        printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
-        printf("%s: n_rot   = %d\n", __func__, hparams.n_rot);
-        printf("%s: ftype   = %d\n", __func__, hparams.ftype);
-        printf("%s: qntvr   = %d\n", __func__, qntvr);
-
-        hparams.ftype %= GGML_QNT_VERSION_FACTOR;
-    }
-
-    // load vocab
-    {
-        int32_t n_vocab = 0;
-        fin.read((char *) &n_vocab, sizeof(n_vocab));
-
-        if (n_vocab != model.hparams.n_vocab) {
-            fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
-                    __func__, fname.c_str(), n_vocab, model.hparams.n_vocab);
-            return false;
-        }
-
-        std::string word;
-        std::vector<char> buf(128);
-
-        for (int i = 0; i < n_vocab; i++) {
-            uint32_t len;
-            fin.read((char *) &len, sizeof(len));
-
-            buf.resize(len);
-            fin.read((char *) buf.data(), len);
-            word.assign(buf.data(), len);
-
-            vocab.token_to_id[word] = i;
-            vocab.id_to_token[i] = word;
-        }
-    }
-
-    // for the big tensors, we have the option to store the data in 16-bit floats or quantized
-    // in order to save memory and also to speed up the computation
-    ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
-    if (wtype == GGML_TYPE_COUNT) {
-        fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
-                __func__, fname.c_str(), model.hparams.ftype);
-        return false;
-    }
-
-    auto & ctx = model.ctx;
-
-    size_t ctx_size = 0;
-
-    {
-        const auto & hparams = model.hparams;
-
-        const int n_embd  = hparams.n_embd;
-        const int n_layer = hparams.n_layer;
-        const int n_ctx   = hparams.n_ctx;
-        const int n_vocab = hparams.n_vocab;
-
-        ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_g
-        ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_b
-
-        ctx_size += ggml_row_size(wtype, n_embd*n_vocab); // wte
-
-        ctx_size += ggml_row_size(wtype,         n_embd*n_vocab); // lmh_g
-        ctx_size += ggml_row_size(GGML_TYPE_F32,        n_vocab); // lmh_b
-
-        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_g
-        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_b
-
-        ctx_size += n_layer*(ggml_row_size(wtype, n_embd*n_embd)); // c_attn_q_proj_w
-        ctx_size += n_layer*(ggml_row_size(wtype, n_embd*n_embd)); // c_attn_k_proj_w
-        ctx_size += n_layer*(ggml_row_size(wtype, n_embd*n_embd)); // c_attn_v_proj_w
-
-        ctx_size += n_layer*(ggml_row_size(wtype, n_embd*n_embd)); // c_attn_proj_w
-
-        ctx_size += n_layer*(ggml_row_size(wtype,         4*n_embd*n_embd)); // c_mlp_fc_w
-        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 4*n_embd));        // c_mlp_fc_b
-
-        ctx_size += n_layer*(ggml_row_size(wtype,         4*n_embd*n_embd)); // c_mlp_proj_w
-        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32,   n_embd));        // c_mlp_proj_b
-
-        ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F16, n_embd); // memory_k
-        ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F16, n_embd); // memory_v
-
-        ctx_size += (5 + 10*n_layer)*512; // object overhead
-
-        printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
-    }
-
-    // create the ggml context
-    {
-        struct ggml_init_params params = {
-            /*.mem_size   =*/ ctx_size,
-            /*.mem_buffer =*/ NULL,
-            /*.no_alloc   =*/ false,
-        };
-
-        model.ctx = ggml_init(params);
-        if (!model.ctx) {
-            fprintf(stderr, "%s: ggml_init() failed\n", __func__);
-            return false;
-        }
-    }
-
-    // prepare memory for the weights
-    {
-        const auto & hparams = model.hparams;
-
-        const int n_embd  = hparams.n_embd;
-        const int n_layer = hparams.n_layer;
-        const int n_vocab = hparams.n_vocab;
-
-        model.layers.resize(n_layer);
-
-        model.wte    = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
-
-        model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-        model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-
-        model.lmh_g  = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
-        model.lmh_b  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_vocab);
-
-        // map by name
-        model.tensors["transformer.wte.weight"] = model.wte;
-
-        model.tensors["transformer.ln_f.weight"] = model.ln_f_g;
-        model.tensors["transformer.ln_f.bias"]   = model.ln_f_b;
-
-        model.tensors["lm_head.weight"] = model.lmh_g;
-        model.tensors["lm_head.bias"]   = model.lmh_b;
-
-        for (int i = 0; i < n_layer; ++i) {
-            auto & layer = model.layers[i];
-
-            layer.ln_1_g          = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-            layer.ln_1_b          = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-
-            layer.c_attn_q_proj_w = ggml_new_tensor_2d(ctx, wtype,           n_embd,   n_embd);
-            layer.c_attn_k_proj_w = ggml_new_tensor_2d(ctx, wtype,           n_embd,   n_embd);
-            layer.c_attn_v_proj_w = ggml_new_tensor_2d(ctx, wtype,           n_embd,   n_embd);
-
-            layer.c_attn_proj_w   = ggml_new_tensor_2d(ctx, wtype,           n_embd,   n_embd);
-
-            layer.c_mlp_fc_w      = ggml_new_tensor_2d(ctx, wtype,           n_embd, 4*n_embd);
-            layer.c_mlp_fc_b      = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
-
-            layer.c_mlp_proj_w    = ggml_new_tensor_2d(ctx, wtype,         4*n_embd,   n_embd);
-            layer.c_mlp_proj_b    = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-
-            // map by name
-            model.tensors["transformer.h." + std::to_string(i) + ".ln_1.weight"]          = layer.ln_1_g;
-            model.tensors["transformer.h." + std::to_string(i) + ".ln_1.bias"]            = layer.ln_1_b;
-
-            model.tensors["transformer.h." + std::to_string(i) + ".attn.q_proj.weight"]   = layer.c_attn_q_proj_w;
-            model.tensors["transformer.h." + std::to_string(i) + ".attn.k_proj.weight"]   = layer.c_attn_k_proj_w;
-            model.tensors["transformer.h." + std::to_string(i) + ".attn.v_proj.weight"]   = layer.c_attn_v_proj_w;
-
-            model.tensors["transformer.h." + std::to_string(i) + ".attn.out_proj.weight"] = layer.c_attn_proj_w;
-
-            model.tensors["transformer.h." + std::to_string(i) + ".mlp.fc_in.weight"]     = layer.c_mlp_fc_w;
-            model.tensors["transformer.h." + std::to_string(i) + ".mlp.fc_in.bias"]       = layer.c_mlp_fc_b;
-
-            model.tensors["transformer.h." + std::to_string(i) + ".mlp.fc_out.weight"]    = layer.c_mlp_proj_w;
-            model.tensors["transformer.h." + std::to_string(i) + ".mlp.fc_out.bias"]      = layer.c_mlp_proj_b;
-        }
-    }
-
-    // key + value memory
-    {
-        const auto & hparams = model.hparams;
-
-        const int n_embd  = hparams.n_embd;
-        const int n_layer = hparams.n_layer;
-        const int n_ctx   = hparams.n_ctx;
-
-        const int n_mem      = n_layer*n_ctx;
-        const int n_elements = n_embd*n_mem;
-
-        model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
-        model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
-
-        const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
-
-        printf("%s: memory_size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem);
-    }
-
-    // load weights
-    {
-        int n_tensors = 0;
-        size_t total_size = 0;
-
-        printf("%s: ", __func__);
-
-        while (true) {
-            int32_t n_dims;
-            int32_t length;
-            int32_t ttype;
-
-            fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
-            fin.read(reinterpret_cast<char *>(&length), sizeof(length));
-            fin.read(reinterpret_cast<char *>(&ttype),  sizeof(ttype));
-
-            if (fin.eof()) {
-                break;
-            }
-
-            int32_t nelements = 1;
-            int32_t ne[2] = { 1, 1 };
-            for (int i = 0; i < n_dims; ++i) {
-                fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
-                nelements *= ne[i];
-            }
-
-            std::string name(length, 0);
-            fin.read(&name[0], length);
-
-            if (model.tensors.find(name) == model.tensors.end()) {
-                fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.c_str());
-                return false;
-            }
-
-            auto tensor = model.tensors[name];
-            if (ggml_nelements(tensor) != nelements) {
-                fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.c_str());
-                return false;
-            }
-
-            if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
-                fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
-                        __func__, name.c_str(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]);
-                return false;
-            }
-
-            // for debugging
-            if (0) {
-                printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.c_str(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
-            }
-
-            const size_t bpe = ggml_type_size(ggml_type(ttype));
-
-            if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
-                fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
-                        __func__, name.c_str(), ggml_nbytes(tensor), nelements*bpe);
-                return false;
-            }
-
-            fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
-
-            //printf("%42s - [%5d, %5d], type = %6s, %6.2f MB\n", name.c_str(), ne[0], ne[1], ttype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
-            total_size += ggml_nbytes(tensor);
-            if (++n_tensors % 8 == 0) {
-                printf(".");
-                fflush(stdout);
-            }
-        }
-
-        printf(" done\n");
-
-        printf("%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, n_tensors);
-    }
-
-    fin.close();
-
-    return true;
-}
-
-// evaluate the transformer
-//
-//   - model:     the model
-//   - n_threads: number of threads to use
-//   - n_past:    the context size so far
-//   - embd_inp:  the embeddings of the tokens in the context
-//   - embd_w:    the predicted logits for the next token
-//
-// The GPT-J model requires about 16MB of memory per input token.
-//
-bool gptj_eval(
-        const gptj_model & model,
-        const int n_threads,
-        const int n_past,
-        const std::vector<gpt_vocab::id> & embd_inp,
-              std::vector<float>         & embd_w,
-              size_t                     & mem_per_token) {
-    const int N = embd_inp.size();
-
-    const auto & hparams = model.hparams;
-
-    const int n_embd  = hparams.n_embd;
-    const int n_layer = hparams.n_layer;
-    const int n_ctx   = hparams.n_ctx;
-    const int n_head  = hparams.n_head;
-    const int n_vocab = hparams.n_vocab;
-    const int n_rot   = hparams.n_rot;
-
-    static size_t buf_size = 256u*1024*1024;
-    static void * buf = malloc(buf_size);
-
-    if (mem_per_token > 0 && mem_per_token*N > buf_size) {
-        const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
-        //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
-
-        // reallocate
-        buf_size = buf_size_new;
-        buf = realloc(buf, buf_size);
-        if (buf == nullptr) {
-            fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size);
-            return false;
-        }
-    }
-
-    struct ggml_init_params params = {
-        /*.mem_size   =*/ buf_size,
-        /*.mem_buffer =*/ buf,
-        /*.no_alloc   =*/ false,
-    };
-
-    struct ggml_context * ctx0 = ggml_init(params);
-    struct ggml_cgraph * gf = ggml_new_graph(ctx0);
-
-    // KQ_pos - contains the positions
-    struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
-    int * data = (int *) KQ_pos->data;
-    for (int i = 0; i < N; ++i) {
-        data[i] = n_past + i;
-    }
-
-    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
-    memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
-
-    // wte
-    struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.wte, embd);
-
-    for (int il = 0; il < n_layer; ++il) {
-        struct ggml_tensor * cur;
-
-        // norm
-        {
-            cur = ggml_norm(ctx0, inpL, hparams.eps);
-
-            // cur = ln_1_g*cur + ln_1_b
-            cur = ggml_add(ctx0,
-                    ggml_mul(ctx0,
-                        ggml_repeat(ctx0, model.layers[il].ln_1_g, cur),
-                        cur),
-                    ggml_repeat(ctx0, model.layers[il].ln_1_b, cur));
-        }
-
-        struct ggml_tensor * inpSA = cur;
-
-        // self-attention
-        {
-            struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].c_attn_q_proj_w, cur), n_embd/n_head, n_head, N), KQ_pos, n_rot, 0, 0);
-            struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].c_attn_k_proj_w, cur), n_embd/n_head, n_head, N), KQ_pos, n_rot, 0, 0);
-
-            // store key and value to memory
-            {
-                struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_mul_mat(ctx0, model.layers[il].c_attn_v_proj_w, cur));
-
-                struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past));
-                struct ggml_tensor * v = ggml_view_2d(ctx0, model.memory_v, N, n_embd,
-                        (   n_ctx)*ggml_element_size(model.memory_v),
-                        (il*n_ctx)*ggml_element_size(model.memory_v)*n_embd + n_past*ggml_element_size(model.memory_v));
-
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
-            }
-
-            // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
-            struct ggml_tensor * Q =
-                ggml_permute(ctx0,
-                        Qcur,
-                        0, 2, 1, 3);
-
-            // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
-            struct ggml_tensor * K =
-                ggml_permute(ctx0,
-                        ggml_reshape_3d(ctx0,
-                            ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd),
-                            n_embd/n_head, n_head, n_past + N),
-                        0, 2, 1, 3);
-
-            // K * Q
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
-
-            // KQ_scaled = KQ / sqrt(n_embd/n_head)
-            struct ggml_tensor * KQ_scaled =
-                ggml_scale_inplace(ctx0,
-                        KQ,
-                        1.0f/sqrt(float(n_embd)/n_head));
-
-            // KQ_masked = mask_past(KQ_scaled)
-            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
-
-            // KQ = soft_max(KQ_masked)
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
-
-            // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
-            struct ggml_tensor * V =
-                ggml_view_3d(ctx0, model.memory_v,
-                        n_past + N, n_embd/n_head, n_head,
-                        n_ctx*ggml_element_size(model.memory_v),
-                        n_ctx*ggml_element_size(model.memory_v)*n_embd/n_head,
-                        il*n_ctx*ggml_element_size(model.memory_v)*n_embd);
-
-            // KQV = transpose(V) * KQ_soft_max
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
-
-            // KQV_merged = KQV.permute(0, 2, 1, 3)
-            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
-
-            // cur = KQV_merged.contiguous().view(n_embd, N)
-            cur = ggml_cpy(ctx0,
-                    KQV_merged,
-                    ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
-
-            // projection (no bias)
-            cur = ggml_mul_mat(ctx0,
-                    model.layers[il].c_attn_proj_w,
-                    cur);
-        }
-
-        struct ggml_tensor * inpFF = cur;
-
-        // feed-forward network
-        // this is independent of the self-attention result, so it could be done in parallel to the self-attention
-        {
-            // note here we pass inpSA instead of cur
-            cur = ggml_mul_mat(ctx0,
-                    model.layers[il].c_mlp_fc_w,
-                    inpSA);
-
-            cur = ggml_add(ctx0,
-                    ggml_repeat(ctx0, model.layers[il].c_mlp_fc_b, cur),
-                    cur);
-
-            // GELU activation
-            cur = ggml_gelu(ctx0, cur);
-
-            // projection
-            // cur = proj_w*cur + proj_b
-            cur = ggml_mul_mat(ctx0,
-                    model.layers[il].c_mlp_proj_w,
-                    cur);
-
-            cur = ggml_add(ctx0,
-                    ggml_repeat(ctx0, model.layers[il].c_mlp_proj_b, cur),
-                    cur);
-        }
-
-        // self-attention + FF
-        cur  = ggml_add(ctx0, cur, inpFF);
-
-        // input for next layer
-        inpL = ggml_add(ctx0, cur, inpL);
-    }
-
-    // norm
-    {
-        inpL = ggml_norm(ctx0, inpL, hparams.eps);
-
-        // inpL = ln_f_g*inpL + ln_f_b
-        inpL = ggml_add(ctx0,
-                ggml_mul(ctx0,
-                    ggml_repeat(ctx0, model.ln_f_g, inpL),
-                    inpL),
-                ggml_repeat(ctx0, model.ln_f_b, inpL));
-    }
-
-    // lm_head
-    {
-        inpL = ggml_mul_mat(ctx0, model.lmh_g, inpL);
-
-        inpL = ggml_add(ctx0,
-                ggml_repeat(ctx0, model.lmh_b, inpL),
-                inpL);
-    }
-
-    // logits -> probs
-    //inpL = ggml_soft_max_inplace(ctx0, inpL);
-
-    // run the computation
-    ggml_build_forward_expand(gf, inpL);
-    ggml_graph_compute_with_ctx(ctx0, gf, n_threads);
-
-    //if (n_past%100 == 0) {
-    //    ggml_graph_print   (&gf);
-    //    ggml_graph_dump_dot(&gf, NULL, "gpt-j.dot");
-    //}
-
-    //embd_w.resize(n_vocab*N);
-    //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
-
-    // return result for just the last token
-    embd_w.resize(n_vocab);
-    memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
-
-    if (mem_per_token == 0) {
-        mem_per_token = ggml_used_mem(ctx0)/N;
-    }
-    //printf("used_mem = %zu\n", ggml_used_mem(ctx0));
-
-    ggml_free(ctx0);
-
-    return true;
-}
-
-int main(int argc, char ** argv) {
-    ggml_time_init();
-
-    const int64_t t_main_start_us = ggml_time_us();
-
-    gpt_params params;
-    params.model = "models/gpt-j-6B/ggml-model.bin";
-
-    if (gpt_params_parse(argc, argv, params) == false) {
-        return 1;
-    }
-
-    if (params.seed < 0) {
-        params.seed = time(NULL);
-    }
-
-    printf("%s: seed = %d\n", __func__, params.seed);
-
-    std::mt19937 rng(params.seed);
-    if (params.prompt.empty()) {
-        params.prompt = gpt_random_prompt(rng);
-    }
-
-    int64_t t_load_us = 0;
-
-    gpt_vocab vocab;
-    gptj_model model;
-
-    // load the model
-    {
-        const int64_t t_start_us = ggml_time_us();
-
-        if (!gptj_model_load(params.model, model, vocab)) {
-            fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
-            return 1;
-        }
-
-        t_load_us = ggml_time_us() - t_start_us;
-
-        test_gpt_tokenizer(vocab, params.token_test);
-    }
-
-    int n_past = 0;
-
-    int64_t t_sample_us  = 0;
-    int64_t t_predict_us = 0;
-
-    std::vector<float> logits;
-
-    // tokenize the prompt
-    std::vector<gpt_vocab::id> embd_inp = ::gpt_tokenize(vocab, params.prompt);
-
-    params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size());
-
-    printf("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
-    printf("\n");
-
-    std::vector<gpt_vocab::id> embd;
-
-    // determine the required inference memory per token:
-    size_t mem_per_token = 0;
-    gptj_eval(model, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
-
-    for (size_t i = embd.size(); i < embd_inp.size() + params.n_predict; i++) {
-        // predict
-        if (embd.size() > 0) {
-            const int64_t t_start_us = ggml_time_us();
-
-            if (!gptj_eval(model, params.n_threads, n_past, embd, logits, mem_per_token)) {
-                printf("Failed to predict\n");
-                return 1;
-            }
-
-            t_predict_us += ggml_time_us() - t_start_us;
-        }
-
-        n_past += embd.size();
-        embd.clear();
-
-        if (i >= embd_inp.size()) {
-            // sample next token
-            const int   top_k = params.top_k;
-            const float top_p = params.top_p;
-            const float temp  = params.temp;
-
-            const int n_vocab = model.hparams.n_vocab;
-
-            gpt_vocab::id id = 0;
-
-            {
-                const int64_t t_start_sample_us = ggml_time_us();
-
-                id = gpt_sample_top_k_top_p(vocab, logits.data() + (logits.size() - n_vocab), top_k, top_p, temp, rng);
-
-                t_sample_us += ggml_time_us() - t_start_sample_us;
-            }
-
-            // add it to the context
-            embd.push_back(id);
-        } else {
-            // if here, it means we are still processing the input prompt
-            for (size_t k = i; k < embd_inp.size(); k++) {
-                embd.push_back(embd_inp[k]);
-                if (int32_t(embd.size()) > params.n_batch) {
-                    break;
-                }
-            }
-            i += embd.size() - 1;
-        }
-
-        // display text
-        for (auto id : embd) {
-            printf("%s", vocab.id_to_token[id].c_str());
-        }
-        fflush(stdout);
-
-        // end of text token
-        if (embd.back() == 50256) {
-            break;
-        }
-    }
-
-    // report timing
-    {
-        const int64_t t_main_end_us = ggml_time_us();
-
-        printf("\n\n");
-        printf("%s: mem per token = %8zu bytes\n", __func__, mem_per_token);
-        printf("%s:     load time = %8.2f ms\n", __func__, t_load_us/1000.0f);
-        printf("%s:   sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f);
-        printf("%s:  predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/n_past);
-        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
-    }
-
-    ggml_free(model.ctx);
-
-    return 0;
-}
diff --git a/ggml/examples/gpt-j/quantize.cpp b/ggml/examples/gpt-j/quantize.cpp
deleted file mode 100644
index 437053b..0000000
--- a/ggml/examples/gpt-j/quantize.cpp
+++ /dev/null
@@ -1,182 +0,0 @@
-#include "ggml/ggml.h"
-
-#include "common.h"
-#include "common-ggml.h"
-
-#include <cassert>
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <fstream>
-#include <map>
-#include <string>
-#include <vector>
-#include <regex>
-
-// default hparams (GPT-J 6B)
-struct gptj_hparams {
-    int32_t n_vocab = 50400;
-    int32_t n_ctx   = 2048;
-    int32_t n_embd  = 4096;
-    int32_t n_head  = 16;
-    int32_t n_layer = 28;
-    int32_t n_rot   = 64;
-    int32_t ftype   = 1;
-};
-
-// quantize a model
-bool gptj_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_ftype ftype) {
-    gpt_vocab vocab;
-
-    printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
-
-    auto finp = std::ifstream(fname_inp, std::ios::binary);
-    if (!finp) {
-        fprintf(stderr, "%s: failed to open '%s' for reading\n", __func__, fname_inp.c_str());
-        return false;
-    }
-
-    auto fout = std::ofstream(fname_out, std::ios::binary);
-    if (!fout) {
-        fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname_out.c_str());
-        return false;
-    }
-
-    // verify magic
-    {
-        uint32_t magic;
-        finp.read((char *) &magic, sizeof(magic));
-        if (magic != GGML_FILE_MAGIC) {
-            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname_inp.c_str());
-            return false;
-        }
-
-        fout.write((char *) &magic, sizeof(magic));
-    }
-
-    gptj_hparams hparams;
-
-    // load hparams
-    {
-        finp.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
-        finp.read((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
-        finp.read((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
-        finp.read((char *) &hparams.n_head,  sizeof(hparams.n_head));
-        finp.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
-        finp.read((char *) &hparams.n_rot,   sizeof(hparams.n_rot));
-        finp.read((char *) &hparams.ftype,   sizeof(hparams.ftype));
-
-        const int32_t qntvr_src =    hparams.ftype / GGML_QNT_VERSION_FACTOR;
-        const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype;
-
-        printf("%s: n_vocab     = %d\n", __func__, hparams.n_vocab);
-        printf("%s: n_ctx       = %d\n", __func__, hparams.n_ctx);
-        printf("%s: n_embd      = %d\n", __func__, hparams.n_embd);
-        printf("%s: n_head      = %d\n", __func__, hparams.n_head);
-        printf("%s: n_layer     = %d\n", __func__, hparams.n_layer);
-        printf("%s: ftype (src) = %d\n", __func__, hparams.ftype);
-        printf("%s: qntvr (src) = %d\n", __func__, qntvr_src);
-        printf("%s: ftype (dst) = %d\n", __func__, ftype_dst);
-        printf("%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION);
-
-        fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
-        fout.write((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
-        fout.write((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
-        fout.write((char *) &hparams.n_head,  sizeof(hparams.n_head));
-        fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer));
-        fout.write((char *) &hparams.n_rot,   sizeof(hparams.n_rot));
-        fout.write((char *) &ftype_dst,       sizeof(ftype_dst));
-    }
-
-    // load vocab
-    {
-        int32_t n_vocab = 0;
-        finp.read ((char *) &n_vocab, sizeof(n_vocab));
-        fout.write((char *) &n_vocab, sizeof(n_vocab));
-
-        if (n_vocab != hparams.n_vocab) {
-            fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
-                    __func__, fname_inp.c_str(), n_vocab, hparams.n_vocab);
-            return false;
-        }
-
-        std::string word;
-        for (int i = 0; i < n_vocab; i++) {
-            uint32_t len;
-            finp.read ((char *) &len, sizeof(len));
-            fout.write((char *) &len, sizeof(len));
-
-            word.resize(len);
-            finp.read ((char *) word.data(), len);
-            fout.write((char *) word.data(), len);
-
-            vocab.token_to_id[word] = i;
-            vocab.id_to_token[i] = word;
-        }
-    }
-
-    // regexes of tensor names to be quantized
-    const std::vector<std::string> to_quant = {
-        ".*weight",
-    };
-
-    if (!ggml_common_quantize_0(finp, fout, ftype, to_quant, {})) {
-        fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__, fname_inp.c_str());
-        return false;
-    }
-
-    finp.close();
-    fout.close();
-
-    return true;
-}
-
-// usage:
-//  ./gpt-2-quantize models/gpt-2-117M/ggml-model.bin models/gpt-2-117M/ggml-model-quant.bin type
-//
-int main(int argc, char ** argv) {
-    if (argc != 4) {
-        fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
-        ggml_print_ftypes(stderr);
-        return 1;
-    }
-
-    // needed to initialize f16 tables
-    {
-        struct ggml_init_params params = { 0, NULL, false };
-        struct ggml_context * ctx = ggml_init(params);
-        ggml_free(ctx);
-    }
-
-    const std::string fname_inp = argv[1];
-    const std::string fname_out = argv[2];
-
-    const ggml_ftype ftype = ggml_parse_ftype(argv[3]);
-
-    const int64_t t_main_start_us = ggml_time_us();
-
-    int64_t t_quantize_us = 0;
-
-    // load the model
-    {
-        const int64_t t_start_us = ggml_time_us();
-
-        if (!gptj_model_quantize(fname_inp, fname_out, ggml_ftype(ftype))) {
-            fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
-            return 1;
-        }
-
-        t_quantize_us = ggml_time_us() - t_start_us;
-    }
-
-    // report timing
-    {
-        const int64_t t_main_end_us = ggml_time_us();
-
-        printf("\n");
-        printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0f);
-        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
-    }
-
-    return 0;
-}
diff --git a/ggml/examples/gpt-neox/CMakeLists.txt b/ggml/examples/gpt-neox/CMakeLists.txt
deleted file mode 100644
index 21a319b..0000000
--- a/ggml/examples/gpt-neox/CMakeLists.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-#
-# gpt-neox
-
-set(TEST_TARGET gpt-neox)
-add_executable(${TEST_TARGET} main.cpp)
-target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)
-
-#
-# gpt-neox-quantize
-
-set(TEST_TARGET gpt-neox-quantize)
-add_executable(${TEST_TARGET} quantize.cpp)
-target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)
diff --git a/ggml/examples/gpt-neox/README.md b/ggml/examples/gpt-neox/README.md
deleted file mode 100644
index 64c6d7c..0000000
--- a/ggml/examples/gpt-neox/README.md
+++ /dev/null
@@ -1,110 +0,0 @@
-# GPT-NeoX
-
-Transformer architecture: GPT-NeoX
-
-Ref: https://github.com/stability-AI/stableLM/#stablelm-alpha
-
-## Usage
-
-```bash
-# get the repo and build it
-git clone https://github.com/ggerganov/ggml
-cd ggml
-mkdir build && cd build
-cmake ..
-make -j
-
-# get the StableLM 3B Alpha model
-git clone https://huggingface.co/stabilityai/gpt_neox-base-alpha-3b
-
-# install Python dependencies
-python3 -m pip install -r ../requirements.txt
-
-# convert model to FP16
-python3 ../examples/gpt-neox/convert-h5-to-ggml.py ./stablelm-base-alpha-3b/ 1
-
-# run inference using FP16 precision
-make -j && ./bin/gpt-neox -m ./stablelm-base-alpha-3b/ggml-model-f16.bin -p "I believe the meaning of life is" -t 8 -n 64
-
-main: seed = 1681940611
-gpt_neox_model_load: loading model from 'models/stablelm-base-alpha-3b/ggml-model-f16.bin' - please wait ...
-gpt_neox_model_load: n_vocab = 50688
-gpt_neox_model_load: n_ctx   = 4096
-gpt_neox_model_load: n_embd  = 4096
-gpt_neox_model_load: n_head  = 32
-gpt_neox_model_load: n_layer = 16
-gpt_neox_model_load: n_rot   = 32
-gpt_neox_model_load: ftype   = 1
-gpt_neox_model_load: ggml ctx size = 10011.10 MB
-gpt_neox_model_load: memory_size =  2048.00 MB, n_mem = 65536
-gpt_neox_model_load: ................................ done
-gpt_neox_model_load: model size =  6939.28 MB / num tensors = 260
-main: number of tokens in prompt = 7
-main: token[0] =     42, I
-main: token[1] =   2868,  believe
-main: token[2] =    253,  the
-main: token[3] =   4495,  meaning
-main: token[4] =    273,  of
-main: token[5] =   1495,  life
-main: token[6] =    310,  is
-
-I believe the meaning of life is to grow, to find a way, to love, to find an appreciation for life, and to live it with all of its beauty.
-
-For I am the child of God. I am the offspring of God's love. I am the offspring of the light of the world. I am the offspring of the
-
-main: mem per token = 12186760 bytes
-main:     load time =  2118.55 ms
-main:   sample time =     9.59 ms
-main:  predict time =  4474.07 ms / 63.92 ms per token
-main:    total time =  6911.26 ms
-```
-
-## 5-bit integer quantization mode
-
-```bash
-# quantize the model to 5-bits using Q5_0 quantization
-./bin/gpt-neox-quantize ./stablelm-base-alpha-3b/ggml-model-f16.bin ./stablelm-base-alpha-3b/ggml-model-q5_0.bin q5_0
-
-# run the quantized model
-./bin/gpt-neox -m ./stablelm-base-alpha-3b/ggml-model-q5_0.bin -p "I believe the meaning of life is" -t 8 -n 64
-
-main: seed = 1682021489
-gpt_neox_model_load: loading model from 'models/stablelm-base-alpha-3b/ggml-model-q5_0.bin' - please wait ...
-gpt_neox_model_load: n_vocab = 50688
-gpt_neox_model_load: n_ctx   = 4096
-gpt_neox_model_load: n_embd  = 4096
-gpt_neox_model_load: n_head  = 32
-gpt_neox_model_load: n_layer = 16
-gpt_neox_model_load: n_rot   = 32
-gpt_neox_model_load: ftype   = 6
-gpt_neox_model_load: ggml ctx size = 5676.10 MB
-gpt_neox_model_load: memory_size =  1024.00 MB, n_mem = 65536
-gpt_neox_model_load: ........................ done
-gpt_neox_model_load: model size =  2604.28 MB / num tensors = 196
-main: number of tokens in prompt = 7
-main: token[0] =     42, I
-main: token[1] =   2868,  believe
-main: token[2] =    253,  the
-main: token[3] =   4495,  meaning
-main: token[4] =    273,  of
-main: token[5] =   1495,  life
-main: token[6] =    310,  is
-
-I believe the meaning of life is to love and be loved. The last three verses were enough to tie us all together. If you love someone you love them all. There are some things in this world that are just not equal in Heaven. - Be here in this moment.
-
-This world is not what is outside of us. It is what
-
-main: mem per token = 12958024 bytes
-main:     load time =   850.51 ms
-main:   sample time =     9.95 ms
-main:  predict time =  3103.81 ms / 44.34 ms per token
-main:    total time =  4177.68 ms
-
-```
-
-## Notes
-
-- No guarantees for correctness
-- The tokenizer is currently hacked - probably works only for English
-- Non-parallel residual is not supported
-- Contributions and improvements are welcome
diff --git a/ggml/examples/gpt-neox/convert-h5-to-ggml.py b/ggml/examples/gpt-neox/convert-h5-to-ggml.py
deleted file mode 100644
index f11a4cb..0000000
--- a/ggml/examples/gpt-neox/convert-h5-to-ggml.py
+++ /dev/null
@@ -1,107 +0,0 @@
-import sys
-import struct
-import json
-import numpy as np
-
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-if len(sys.argv) < 3:
-    print("Usage: convert-h5-to-ggml.py dir-model [use-f32]\n")
-    print("  ftype == 0 -> float32")
-    print("  ftype == 1 -> float16")
-    sys.exit(1)
-
-# output in the same directory as the model
-dir_model = sys.argv[1]
-fname_out = sys.argv[1] + "/ggml-model.bin"
-
-with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
-    hparams = json.load(f)
-
-# possible data types
-#   ftype == 0 -> float32
-#   ftype == 1 -> float16
-#
-# map from ftype to string
-ftype_str = ["f32", "f16"]
-
-ftype = 1
-if len(sys.argv) > 2:
-    ftype = int(sys.argv[2])
-    if ftype < 0 or ftype > 1:
-        print("Invalid ftype: " + str(ftype))
-        sys.exit(1)
-    fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin"
-
-
-tokenizer = AutoTokenizer.from_pretrained(dir_model)
-model = AutoModelForCausalLM.from_pretrained(dir_model, low_cpu_mem_usage=True)
-
-list_vars = model.state_dict()
-for name in list_vars.keys():
-    print(name, list_vars[name].shape, list_vars[name].dtype)
-
-fout = open(fname_out, "wb")
-
-print(hparams)
-
-fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
-fout.write(struct.pack("i", hparams["vocab_size"]))
-fout.write(struct.pack("i", hparams["max_position_embeddings"]))
-fout.write(struct.pack("i", hparams["hidden_size"]))
-fout.write(struct.pack("i", hparams["num_attention_heads"]))
-fout.write(struct.pack("i", hparams["num_hidden_layers"]))
-fout.write(struct.pack("i", int(hparams["rotary_pct"]*(hparams["hidden_size"]//hparams["num_attention_heads"]))))
-fout.write(struct.pack("i", hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True))
-fout.write(struct.pack("i", ftype))
-
-# TODO: temporary hack to not deal with implementing the tokenizer
-for i in range(hparams["vocab_size"]):
-    text = tokenizer.decode([i]).encode('utf-8')
-    fout.write(struct.pack("i", len(text)))
-    fout.write(text)
-
-for name in list_vars.keys():
-    data = list_vars[name].squeeze().numpy()
-    print("Processing variable: " + name + " with shape: ", data.shape)
-
-    # we don't need these
-    if name.endswith(".attention.masked_bias") or     \
-       name.endswith(".attention.bias") or \
-       name.endswith(".attention.rotary_emb.inv_freq"):
-        print("  Skipping variable: " + name)
-        continue
-
-    n_dims = len(data.shape)
-
-    # ftype == 0 -> float32, ftype == 1 -> float16
-    ftype_cur = 0
-    if ftype != 0:
-        if name[-7:] == ".weight" and n_dims == 2:
-            print("  Converting to float16")
-            data = data.astype(np.float16)
-            ftype_cur = 1
-        else:
-            print("  Converting to float32")
-            data = data.astype(np.float32)
-            ftype_cur = 0
-    else:
-        if data.dtype != np.float32:
-            print("  Converting to float32")
-            data = data.astype(np.float32)
-            ftype_cur = 0
-
-    # header
-    str = name.encode('utf-8')
-    fout.write(struct.pack("iii", n_dims, len(str), ftype_cur))
-    for i in range(n_dims):
-        fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
-    fout.write(str)
-
-    # data
-    data.tofile(fout)
-
-fout.close()
-
-print("Done. Output file: " + fname_out)
-print("")
diff --git a/ggml/examples/gpt-neox/main.cpp b/ggml/examples/gpt-neox/main.cpp
deleted file mode 100644
index 37f3c61..0000000
--- a/ggml/examples/gpt-neox/main.cpp
+++ /dev/null
@@ -1,820 +0,0 @@
-#include "ggml/ggml.h"
-
-#include "common.h"
-#include "common-ggml.h"
-
-#include <cassert>
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <cinttypes>
-#include <fstream>
-#include <map>
-#include <string>
-#include <vector>
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-
-// default hparams (StableLM 3B)
-struct gpt_neox_hparams {
-    int32_t n_vocab = 50257;
-    int32_t n_ctx   = 4096;
-    int32_t n_embd  = 4096;
-    int32_t n_head  = 32;
-    int32_t n_layer = 16;
-    int32_t n_rot   = 32; // rotary_pct * (n_embd / n_head)
-    int32_t par_res = 1; // 1 = true, 0 = false
-    int32_t ftype   = 1;
-    float   eps     = 1e-5f;
-};
-
-struct gpt_neox_layer {
-    // pre normalization
-    struct ggml_tensor * ln_1_g;
-    struct ggml_tensor * ln_1_b;
-
-    // attention
-    struct ggml_tensor * c_attn_attn_w;
-    struct ggml_tensor * c_attn_attn_b;
-
-    struct ggml_tensor * c_attn_proj_w;
-    struct ggml_tensor * c_attn_proj_b;
-
-    // post normalization
-    struct ggml_tensor * ln_2_g;
-    struct ggml_tensor * ln_2_b;
-
-    // ff
-    struct ggml_tensor * c_mlp_fc_w;
-    struct ggml_tensor * c_mlp_fc_b;
-
-    struct ggml_tensor * c_mlp_proj_w;
-    struct ggml_tensor * c_mlp_proj_b;
-};
-
-struct gpt_neox_model {
-    gpt_neox_hparams hparams;
-
-    // normalization
-    struct ggml_tensor * ln_f_g;
-    struct ggml_tensor * ln_f_b;
-
-    struct ggml_tensor * wte; // position embedding
-
-    struct ggml_tensor * lmh_g; // language model head
-    //struct ggml_tensor * lmh_b; // language model bias
-
-    std::vector<gpt_neox_layer> layers;
-
-    // key + value memory
-    struct ggml_tensor * memory_k;
-    struct ggml_tensor * memory_v;
-
-    //
-    struct ggml_context * ctx;
-    std::map<std::string, struct ggml_tensor *> tensors;
-};
-
-// load the model's weights from a file
-bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt_vocab & vocab) {
-    printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
-
-    auto fin = std::ifstream(fname, std::ios::binary);
-    if (!fin) {
-        fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
-        return false;
-    }
-
-    // verify magic
-    {
-        uint32_t magic;
-        fin.read((char *) &magic, sizeof(magic));
-        if (magic != GGML_FILE_MAGIC) {
-            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
-            return false;
-        }
-    }
-
-    // load hparams
-    {
-        auto & hparams = model.hparams;
-
-        fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
-        fin.read((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
-        fin.read((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
-        fin.read((char *) &hparams.n_head,  sizeof(hparams.n_head));
-        fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
-        fin.read((char *) &hparams.n_rot,   sizeof(hparams.n_rot));
-        fin.read((char *) &hparams.par_res, sizeof(hparams.par_res));
-        fin.read((char *) &hparams.ftype,   sizeof(hparams.ftype));
-
-        const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
-
-        printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
-        printf("%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
-        printf("%s: n_embd  = %d\n", __func__, hparams.n_embd);
-        printf("%s: n_head  = %d\n", __func__, hparams.n_head);
-        printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
-        printf("%s: n_rot   = %d\n", __func__, hparams.n_rot);
-        printf("%s: par_res = %d\n", __func__, hparams.par_res);
-        printf("%s: ftype   = %d\n", __func__, hparams.ftype);
-        printf("%s: qntvr   = %d\n", __func__, qntvr);
-
-        hparams.ftype %= GGML_QNT_VERSION_FACTOR;
-    }
-
-    // load vocab
-    {
-        const int32_t n_vocab = model.hparams.n_vocab;
-
-        std::string word;
-        std::vector<char> buf(128);
-
-        for (int i = 0; i < n_vocab; i++) {
-            uint32_t len;
-            fin.read((char *) &len, sizeof(len));
-
-            buf.resize(len);
-            fin.read((char *) buf.data(), len);
-            word.assign(buf.data(), len);
-
-            vocab.token_to_id[word] = i;
-            vocab.id_to_token[i] = word;
-        }
-    }
-
-    // for the big tensors, we have the option to store the data in 16-bit floats or quantized
-    // in order to save memory and also to speed up the computation
-    ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
-    if (wtype == GGML_TYPE_COUNT) {
-        fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
-                __func__, fname.c_str(), model.hparams.ftype);
-        return false;
-    }
-
-    auto & ctx = model.ctx;
-
-    size_t ctx_size = 0;
-
-    {
-        const auto & hparams = model.hparams;
-
-        const size_t n_embd  = hparams.n_embd;
-        const size_t n_layer = hparams.n_layer;
-        const size_t n_ctx   = hparams.n_ctx;
-        const size_t n_vocab = hparams.n_vocab;
-
-        ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_g
-        ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_b
-
-        ctx_size += ggml_row_size(wtype, n_embd*n_vocab); // wte
-
-        ctx_size += ggml_row_size(wtype, n_embd*n_vocab); // lmh_g
-        //ctx_size += ggml_row_size(GGML_TYPE_F32, n_vocab); // lmh_b
-
-        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_g
-        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_b
-
-        ctx_size += n_layer*(ggml_row_size(wtype,         3*n_embd*n_embd)); // c_attn_attn_w
-        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 3*n_embd));        // c_attn_attn_b
-
-        ctx_size += n_layer*(ggml_row_size(wtype,         n_embd*n_embd)); // c_attn_proj_w
-        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd*n_embd)); // c_attn_proj_b
-
-        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_g
-        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_b
-
-        ctx_size += n_layer*(ggml_row_size(wtype,         4*n_embd*n_embd)); // c_mlp_fc_w
-        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 4*n_embd));        // c_mlp_fc_b
-
-        ctx_size += n_layer*(ggml_row_size(wtype,         4*n_embd*n_embd)); // c_mlp_proj_w
-        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32,   n_embd));        // c_mlp_proj_b
-
-        ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F32, n_embd); // memory_k
-        ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F32, n_embd); // memory_v
-
-        ctx_size += (6 + 16*n_layer)*1024; // object overhead
-
-        printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
-    }
-
-    // create the ggml context
-    {
-        struct ggml_init_params params = {
-            /*.mem_size   =*/ ctx_size,
-            /*.mem_buffer =*/ NULL,
-            /*.no_alloc   =*/ false,
-        };
-
-        model.ctx = ggml_init(params);
-        if (!model.ctx) {
-            fprintf(stderr, "%s: ggml_init() failed\n", __func__);
-            return false;
-        }
-    }
-
-    // prepare memory for the weights
-    {
-        const auto & hparams = model.hparams;
-
-        const int n_embd  = hparams.n_embd;
-        const int n_layer = hparams.n_layer;
-        const int n_vocab = hparams.n_vocab;
-
-        model.layers.resize(n_layer);
-
-        model.wte    = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
-
-        model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-        model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-
-        model.lmh_g  = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
-        //model.lmh_b  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_vocab);
-
-        // map by name
-        model.tensors["gpt_neox.embed_in.weight"] = model.wte;
-
-        model.tensors["gpt_neox.final_layer_norm.weight"] = model.ln_f_g;
-        model.tensors["gpt_neox.final_layer_norm.bias"]   = model.ln_f_b;
-
-        model.tensors["embed_out.weight"] = model.lmh_g;
-        //model.tensors["lm_head.bias"]   = model.lmh_b;
-
-        for (int i = 0; i < n_layer; ++i) {
-            auto & layer = model.layers[i];
-
-            layer.ln_1_g          = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-            layer.ln_1_b          = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-
-            layer.c_attn_attn_w   = ggml_new_tensor_2d(ctx, wtype,           n_embd, 3*n_embd);
-            layer.c_attn_attn_b   = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd);
-
-            layer.c_attn_proj_w   = ggml_new_tensor_2d(ctx, wtype,           n_embd,   n_embd);
-            layer.c_attn_proj_b   = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-
-            layer.ln_2_g          = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-            layer.ln_2_b          = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-
-            layer.c_mlp_fc_w      = ggml_new_tensor_2d(ctx, wtype,           n_embd, 4*n_embd);
-            layer.c_mlp_fc_b      = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
-
-            layer.c_mlp_proj_w    = ggml_new_tensor_2d(ctx, wtype,         4*n_embd,   n_embd);
-            layer.c_mlp_proj_b    = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-
-            // map by name
-            model.tensors["gpt_neox.layers." + std::to_string(i) + ".input_layernorm.weight"] = layer.ln_1_g;
-            model.tensors["gpt_neox.layers." + std::to_string(i) + ".input_layernorm.bias"]   = layer.ln_1_b;
-
-            model.tensors["gpt_neox.layers." + std::to_string(i) + ".attention.query_key_value.weight"] = layer.c_attn_attn_w;
-            model.tensors["gpt_neox.layers." + std::to_string(i) + ".attention.query_key_value.bias"]   = layer.c_attn_attn_b;
-
-            model.tensors["gpt_neox.layers." + std::to_string(i) + ".attention.dense.weight"] = layer.c_attn_proj_w;
-            model.tensors["gpt_neox.layers." + std::to_string(i) + ".attention.dense.bias"]   = layer.c_attn_proj_b;
-
-            model.tensors["gpt_neox.layers." + std::to_string(i) + ".post_attention_layernorm.weight"] = layer.ln_2_g;
-            model.tensors["gpt_neox.layers." + std::to_string(i) + ".post_attention_layernorm.bias"]   = layer.ln_2_b;
-
-            model.tensors["gpt_neox.layers." + std::to_string(i) + ".mlp.dense_h_to_4h.weight"] = layer.c_mlp_fc_w;
-            model.tensors["gpt_neox.layers." + std::to_string(i) + ".mlp.dense_h_to_4h.bias"]   = layer.c_mlp_fc_b;
-
-            model.tensors["gpt_neox.layers." + std::to_string(i) + ".mlp.dense_4h_to_h.weight"] = layer.c_mlp_proj_w;
-            model.tensors["gpt_neox.layers." + std::to_string(i) + ".mlp.dense_4h_to_h.bias"]   = layer.c_mlp_proj_b;
-        }
-    }
-
-    // key + value memory
-    {
-        const auto & hparams = model.hparams;
-
-        const int n_embd  = hparams.n_embd;
-        const int n_layer = hparams.n_layer;
-        const int n_ctx   = hparams.n_ctx;
-
-        const int64_t n_mem      = n_layer*n_ctx;
-        const int64_t n_elements = n_embd*n_mem;
-
-        model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
-        model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
-
-        const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
-
-        printf("%s: memory_size = %8.2f MB, n_mem = %" PRId64 "\n", __func__, memory_size/1024.0/1024.0, n_mem);
-    }
-
-    // load weights
-    {
-        int n_tensors = 0;
-        size_t total_size = 0;
-
-        printf("%s: ", __func__);
-
-        while (true) {
-            int32_t n_dims;
-            int32_t length;
-            int32_t ttype;
-
-            fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
-            fin.read(reinterpret_cast<char *>(&length), sizeof(length));
-            fin.read(reinterpret_cast<char *>(&ttype),  sizeof(ttype));
-
-            if (fin.eof()) {
-                break;
-            }
-
-            int32_t nelements = 1;
-            int32_t ne[2] = { 1, 1 };
-            for (int i = 0; i < n_dims; ++i) {
-                fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
-                nelements *= ne[i];
-            }
-
-            std::string name(length, 0);
-            fin.read(&name[0], length);
-
-            if (model.tensors.find(name) == model.tensors.end()) {
-                fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.c_str());
-                return false;
-            }
-
-            auto tensor = model.tensors[name];
-            if (ggml_nelements(tensor) != nelements) {
-                fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.c_str());
-                return false;
-            }
-
-            if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
-                fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%5d, %5d], expected [%5d, %5d]\n",
-                        __func__, name.c_str(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]);
-                return false;
-            }
-
-            // for debugging
-            if (0) {
-                printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.c_str(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
-            }
-
-            const size_t bpe = ggml_type_size(ggml_type(ttype));
-
-            if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
-                fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
-                        __func__, name.c_str(), ggml_nbytes(tensor), nelements*bpe);
-                return false;
-            }
-
-            fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
-
-            total_size += ggml_nbytes(tensor);
-            if (++n_tensors % 8 == 0) {
-                printf(".");
-                fflush(stdout);
-            }
-        }
-
-        printf(" done\n");
-
-        printf("%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, n_tensors);
-    }
-
-    fin.close();
-
-    return true;
-}
-
-
-// feed-forward network
-ggml_tensor * gpt_neox_ff(
-        const gpt_neox_layer & layer,
-        ggml_context         * ctx0,
-        ggml_tensor          * inp,
-        float                  eps) {
-    ggml_tensor * cur = ggml_norm(ctx0, inp, eps);
-
-    cur = ggml_add(ctx0,
-        ggml_mul(ctx0,
-            ggml_repeat(ctx0, layer.ln_2_g, cur),
-            cur),
-        ggml_repeat(ctx0, layer.ln_2_b, cur));
-
-    cur = ggml_mul_mat(ctx0,
-            layer.c_mlp_fc_w,
-            cur);
-
-    cur = ggml_add(ctx0,
-            ggml_repeat(ctx0, layer.c_mlp_fc_b, cur),
-            cur);
-
-    // GELU activation
-    cur = ggml_gelu(ctx0, cur);
-
-    // projection
-    // cur = proj_w*cur + proj_b
-    cur = ggml_mul_mat(ctx0,
-            layer.c_mlp_proj_w,
-            cur);
-
-    cur = ggml_add(ctx0,
-            ggml_repeat(ctx0, layer.c_mlp_proj_b, cur),
-            cur);
-    return cur;
-}
-
-// evaluate the transformer
-//
-//   - model:     the model
-//   - n_threads: number of threads to use
-//   - n_past:    the context size so far
-//   - embd_inp:  the embeddings of the tokens in the context
-//   - embd_w:    the predicted logits for the next token
-//
-bool gpt_neox_eval(
-        const gpt_neox_model & model,
-        const int n_threads,
-        const int n_past,
-        const std::vector<gpt_vocab::id> & embd_inp,
-              std::vector<float>         & embd_w,
-              size_t                     & mem_per_token) {
-    const int N = embd_inp.size();
-
-    const auto & hparams = model.hparams;
-
-    const int n_embd  = hparams.n_embd;
-    const int n_layer = hparams.n_layer;
-    const int n_ctx   = hparams.n_ctx;
-    const int n_head  = hparams.n_head;
-    const int n_vocab = hparams.n_vocab;
-    const int n_rot   = hparams.n_rot;
-
-    static size_t buf_size = 256u*1024*1024;
-    static void * buf = malloc(buf_size);
-
-    // use 2 scratch buffers
-    // TODO: very hacky solution - reimplement in a more elegant way
-    static size_t scr0_size = 256u*1024*1024;
-    static void * scr0 = malloc(scr0_size);
-
-    static size_t scr1_size = 256u*1024*1024;
-    static void * scr1 = malloc(scr1_size);
-
-    if (mem_per_token > 0 && mem_per_token*N > buf_size) {
-        const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
-        //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
-
-        // reallocate
-        buf_size = buf_size_new;
-        buf = realloc(buf, buf_size);
-        if (buf == nullptr) {
-            fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size);
-            return false;
-        }
-    }
-
-    struct ggml_init_params params = {
-        /*.mem_size   =*/ buf_size,
-        /*.mem_buffer =*/ buf,
-        /*.no_alloc   =*/ false,
-    };
-
-    struct ggml_context * ctx0 = ggml_init(params);
-    struct ggml_cgraph * gf = ggml_new_graph(ctx0);
-
-    // KQ_pos - contains the positions
-    struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
-    int * data = (int *) KQ_pos->data;
-    for (int i = 0; i < N; ++i) {
-        data[i] = n_past + i;
-    }
-
-    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
-    memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
-
-    // wte
-    struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.wte, embd);
-
-    for (int il = 0; il < n_layer; ++il) {
-        struct ggml_tensor * cur;
-
-        ggml_set_scratch(ctx0, { 0, scr0_size, scr0, });
-
-        // self-attention
-        {
-            {
-                cur = ggml_norm(ctx0, inpL, hparams.eps);
-
-                cur = ggml_add(ctx0,
-                        ggml_mul(ctx0,
-                            ggml_repeat(ctx0, model.layers[il].ln_1_g, cur),
-                            cur),
-                        ggml_repeat(ctx0, model.layers[il].ln_1_b, cur));
-            }
-
-            // compute QKV
-            {
-                cur = ggml_mul_mat(ctx0,
-                        model.layers[il].c_attn_attn_w,
-                        cur);
-
-                cur = ggml_add(ctx0,
-                        ggml_repeat(ctx0, model.layers[il].c_attn_attn_b, cur),
-                        cur);
-            }
-
-            struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd/n_head, n_head, N, cur->nb[1]/n_head, cur->nb[1], 0*sizeof(float)*n_embd/n_head));
-            struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd/n_head, n_head, N, cur->nb[1]/n_head, cur->nb[1], 1*sizeof(float)*n_embd/n_head));
-            struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd/n_head, n_head, N, cur->nb[1]/n_head, cur->nb[1], 2*sizeof(float)*n_embd/n_head));
-
-            // using mode = 2 for GPT-NeoX mode
-            Qcur = ggml_rope_inplace(ctx0, Qcur, KQ_pos, n_rot, 2, 0);
-            Kcur = ggml_rope_inplace(ctx0, Kcur, KQ_pos, n_rot, 2, 0);
-
-            // store key and value to memory
-            {
-                Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd, N));
-
-                struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past));
-                struct ggml_tensor * v = ggml_view_2d(ctx0, model.memory_v, N, n_embd,
-                        (   n_ctx)*ggml_element_size(model.memory_v),
-                        (il*n_ctx)*ggml_element_size(model.memory_v)*n_embd + n_past*ggml_element_size(model.memory_v));
-
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
-            }
-
-            // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
-            struct ggml_tensor * Q =
-                ggml_permute(ctx0,
-                        Qcur,
-                        0, 2, 1, 3);
-
-            // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
-            struct ggml_tensor * K =
-                ggml_permute(ctx0,
-                        ggml_reshape_3d(ctx0,
-                            ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd),
-                            n_embd/n_head, n_head, n_past + N),
-                        0, 2, 1, 3);
-
-            // K * Q
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
-
-            // KQ_scaled = KQ / sqrt(n_embd/n_head)
-            struct ggml_tensor * KQ_scaled =
-                ggml_scale_inplace(ctx0,
-                        KQ,
-                        1.0f/sqrt(float(n_embd)/n_head));
-
-            // KQ_masked = mask_past(KQ_scaled)
-            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
-
-            // KQ = soft_max(KQ_masked)
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
-
-            // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
-            struct ggml_tensor * V =
-                ggml_view_3d(ctx0, model.memory_v,
-                        n_past + N, n_embd/n_head, n_head,
-                        n_ctx*ggml_element_size(model.memory_v),
-                        n_ctx*ggml_element_size(model.memory_v)*n_embd/n_head,
-                        il*n_ctx*ggml_element_size(model.memory_v)*n_embd);
-
-            // KQV = transpose(V) * KQ_soft_max
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
-
-            // KQV_merged = KQV.permute(0, 2, 1, 3)
-            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
-
-            // cur = KQV_merged.contiguous().view(n_embd, N)
-            cur = ggml_cpy(ctx0,
-                    KQV_merged,
-                    ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
-
-            // projection
-            {
-                cur = ggml_mul_mat(ctx0,
-                        model.layers[il].c_attn_proj_w,
-                        cur);
-
-                cur = ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].c_attn_proj_b, cur), cur);
-            }
-        }
-
-        ggml_set_scratch(ctx0, { 0, scr1_size, scr1, });
-
-        if (hparams.par_res == 0) {
-            struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpL);
-
-            cur = gpt_neox_ff(model.layers[il], ctx0, inpFF, hparams.eps);
-
-            // input for next layer
-            inpL = ggml_add(ctx0, cur, inpFF);
-        } else {
-            struct ggml_tensor * inpFF = cur;
-
-            // this is independent of the self-attention result, so it could be done in parallel to the self-attention
-            // note here we pass inpL instead of cur
-            cur = gpt_neox_ff(model.layers[il], ctx0, inpL, hparams.eps);
-
-            // layer input + FF
-            cur  = ggml_add(ctx0, cur, inpFF);
-
-            // input for next layer
-            inpL = ggml_add(ctx0, cur, inpL);
-        }
-    }
-
-    ggml_set_scratch(ctx0, { 0, scr0_size, scr0, });
-
-    // norm
-    {
-        inpL = ggml_norm(ctx0, inpL, hparams.eps);
-
-        // inpL = ln_f_g*inpL + ln_f_b
-        inpL = ggml_add(ctx0,
-                ggml_mul(ctx0,
-                    ggml_repeat(ctx0, model.ln_f_g, inpL),
-                    inpL),
-                ggml_repeat(ctx0, model.ln_f_b, inpL));
-    }
-
-    ggml_set_scratch(ctx0, { 0, 0, nullptr, });
-
-    // lm_head
-    {
-        inpL = ggml_mul_mat(ctx0, model.lmh_g, inpL);
-
-        //inpL = ggml_add(ctx0,
-        //        ggml_repeat(ctx0, model.lmh_b, inpL),
-        //        inpL);
-    }
-
-    // logits -> probs
-    //inpL = ggml_soft_max_inplace(ctx0, inpL);
-
-    // run the computation
-    ggml_build_forward_expand(gf, inpL);
-    ggml_graph_compute_with_ctx(ctx0, gf, n_threads);
-
-    //if (n_past%100 == 0) {
-    //    ggml_graph_print   (&gf);
-    //    ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
-    //}
-
-    //embd_w.resize(n_vocab*N);
-    //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
-
-    // return result for just the last token
-    embd_w.resize(n_vocab);
-    memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
-
-    if (mem_per_token == 0) {
-        mem_per_token = ggml_used_mem(ctx0)/N;
-    }
-    //printf("used_mem = %zu\n", ggml_used_mem(ctx0));
-
-    ggml_free(ctx0);
-
-    return true;
-}
-
-int main(int argc, char ** argv) {
-    ggml_time_init();
-
-    const int64_t t_main_start_us = ggml_time_us();
-
-    gpt_params params;
-    params.model = "models/stablelm-base-alpha-3b/ggml-model-f16.bin";
-
-    if (gpt_params_parse(argc, argv, params) == false) {
-        return 1;
-    }
-
-    if (params.seed < 0) {
-        params.seed = time(NULL);
-    }
-
-    printf("%s: seed = %d\n", __func__, params.seed);
-
-    std::mt19937 rng(params.seed);
-    if (params.prompt.empty()) {
-        params.prompt = gpt_random_prompt(rng);
-    }
-
-    int64_t t_load_us = 0;
-
-    gpt_vocab vocab;
-    gpt_neox_model model;
-
-    // load the model
-    {
-        const int64_t t_start_us = ggml_time_us();
-
-        if (!gpt_neox_model_load(params.model, model, vocab)) {
-            fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
-            return 1;
-        }
-
-        t_load_us = ggml_time_us() - t_start_us;
-
-        test_gpt_tokenizer(vocab, params.token_test);
-    }
-
-    int n_past = 0;
-
-    int64_t t_sample_us  = 0;
-    int64_t t_predict_us = 0;
-
-    std::vector<float> logits;
-
-    // tokenize the prompt
-    std::vector<gpt_vocab::id> embd_inp = ::gpt_tokenize(vocab, params.prompt);
-
-    params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size());
-
-    printf("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
-    for (size_t i = 0; i < embd_inp.size(); i++) {
-        printf("%s: token[%zu] = %6d, %s\n", __func__, i, embd_inp[i], vocab.id_to_token.at(embd_inp[i]).c_str());
-    }
-    printf("\n");
-
-    std::vector<gpt_vocab::id> embd;
-
-    // determine the required inference memory per token:
-    size_t mem_per_token = 0;
-    gpt_neox_eval(model, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
-
-    for (size_t i = embd.size(); i < embd_inp.size() + params.n_predict; i++) {
-        // predict
-        if (embd.size() > 0) {
-            const int64_t t_start_us = ggml_time_us();
-
-            if (!gpt_neox_eval(model, params.n_threads, n_past, embd, logits, mem_per_token)) {
-                printf("Failed to predict\n");
-                return 1;
-            }
-
-            t_predict_us += ggml_time_us() - t_start_us;
-        }
-
-        n_past += embd.size();
-        embd.clear();
-
-        if (i >= embd_inp.size()) {
-            // sample next token
-            const int   top_k = params.top_k;
-            const float top_p = params.top_p;
-            const float temp  = params.temp;
-
-            const int n_vocab = model.hparams.n_vocab;
-
-            gpt_vocab::id id = 0;
-
-            {
-                const int64_t t_start_sample_us = ggml_time_us();
-
-                id = gpt_sample_top_k_top_p(vocab, logits.data() + (logits.size() - n_vocab), top_k, top_p, temp, rng);
-
-                t_sample_us += ggml_time_us() - t_start_sample_us;
-            }
-
-            // add it to the context
-            embd.push_back(id);
-        } else {
-            // if here, it means we are still processing the input prompt
-            for (size_t k = i; k < embd_inp.size(); k++) {
-                embd.push_back(embd_inp[k]);
-                if (int32_t(embd.size()) > params.n_batch) {
-                    break;
-                }
-            }
-            i += embd.size() - 1;
-        }
-
-        // display text
-        for (auto id : embd) {
-            printf("%s", vocab.id_to_token[id].c_str());
-        }
-        fflush(stdout);
-
-        // end of text token
-        if (embd.back() == 0) {
-            break;
-        }
-    }
-
-    // report timing
-    {
-        const int64_t t_main_end_us = ggml_time_us();
-
-        printf("\n\n");
-        printf("%s: mem per token = %8zu bytes\n", __func__, mem_per_token);
-        printf("%s:     load time = %8.2f ms\n", __func__, t_load_us/1000.0f);
-        printf("%s:   sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f);
-        printf("%s:  predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/n_past);
-        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
-    }
-
-    ggml_free(model.ctx);
-
-    return 0;
-}
diff --git a/ggml/examples/gpt-neox/quantize.cpp b/ggml/examples/gpt-neox/quantize.cpp
deleted file mode 100644
index 96208c1..0000000
--- a/ggml/examples/gpt-neox/quantize.cpp
+++ /dev/null
@@ -1,178 +0,0 @@
-#include "ggml/ggml.h"
-
-#include "common.h"
-#include "common-ggml.h"
-
-#include <cassert>
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <fstream>
-#include <map>
-#include <string>
-#include <vector>
-#include <regex>
-
-// default hparams (StableLM 3B)
-struct gpt_neox_hparams {
-    int32_t n_vocab = 50257;
-    int32_t n_ctx   = 4096;
-    int32_t n_embd  = 4096;
-    int32_t n_head  = 32;
-    int32_t n_layer = 16;
-    int32_t n_rot   = 32; // 0.25 * (n_embd / n_head)
-    int32_t par_res = 1; // 1 = true, 0 = false
-    int32_t ftype   = 1;
-};
-
-// quantize a model
-bool gpt_neox_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_ftype ftype) {
-    gpt_vocab vocab;
-
-    printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
-
-    auto finp = std::ifstream(fname_inp, std::ios::binary);
-    if (!finp) {
-        fprintf(stderr, "%s: failed to open '%s' for reading\n", __func__, fname_inp.c_str());
-        return false;
-    }
-
-    auto fout = std::ofstream(fname_out, std::ios::binary);
-    if (!fout) {
-        fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname_out.c_str());
-        return false;
-    }
-
-    // verify magic
-    {
-        uint32_t magic;
-        finp.read((char *) &magic, sizeof(magic));
-        if (magic != GGML_FILE_MAGIC) {
-            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname_inp.c_str());
-            return false;
-        }
-
-        fout.write((char *) &magic, sizeof(magic));
-    }
-
-    gpt_neox_hparams hparams;
-
-    // load hparams
-    {
-        finp.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
-        finp.read((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
-        finp.read((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
-        finp.read((char *) &hparams.n_head,  sizeof(hparams.n_head));
-        finp.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
-        finp.read((char *) &hparams.n_rot,   sizeof(hparams.n_rot));
-        finp.read((char *) &hparams.par_res, sizeof(hparams.par_res));
-        finp.read((char *) &hparams.ftype,   sizeof(hparams.ftype));
-
-        const int32_t qntvr_src =    hparams.ftype / GGML_QNT_VERSION_FACTOR;
-        const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype;
-
-        printf("%s: n_vocab     = %d\n", __func__, hparams.n_vocab);
-        printf("%s: n_ctx       = %d\n", __func__, hparams.n_ctx);
-        printf("%s: n_embd      = %d\n", __func__, hparams.n_embd);
-        printf("%s: n_head      = %d\n", __func__, hparams.n_head);
-        printf("%s: n_layer     = %d\n", __func__, hparams.n_layer);
-        printf("%s: par_res     = %d\n", __func__, hparams.par_res);
-        printf("%s: ftype (src) = %d\n", __func__, hparams.ftype);
-        printf("%s: qntvr (src) = %d\n", __func__, qntvr_src);
-        printf("%s: ftype (dst) = %d\n", __func__, ftype_dst);
-        printf("%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION);
-
-        fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
-        fout.write((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
-        fout.write((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
-        fout.write((char *) &hparams.n_head,  sizeof(hparams.n_head));
-        fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer));
-        fout.write((char *) &hparams.n_rot,   sizeof(hparams.n_rot));
-        fout.write((char *) &hparams.par_res, sizeof(hparams.par_res));
-        fout.write((char *) &ftype_dst,       sizeof(ftype_dst));
-    }
-
-    // load vocab
-    {
-        const int32_t n_vocab = hparams.n_vocab;
-
-        std::string word;
-        for (int i = 0; i < n_vocab; i++) {
-            uint32_t len;
-            finp.read ((char *) &len, sizeof(len));
-            fout.write((char *) &len, sizeof(len));
-
-            word.resize(len);
-            finp.read ((char *) word.data(), len);
-            fout.write((char *) word.data(), len);
-
-            vocab.token_to_id[word] = i;
-            vocab.id_to_token[i] = word;
-        }
-    }
-
-    // regexes of tensor names to be quantized
-    const std::vector<std::string> to_quant = {
-        ".*weight",
-    };
-
-    if (!ggml_common_quantize_0(finp, fout, ftype, to_quant, {})) {
-        fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__, fname_inp.c_str());
-        return false;
-    }
-
-    finp.close();
-    fout.close();
-
-    return true;
-}
-
-// usage:
-//  ./gpt-neox-quantize models/stalellm2-117M/ggml-model.bin models/stablelm2-117M/ggml-model-quant.bin type
-//
-int main(int argc, char ** argv) {
-    if (argc != 4) {
-        fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
-        ggml_print_ftypes(stderr);
-        return 1;
-    }
-
-    // needed to initialize f16 tables
-    {
-        struct ggml_init_params params = { 0, NULL, false };
-        struct ggml_context * ctx = ggml_init(params);
-        ggml_free(ctx);
-    }
-
-    const std::string fname_inp = argv[1];
-    const std::string fname_out = argv[2];
-
-    const ggml_ftype ftype = ggml_parse_ftype(argv[3]);
-
-    const int64_t t_main_start_us = ggml_time_us();
-
-    int64_t t_quantize_us = 0;
-
-    // load the model
-    {
-        const int64_t t_start_us = ggml_time_us();
-
-        if (!gpt_neox_model_quantize(fname_inp, fname_out, ggml_ftype(ftype))) {
-            fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
-            return 1;
-        }
-
-        t_quantize_us = ggml_time_us() - t_start_us;
-    }
-
-    // report timing
-    {
-        const int64_t t_main_end_us = ggml_time_us();
-
-        printf("\n");
-        printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0f);
-        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
-    }
-
-    return 0;
-}
diff --git a/ggml/examples/mnist/CMakeLists.txt b/ggml/examples/mnist/CMakeLists.txt
deleted file mode 100644
index 4d9b93e..0000000
--- a/ggml/examples/mnist/CMakeLists.txt
+++ /dev/null
@@ -1,40 +0,0 @@
-#
-# mnist
-
-set(TEST_TARGET mnist)
-add_executable(${TEST_TARGET} main.cpp)
-target_link_libraries(${TEST_TARGET} PRIVATE ggml common)
-
-#
-# mnist-cnn
-
-set(TEST_TARGET mnist-cnn)
-add_executable(${TEST_TARGET} main-cnn.cpp)
-target_link_libraries(${TEST_TARGET} PRIVATE ggml common)
-
-#
-# mnist-cpu
-
-set(TEST_TARGET mnist-cpu)
-add_executable(${TEST_TARGET} main-cpu.cpp)
-target_link_libraries(${TEST_TARGET} PRIVATE ggml)
-
-if (APPLE)
-    #
-    # mnist-mtl
-
-    find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
-    find_library(METAL_FRAMEWORK    Metal      REQUIRED)
-    find_library(METALKIT_FRAMEWORK MetalKit   REQUIRED)
-    find_library(METALPERFORMANCE_FRAMEWORK MetalPerformanceShaders REQUIRED)
-
-    set(TEST_TARGET mnist-mtl)
-    add_executable(${TEST_TARGET} main-mtl.cpp main-mtl.h main-mtl.m)
-    target_link_libraries(${TEST_TARGET} PRIVATE
-        ggml
-        ${FOUNDATION_LIBRARY}
-        ${METAL_FRAMEWORK}
-        ${METALKIT_FRAMEWORK}
-        ${METALPERFORMANCE_FRAMEWORK}
-    )
-endif()
diff --git a/ggml/examples/mnist/README.md b/ggml/examples/mnist/README.md
deleted file mode 100644
index f6b66f7..0000000
--- a/ggml/examples/mnist/README.md
+++ /dev/null
@@ -1,128 +0,0 @@
-# MNIST Examples for GGML
-
-These are simple examples of how to use GGML for inferencing.
-The first example uses convolutional neural network (CNN), the second one uses fully connected neural network.
-
-## Building the examples
-
-```bash
-git clone https://github.com/ggerganov/ggml
-cd ggml
-mkdir build && cd build
-cmake ..
-make -j4 mnist-cnn mnist
-```
-
-## MNIST with CNN
-
-This implementation achieves ~99% accuracy on the MNIST test set.
-
-### Training the model
-
-Use the `mnist-cnn.py` script to train the model and convert it to GGUF format:
-
-```
-$ python3 ../examples/mnist/mnist-cnn.py train mnist-cnn-model
-...
-Keras model saved to 'mnist-cnn-model'
-```
-
-Convert the model to GGUF format:
-
-```
-$ python3 ../examples/mnist/mnist-cnn.py convert mnist-cnn-model
-...
-Model converted and saved to 'mnist-cnn-model.gguf'
-```
-
-### Running the example
-
-```bash
-$ ./bin/mnist-cnn mnist-cnn-model.gguf ../examples/mnist/models/mnist/t10k-images.idx3-ubyte
-main: loaded model in     5.17 ms
-_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
-_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
-_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
-_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
-_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ * * * * * _ _ _ _ _ _ _ 
-_ _ _ _ _ _ _ _ _ _ _ _ _ _ * * * * * * * * _ _ _ _ _ _ 
-_ _ _ _ _ _ _ _ _ _ _ _ * * * * * _ _ _ * * _ _ _ _ _ _ 
-_ _ _ _ _ _ _ _ _ _ _ _ * * _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
-_ _ _ _ _ _ _ _ _ _ _ * * * _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
-_ _ _ _ _ _ _ _ _ _ _ * * * _ _ _ _ _ _ _ _ * _ _ _ _ _ 
-_ _ _ _ _ _ _ _ _ _ _ * * * _ _ _ _ _ _ _ _ * * _ _ _ _ 
-_ _ _ _ _ _ _ _ _ _ _ * * * _ _ _ _ _ _ _ _ * * _ _ _ _ 
-_ _ _ _ _ _ _ _ _ _ _ _ * * * _ _ _ _ * * * * * _ _ _ _ 
-_ _ _ _ _ _ _ _ _ _ _ _ * * * * * * * * * _ _ _ _ _ _ _ 
-_ _ _ _ _ _ _ _ * * * * * * * * * * _ _ _ _ _ _ _ _ _ _ 
-_ _ _ _ _ _ _ * * * * * * _ _ * * * _ _ _ _ _ _ _ _ _ _ 
-_ _ _ _ _ _ * * * _ _ _ _ _ _ _ * * * _ _ _ _ _ _ _ _ _ 
-_ _ _ _ _ _ * * _ _ _ _ _ _ _ _ _ * * _ _ _ _ _ _ _ _ _ 
-_ _ _ _ _ _ * * _ _ _ _ _ _ _ _ _ * * * _ _ _ _ _ _ _ _ 
-_ _ _ _ _ _ * * _ _ _ _ _ _ _ _ _ * * * _ _ _ _ _ _ _ _ 
-_ _ _ _ _ _ * * * _ _ _ _ _ _ _ _ * * * _ _ _ _ _ _ _ _ 
-_ _ _ _ _ _ _ _ * * * _ _ _ _ _ _ * * * _ _ _ _ _ _ _ _ 
-_ _ _ _ _ _ _ _ _ * * * * * * * * * * _ _ _ _ _ _ _ _ _ 
-_ _ _ _ _ _ _ _ _ _ _ * * * * * * _ _ _ _ _ _ _ _ _ _ _ 
-_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
-_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
-_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
-_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
-
-ggml_graph_dump_dot: dot -Tpng mnist-cnn.dot -o mnist-cnn.dot.png && open mnist-cnn.dot.png
-main: predicted digit is 8
-```
-
-Computation graph:
-
-![mnist dot](https://user-images.githubusercontent.com/1991296/263763842-3b679b45-7ca1-4ee9-b19a-82e34396624f.png)
-
-## MNIST with fully connected network
-
-A fully connected layer + relu, followed by a fully connected layer + softmax.
-
-### Training the Model
-
-A Google Colab notebook for training a simple two-layer network to recognize digits is located here. You can
-use this to save a pytorch model to be converted to ggml format.
-
-[Colab](https://colab.research.google.com/drive/12n_8VNJnolBnX5dVS0HNWubnOjyEaFSb?usp=sharing)
-
-GGML "format" is whatever you choose for efficient loading. In our case, we just save the hyperparameters used
-plus the model weights and biases. Run convert-h5-to-ggml.py to convert your pytorch model. The output format is:
-
-- magic constant (int32)
-- repeated list of tensors
-- number of dimensions of tensor (int32)
-- tensor dimension (int32 repeated)
-- values of tensor (int32)
-
-Run ```convert-h5-to-ggml.py mnist_model.state_dict``` where `mnist_model.state_dict` is the saved pytorch model from the Google Colab. For
-quickstart, it is included in the mnist/models directory.
-
-```bash
-mkdir -p models/mnist
-python3 ../examples/mnist/convert-h5-to-ggml.py ../examples/mnist/models/mnist/mnist_model.state_dict
-```
-
-### Running the example
-
-```bash
-./bin/mnist ./models/mnist/ggml-model-f32.bin ../examples/mnist/models/mnist/t10k-images.idx3-ubyte
-```
-
-Computation graph:
-
-![mnist dot](https://user-images.githubusercontent.com/1991296/231882071-84e29d53-b226-4d73-bdc2-5bd6dcb7efd1.png)
-
-
-## Web demo
-
-The example can be compiled with Emscripten like this:
-
-```bash
-cd examples/mnist
-emcc -I../../include -I../../include/ggml -I../../examples ../../src/ggml.c main.cpp -o web/mnist.js -s EXPORTED_FUNCTIONS='["_wasm_eval","_wasm_random_digit","_malloc","_free"]' -s EXPORTED_RUNTIME_METHODS='["ccall"]' -s ALLOW_MEMORY_GROWTH=1 --preload-file models/mnist
-```
-
-Online demo: https://mnist.ggerganov.com
diff --git a/ggml/examples/mnist/convert-h5-to-ggml.py b/ggml/examples/mnist/convert-h5-to-ggml.py
deleted file mode 100644
index a4f7536..0000000
--- a/ggml/examples/mnist/convert-h5-to-ggml.py
+++ /dev/null
@@ -1,63 +0,0 @@
-# Convert MNIS h5 transformer model to ggml format
-#
-# Load the (state_dict) saved model using PyTorch
-# Iterate over all variables and write them to a binary file.
-#
-# For each variable, write the following:
-#   - Number of dimensions (int)
-#   - Name length (int)
-#   - Dimensions (int[n_dims])
-#   - Name (char[name_length])
-#   - Data (float[n_dims])
-#
-# At the start of the ggml file we write the model parameters
-
-import sys
-import struct
-import json
-import numpy as np
-import re
-
-
-import torch
-import torch.nn as nn
-import torchvision.datasets as dsets
-import torchvision.transforms as transforms
-from torch.autograd import Variable
-
-if len(sys.argv) != 2:
-    print("Usage: convert-h5-to-ggml.py model\n")
-    sys.exit(1)
-
-state_dict_file = sys.argv[1]
-fname_out = "models/mnist/ggml-model-f32.bin"
-
-state_dict = torch.load(state_dict_file, map_location=torch.device('cpu'))
-#print (model)
-
-list_vars = state_dict
-print (list_vars)
-
-fout = open(fname_out, "wb")
-
-fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
-
-
-for name in list_vars.keys():
-    data = list_vars[name].squeeze().numpy()
-    print("Processing variable: " + name + " with shape: ", data.shape) 
-    n_dims = len(data.shape);
-   
-    fout.write(struct.pack("i", n_dims))
-    
-    data = data.astype(np.float32)
-    for i in range(n_dims):
-        fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
-
-    # data
-    data.tofile(fout)
-
-fout.close()
-
-print("Done. Output file: " + fname_out)
-print("")
diff --git a/ggml/examples/mnist/main-cnn.cpp b/ggml/examples/mnist/main-cnn.cpp
deleted file mode 100644
index b013503..0000000
--- a/ggml/examples/mnist/main-cnn.cpp
+++ /dev/null
@@ -1,169 +0,0 @@
-#include "ggml/ggml.h"
-
-#include "common.h"
-
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <ctime>
-#include <fstream>
-#include <string>
-#include <vector>
-#include <algorithm>
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-
-struct mnist_model {
-    struct ggml_tensor * conv2d_1_kernel;
-    struct ggml_tensor * conv2d_1_bias;
-    struct ggml_tensor * conv2d_2_kernel;
-    struct ggml_tensor * conv2d_2_bias;
-    struct ggml_tensor * dense_weight;
-    struct ggml_tensor * dense_bias;
-    struct ggml_context * ctx;
-};
-
-bool mnist_model_load(const std::string & fname, mnist_model & model) {
-    struct gguf_init_params params = {
-        /*.no_alloc   =*/ false,
-        /*.ctx        =*/ &model.ctx,
-    };
-    gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
-    if (!ctx) {
-        fprintf(stderr, "%s: gguf_init_from_file() failed\n", __func__);
-        return false;
-    }
-    model.conv2d_1_kernel = ggml_get_tensor(model.ctx, "kernel1");
-    model.conv2d_1_bias = ggml_get_tensor(model.ctx, "bias1");
-    model.conv2d_2_kernel = ggml_get_tensor(model.ctx, "kernel2");
-    model.conv2d_2_bias = ggml_get_tensor(model.ctx, "bias2");
-    model.dense_weight = ggml_get_tensor(model.ctx, "dense_w");
-    model.dense_bias = ggml_get_tensor(model.ctx, "dense_b");
-    return true;
-}
-
-int mnist_eval(
-        const mnist_model & model,
-        const int n_threads,
-        std::vector<float> digit,
-        const char * fname_cgraph
-        )
-{
-    static size_t buf_size = 100000 * sizeof(float) * 4;
-    static void * buf = malloc(buf_size);
-
-    struct ggml_init_params params = {
-        /*.mem_size   =*/ buf_size,
-        /*.mem_buffer =*/ buf,
-        /*.no_alloc   =*/ false,
-    };
-
-    struct ggml_context * ctx0 = ggml_init(params);
-    struct ggml_cgraph * gf = ggml_new_graph(ctx0);
-
-    struct ggml_tensor * input = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, 28, 28, 1, 1);
-    memcpy(input->data, digit.data(), ggml_nbytes(input));
-    ggml_set_name(input, "input");
-    ggml_tensor * cur = ggml_conv_2d(ctx0, model.conv2d_1_kernel, input, 1, 1, 0, 0, 1, 1);
-    cur = ggml_add(ctx0, cur, model.conv2d_1_bias);
-    cur = ggml_relu(ctx0, cur);
-    // Output shape after Conv2D: (26 26 32 1)
-    cur = ggml_pool_2d(ctx0, cur, GGML_OP_POOL_MAX, 2, 2, 2, 2, 0, 0);
-    // Output shape after MaxPooling2D: (13 13 32 1)
-    cur = ggml_conv_2d(ctx0, model.conv2d_2_kernel, cur, 1, 1, 0, 0, 1, 1);
-    cur = ggml_add(ctx0, cur, model.conv2d_2_bias);
-    cur = ggml_relu(ctx0, cur);
-    // Output shape after Conv2D: (11 11 64 1)
-    cur = ggml_pool_2d(ctx0, cur, GGML_OP_POOL_MAX, 2, 2, 2, 2, 0, 0);
-    // Output shape after MaxPooling2D: (5 5 64 1)
-    cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 2, 0, 3));
-    // Output shape after permute: (64 5 5 1)
-    cur = ggml_reshape_2d(ctx0, cur, 1600, 1);
-    // Final Dense layer
-    cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.dense_weight, cur), model.dense_bias);
-    ggml_tensor * probs = ggml_soft_max(ctx0, cur);
-    ggml_set_name(probs, "probs");
-
-    ggml_build_forward_expand(gf, probs);
-    ggml_graph_compute_with_ctx(ctx0, gf, n_threads);
-
-    //ggml_graph_print(&gf);
-    ggml_graph_dump_dot(gf, NULL, "mnist-cnn.dot");
-
-    if (fname_cgraph) {
-        // export the compute graph for later use
-        // see the "mnist-cpu" example
-        ggml_graph_export(gf, fname_cgraph);
-
-        fprintf(stderr, "%s: exported compute graph to '%s'\n", __func__, fname_cgraph);
-    }
-
-    const float * probs_data = ggml_get_data_f32(probs);
-    const int prediction = std::max_element(probs_data, probs_data + 10) - probs_data;
-    ggml_free(ctx0);
-    return prediction;
-}
-
-int main(int argc, char ** argv) {
-    srand(time(NULL));
-    ggml_time_init();
-
-    if (argc != 3) {
-        fprintf(stderr, "Usage: %s models/mnist/mnist-cnn.gguf models/mnist/t10k-images.idx3-ubyte\n", argv[0]);
-        exit(0);
-    }
-
-    uint8_t buf[784];
-    mnist_model model;
-    std::vector<float> digit;
-
-    // load the model
-    {
-        const int64_t t_start_us = ggml_time_us();
-
-        if (!mnist_model_load(argv[1], model)) {
-            fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, argv[1]);
-            return 1;
-        }
-
-        const int64_t t_load_us = ggml_time_us() - t_start_us;
-
-        fprintf(stdout, "%s: loaded model in %8.2f ms\n", __func__, t_load_us / 1000.0f);
-    }
-
-    // read a random digit from the test set
-    {
-        std::ifstream fin(argv[2], std::ios::binary);
-        if (!fin) {
-            fprintf(stderr, "%s: failed to open '%s'\n", __func__, argv[2]);
-            return 1;
-        }
-
-        // seek to a random digit: 16-byte header + 28*28 * (random 0 - 10000)
-        fin.seekg(16 + 784 * (rand() % 10000));
-        fin.read((char *) &buf, sizeof(buf));
-    }
-
-    // render the digit in ASCII
-    {
-        digit.resize(sizeof(buf));
-
-        for (int row = 0; row < 28; row++) {
-            for (int col = 0; col < 28; col++) {
-                fprintf(stderr, "%c ", (float)buf[row*28 + col] > 230 ? '*' : '_');
-                digit[row*28 + col] = ((float)buf[row*28 + col] / 255.0f);
-            }
-
-            fprintf(stderr, "\n");
-        }
-
-        fprintf(stderr, "\n");
-    }
-
-    const int prediction = mnist_eval(model, 1, digit, nullptr);
-    fprintf(stdout, "%s: predicted digit is %d\n", __func__, prediction);
-    ggml_free(model.ctx);
-    return 0;
-}
diff --git a/ggml/examples/mnist/main-cpu.cpp b/ggml/examples/mnist/main-cpu.cpp
deleted file mode 100644
index 3b759b0..0000000
--- a/ggml/examples/mnist/main-cpu.cpp
+++ /dev/null
@@ -1,122 +0,0 @@
-// Use a pre-generated MNIST compute graph for inference on the CPU
-//
-// You can generate a compute graph using the "mnist" tool:
-//
-// $ ./bin/mnist ./models/mnist/ggml-model-f32.bin ../examples/mnist/models/mnist/t10k-images.idx3-ubyte
-//
-// This command creates the "mnist.ggml" file, which contains the generated compute graph.
-// Now, you can re-use the compute graph with the "mnist-cpu" tool:
-//
-// $ ./bin/mnist-cpu ./models/mnist/mnist.ggml ../examples/mnist/models/mnist/t10k-images.idx3-ubyte
-//
-
-#include "ggml/ggml.h"
-
-#include <algorithm>
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <ctime>
-#include <fstream>
-#include <vector>
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-
-// evaluate the MNIST compute graph
-//
-//   - fname_cgraph: path to the compute graph
-//   - n_threads:    number of threads to use
-//   - digit:        784 pixel values
-//
-// returns 0 - 9 prediction
-int mnist_eval(
-        const char * fname_cgraph,
-        const int n_threads,
-        std::vector<float> digit) {
-    // load the compute graph
-    struct ggml_context * ctx_data = NULL;
-    struct ggml_context * ctx_eval = NULL;
-
-    struct ggml_cgraph * gfi = ggml_graph_import(fname_cgraph, &ctx_data, &ctx_eval);
-
-    // param export/import test
-    GGML_ASSERT(ggml_graph_get_tensor(gfi, "fc1_bias")->op_params[0] == int(0xdeadbeef));
-
-    // allocate work context
-    // needed during ggml_graph_compute() to allocate a work tensor
-    static size_t buf_size = 128ull*1024*1024; // TODO
-    static void * buf = malloc(buf_size);
-
-    struct ggml_init_params params = {
-        /*.mem_size   =*/ buf_size,
-        /*.mem_buffer =*/ buf,
-        /*.no_alloc   =*/ false,
-    };
-
-    struct ggml_context * ctx_work = ggml_init(params);
-
-    struct ggml_tensor * input = ggml_graph_get_tensor(gfi, "input");
-    memcpy(input->data, digit.data(), ggml_nbytes(input));
-
-    ggml_graph_compute_with_ctx(ctx_work, gfi, n_threads);
-
-    const float * probs_data = ggml_get_data_f32(ggml_graph_get_tensor(gfi, "probs"));
-
-    const int prediction = std::max_element(probs_data, probs_data + 10) - probs_data;
-
-    ggml_free(ctx_work);
-    ggml_free(ctx_data);
-    ggml_free(ctx_eval);
-
-    return prediction;
-}
-
-int main(int argc, char ** argv) {
-    srand(time(NULL));
-    ggml_time_init();
-
-    if (argc != 3) {
-        fprintf(stderr, "Usage: %s models/mnist/mnist.ggml models/mnist/t10k-images.idx3-ubyte\n", argv[0]);
-        exit(0);
-    }
-
-    uint8_t buf[784];
-    std::vector<float> digit;
-
-    // read a random digit from the test set
-    {
-        std::ifstream fin(argv[2], std::ios::binary);
-        if (!fin) {
-            fprintf(stderr, "%s: failed to open '%s'\n", __func__, argv[2]);
-            return 1;
-        }
-
-        // seek to a random digit: 16-byte header + 28*28 * (random 0 - 10000)
-        fin.seekg(16 + 784 * (rand() % 10000));
-        fin.read((char *) &buf, sizeof(buf));
-    }
-
-    // render the digit in ASCII
-    {
-        digit.resize(sizeof(buf));
-
-        for (int row = 0; row < 28; row++) {
-            for (int col = 0; col < 28; col++) {
-                fprintf(stderr, "%c ", (float)buf[row*28 + col] > 230 ? '*' : '_');
-                digit[row*28 + col] = ((float)buf[row*28 + col]);
-            }
-
-            fprintf(stderr, "\n");
-        }
-
-        fprintf(stderr, "\n");
-    }
-
-    const int prediction = mnist_eval(argv[1], 1, digit);
-
-    fprintf(stdout, "%s: predicted digit is %d\n", __func__, prediction);
-
-    return 0;
-}
diff --git a/ggml/examples/mnist/main-mtl.cpp b/ggml/examples/mnist/main-mtl.cpp
deleted file mode 100644
index 7d0eec8..0000000
--- a/ggml/examples/mnist/main-mtl.cpp
+++ /dev/null
@@ -1,125 +0,0 @@
-// Use a pre-generated MNIST compute graph for inference on the M1 GPU via MPS
-//
-// You can generate a compute graph using the "mnist" tool:
-//
-// $ ./bin/mnist ./models/mnist/ggml-model-f32.bin ../examples/mnist/models/mnist/t10k-images.idx3-ubyte
-//
-// This command creates the "mnist.ggml" file, which contains the generated compute graph.
-// Now, you can re-use the compute graph on the GPU with the "mnist-mtl" tool:
-//
-// $ ./bin/mnist-mtl ./models/mnist/mnist.ggml ../examples/mnist/models/mnist/t10k-images.idx3-ubyte
-//
-
-#include "ggml/ggml.h"
-
-#include "main-mtl.h"
-
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <ctime>
-#include <fstream>
-#include <vector>
-
-// evaluate the MNIST compute graph
-//
-//   - fname_cgraph: path to the compute graph
-//   - digit:        784 pixel values
-//
-// returns 0 - 9 prediction
-int mnist_eval(
-        const char * fname_cgraph,
-        std::vector<float> digit
-        ) {
-    // load the compute graph
-    struct ggml_context * ctx_data = NULL;
-    struct ggml_context * ctx_eval = NULL;
-
-    struct ggml_cgraph * gf = ggml_graph_import(fname_cgraph, &ctx_data, &ctx_eval);
-
-    // allocate work context
-    static size_t buf_size = 128ull*1024*1024; // TODO
-    static void * buf = malloc(buf_size);
-
-    struct ggml_init_params params = {
-        /*.mem_size   =*/ buf_size,
-        /*.mem_buffer =*/ buf,
-        /*.no_alloc   =*/ false,
-    };
-
-    struct ggml_context * ctx_work = ggml_init(params);
-
-    // this allocates all Metal resources and memory buffers
-    auto ctx_mtl = mnist_mtl_init(ctx_data, ctx_eval, ctx_work, gf);
-
-    int prediction = -1;
-
-    for (int i = 0; i < 1; ++i) {
-        struct ggml_tensor * input = ggml_graph_get_tensor(gf, "input");
-
-        if (i % 2 == 0) {
-            memcpy(input->data, digit.data(), ggml_nbytes(input));
-        } else {
-            memset(input->data, 0, ggml_nbytes(input));
-        }
-
-        // the actual inference happens here
-        prediction = mnist_mtl_eval(ctx_mtl, gf);
-    }
-
-    mnist_mtl_free(ctx_mtl);
-
-    ggml_free(ctx_work);
-    ggml_free(ctx_data);
-    ggml_free(ctx_eval);
-
-    return prediction;
-}
-
-int main(int argc, char ** argv) {
-    srand(time(NULL));
-    ggml_time_init();
-
-    if (argc != 3) {
-        fprintf(stderr, "Usage: %s models/mnist/mnist.ggml models/mnist/t10k-images.idx3-ubyte\n", argv[0]);
-        exit(0);
-    }
-
-    uint8_t buf[784];
-    std::vector<float> digit;
-
-    // read a random digit from the test set
-    {
-        std::ifstream fin(argv[2], std::ios::binary);
-        if (!fin) {
-            fprintf(stderr, "%s: failed to open '%s'\n", __func__, argv[2]);
-            return 1;
-        }
-
-        // seek to a random digit: 16-byte header + 28*28 * (random 0 - 10000)
-        fin.seekg(16 + 784 * (rand() % 10000));
-        fin.read((char *) &buf, sizeof(buf));
-    }
-
-    // render the digit in ASCII
-    {
-        digit.resize(sizeof(buf));
-
-        for (int row = 0; row < 28; row++) {
-            for (int col = 0; col < 28; col++) {
-                fprintf(stderr, "%c ", (float)buf[row*28 + col] > 230 ? '*' : '_');
-                digit[row*28 + col] = ((float)buf[row*28 + col]);
-            }
-
-            fprintf(stderr, "\n");
-        }
-
-        fprintf(stderr, "\n");
-    }
-
-    const int prediction = mnist_eval(argv[1], digit);
-
-    fprintf(stdout, "%s: predicted digit is %d\n", __func__, prediction);
-
-    return 0;
-}
diff --git a/ggml/examples/mnist/main-mtl.h b/ggml/examples/mnist/main-mtl.h
deleted file mode 100644
index 4e661a4..0000000
--- a/ggml/examples/mnist/main-mtl.h
+++ /dev/null
@@ -1,26 +0,0 @@
-#pragma once
-
-struct ggml_context;
-struct ggml_cgraph;
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-struct ggml_mtl_context;
-
-struct ggml_mtl_context * mnist_mtl_init(
-        struct ggml_context * ctx_data,
-        struct ggml_context * ctx_eval,
-        struct ggml_context * ctx_work,
-        struct ggml_cgraph  * gf);
-
-void mnist_mtl_free(struct ggml_mtl_context * ctx);
-
-int mnist_mtl_eval(
-        struct ggml_mtl_context * ctx,
-        struct ggml_cgraph      * gf);
-
-#ifdef __cplusplus
-}
-#endif
diff --git a/ggml/examples/mnist/main-mtl.m b/ggml/examples/mnist/main-mtl.m
deleted file mode 100644
index 4b77179..0000000
--- a/ggml/examples/mnist/main-mtl.m
+++ /dev/null
@@ -1,499 +0,0 @@
-#import "main-mtl.h"
-
-#import "ggml/ggml.h"
-
-#import <Foundation/Foundation.h>
-#import <Metal/Metal.h>
-#import <MetalPerformanceShaders/MetalPerformanceShaders.h>
-
-// TODO: couldn't get this to work
-//#define GGML_MTL_HEAP
-
-struct ggml_mtl_context {
-    struct ggml_context * ctx_data;
-    struct ggml_context * ctx_eval;
-    struct ggml_context * ctx_work;
-
-    id<MTLDevice>       device;
-    id<MTLCommandQueue> queue;
-    id<MTLLibrary>      library;
-
-#ifdef GGML_MTL_HEAP
-    id<MTLHeap> heap_data;
-    id<MTLHeap> heap_eval;
-#else
-    id<MTLBuffer> buffer_data;
-    id<MTLBuffer> buffer_eval;
-#endif
-
-    id<MTLBuffer> out;
-
-    // custom kernels
-    id<MTLFunction>             function_add;
-    id<MTLComputePipelineState> pipeline_add;
-
-    id<MTLFunction>             function_relu;
-    id<MTLComputePipelineState> pipeline_relu;
-
-    id<MTLFunction>             function_soft_max;
-    id<MTLComputePipelineState> pipeline_soft_max;
-};
-
-// MSL code
-NSString * const msl_library_mnist = @"\
-#include <metal_stdlib>                                                                 \n\
-using namespace metal;                                                                  \n\
-                                                                                        \n\
-#define MAX(x, y) ((x) > (y) ? (x) : (y))                                               \n\
-                                                                                        \n\
-constant int k_digits [[function_constant(0)]];                                         \n\
-                                                                                        \n\
-kernel void kernel_add(                                                                 \n\
-        device const float * src0,                                                      \n\
-        device const float * src1,                                                      \n\
-        device float * dst,                                                             \n\
-        uint gid[[thread_position_in_grid]]) {                                          \n\
-    dst[gid] = src0[gid] + src1[gid];                                                   \n\
-}                                                                                       \n\
-                                                                                        \n\
-kernel void kernel_relu(                                                                \n\
-        device const float * src,                                                       \n\
-        device       float * dst,                                                       \n\
-        uint gid[[thread_position_in_grid]]) {                                          \n\
-    dst[gid] = max(0.0f, src[gid]);                                                     \n\
-}                                                                                       \n\
-                                                                                        \n\
-kernel void kernel_soft_max(                                                            \n\
-        device const float * src,                                                       \n\
-        device       float * dst,                                                       \n\
-        uint gid[[thread_position_in_grid]]) {                                          \n\
-    float max = 0.0f;                                                                   \n\
-    for (int i = 0; i < k_digits; i++) {                                                \n\
-        max = MAX(max, src[i]);                                                         \n\
-    }                                                                                   \n\
-    float sum = 0.0f;                                                                   \n\
-    for (int i = 0; i < k_digits; i++) {                                                \n\
-        dst[i] = exp(src[i] - max);                                                     \n\
-        sum += dst[i];                                                                  \n\
-    }                                                                                   \n\
-    for (int i = 0; i < k_digits; i++) {                                                \n\
-        dst[i] /= sum;                                                                  \n\
-    }                                                                                   \n\
-}                                                                                       \n\
-";
-
-struct ggml_mtl_context * mnist_mtl_init(
-    struct ggml_context * ctx_data,
-    struct ggml_context * ctx_eval,
-    struct ggml_context * ctx_work,
-    struct ggml_cgraph  * gf) {
-    fprintf(stderr, "%s: allocating\n", __func__);
-
-    struct ggml_mtl_context * ctx = malloc(sizeof(struct ggml_mtl_context));
-
-    ctx->ctx_data = ctx_data;
-    ctx->ctx_eval = ctx_eval;
-    ctx->ctx_work = ctx_work;
-
-    ctx->device = MTLCreateSystemDefaultDevice();
-    ctx->queue  = [ctx->device newCommandQueue];
-
-    // determine if we can use MPS
-    if (MPSSupportsMTLDevice(ctx->device)) {
-        fprintf(stderr, "%s: using MPS\n", __func__);
-    } else {
-        fprintf(stderr, "%s: not using MPS\n", __func__);
-        GGML_ASSERT(false && "MPS not supported");
-    }
-
-    // compile from source string and show compile log
-    {
-        NSError * error = nil;
-        ctx->library = [ctx->device newLibraryWithSource:msl_library_mnist options:nil error:&error];
-        if (error) {
-            fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
-            exit(1);
-        }
-    }
-
-    // load kernels
-    {
-        const int k_digits = ggml_graph_get_tensor(gf, "probs")->ne[0];
-
-        MTLFunctionConstantValues * constants = [MTLFunctionConstantValues new];
-        [constants setConstantValue:&k_digits type:MTLDataTypeInt withName:@"k_digits"];
-
-        ctx->function_add = [ctx->library newFunctionWithName:@"kernel_add"];
-        ctx->pipeline_add = [ctx->device newComputePipelineStateWithFunction:ctx->function_add error:nil];
-        fprintf(stderr, "%s: loaded kernel_add: %p\n", __func__, (void *) ctx->pipeline_add);
-
-        ctx->function_relu = [ctx->library newFunctionWithName:@"kernel_relu"];
-        ctx->pipeline_relu = [ctx->device newComputePipelineStateWithFunction:ctx->function_relu error:nil];
-        fprintf(stderr, "%s: loaded kernel_relu: %p\n", __func__, (void *) ctx->pipeline_relu);
-
-        ctx->function_soft_max = [ctx->library newFunctionWithName:@"kernel_soft_max" constantValues:constants error:nil];
-        ctx->pipeline_soft_max = [ctx->device newComputePipelineStateWithFunction:ctx->function_soft_max error:nil];
-        fprintf(stderr, "%s: loaded kernel_soft_max: %p\n", __func__, (void *) ctx->pipeline_soft_max);
-    }
-
-#ifdef GGML_MTL_HEAP
-    // MTLHeap approach
-
-    // pin ctx_data memory to GPU
-    // use MTLStorageModeShared to allow us to initialize the weights from the CPU
-    // TODO: how to use MTLStorageModeManaged?
-    // TODO: see if we can avoid this copy somehow
-    {
-        const void * mem_buffer = ggml_get_mem_buffer(ctx_data);
-        const size_t mem_size   = ggml_get_mem_size(ctx_data);
-
-        MTLHeapDescriptor * heap_desc = [MTLHeapDescriptor new];
-        heap_desc.storageMode = MTLStorageModeShared;
-        heap_desc.size        = mem_size;
-
-        printf("heap_desc.size = %zu\n", mem_size);
-
-        ctx->heap_data = [ctx->device newHeapWithDescriptor:heap_desc];
-        [ctx->heap_data setPurgeableState:MTLPurgeableStateNonVolatile]; // TODO: is this needed?
-        ctx->heap_data.label = @"heap_data";
-
-        printf("ctx->heap_data.size = %zu\n", [ctx->heap_data size]);
-
-        id<MTLBuffer> buffer = [ctx->heap_data newBufferWithLength:mem_size options:MTLResourceStorageModeShared];
-        if (!buffer) {
-            fprintf(stderr, "%s: error: failed to allocate buffer\n", __func__);
-            exit(1);
-        }
-
-        // copy data from CPU to GPU
-        memcpy([buffer contents], mem_buffer, mem_size);
-
-        fprintf(stderr, "%s: allocated data heap, size = %zu\n", __func__, mem_size);
-    }
-
-    // pin ctx_eval memory to GPU
-    // this heap will be used for the intermediate results of the evaluation
-    {
-        const size_t mem_size = ggml_get_mem_size(ctx_eval);
-
-        MTLHeapDescriptor * heap_desc = [MTLHeapDescriptor new];
-        heap_desc.storageMode = MTLStorageModePrivate; // GPU only
-        heap_desc.size        = mem_size;
-
-        ctx->heap_eval = [ctx->device newHeapWithDescriptor:heap_desc];
-        [ctx->heap_eval setPurgeableState:MTLPurgeableStateNonVolatile]; // TODO: is this needed?
-
-        fprintf(stderr, "%s: allocated eval heap, size = %zu\n", __func__, mem_size);
-    }
-#else
-    // MTLBuffer approach
-
-    // pin ctx_data memory to GPU
-    // use MTLStorageModeShared to allow us to initialize the weights from the CPU
-    // TODO: how to use MTLStorageModeManaged?
-    // TODO: see if we can avoid this copy somehow
-    {
-        const void * mem_buffer = ggml_get_mem_buffer(ctx_data);
-        const size_t mem_size   = ggml_get_mem_size(ctx_data);
-
-        ctx->buffer_data = [ctx->device newBufferWithBytes:mem_buffer length:mem_size options:MTLResourceStorageModeShared];
-
-        fprintf(stderr, "%s: allocated data buffer, size = %zu\n", __func__, mem_size);
-    }
-
-    // pin ctx_eval memory to GPU
-    // this buffer will be used for the intermediate results of the evaluation
-    {
-        const size_t mem_size = ggml_get_mem_size(ctx_eval);
-
-        ctx->buffer_eval = [ctx->device newBufferWithLength:mem_size options:MTLResourceStorageModePrivate];
-
-        fprintf(stderr, "%s: allocated eval buffer, size = %zu\n", __func__, mem_size);
-    }
-#endif
-
-    // allocate buffer for result extraction
-    {
-        const size_t mem_size = ggml_nbytes(gf->nodes[gf->n_nodes - 1]);
-
-        ctx->out = [ctx->device newBufferWithLength:mem_size options:MTLResourceStorageModeShared];
-
-        fprintf(stderr, "%s: allocated out buffer, size = %zu\n", __func__, mem_size);
-    }
-
-    return ctx;
-}
-
-void mnist_mtl_free(struct ggml_mtl_context * ctx) {
-    fprintf(stderr, "%s: deallocating\n", __func__);
-
-    free(ctx);
-}
-
-#ifdef GGML_MTL_HEAP
-
-// make a view of the respective MTL heap
-id<MTLBuffer> mnist_mtl_get_buffer_on_heap(struct ggml_mtl_context * ctx, struct ggml_tensor * t) {
-    const int64_t offs_data = (int64_t) t->data - (int64_t) ggml_get_mem_buffer(ctx->ctx_data);
-    const int64_t offs_eval = (int64_t) t->data - (int64_t) ggml_get_mem_buffer(ctx->ctx_eval);
-
-    const bool is_data = (offs_eval < 0) || (offs_data >= 0 && offs_data < offs_eval);
-
-    const size_t t_size = ggml_nbytes(t);
-    const size_t t_offs = is_data ? offs_data : offs_eval;
-
-    id<MTLBuffer> result;
-
-    if (is_data) {
-        fprintf(stderr, "%s: data tensor '%16s', offs = %8ld, size = %8ld\n", __func__, t->name, t_offs, t_size);
-        result = [ctx->heap_data newBufferWithLength:t_size options:MTLResourceStorageModeShared offset:t_offs];
-    } else {
-        fprintf(stderr, "%s: eval tensor '%16s', offs = %8ld, size = %8ld\n", __func__, t->name, t_offs, t_size);
-        result = [ctx->heap_eval newBufferWithLength:t_size options:MTLResourceStorageModePrivate offset:t_offs];
-    }
-
-    if (result == nil) {
-        fprintf(stderr, "%s: error: buffer is nil\n", __func__);
-        GGML_ASSERT(false);
-    }
-
-    return result;
-}
-
-#else
-
-// get data / eval buffer + offset
-id<MTLBuffer> mnist_mtl_get_buffer(struct ggml_mtl_context * ctx, struct ggml_tensor * t, size_t * offs) {
-    const int64_t offs_data = (int64_t) t->data - (int64_t) ggml_get_mem_buffer(ctx->ctx_data);
-    const int64_t offs_eval = (int64_t) t->data - (int64_t) ggml_get_mem_buffer(ctx->ctx_eval);
-
-    const bool is_data = (offs_eval < 0) || (offs_data >= 0 && offs_data < offs_eval);
-
-    const size_t t_size = ggml_nbytes(t);
-    const size_t t_offs = is_data ? offs_data : offs_eval;
-
-    id<MTLBuffer> result;
-
-    if (is_data) {
-        fprintf(stderr, "%s: data tensor '%16s', offs = %8ld, size = %8ld\n", __func__, t->name, t_offs, t_size);
-        result = ctx->buffer_data;
-    } else {
-        fprintf(stderr, "%s: eval tensor '%16s', offs = %8ld, size = %8ld\n", __func__, t->name, t_offs, t_size);
-        result = ctx->buffer_eval;
-    }
-
-    if (result == nil) {
-        fprintf(stderr, "%s: error: buffer is nil\n", __func__);
-        GGML_ASSERT(false);
-    }
-
-    if (offs != nil) {
-        *offs = t_offs;
-    }
-
-    return result;
-}
-
-#endif
-
-int mnist_mtl_eval(
-        struct ggml_mtl_context * ctx,
-        struct ggml_cgraph      * gf) {
-    fprintf(stderr, "%s: evaluating\n", __func__);
-
-    id<MTLCommandBuffer> command_buffer  = [ctx->queue commandBuffer];
-    id<MTLComputeCommandEncoder> encoder = nil;
-
-    size_t offs_src0;
-    size_t offs_src1;
-    size_t offs_dst;
-
-    // copy the input data to the GPU
-    {
-        struct ggml_tensor * inp = ggml_graph_get_tensor(gf, "input");
-
-        id<MTLBuffer> id_dst = mnist_mtl_get_buffer(ctx, inp, &offs_src0);
-
-        memcpy((char *) id_dst.contents + offs_src0, inp->data, ggml_nbytes(inp));
-    }
-
-    for (int i = 0; i < gf->n_nodes; ++i) {
-        fprintf(stderr, "%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
-
-        switch (gf->nodes[i]->op) {
-            case GGML_OP_ADD:
-                {
-                    if (encoder == nil) {
-                        encoder = [command_buffer computeCommandEncoder];
-                    }
-
-                    id<MTLBuffer> id_src0 = mnist_mtl_get_buffer(ctx, gf->nodes[i]->src[0], &offs_src0);
-                    id<MTLBuffer> id_src1 = mnist_mtl_get_buffer(ctx, gf->nodes[i]->src[1], &offs_src1);
-                    id<MTLBuffer> id_dst  = mnist_mtl_get_buffer(ctx, gf->nodes[i],         &offs_dst);
-
-                    [encoder setComputePipelineState:ctx->pipeline_add];
-                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                    [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
-                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
-
-                    const int64_t n = ggml_nelements(gf->nodes[i]);
-
-                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                } break;
-            case GGML_OP_UNARY:
-                switch (ggml_get_unary_op(gf->nodes[i])) {
-                    case GGML_UNARY_OP_RELU:
-                        {
-                            if (encoder == nil) {
-                                encoder = [command_buffer computeCommandEncoder];
-                            }
-
-                            id<MTLBuffer> id_src = mnist_mtl_get_buffer(ctx, gf->nodes[i]->src[0], &offs_src0);
-                            id<MTLBuffer> id_dst = mnist_mtl_get_buffer(ctx, gf->nodes[i],       &offs_dst);
-
-                            [encoder setComputePipelineState:ctx->pipeline_relu];
-                            [encoder setBuffer:id_src offset:offs_src0 atIndex:0];
-                            [encoder setBuffer:id_dst offset:offs_dst  atIndex:1];
-
-                            const int64_t n = ggml_nelements(gf->nodes[i]);
-
-                            [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                        } break;
-                    default:
-                        {
-                            fprintf(stderr, "%s: node %3d, op = %8s, unary op %d not implemented\n", __func__, i, ggml_op_name(gf->nodes[i]->op), (int) ggml_get_unary_op(gf->nodes[i]));
-                            GGML_ASSERT(false);
-                            return -1;
-                        }
-                        break;
-                } break;
-            case GGML_OP_SOFT_MAX:
-                {
-#if 0
-                    // NOTE: MPSMatrixSoftMax is not working properly, probably there is a bug
-
-                    if (encoder != nil) {
-                        [encoder endEncoding];
-                        encoder = nil;
-                    }
-
-                    // use MPSMatrixSoftMax
-                    id<MTLBuffer> id_src = mnist_mtl_get_buffer(ctx, gf->nodes[i]->src0, &offs_src0);
-                    id<MTLBuffer> id_dst = mnist_mtl_get_buffer(ctx, gf->nodes[i],       &offs_dst);
-
-                    MPSMatrixDescriptor * desc = [MPSMatrixDescriptor
-                        matrixDescriptorWithRows:1 columns:gf->nodes[i]->ne[0] rowBytes:gf->nodes[i]->nb[1] dataType:MPSDataTypeFloat32];
-
-                    MPSMatrix * mat_src = [[MPSMatrix alloc] initWithBuffer:id_src offset:offs_src0 descriptor:desc];
-                    MPSMatrix * mat_dst = [[MPSMatrix alloc] initWithBuffer:id_dst offset:offs_dst  descriptor:desc];
-
-                    MPSMatrixSoftMax * softmax = [[MPSMatrixSoftMax alloc] initWithDevice:ctx->device];
-
-                    [softmax encodeToCommandBuffer:command_buffer inputMatrix:mat_src resultMatrix:mat_dst];
-#else
-                    if (encoder == nil) {
-                        encoder = [command_buffer computeCommandEncoder];
-                    }
-
-                    id<MTLBuffer> id_src = mnist_mtl_get_buffer(ctx, gf->nodes[i]->src[0], &offs_src0);
-                    id<MTLBuffer> id_dst = mnist_mtl_get_buffer(ctx, gf->nodes[i],       &offs_dst);
-
-                    [encoder setComputePipelineState:ctx->pipeline_soft_max];
-                    [encoder setBuffer:id_src offset:offs_src0 atIndex:0];
-                    [encoder setBuffer:id_dst offset:offs_dst  atIndex:1];
-
-                    [encoder dispatchThreadgroups:MTLSizeMake(1, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-#endif
-                } break;
-            case GGML_OP_MUL_MAT:
-                {
-                    if (encoder != nil) {
-                        [encoder endEncoding];
-                        encoder = nil;
-                    }
-
-                    // use MPSMatrixMultiplication
-                    id<MTLBuffer> id_src0 = mnist_mtl_get_buffer(ctx, gf->nodes[i]->src[0], &offs_src0);
-                    id<MTLBuffer> id_src1 = mnist_mtl_get_buffer(ctx, gf->nodes[i]->src[1], &offs_src1);
-                    id<MTLBuffer> id_dst  = mnist_mtl_get_buffer(ctx, gf->nodes[i],         &offs_dst);
-
-                    const int64_t ncols0 = gf->nodes[i]->src[0]->ne[0];
-                    const int64_t nrows0 = gf->nodes[i]->src[0]->ne[1];
-
-                    const int64_t ncols1 = gf->nodes[i]->src[1]->ne[0];
-                    const int64_t nrows1 = gf->nodes[i]->src[1]->ne[1];
-
-                    const int64_t ncols2 = gf->nodes[i]->ne[0];
-                    const int64_t nrows2 = gf->nodes[i]->ne[1];
-
-                    GGML_ASSERT(ncols0 == ncols1);
-
-                    MPSMatrixDescriptor * desc0 = [MPSMatrixDescriptor
-                        matrixDescriptorWithRows:nrows0 columns:ncols0 rowBytes:gf->nodes[i]->src[0]->nb[1] dataType:MPSDataTypeFloat32];
-                    MPSMatrixDescriptor * desc1 = [MPSMatrixDescriptor
-                        matrixDescriptorWithRows:nrows1 columns:ncols1 rowBytes:gf->nodes[i]->src[1]->nb[1] dataType:MPSDataTypeFloat32];
-                    MPSMatrixDescriptor * desc2 = [MPSMatrixDescriptor
-                        matrixDescriptorWithRows:nrows2 columns:ncols2 rowBytes:gf->nodes[i]->nb[1] dataType:MPSDataTypeFloat32];
-
-                    MPSMatrix * mat_src0 = [[MPSMatrix alloc] initWithBuffer:id_src0 offset:offs_src0 descriptor:desc0];
-                    MPSMatrix * mat_src1 = [[MPSMatrix alloc] initWithBuffer:id_src1 offset:offs_src1 descriptor:desc1];
-                    MPSMatrix * mat_dst  = [[MPSMatrix alloc] initWithBuffer:id_dst  offset:offs_dst  descriptor:desc2];
-
-                    MPSMatrixMultiplication * mul = [[MPSMatrixMultiplication alloc] initWithDevice:ctx->device
-                        transposeLeft:false transposeRight:true resultRows:nrows1 resultColumns:nrows0 interiorColumns:ncols0 alpha:1.0 beta:0.0];
-
-                    [mul encodeToCommandBuffer:command_buffer leftMatrix:mat_src1 rightMatrix:mat_src0 resultMatrix:mat_dst];
-                } break;
-            default:
-                {
-                    fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
-                    GGML_ASSERT(false);
-                    return -1;
-                }
-        }
-    }
-
-    // extract results from the GPU
-    {
-        if (encoder != nil) {
-            [encoder endEncoding];
-            encoder = nil;
-        }
-
-        struct ggml_tensor * out = gf->nodes[gf->n_nodes - 1];
-
-        id<MTLBuffer> id_src = mnist_mtl_get_buffer(ctx, out, &offs_src0);
-        id<MTLBuffer> id_dst = ctx->out;
-
-        id<MTLBlitCommandEncoder> encoder_blit = [command_buffer blitCommandEncoder];
-        [encoder_blit copyFromBuffer:id_src sourceOffset:offs_src0 toBuffer:id_dst destinationOffset:0 size:ggml_nbytes(out)];
-        [encoder_blit endEncoding];
-    }
-
-    [command_buffer commit];
-    [command_buffer waitUntilCompleted];
-
-    {
-        const double time_elapsed = [command_buffer GPUEndTime] - [command_buffer GPUStartTime];
-        fprintf(stderr, "%s: time elapsed = %f\n", __func__, time_elapsed);
-    }
-
-    // select the most probable digit
-    int result = -1;
-    {
-        const float * probs = ctx->out.contents;
-
-        float prob = probs[0];
-
-        for (int i = 0; i < 10; ++i) {
-            fprintf(stderr, "%s: probs[%2d] = %f\n", __func__, i, probs[i]);
-
-            if (probs[i] > prob) {
-                result = i;
-                prob = probs[i];
-            }
-        }
-    }
-
-    return result;
-}
diff --git a/ggml/examples/mnist/main.cpp b/ggml/examples/mnist/main.cpp
deleted file mode 100644
index 3580858..0000000
--- a/ggml/examples/mnist/main.cpp
+++ /dev/null
@@ -1,328 +0,0 @@
-#include "ggml/ggml.h"
-
-#include "common.h"
-
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <ctime>
-#include <fstream>
-#include <string>
-#include <vector>
-#include <algorithm>
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-
-// default hparams
-struct mnist_hparams {
-    int32_t n_input   = 784;
-    int32_t n_hidden  = 500;
-    int32_t n_classes = 10;
-};
-
-struct mnist_model {
-    mnist_hparams hparams;
-
-    struct ggml_tensor * fc1_weight;
-    struct ggml_tensor * fc1_bias;
-
-    struct ggml_tensor * fc2_weight;
-    struct ggml_tensor * fc2_bias;
-
-    struct ggml_context * ctx;
-};
-
-// load the model's weights from a file
-bool mnist_model_load(const std::string & fname, mnist_model & model) {
-    printf("%s: loading model from '%s'\n", __func__, fname.c_str());
-
-    auto fin = std::ifstream(fname, std::ios::binary);
-    if (!fin) {
-        fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
-        return false;
-    }
-
-    // verify magic
-    {
-        uint32_t magic;
-        fin.read((char *) &magic, sizeof(magic));
-        if (magic != GGML_FILE_MAGIC) {
-            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
-            return false;
-        }
-    }
-
-    auto & ctx = model.ctx;
-
-    size_t ctx_size = 0;
-
-    {
-        const auto & hparams = model.hparams;
-
-        const int n_input   = hparams.n_input;
-        const int n_hidden  = hparams.n_hidden;
-        const int n_classes = hparams.n_classes;
-
-        ctx_size += n_input * n_hidden * ggml_type_size(GGML_TYPE_F32); // fc1 weight
-        ctx_size +=           n_hidden * ggml_type_size(GGML_TYPE_F32); // fc1 bias
-
-        ctx_size += n_hidden * n_classes * ggml_type_size(GGML_TYPE_F32); // fc2 weight
-        ctx_size +=            n_classes * ggml_type_size(GGML_TYPE_F32); // fc2 bias
-
-        printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
-    }
-
-    // create the ggml context
-    {
-        struct ggml_init_params params = {
-            /*.mem_size   =*/ ctx_size + 1024*1024,
-            /*.mem_buffer =*/ NULL,
-            /*.no_alloc   =*/ false,
-        };
-
-        model.ctx = ggml_init(params);
-        if (!model.ctx) {
-            fprintf(stderr, "%s: ggml_init() failed\n", __func__);
-            return false;
-        }
-    }
-
-    // Read FC1 layer 1
-    {
-        // Read dimensions
-        int32_t n_dims;
-        fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
-
-        {
-            int32_t ne_weight[2] = { 1, 1 };
-            for (int i = 0; i < n_dims; ++i) {
-                fin.read(reinterpret_cast<char *>(&ne_weight[i]), sizeof(ne_weight[i]));
-            }
-
-            // FC1 dimensions taken from file, eg. 768x500
-            model.hparams.n_input  = ne_weight[0];
-            model.hparams.n_hidden = ne_weight[1];
-
-            model.fc1_weight = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, model.hparams.n_input, model.hparams.n_hidden);
-            fin.read(reinterpret_cast<char *>(model.fc1_weight->data), ggml_nbytes(model.fc1_weight));
-            ggml_set_name(model.fc1_weight, "fc1_weight");
-        }
-
-        {
-            int32_t ne_bias[2] = { 1, 1 };
-            for (int i = 0; i < n_dims; ++i) {
-                fin.read(reinterpret_cast<char *>(&ne_bias[i]), sizeof(ne_bias[i]));
-            }
-
-            model.fc1_bias = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model.hparams.n_hidden);
-            fin.read(reinterpret_cast<char *>(model.fc1_bias->data), ggml_nbytes(model.fc1_bias));
-            ggml_set_name(model.fc1_bias, "fc1_bias");
-
-            // just for testing purposes, set some parameters to non-zero
-            model.fc1_bias->op_params[0] = 0xdeadbeef;
-        }
-    }
-
-    // Read FC2 layer 2
-    {
-        // Read dimensions
-        int32_t n_dims;
-        fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
-
-        {
-            int32_t ne_weight[2] = { 1, 1 };
-            for (int i = 0; i < n_dims; ++i) {
-                fin.read(reinterpret_cast<char *>(&ne_weight[i]), sizeof(ne_weight[i]));
-            }
-
-            // FC1 dimensions taken from file, eg. 10x500
-            model.hparams.n_classes = ne_weight[1];
-
-            model.fc2_weight = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, model.hparams.n_hidden, model.hparams.n_classes);
-            fin.read(reinterpret_cast<char *>(model.fc2_weight->data), ggml_nbytes(model.fc2_weight));
-            ggml_set_name(model.fc2_weight, "fc2_weight");
-        }
-
-        {
-            int32_t ne_bias[2] = { 1, 1 };
-            for (int i = 0; i < n_dims; ++i) {
-                fin.read(reinterpret_cast<char *>(&ne_bias[i]), sizeof(ne_bias[i]));
-            }
-
-            model.fc2_bias = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model.hparams.n_classes);
-            fin.read(reinterpret_cast<char *>(model.fc2_bias->data), ggml_nbytes(model.fc2_bias));
-            ggml_set_name(model.fc2_bias, "fc2_bias");
-        }
-    }
-
-    fin.close();
-
-    return true;
-}
-
-// evaluate the model
-//
-//   - model:     the model
-//   - n_threads: number of threads to use
-//   - digit:     784 pixel values
-//
-// returns 0 - 9 prediction
-int mnist_eval(
-        const mnist_model & model,
-        const int n_threads,
-        std::vector<float> digit,
-        const char * fname_cgraph
-        ) {
-
-    const auto & hparams = model.hparams;
-
-    static size_t buf_size = hparams.n_input * sizeof(float) * 32;
-    static void * buf = malloc(buf_size);
-
-    struct ggml_init_params params = {
-        /*.mem_size   =*/ buf_size,
-        /*.mem_buffer =*/ buf,
-        /*.no_alloc   =*/ false,
-    };
-
-    struct ggml_context * ctx0 = ggml_init(params);
-    struct ggml_cgraph * gf = ggml_new_graph(ctx0);
-
-    struct ggml_tensor * input = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, hparams.n_input);
-    memcpy(input->data, digit.data(), ggml_nbytes(input));
-    ggml_set_name(input, "input");
-
-    // fc1 MLP = Ax + b
-    ggml_tensor * fc1 = ggml_add(ctx0, ggml_mul_mat(ctx0, model.fc1_weight, input),                model.fc1_bias);
-    ggml_tensor * fc2 = ggml_add(ctx0, ggml_mul_mat(ctx0, model.fc2_weight, ggml_relu(ctx0, fc1)), model.fc2_bias);
-
-    // soft max
-    ggml_tensor * probs = ggml_soft_max(ctx0, fc2);
-    ggml_set_name(probs, "probs");
-
-    // build / export / run the computation graph
-    ggml_build_forward_expand(gf, probs);
-    ggml_graph_compute_with_ctx(ctx0, gf, n_threads);
-
-    //ggml_graph_print   (&gf);
-    ggml_graph_dump_dot(gf, NULL, "mnist.dot");
-
-    if (fname_cgraph) {
-        // export the compute graph for later use
-        // see the "mnist-cpu" example
-        ggml_graph_export(gf, "mnist.ggml");
-
-        fprintf(stderr, "%s: exported compute graph to '%s'\n", __func__, fname_cgraph);
-    }
-
-    const float * probs_data = ggml_get_data_f32(probs);
-
-    const int prediction = std::max_element(probs_data, probs_data + 10) - probs_data;
-
-    ggml_free(ctx0);
-
-    return prediction;
-}
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-int wasm_eval(uint8_t * digitPtr) {
-    mnist_model model;
-    if (!mnist_model_load("models/mnist/ggml-model-f32.bin", model)) {
-        fprintf(stderr, "error loading model\n");
-        return -1;
-    }
-    std::vector<float> digit(digitPtr, digitPtr + 784);
-    int result = mnist_eval(model, 1, digit, nullptr);
-    ggml_free(model.ctx);
-
-    return result;
-}
-
-int wasm_random_digit(char * digitPtr) {
-    auto fin = std::ifstream("models/mnist/t10k-images.idx3-ubyte", std::ios::binary);
-    if (!fin) {
-        fprintf(stderr, "failed to open digits file\n");
-        return 0;
-    }
-    srand(time(NULL));
-
-    // Seek to a random digit: 16-byte header + 28*28 * (random 0 - 10000)
-    fin.seekg(16 + 784 * (rand() % 10000));
-    fin.read(digitPtr, 784);
-
-    return 1;
-}
-
-#ifdef __cplusplus
-}
-#endif
-
-int main(int argc, char ** argv) {
-    srand(time(NULL));
-    ggml_time_init();
-
-    if (argc != 3) {
-        fprintf(stderr, "Usage: %s models/mnist/ggml-model-f32.bin models/mnist/t10k-images.idx3-ubyte\n", argv[0]);
-        exit(0);
-    }
-
-    uint8_t buf[784];
-    mnist_model model;
-    std::vector<float> digit;
-
-    // load the model
-    {
-        const int64_t t_start_us = ggml_time_us();
-
-        if (!mnist_model_load(argv[1], model)) {
-            fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, "models/ggml-model-f32.bin");
-            return 1;
-        }
-
-        const int64_t t_load_us = ggml_time_us() - t_start_us;
-
-        fprintf(stdout, "%s: loaded model in %8.2f ms\n", __func__, t_load_us / 1000.0f);
-    }
-
-    // read a random digit from the test set
-    {
-        std::ifstream fin(argv[2], std::ios::binary);
-        if (!fin) {
-            fprintf(stderr, "%s: failed to open '%s'\n", __func__, argv[2]);
-            return 1;
-        }
-
-        // seek to a random digit: 16-byte header + 28*28 * (random 0 - 10000)
-        fin.seekg(16 + 784 * (rand() % 10000));
-        fin.read((char *) &buf, sizeof(buf));
-    }
-
-    // render the digit in ASCII
-    {
-        digit.resize(sizeof(buf));
-
-        for (int row = 0; row < 28; row++) {
-            for (int col = 0; col < 28; col++) {
-                fprintf(stderr, "%c ", (float)buf[row*28 + col] > 230 ? '*' : '_');
-                digit[row*28 + col] = ((float)buf[row*28 + col]);
-            }
-
-            fprintf(stderr, "\n");
-        }
-
-        fprintf(stderr, "\n");
-    }
-
-    const int prediction = mnist_eval(model, 1, digit, "mnist.ggml");
-
-    fprintf(stdout, "%s: predicted digit is %d\n", __func__, prediction);
-
-    ggml_free(model.ctx);
-
-    return 0;
-}
diff --git a/ggml/examples/mnist/mnist-cnn.py b/ggml/examples/mnist/mnist-cnn.py
deleted file mode 100644
index 35dda60..0000000
--- a/ggml/examples/mnist/mnist-cnn.py
+++ /dev/null
@@ -1,101 +0,0 @@
-#!/usr/bin/env python3
-import sys
-import gguf
-import numpy as np
-from tensorflow import keras
-from tensorflow.keras import layers
-
-def train(model_name):
-    # Model / data parameters
-    num_classes = 10
-    input_shape = (28, 28, 1)
-
-    # Load the data and split it between train and test sets
-    (x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()
-
-    # Scale images to the [0, 1] range
-    x_train = x_train.astype("float32") / 255
-    x_test = x_test.astype("float32") / 255
-    # Make sure images have shape (28, 28, 1)
-    x_train = np.expand_dims(x_train, -1)
-    x_test = np.expand_dims(x_test, -1)
-    print("x_train shape:", x_train.shape)
-    print(x_train.shape[0], "train samples")
-    print(x_test.shape[0], "test samples")
-
-    # convert class vectors to binary class matrices
-    y_train = keras.utils.to_categorical(y_train, num_classes)
-    y_test = keras.utils.to_categorical(y_test, num_classes)
-
-    model = keras.Sequential(
-        [
-            keras.Input(shape=input_shape),
-            layers.Conv2D(32, kernel_size=(3, 3), activation="relu"),
-            layers.MaxPooling2D(pool_size=(2, 2)),
-            layers.Conv2D(64, kernel_size=(3, 3), activation="relu"),
-            layers.MaxPooling2D(pool_size=(2, 2)),
-            layers.Flatten(),
-            layers.Dropout(0.5),
-            layers.Dense(num_classes, activation="softmax"),
-        ]
-    )
-
-    model.summary()
-    batch_size = 128
-    epochs = 15
-    model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
-    model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.1)
-
-    score = model.evaluate(x_test, y_test, verbose=0)
-    print("Test loss:", score[0])
-    print("Test accuracy:", score[1])
-    model.save(model_name)
-    print("Keras model saved to '" + model_name + "'")
-
-def convert(model_name):
-    model = keras.models.load_model(model_name)
-    gguf_model_name = model_name + ".gguf"
-    gguf_writer = gguf.GGUFWriter(gguf_model_name, "mnist-cnn")
-
-    kernel1 = model.layers[0].weights[0].numpy()
-    kernel1 = np.moveaxis(kernel1, [2,3], [0,1])
-    kernel1 = kernel1.astype(np.float16)
-    gguf_writer.add_tensor("kernel1", kernel1, raw_shape=(32, 1, 3, 3))
-
-    bias1 = model.layers[0].weights[1].numpy()
-    bias1 = np.repeat(bias1, 26*26)
-    gguf_writer.add_tensor("bias1", bias1, raw_shape=(1, 32, 26, 26))
-
-    kernel2 = model.layers[2].weights[0].numpy()
-    kernel2 = np.moveaxis(kernel2, [0,1,2,3], [2,3,1,0])
-    kernel2 = kernel2.astype(np.float16)
-    gguf_writer.add_tensor("kernel2", kernel2, raw_shape=(64, 32, 3, 3))
-
-    bias2 = model.layers[2].weights[1].numpy()
-    bias2 = np.repeat(bias2, 11*11)
-    gguf_writer.add_tensor("bias2", bias2, raw_shape=(1, 64, 11, 11))
-
-    dense_w = model.layers[-1].weights[0].numpy()
-    dense_w = dense_w.transpose()
-    gguf_writer.add_tensor("dense_w", dense_w, raw_shape=(10, 1600))
-
-    dense_b = model.layers[-1].weights[1].numpy()
-    gguf_writer.add_tensor("dense_b", dense_b)
-
-    gguf_writer.write_header_to_file()
-    gguf_writer.write_kv_data_to_file()
-    gguf_writer.write_tensors_to_file()
-    gguf_writer.close()
-    print("Model converted and saved to '{}'".format(gguf_model_name))
-
-if __name__ == '__main__':
-    if len(sys.argv) < 3:
-        print("Usage: %s <train|convert> <model_name>".format(sys.argv[0]))
-        sys.exit(1)
-    if sys.argv[1] == 'train':
-        train(sys.argv[2])
-    elif sys.argv[1] == 'convert':
-        convert(sys.argv[2])
-    else:
-        print("Usage: %s <train|convert> <model_name>".format(sys.argv[0]))
-        sys.exit(1)
diff --git a/ggml/examples/mpt/CMakeLists.txt b/ggml/examples/mpt/CMakeLists.txt
deleted file mode 100644
index 09408f9..0000000
--- a/ggml/examples/mpt/CMakeLists.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-#
-# mpt
-
-set(TEST_TARGET mpt)
-add_executable(${TEST_TARGET} main.cpp)
-target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)
-
-#
-# mpt-quantize
-
-set(TEST_TARGET mpt-quantize)
-add_executable(${TEST_TARGET} quantize.cpp)
-target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)
diff --git a/ggml/examples/mpt/README.md b/ggml/examples/mpt/README.md
deleted file mode 100644
index 39f46ba..0000000
--- a/ggml/examples/mpt/README.md
+++ /dev/null
@@ -1,27 +0,0 @@
-# MPT
-
-Ref: https://github.com/mosaicml/llm-foundry#mpt
-
-## Usage
-
-```bash
-# get the repo and build it
-git clone https://github.com/ggerganov/ggml
-cd ggml
-mkdir build && cd build
-cmake ..
-make -j
-
-# get the model from HuggingFace
-# be sure to have git-lfs installed
-git clone https://huggingface.co/mosaicml/mpt-30b
-
-# convert model to FP16
-python3 ../examples/mpt/convert-h5-to-ggml.py ./mpt-30b 1
-
-# run inference using FP16 precision
-./bin/mpt -m ./mpt-30b/ggml-model-f16.bin -p "I believe the meaning of life is" -t 8 -n 64
-
-# quantize the model to 5-bits using Q5_0 quantization
-./bin/mpt-quantize ./mpt-30b/ggml-model-f16.bin ./mpt-30b/ggml-model-q5_0.bin q5_0
-```
diff --git a/ggml/examples/mpt/convert-h5-to-ggml.py b/ggml/examples/mpt/convert-h5-to-ggml.py
deleted file mode 100644
index ccd6459..0000000
--- a/ggml/examples/mpt/convert-h5-to-ggml.py
+++ /dev/null
@@ -1,169 +0,0 @@
-import os
-import struct
-import sys
-
-import torch
-from transformers import AutoConfig, AutoTokenizer
-
-
-# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
-def bytes_to_unicode():
-    """
-    Returns list of utf-8 byte and a corresponding list of unicode strings.
-    The reversible bpe codes work on unicode strings.
-    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
-    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
-    This is a signficant percentage of your normal, say, 32K bpe vocab.
-    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
-    And avoids mapping to whitespace/control characters the bpe code barfs on.
-    """
-    bs = (
-        list(range(ord("!"), ord("~") + 1))
-        + list(range(ord("¡"), ord("¬") + 1))
-        + list(range(ord("®"), ord("ÿ") + 1))
-    )
-    cs = bs[:]
-    n = 0
-    for b in range(2**8):
-        if b not in bs:
-            bs.append(b)
-            cs.append(2**8 + n)
-            n += 1
-
-    cs = [chr(n) for n in cs]
-
-    return dict(zip(bs, cs))
-
-
-def count_model_parts(dir_model: str) -> int:
-    """Returns the number of model parts in the model directory."""
-    num_parts = 0
-    for filename in os.listdir(dir_model):
-        if filename.startswith("pytorch_model-"):
-            num_parts += 1
-
-    if num_parts > 0:
-        print(f"Found {num_parts} model parts in {dir_model}")
-    return num_parts
-
-
-if len(sys.argv) < 3:
-    print("Usage: convert-h5-to-ggml.py dir-model [use-f32]\n")
-    print("  ftype == 0 -> float32")
-    print("  ftype == 1 -> float16")
-    sys.exit(1)
-
-
-# output in the same directory as the model
-dir_model = sys.argv[1]
-# get number of model parts
-num_parts = count_model_parts(dir_model)
-
-# possible data types
-#   ftype == 0 -> float32
-#   ftype == 1 -> float16
-#
-# map from ftype to string
-ftype_str = ["f32", "f16"]
-
-ftype = 1
-if len(sys.argv) > 2:
-    ftype = int(sys.argv[2])
-    if ftype < 0 or ftype > 1:
-        print("Invalid ftype: " + str(ftype))
-        sys.exit(1)
-    fname_out = dir_model + "/ggml-model-" + ftype_str[ftype] + ".bin"
-
-
-tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
-config = AutoConfig.from_pretrained(dir_model, trust_remote_code=True)
-hparams = config.to_dict()
-
-fout = open(fname_out, "wb")
-
-fout.write(struct.pack("i", 0x67676D6C))  # magic: ggml in hex
-fout.write(struct.pack("i", hparams["d_model"]))
-fout.write(struct.pack("i", hparams["max_seq_len"]))
-fout.write(struct.pack("i", hparams["n_heads"]))
-fout.write(struct.pack("i", hparams["n_layers"]))
-fout.write(struct.pack("i", hparams["vocab_size"]))
-fout.write(struct.pack("f", hparams["attn_config"]["alibi_bias_max"]))
-fout.write(struct.pack("f", hparams["attn_config"]["clip_qkv"] or 0.0))
-fout.write(struct.pack("i", ftype))
-
-vocab_size = hparams["vocab_size"]
-
-encoder = tokenizer.vocab
-# Add added_tokens (special tokens) to the encoder
-encoder.update(tokenizer.get_added_vocab())
-
-byte_encoder = bytes_to_unicode()
-byte_decoder = {v: k for k, v in byte_encoder.items()}
-
-counter = 0
-# sort by value
-for key in sorted(encoder, key=encoder.get):
-    # workaround for key error when c not found
-    text = ""
-    for c in key:
-        if c not in byte_decoder:
-            text += c
-        else:
-            text += chr(byte_decoder[c])
-    text = bytearray(text, encoding="utf-8")
-    fout.write(struct.pack("i", len(text)))
-    fout.write(text)
-    counter += 1
-
-# Repeat last token until vocab_size
-while counter < vocab_size:
-    fout.write(struct.pack("i", len(text)))
-    fout.write(text)
-    counter += 1
-
-if num_parts == 0:
-    part_names = ("pytorch_model.bin",)
-else:
-    part_names = (
-        f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
-    )
-
-for part_name in part_names:
-    print(f"\n* Loading part: {part_name}")
-    model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
-
-    for name in model_part.keys():
-        data = model_part[name].squeeze()
-        n_dims = len(data.shape)
-
-        # ftype == 0 -> float32, ftype == 1 -> float16
-        # default type is fp32
-        ftype_cur = 0
-        if ftype == 1 and name[-7:] == ".weight" and n_dims > 1:
-            ftype_cur = 1
-        data = data.to(dtype=torch.float16 if ftype_cur == 1 else torch.float32).numpy()
-
-        print(
-            "Processing variable: " + name + " with shape: ",
-            data.shape,
-            "->",
-            data.dtype,
-        )
-
-        # header
-        str = name.encode("utf-8")
-        fout.write(struct.pack("iii", n_dims, len(str), ftype_cur))
-        for i in range(n_dims):
-            fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
-        fout.write(str)
-
-        # data
-        data.tofile(fout)
-
-    # release memory
-    del model_part
-
-fout.close()
-
-print("Done. Output file: " + fname_out)
-print("")
diff --git a/ggml/examples/mpt/main.cpp b/ggml/examples/mpt/main.cpp
deleted file mode 100644
index a16367c..0000000
--- a/ggml/examples/mpt/main.cpp
+++ /dev/null
@@ -1,1042 +0,0 @@
-#include "ggml/ggml.h"
-
-#include "common-ggml.h"
-#include "common.h"
-
-#include <cmath>
-#include <cstddef>
-#include <cstdio>
-#include <cstring>
-#include <fstream>
-#include <cinttypes>
-#include <map>
-#include <string>
-#include <utility>
-#include <vector>
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-
-// no defaults for now
-struct mpt_hparams {
-    int32_t d_model      = 0;
-    int32_t max_seq_len  = 0;
-    int32_t n_heads      = 0;
-    int32_t n_layers     = 0;
-    int32_t n_vocab      = 0;
-    float alibi_bias_max = 0;
-    float clip_qkv       = 0;
-    int32_t ftype        = 0;
-    int32_t n_ctx        = 0;
-
-};
-
-struct mpt_layer {
-    // pre normalization
-    struct ggml_tensor * norm_1_weight;
-
-    // attention
-    struct ggml_tensor * c_attn_wqkv_weight;
-    struct ggml_tensor * c_attn_out_proj_weight;
-
-    // post normalization
-    struct ggml_tensor * norm_2_weight;
-
-    // ff
-    struct ggml_tensor * ffn_up_proj;
-    struct ggml_tensor * ffn_down_proj;
-};
-
-struct mpt_model {
-    mpt_hparams hparams;
-
-    struct ggml_tensor * wte_weight;    // position embedding
-    struct ggml_tensor * norm_f_weight; // language model head
-
-    std::vector<mpt_layer> layers;
-
-    // key + value memory
-    struct ggml_tensor * memory_k;
-    struct ggml_tensor * memory_v;
-
-    struct ggml_context * ctx;
-    std::map<std::string, struct ggml_tensor *> tensors;
-};
-
-struct mpt_params {
-    int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
-
-    int32_t seed           = -1; // RNG seed
-    int32_t n_predict      = 200; // new tokens to predict
-    int32_t n_batch        = 8; // batch size for prompt processing
-    int32_t n_ctx          = 512;
-
-    std::string model      = ""; // model path
-    std::string prompt     = "";
-    std::string token_test = "";
-
-    bool    perplexity     = false;
-
-    // sampling parameters
-    int32_t top_k          = 0;
-    float   top_p          = 1.0f;
-    float   temp           = 0.8f;
-    int32_t repeat_last_n  = 64;
-    float   repeat_penalty = 1.02f;
-
-};
-
-void mpt_print_usage(int /*argc*/, char ** argv, const mpt_params & params) {
-    fprintf(stderr, "usage: %s [options]\n", argv[0]);
-    fprintf(stderr, "\n");
-    fprintf(stderr, "options:\n");
-    fprintf(stderr, "  -h, --help            show this help message and exit\n");
-    fprintf(stderr, "  -s SEED, --seed SEED  RNG seed (default: -1)\n");
-    fprintf(stderr, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
-    fprintf(stderr, "  -p PROMPT, --prompt PROMPT\n");
-    fprintf(stderr, "                        prompt to start generation with (default: random)\n");
-    fprintf(stderr, "  -f FNAME, --file FNAME\n");
-    fprintf(stderr, "                        load prompt from a file\n");
-    fprintf(stderr, "  -tt TOKEN_TEST, --token_test TOKEN_TEST\n");
-    fprintf(stderr, "                        test tokenization\n");
-    fprintf(stderr, "  -n N, --n_predict N   number of tokens to predict (default: %d)\n", params.n_predict);
-    fprintf(stderr, "  --top_k N             top-k sampling (default: %d, 0 = n_vocab)\n", params.top_k);
-    fprintf(stderr, "  --top_p N             top-p sampling (default: %.2f)\n", params.top_p);
-    fprintf(stderr, "  --temp N              temperature (default: %.2f)\n", params.temp);
-    fprintf(stderr, "  --repeat-last-n N     last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", params.repeat_last_n);
-    fprintf(stderr, "  --repeat-penalty N    penalize repeat sequence of tokens (default: %.2f, 1.0 = disabled)\n", (double)params.repeat_penalty);
-    fprintf(stderr, "  --perplexity          compute perplexity over the prompt\n");
-    fprintf(stderr, "  -c N, --ctx-size N    size of the prompt context (default: %d)\n", params.n_ctx);
-    fprintf(stderr, "  -b N, --batch_size N  batch size for prompt processing (default: %d)\n", params.n_batch);
-    fprintf(stderr, "  -m FNAME, --model FNAME\n");
-    fprintf(stderr, "                        model path (default: %s)\n", params.model.c_str());
-    fprintf(stderr, "\n");
-}
-
-bool mpt_params_parse(int argc, char ** argv, mpt_params & params) {
-    for (int i = 1; i < argc; i++) {
-        std::string arg = argv[i];
-
-        if (arg == "-s" || arg == "--seed") {
-            params.seed = std::stoi(argv[++i]);
-        } else if (arg == "-t" || arg == "--threads") {
-            params.n_threads = std::stoi(argv[++i]);
-        } else if (arg == "-p" || arg == "--prompt") {
-            params.prompt = argv[++i];
-        } else if (arg == "-n" || arg == "--n_predict") {
-            params.n_predict = std::stoi(argv[++i]);
-        } else if (arg == "--top_k") {
-            params.top_k = std::max(1, std::stoi(argv[++i]));
-        } else if (arg == "--top_p") {
-            params.top_p = std::stof(argv[++i]);
-        } else if (arg == "--temp") {
-            params.temp = std::stof(argv[++i]);
-        } else if (arg == "--repeat-last-n") {
-            params.repeat_last_n = std::stof(argv[++i]);
-        } else if (arg == "--repeat-penalty") {
-            params.repeat_penalty = std::stof(argv[++i]);
-        } else if (arg == "--perplexity") {
-            params.perplexity = true;
-        } else if (arg == "-c" || arg == "--ctx-size") {
-            params.n_ctx = std::stoi(argv[++i]);
-        } else if (arg == "-b" || arg == "--batch_size") {
-            params.n_batch = std::stoi(argv[++i]);
-        } else if (arg == "-m" || arg == "--model") {
-            params.model = argv[++i];
-        } else if (arg == "-h" || arg == "--help") {
-            mpt_print_usage(argc, argv, params);
-            exit(0);
-        } else if (arg == "-f" || arg == "--file") {
-            if (++i > argc) {
-                fprintf(stderr, "Invalid file param");
-                break;
-            }
-            std::ifstream file(argv[i]);
-            if (!file) {
-                fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
-                break;
-            }
-            params.prompt.clear();
-            std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
-            if (params.prompt.back() == '\n') {
-                params.prompt.pop_back();
-            }
-        } else if (arg == "-tt" || arg == "--token_test") {
-            params.token_test = argv[++i];
-        } else {
-            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
-            mpt_print_usage(argc, argv, params);
-            exit(0);
-        }
-    }
-
-    return true;
-}
-
-// load the model's weights from a file
-bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vocab) {
-    printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
-
-    auto fin = std::ifstream(fname, std::ios::binary);
-    if (!fin) {
-        fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
-        return false;
-    }
-
-    // verify magic
-    {
-        uint32_t magic;
-        fin.read((char *)&magic, sizeof(magic));
-        if (magic != GGML_FILE_MAGIC) {
-            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
-            return false;
-        }
-    }
-
-    // load hparams
-    {
-        auto & hparams = model.hparams;
-
-        fin.read((char *) &hparams.d_model,        sizeof(hparams.d_model));
-        fin.read((char *) &hparams.max_seq_len,    sizeof(hparams.max_seq_len));
-        fin.read((char *) &hparams.n_heads,        sizeof(hparams.n_heads));
-        fin.read((char *) &hparams.n_layers,       sizeof(hparams.n_layers));
-        fin.read((char *) &hparams.n_vocab,        sizeof(hparams.n_vocab));
-        fin.read((char *) &hparams.alibi_bias_max, sizeof(hparams.alibi_bias_max));
-        fin.read((char *) &hparams.clip_qkv,       sizeof(hparams.clip_qkv));
-        fin.read((char *) &hparams.ftype,          sizeof(hparams.ftype));
-
-        hparams.n_ctx = std::min(hparams.max_seq_len, hparams.n_ctx);
-
-        const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
-
-        printf("%s: d_model        = %d\n", __func__, hparams.d_model);
-        printf("%s: max_seq_len    = %d\n", __func__, hparams.max_seq_len);
-        printf("%s: n_ctx          = %d\n", __func__, hparams.n_ctx);
-        printf("%s: n_heads        = %d\n", __func__, hparams.n_heads);
-        printf("%s: n_layers       = %d\n", __func__, hparams.n_layers);
-        printf("%s: n_vocab        = %d\n", __func__, hparams.n_vocab);
-        printf("%s: alibi_bias_max = %f\n", __func__, hparams.alibi_bias_max);
-        printf("%s: clip_qkv       = %f\n", __func__, hparams.clip_qkv);
-        printf("%s: ftype          = %d\n", __func__, hparams.ftype);
-        printf("%s: qntvr          = %d\n", __func__, qntvr);
-
-        hparams.ftype %= GGML_QNT_VERSION_FACTOR;
-    }
-
-    // load vocab
-    {
-        const int32_t n_vocab = model.hparams.n_vocab;
-
-        std::string word;
-        std::vector<char> buf(128);
-
-        for (int i = 0; i < n_vocab; i++) {
-            uint32_t len;
-            fin.read((char *) &len, sizeof(len));
-
-            buf.resize(len);
-            fin.read((char *) buf.data(), len);
-            word.assign(buf.data(), len);
-
-            // Convert token from utf-8
-            std::wstring word_multibytes = convert_to_wstring(word);
-            word.resize(word_multibytes.size());
-            for (size_t w = 0; w < word_multibytes.size(); w++) {
-                word[w] = uint8_t(word_multibytes[w]);
-            }
-
-            vocab.token_to_id[word] = i;
-            vocab.id_to_token[i] = word;
-        }
-    }
-
-    // for the big tensors, we have the option to store the data in 16-bit
-    // floats or quantized in order to save memory and also to speed up the
-    // computation
-    ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype)(model.hparams.ftype));
-    if (wtype == GGML_TYPE_COUNT) {
-        fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n", __func__, fname.c_str(),
-                model.hparams.ftype);
-        return false;
-    }
-
-    auto & ctx = model.ctx;
-
-    size_t ctx_size = 0;
-
-    const auto & hparams = model.hparams;
-    const size_t n_ctx = hparams.n_ctx;
-
-    {
-        const size_t n_embd = hparams.d_model;
-        const size_t n_layer = hparams.n_layers;
-        const size_t n_vocab = hparams.n_vocab;
-
-        ctx_size += ggml_row_size(wtype,         n_embd * n_vocab); // wte_weight
-        ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd);           // norm_f_weight
-
-        ctx_size += n_layer * (ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_weight
-
-        ctx_size += n_layer * (ggml_row_size(wtype, 3 * n_embd * n_embd)); // attn_Wqkv_weight
-        ctx_size += n_layer * (ggml_row_size(wtype,     n_embd * n_embd)); // attn_out_proj_weight
-
-        ctx_size += n_layer * (ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_weight
-
-        ctx_size += n_layer * (ggml_row_size(wtype, 4 * n_embd * n_embd)); // mlp_mlp_up_weight
-        ctx_size += n_layer * (ggml_row_size(wtype, 4 * n_embd * n_embd)); // mlp_mlp_down_weight
-
-        ctx_size += n_ctx * n_layer * ggml_row_size(GGML_TYPE_F16, n_embd); // memory_k
-        ctx_size += n_ctx * n_layer * ggml_row_size(GGML_TYPE_F16, n_embd); // memory_v
-
-        ctx_size += (1 + 6 * n_layer) * 512; // object overhead
-
-        printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size / (1024.0 * 1024.0));
-    }
-
-    // create the ggml context
-    {
-        struct ggml_init_params params = {
-            /*.mem_size   =*/ ctx_size,
-            /*.mem_buffer =*/ NULL,
-            /*.no_alloc   =*/ false,
-        };
-
-        model.ctx = ggml_init(params);
-        if (!model.ctx) {
-            fprintf(stderr, "%s: ggml_init() failed\n", __func__);
-            return false;
-        }
-    }
-
-    // prepare memory for the weights
-    {
-        const auto & hparams = model.hparams;
-
-        const size_t n_embd = hparams.d_model;
-        const size_t n_layer = hparams.n_layers;
-        const size_t n_vocab = hparams.n_vocab;
-
-        model.layers.resize(n_layer);
-
-        model.wte_weight    = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab);
-        model.norm_f_weight = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-
-        // map by name
-        model.tensors["transformer.wte.weight"]    = model.wte_weight;
-        model.tensors["transformer.norm_f.weight"] = model.norm_f_weight;
-
-        for (int i = 0; i < (int) n_layer; ++i) {
-            auto & layer = model.layers[i];
-
-            layer.norm_1_weight          = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,     n_embd);
-            layer.c_attn_wqkv_weight     = ggml_new_tensor_2d(ctx, wtype,             n_embd, 3 * n_embd);
-            layer.c_attn_out_proj_weight = ggml_new_tensor_2d(ctx, wtype,             n_embd,     n_embd);
-            layer.norm_2_weight          = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,     n_embd);
-            layer.ffn_up_proj            = ggml_new_tensor_2d(ctx, wtype,             n_embd, 4 * n_embd);
-            layer.ffn_down_proj          = ggml_new_tensor_2d(ctx, wtype,         4 * n_embd,     n_embd);
-
-            // map by name
-            model.tensors["transformer.blocks." + std::to_string(i) + ".norm_1.weight"]        = layer.norm_1_weight;
-            model.tensors["transformer.blocks." + std::to_string(i) + ".attn.Wqkv.weight"]     = layer.c_attn_wqkv_weight;
-            model.tensors["transformer.blocks." + std::to_string(i) + ".attn.out_proj.weight"] = layer.c_attn_out_proj_weight;
-            model.tensors["transformer.blocks." + std::to_string(i) + ".norm_2.weight"]        = layer.norm_2_weight;
-            model.tensors["transformer.blocks." + std::to_string(i) + ".ffn.up_proj.weight"]   = layer.ffn_up_proj;
-            model.tensors["transformer.blocks." + std::to_string(i) + ".ffn.down_proj.weight"] = layer.ffn_down_proj;
-        }
-    }
-
-    // key + value memory
-    {
-        const auto & hparams = model.hparams;
-
-        const size_t n_embd  = hparams.d_model;
-        const size_t n_layer = hparams.n_layers;
-
-        const int64_t n_mem      = n_layer * n_ctx;
-        const int64_t n_elements = n_embd  * n_mem;
-
-        model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
-        model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
-
-        const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
-
-        printf("%s: memory_size = %8.2f MB, n_mem = %" PRId64 "\n", __func__, memory_size / 1024.0 / 1024.0, n_mem);
-    }
-
-    // load weights
-    {
-        int n_tensors = 0;
-        size_t total_size = 0;
-
-        printf("%s: ", __func__);
-
-        while (true) {
-            int32_t n_dims;
-            int32_t length;
-            int32_t ttype;
-
-            fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
-            fin.read(reinterpret_cast<char *>(&length), sizeof(length));
-            fin.read(reinterpret_cast<char *>(&ttype), sizeof(ttype));
-
-            if (fin.eof()) {
-                break;
-            }
-
-            int32_t nelements = 1;
-            int32_t ne[2] = {1, 1};
-            for (int i = 0; i < n_dims; ++i) {
-                fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
-                nelements *= ne[i];
-            }
-
-            std::string name(length, 0);
-            fin.read(&name[0], length);
-
-            if (model.tensors.find(name) == model.tensors.end()) {
-                fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.c_str());
-                return false;
-            }
-
-            auto tensor = model.tensors[name];
-            if (ggml_nelements(tensor) != nelements) {
-                fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.c_str());
-                return false;
-            }
-
-            if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
-                fprintf(stderr,
-                        "%s: tensor '%s' has wrong shape in model file: got [%5d, "
-                        "%5d], expected [%5d, %5d]\n",
-                        __func__, name.c_str(), (int)tensor->ne[0], (int)tensor->ne[1], ne[0], ne[1]);
-                return false;
-            }
-
-            // for debugging
-            if (0) {
-                printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.c_str(), ne[0], ne[1],
-                       ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor) / 1024.0 / 1024.0, ggml_nbytes(tensor));
-            }
-
-            const size_t bpe = ggml_type_size(ggml_type(ttype));
-
-            if ((nelements * bpe) / ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
-                fprintf(stderr,
-                        "%s: tensor '%s' has wrong size in model file: got %zu, "
-                        "expected %zu\n",
-                        __func__, name.c_str(), ggml_nbytes(tensor), nelements * bpe);
-                return false;
-            }
-
-            fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
-
-            total_size += ggml_nbytes(tensor);
-            if (++n_tensors % 8 == 0) {
-                printf(".");
-                fflush(stdout);
-            }
-        }
-
-        printf(" done\n");
-
-        printf("%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size / 1024.0 / 1024.0, n_tensors);
-    }
-
-    fin.close();
-
-    return true;
-}
-
-// evaluate the transformer
-//
-//   - model:     the model
-//   - n_threads: number of threads to use
-//   - n_past:    the context size so far
-//   - embd_inp:  the embeddings of the tokens in the context
-//   - embd_w:    the predicted logits for the next token
-//
-bool mpt_eval(const mpt_model & model, const int n_threads, const int n_past,
-              const std::vector<gpt_vocab::id> & embd_inp, std::vector<float> & embd_w, bool logits_all, size_t & mem_per_token) {
-    const int N = embd_inp.size();
-
-    const auto & hparams = model.hparams;
-
-    const int n_embd  = hparams.d_model;
-    const int n_layer = hparams.n_layers;
-    const int n_head  = hparams.n_heads;
-    const int n_vocab = hparams.n_vocab;
-    const int n_ctx   = hparams.n_ctx;
-    const float eps   = 1e-5f;
-
-    static size_t buf_size = 256u * 1024 * 1024;
-    static void * buf = malloc(buf_size);
-
-    // use 2 scratch buffers
-    // TODO: very hacky solution - reimplement in a more elegant way
-    static size_t scr0_size = 256u*1024*1024;
-    static void * scr0 = malloc(scr0_size);
-
-    static size_t scr1_size = 256u*1024*1024;
-    static void * scr1 = malloc(scr1_size);
-
-    if (mem_per_token > 0 && mem_per_token * N > buf_size) {
-        const size_t buf_size_new = 1.1 * (mem_per_token * N); // add 10% to account for ggml object overhead
-        // printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__,
-        // buf_size, buf_size_new);
-
-        // reallocate
-        buf_size = buf_size_new;
-        buf = realloc(buf, buf_size);
-        if (buf == nullptr) {
-            fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size);
-            return false;
-        }
-    }
-
-    struct ggml_init_params params = {
-        /*.mem_size   =*/ buf_size,
-        /*.mem_buffer =*/ buf,
-        /*.no_alloc   =*/ false,
-    };
-
-    struct ggml_context * ctx0 = ggml_init(params);
-    struct ggml_cgraph * gf = ggml_new_graph(ctx0);
-
-    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
-    memcpy(embd->data, embd_inp.data(), N * ggml_element_size(embd));
-
-    struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.wte_weight, embd);
-
-    for (int il = 0; il < n_layer; ++il) {
-
-        struct ggml_tensor * cur;
-
-        ggml_set_scratch(ctx0, { 0, scr0_size, scr0, });
-
-        // a = self.ln_1(x)
-        {
-            cur = ggml_norm(ctx0, inpL, eps);
-
-            cur = ggml_mul(ctx0, ggml_repeat(ctx0, model.layers[il].norm_1_weight, cur), cur);
-        }
-
-        // self-attention
-        //  b, _, past_key_value = self.attn(a, past_key_value=past_key_value,
-        //  attn_bias=attn_bias, attention_mask=attention_mask,
-        //  is_causal=is_causal)
-        {
-            // compute QKV
-            cur = ggml_mul_mat(ctx0, model.layers[il].c_attn_wqkv_weight, cur);
-
-            if (model.hparams.clip_qkv > 0.0f) {
-                cur = ggml_clamp(ctx0, cur, -model.hparams.clip_qkv, model.hparams.clip_qkv);
-            }
-
-            struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0 * sizeof(float) * n_embd);
-            struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1 * sizeof(float) * n_embd);
-            struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2 * sizeof(float) * n_embd);
-
-            // store key and value to memory
-            {
-                struct ggml_tensor * k =
-                    ggml_view_1d(ctx0, model.memory_k, N * n_embd,
-                                 (ggml_element_size(model.memory_k) * n_embd) * (il * n_ctx + n_past));
-                struct ggml_tensor * v =
-                    ggml_view_1d(ctx0, model.memory_v, N * n_embd,
-                                 (ggml_element_size(model.memory_v) * n_embd) * (il * n_ctx + n_past));
-
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
-            }
-
-            // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0,
-            // 2, 1, 3) [64, N, 12]
-            struct ggml_tensor * Q = ggml_permute(
-                ctx0, ggml_cpy(ctx0, Qcur, ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd / n_head, n_head, N)), 0, 2,
-                1, 3);
-
-            // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1,
-            // 3) [64, n_past + N, 12]
-            struct ggml_tensor * K =
-                ggml_permute(ctx0,
-                             ggml_reshape_3d(ctx0,
-                                             ggml_view_1d(ctx0, model.memory_k, (n_past + N) * n_embd,
-                                                          il * n_ctx * ggml_element_size(model.memory_k) * n_embd),
-                                             n_embd / n_head, n_head, n_past + N),
-                             0, 2, 1, 3);
-            // K * Q
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
-
-            // KQ_scaled = KQ / sqrt(n_embd/n_head)
-            struct ggml_tensor * KQ_scaled =
-                ggml_scale(ctx0, KQ, 1.0f / sqrt(float(n_embd) / n_head));
-
-            struct ggml_tensor * KQ_scaled_alibi =
-                ggml_alibi(ctx0, KQ_scaled, n_past, n_head, model.hparams.alibi_bias_max);
-
-            // KQ_masked = mask_past(KQ_scaled)
-            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled_alibi, n_past);
-
-            // KQ = soft_max(KQ_masked)
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
-
-            // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1,
-            // 2, 0, 3).contiguous() [n_past + N, 64, 12]
-            struct ggml_tensor * V_trans = ggml_cpy(
-                ctx0,
-                ggml_permute(ctx0,
-                             ggml_reshape_3d(ctx0,
-                                             ggml_view_1d(ctx0, model.memory_v, (n_past + N) * n_embd,
-                                                          il * n_ctx * ggml_element_size(model.memory_v) * n_embd),
-                                             n_embd / n_head, n_head, n_past + N),
-                             1, 2, 0, 3),
-                ggml_new_tensor_3d(ctx0, model.memory_v->type, n_past + N, n_embd / n_head, n_head));
-
-            // KQV = transpose(V) * KQ_soft_max
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
-
-            // KQV_merged = KQV.permute(0, 2, 1, 3)
-            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
-
-            // cur = KQV_merged.contiguous().view(n_embd, N)
-            cur = ggml_cpy(ctx0, KQV_merged, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
-
-            // projection
-            { cur = ggml_mul_mat(ctx0, model.layers[il].c_attn_out_proj_weight, cur); }
-        }
-
-        inpL = ggml_add(ctx0, inpL, cur);
-
-        ggml_set_scratch(ctx0, { 0, scr1_size, scr1, });
-
-        // m = self.ln_2(x)
-        {
-            cur = ggml_norm(ctx0, inpL, eps);
-
-            cur = ggml_mul(ctx0, ggml_repeat(ctx0, model.layers[il].norm_2_weight, cur), cur);
-        }
-
-        // n = self.mlp(m)
-        {
-
-            cur = ggml_mul_mat(ctx0, model.layers[il].ffn_up_proj, cur);
-
-            // GELU activation
-            cur = ggml_gelu(ctx0, cur);
-
-            // projection
-            // cur = proj_w*cur + proj_b
-            cur = ggml_mul_mat(ctx0, model.layers[il].ffn_down_proj, cur);
-        }
-
-        // x = x + n
-        inpL = ggml_add(ctx0, inpL, cur);
-    }
-
-    ggml_set_scratch(ctx0, { 0, scr0_size, scr0, });
-
-    // norm
-    {
-        inpL = ggml_norm(ctx0, inpL, eps);
-        // inpL = ln_f_g*inpL
-        inpL = ggml_mul(ctx0, ggml_repeat(ctx0, model.norm_f_weight, inpL), inpL);
-    }
-
-    ggml_set_scratch(ctx0, { 0, 0, nullptr, });
-
-    // output embedding weight tied to input embedding
-    inpL = ggml_mul_mat(ctx0, model.wte_weight, inpL);
-
-    // logits -> probs
-    // inpL = ggml_soft_max(ctx0, inpL);
-
-    // run the computation
-    ggml_build_forward_expand(gf, inpL);
-    ggml_graph_compute_with_ctx(ctx0, gf, n_threads);
-
-    // std::cout << "Qcur" << std::endl;
-    // print_tensor(Qcur);
-
-    // if (n_past%100 == 0) {
-    // ggml_graph_print(&gf);
-    // ggml_graph_dump_dot(&gf, NULL, "mpt-model.dot");
-    // }
-
-    if (logits_all) {
-        // return result for all tokens
-        embd_w.resize(n_vocab *N);
-        memcpy(embd_w.data(), (float *)ggml_get_data(inpL) , sizeof(float) * n_vocab * N);
-    } else {
-        // return result for just the last token
-        embd_w.resize(n_vocab);
-        memcpy(embd_w.data(), (float *)ggml_get_data(inpL) + (n_vocab * (N - 1)), sizeof(float) * n_vocab);
-    }
-
-    if (mem_per_token == 0) {
-        mem_per_token = ggml_used_mem(ctx0) / N;
-    }
-    // printf("used_mem = %zu\n", ggml_used_mem(ctx0));
-
-    ggml_free(ctx0);
-
-    return true;
-}
-
-std::vector<float> softmax(const std::vector<float> & logits) {
-    std::vector<float> probs(logits.size());
-    float max_logit = logits[0];
-    for (float v : logits) max_logit = std::max(max_logit, v);
-    double sum_exp = 0.0;
-    for (size_t i = 0; i < logits.size(); i++) {
-        // Subtract the maximum logit value from the current logit value for numerical stability
-        const float logit = logits[i] - max_logit;
-        const float exp_logit = expf(logit);
-        sum_exp += exp_logit;
-        probs[i] = exp_logit;
-    }
-    for (size_t i = 0; i < probs.size(); i++) probs[i] /= sum_exp;
-    return probs;
-}
-
-int perplexity(const mpt_params & params) {
-    ggml_time_init();
-
-    const int64_t t_main_start_us = ggml_time_us();
-
-    printf("%s: n_threads = %d\n", __func__, params.n_threads);
-    printf("%s: n_batch   = %d\n", __func__, params.n_batch);
-    printf("%s: n_ctx     = %d\n", __func__, params.n_ctx);
-    printf("\n");
-
-    int64_t t_load_us = 0;
-
-    gpt_vocab vocab;
-    mpt_model model;
-
-    model.hparams.n_ctx = params.n_ctx;
-
-    // load the model
-    {
-        const int64_t t_start_us = ggml_time_us();
-
-        if (!mpt_model_load(params.model, model, vocab)) {
-            fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
-            return 1;
-        }
-
-        t_load_us = ggml_time_us() - t_start_us;
-    }
-
-    int64_t t_predict_us = 0;
-
-    std::vector<float> logits;
-
-    // tokenize the prompt
-    std::vector<int> embd_inp = ::gpt_tokenize(vocab, params.prompt);
-
-    printf("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
-
-    // determine the required inference memory per token:
-    size_t mem_per_token = 0;
-    mpt_eval(model, params.n_threads, 0, {0, 1, 2, 3}, logits, false, mem_per_token);
-
-    int count   = 0;
-
-    const int n_chunk = embd_inp.size() / params.n_ctx;
-
-    const int n_vocab = model.hparams.n_vocab;
-    const int n_batch = params.n_batch;
-
-    double nll = 0.0;
-    fprintf(stderr, "%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch);
-
-    for (int i = 0; i < n_chunk; ++i) {
-
-        const int start =     i * params.n_ctx;
-        const int end   = start + params.n_ctx;
-
-        const int num_batches = (params.n_ctx + n_batch - 1) / n_batch;
-
-        std::vector<float> logits;
-
-        const auto t_start = std::chrono::high_resolution_clock::now();
-
-        for (int j = 0; j < num_batches; ++j) {
-
-            const int batch_start = start + j * n_batch;
-            const int batch_size  = std::min(end - batch_start, n_batch);
-
-            std::vector<gpt_vocab::id> embd;
-
-            for(int p=0;p<batch_size;p++) {
-                embd.push_back( embd_inp[batch_start+p]  );
-            }
-
-            std::vector<float> batch_logits;// = llama_get_logits(ctx);
-
-            const int64_t t_start_us = ggml_time_us();
-
-            if (!mpt_eval(model, params.n_threads, j * batch_size, embd, batch_logits, true, mem_per_token)) {
-                printf("%s: failed to evaluate model\n", __func__);
-                return 1;
-            }
-
-            t_predict_us += ggml_time_us() - t_start_us;
-
-            logits.insert(logits.end(), batch_logits.data(), batch_logits.data() + batch_size * n_vocab);
-
-        }
-
-        const auto t_end = std::chrono::high_resolution_clock::now();
-
-        if (i == 0) {
-            const float t_total = std::chrono::duration<float>(t_end - t_start).count();
-            fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
-            int total_seconds = (int)(t_total * n_chunk);
-            if (total_seconds >= 60*60) {
-                fprintf(stderr, "%d hours ", total_seconds / (60*60));
-                total_seconds = total_seconds % (60*60);
-            }
-            fprintf(stderr, "%d minutes\n", total_seconds / 60);
-
-            printf("\nChunk\tPPL cumulative\tPPL chunk\n");
-        }
-
-        // We get the logits for all the tokens in the context window (params.n_ctx)
-        // from llama_eval above.  Now, based on https://huggingface.co/docs/transformers/perplexity,
-        // calculate the perplexity over the last half of the window (so the model always has
-        // some context to predict the token).
-        //
-        // We rely on the fact that attention in the forward pass only looks at previous
-        // tokens here, so the logits returned for each token are an accurate representation
-        // of what the model would have predicted at that point.
-        //
-        // Example, we have a context window of 512, we will compute perplexity for each of the
-        // last 256 tokens.  Then, we split the input up into context window size chunks to
-        // process the entire prompt.
-
-        double nllchunk = 0.0;
-        int countchunk = 0;
-
-        for (int j = std::min(512, params.n_ctx / 2); j < params.n_ctx - 1; ++j) {
-            // Calculate probability of next token, given the previous ones.
-            const std::vector<float> tok_logits(
-                logits.begin() + (j + 0) * n_vocab,
-                logits.begin() + (j + 1) * n_vocab);
-
-            const float prob = softmax(tok_logits)[embd_inp[ start+ j + 1]];
-
-            nllchunk += -std::log(prob);
-            ++countchunk;
-        }
-
-		nll += nllchunk;
-		count += countchunk;
-
-        // perplexity is e^(average negative log-likelihood)
-        printf("%d\t%.8lf\t%.8lf\n", i + 1, std::exp(nll / count), std::exp(nllchunk/countchunk) );
-        fflush(stdout);
-    }
-
-    // report timing
-    {
-        const int64_t t_main_end_us = ggml_time_us();
-
-        printf("\n\n");
-        printf("%s: mem per token = %8zu bytes\n", __func__, mem_per_token);
-        printf("%s:     load time = %8.2f ms\n",   __func__, t_load_us / 1000.0f);
-        printf("%s:     eval time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us / 1000.0f, t_predict_us / 1000.0f / (n_chunk * params.n_ctx));
-        printf("%s:    total time = %8.2f ms\n",   __func__, (t_main_end_us - t_main_start_us) / 1000.0f);
-    }
-
-    ggml_free(model.ctx);
-
-    return 0;
-}
-
-int main(int argc, char ** argv) {
-    mpt_params params;
-
-    if (mpt_params_parse(argc, argv, params) == false) {
-        return 1;
-    }
-
-    if (params.perplexity) {
-        return perplexity(params);
-    }
-
-    ggml_time_init();
-
-    const int64_t t_main_start_us = ggml_time_us();
-
-    if (params.seed < 0) {
-        params.seed = time(NULL);
-    }
-
-    if (params.n_predict < 0) {
-        params.n_predict = 0;
-    }
-
-    printf("%s: seed      = %d\n",   __func__, params.seed);
-    printf("%s: n_threads = %d\n",   __func__, params.n_threads);
-    printf("%s: n_batch   = %d\n",   __func__, params.n_batch);
-    printf("%s: n_ctx     = %d\n",   __func__, params.n_ctx);
-    printf("%s: n_predict = %d\n\n", __func__, params.n_predict);
-
-    std::mt19937 rng(params.seed);
-    if (params.prompt.empty()) {
-        params.prompt = gpt_random_prompt(rng);
-    }
-
-    int64_t t_load_us = 0;
-
-    gpt_vocab vocab;
-    mpt_model model;
-
-    model.hparams.n_ctx = params.n_ctx;
-
-    // load the model
-    {
-        const int64_t t_start_us = ggml_time_us();
-
-        if (!mpt_model_load(params.model, model, vocab)) {
-            fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
-            return 1;
-        }
-
-        t_load_us = ggml_time_us() - t_start_us;
-
-        test_gpt_tokenizer(vocab, params.token_test);
-    }
-
-    if (params.top_k == 0) {
-        params.top_k = model.hparams.n_vocab;
-    }
-
-    if (params.repeat_last_n == -1) {
-        params.repeat_last_n = params.n_ctx;
-    }
-
-    printf("\n");
-    printf("%s: temp           = %.3f\n", __func__, params.temp);
-    printf("%s: top_k          = %d\n",   __func__, params.top_k);
-    printf("%s: top_p          = %.3f\n", __func__, params.top_p);
-    printf("%s: repeat_last_n  = %d\n",   __func__, params.repeat_last_n);
-    printf("%s: repeat_penalty = %.3f\n", __func__, params.repeat_penalty);
-
-    int64_t t_sample_us = 0;
-    int64_t t_predict_us = 0;
-
-    std::vector<int32_t> last_n_tokens(params.n_ctx);
-    std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
-
-    // tokenize the prompt
-    std::vector<int> embd_inp = ::gpt_tokenize(vocab, params.prompt);
-
-    printf("\n");
-    printf("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
-
-    for (size_t i = 0; i < embd_inp.size(); i++) {
-        printf("%s: token[%zu] = %6d\n", __func__, i, embd_inp[i]);
-    }
-    printf("\n");
-
-    std::vector<gpt_vocab::id> embd;
-    std::vector<float> logits;
-
-    // determine the required inference memory per token:
-    size_t mem_per_token = 0;
-    mpt_eval(model, params.n_threads, 0, {0, 1, 2, 3}, logits, false, mem_per_token);
-
-    int n_past     = 0;
-    int n_consumed = 0;
-    int n_sampled  = 0;
-
-    while (n_sampled < params.n_predict) {
-        // predict
-        if (embd.size() > 0) {
-            const int64_t t_start_us = ggml_time_us();
-
-            if (!mpt_eval(model, params.n_threads, n_past, embd, logits, false, mem_per_token)) {
-                printf("%s: failed to predict\n", __func__);
-                return 1;
-            }
-
-            t_predict_us += ggml_time_us() - t_start_us;
-
-            n_past += embd.size();
-            embd.clear();
-        }
-
-        if ((int)embd_inp.size() <= n_consumed) {
-            // sample next token
-
-            const int top_k = params.top_k;
-            const float top_p = params.top_p;
-            const float temp = params.temp;
-            const int repeat_last_n = params.repeat_last_n;
-            const float repeat_penalty = params.repeat_penalty;
-
-            gpt_vocab::id id = 0;
-
-            {
-                const int64_t t_start_sample_us = ggml_time_us();
-
-                id = gpt_sample_top_k_top_p_repeat(vocab, logits.data() + (logits.size() - model.hparams.n_vocab), last_n_tokens.data(), last_n_tokens.size(), top_k, top_p, temp, repeat_last_n, repeat_penalty, rng);
-
-                last_n_tokens.erase(last_n_tokens.begin());
-                last_n_tokens.push_back(id);
-
-                t_sample_us += ggml_time_us() - t_start_sample_us;
-            }
-
-            // add it to the context
-            embd.push_back(id);
-            ++n_sampled;
-
-        } else {
-            // if here, it means we are still processing the input prompt
-            while ((int) embd_inp.size() > n_consumed) {
-                embd.push_back(embd_inp[n_consumed]);
-
-                last_n_tokens.erase(last_n_tokens.begin());
-                last_n_tokens.push_back(embd_inp[n_consumed]);
-
-                ++n_consumed;
-                if ((int) embd.size() >= params.n_batch) {
-                    break;
-                }
-            }
-        }
-
-        // display text
-        for (auto id : embd) {
-           printf("%s", vocab.id_to_token[id].c_str());
-        }
-        fflush(stdout);
-
-        // end of text token
-        if (embd.back() == 0) {
-            break;
-        }
-    }
-
-    // report timing
-    {
-        const int64_t t_main_end_us = ggml_time_us();
-
-        printf("\n\n\n");
-        printf("%s: sampled tokens = %8d\n", __func__, n_sampled);
-        printf("%s:  mem per token = %8zu bytes\n", __func__, mem_per_token);
-        printf("%s:      load time = %8.2f ms\n", __func__, t_load_us / 1000.0f);
-        printf("%s:    sample time = %8.2f ms / %.2f ms per token\n", __func__, t_sample_us / 1000.0f, t_sample_us / 1000.0f / n_sampled);
-        printf("%s:      eval time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us / 1000.0f, t_predict_us / 1000.0f / n_past);
-        printf("%s:     total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us) / 1000.0f);
-    }
-
-    ggml_free(model.ctx);
-
-    return 0;
-}
diff --git a/ggml/examples/mpt/quantize.cpp b/ggml/examples/mpt/quantize.cpp
deleted file mode 100644
index d0c9dda..0000000
--- a/ggml/examples/mpt/quantize.cpp
+++ /dev/null
@@ -1,186 +0,0 @@
-#include "ggml/ggml.h"
-
-#include "common-ggml.h"
-#include "common.h"
-
-#include <cassert>
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <fstream>
-#include <map>
-#include <regex>
-#include <string>
-#include <vector>
-
-struct mpt_hparams {
-    int32_t d_model      = 0;
-    int32_t max_seq_len  = 0;
-    int32_t n_heads      = 0;
-    int32_t n_layers     = 0;
-    int32_t n_vocab      = 0;
-    float alibi_bias_max = 0;
-    float clip_qkv       = 0;
-    int32_t ftype        = 0;
-};
-
-// quantize a model
-bool mpt_model_quantize(const std::string & fname_inp,
-                        const std::string & fname_out, ggml_ftype ftype) {
-
-    printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
-
-    auto finp = std::ifstream(fname_inp, std::ios::binary);
-    if (!finp) {
-        fprintf(stderr, "%s: failed to open '%s' for reading\n", __func__,
-                fname_inp.c_str());
-        return false;
-    }
-
-    auto fout = std::ofstream(fname_out, std::ios::binary);
-    if (!fout) {
-        fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__,
-                fname_out.c_str());
-        return false;
-    }
-
-    // verify magic
-    {
-        uint32_t magic;
-        finp.read((char *)&magic, sizeof(magic));
-        if (magic != GGML_FILE_MAGIC) {
-            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n",
-                    __func__, fname_inp.c_str());
-            return false;
-        }
-
-        fout.write((char *)&magic, sizeof(magic));
-    }
-
-    mpt_hparams hparams;
-
-    // load hparams
-    {
-        finp.read((char *) &hparams.d_model,        sizeof(hparams.d_model));
-        finp.read((char *) &hparams.max_seq_len,    sizeof(hparams.max_seq_len));
-        finp.read((char *) &hparams.n_heads,        sizeof(hparams.n_heads));
-        finp.read((char *) &hparams.n_layers,       sizeof(hparams.n_layers));
-        finp.read((char *) &hparams.n_vocab,        sizeof(hparams.n_vocab));
-        finp.read((char *) &hparams.alibi_bias_max, sizeof(hparams.alibi_bias_max));
-        finp.read((char *) &hparams.clip_qkv,       sizeof(hparams.clip_qkv));
-        finp.read((char *) &hparams.ftype,          sizeof(hparams.ftype));
-
-        const int32_t qntvr_src =    hparams.ftype / GGML_QNT_VERSION_FACTOR;
-        const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype;
-
-        printf("%s: d_model        = %d\n", __func__, hparams.d_model);
-        printf("%s: max_seq_len    = %d\n", __func__, hparams.max_seq_len);
-        printf("%s: n_heads        = %d\n", __func__, hparams.n_heads);
-        printf("%s: n_layers       = %d\n", __func__, hparams.n_layers);
-        printf("%s: n_vocab        = %d\n", __func__, hparams.n_vocab);
-        printf("%s: alibi_bias_max = %f\n", __func__, hparams.alibi_bias_max);
-        printf("%s: clip_qkv       = %f\n", __func__, hparams.clip_qkv);
-        printf("%s: ftype (src) = %d\n", __func__, hparams.ftype);
-        printf("%s: qntvr (src) = %d\n", __func__, qntvr_src);
-        printf("%s: ftype (dst) = %d\n", __func__, ftype_dst);
-        printf("%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION);
-
-        fout.write((char *) &hparams.d_model,        sizeof(hparams.d_model));
-        fout.write((char *) &hparams.max_seq_len,    sizeof(hparams.max_seq_len));
-        fout.write((char *) &hparams.n_heads,        sizeof(hparams.n_heads));
-        fout.write((char *) &hparams.n_layers,       sizeof(hparams.n_layers));
-        fout.write((char *) &hparams.n_vocab,        sizeof(hparams.n_vocab));
-        fout.write((char *) &hparams.alibi_bias_max, sizeof(hparams.alibi_bias_max));
-        fout.write((char *) &hparams.clip_qkv,       sizeof(hparams.clip_qkv));
-        fout.write((char *) &ftype_dst,              sizeof(ftype_dst));
-    }
-
-    // load vocab
-    {
-        const int32_t n_vocab = hparams.n_vocab;
-
-        std::string word;
-        for (int i = 0; i < n_vocab; i++) {
-            uint32_t len;
-            finp.read((char *)&len, sizeof(len));
-            fout.write((char *)&len, sizeof(len));
-
-            word.resize(len);
-            finp.read((char *)word.data(), len);
-            fout.write((char *)word.data(), len);
-        }
-    }
-
-    printf("%s: quantizing tensors\n", __func__);
-
-    // regexes of tensor names to be quantized
-    const std::vector<std::string> to_quant = {
-        ".*weight",
-    };
-
-    if (!ggml_common_quantize_0(finp, fout, ftype, to_quant, {})) {
-        fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__,
-                fname_inp.c_str());
-        return false;
-    }
-
-    finp.close();
-    fout.close();
-
-    return true;
-}
-
-// usage:
-//  ./mpt-quantize models/mpt/ggml-model.bin
-//  models/mpt/ggml-model-quant.bin type
-//
-int main(int argc, char ** argv) {
-    if (argc != 4) {
-        fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n",
-                argv[0]);
-        ggml_print_ftypes(stderr);
-        return 1;
-    }
-
-    // needed to initialize f16 tables
-    {
-        struct ggml_init_params params = {0, NULL, false};
-        struct ggml_context * ctx = ggml_init(params);
-        ggml_free(ctx);
-    }
-
-    const std::string fname_inp = argv[1];
-    const std::string fname_out = argv[2];
-
-    const ggml_ftype ftype = ggml_parse_ftype(argv[3]);
-
-    const int64_t t_main_start_us = ggml_time_us();
-
-    int64_t t_quantize_us = 0;
-
-    // load the model
-    {
-        const int64_t t_start_us = ggml_time_us();
-
-        if (!mpt_model_quantize(fname_inp, fname_out, ggml_ftype(ftype))) {
-            fprintf(stderr, "%s: failed to quantize model from '%s'\n",
-                    __func__, fname_inp.c_str());
-            return 1;
-        }
-
-        t_quantize_us = ggml_time_us() - t_start_us;
-    }
-
-    // report timing
-    {
-        const int64_t t_main_end_us = ggml_time_us();
-
-        printf("\n");
-        printf("%s: quantize time = %8.2f ms\n", __func__,
-               t_quantize_us / 1000.0f);
-        printf("%s:    total time = %8.2f ms\n", __func__,
-               (t_main_end_us - t_main_start_us) / 1000.0f);
-    }
-
-    return 0;
-}
diff --git a/ggml/examples/prompts/dolly-v2.txt b/ggml/examples/prompts/dolly-v2.txt
deleted file mode 100644
index ecdb0b7..0000000
--- a/ggml/examples/prompts/dolly-v2.txt
+++ /dev/null
@@ -1,100 +0,0 @@
-Hello World! => 12092,3645,2
-I can't believe it's already Friday!" => 42,476,626,2868,352,434,2168,6794,1476
-The URL for the website is https://www.example.com." => 510,10611,323,253,4422,310,5987,1358,2700,15,11667,15,681,449
-"She said, 'I love to travel.'" => 3,2993,753,13,686,42,2389,281,4288,18574
-'The temperature is 25.5°C.' => 8,510,3276,310,2030,15,22,3272,36,2464
-"Let's meet at 2:30 p.m. in the park." => 3,1466,434,2525,387,374,27,1229,268,15,78,15,275,253,5603,449
-The book costs $19.99 => 510,1984,4815,370,746,15,1525
-"John's favorite color is blue." => 3,8732,434,7583,3295,310,4797,449
-Th@nk y0u f0r y0ur h3lp! => 1044,33,30664,340,17,86,269,17,83,340,17,321,288,20,24343,2
-C@n I g3t a c0ffee, pl3@se? => 36,33,79,309,305,20,85,247,260,17,71,6851,13,499,20,33,339,32
-W0w! Th@t's @m@zing! => 56,17,88,2,596,33,85,434,1214,78,33,8537,2
-H0w 4re y0u t0d@y? => 41,17,88,577,250,340,17,86,246,17,69,33,90,32
-I l0ve t0 tr@vel @r0und the w0rld. => 42,298,17,306,246,17,492,33,652,1214,83,17,1504,253,259,17,83,392,15
-Wh@t's y0ur f@v0rite m0vie? => 3152,33,85,434,340,17,321,269,33,87,17,3852,278,17,25858,32
-The cat is sleeping on the mat. => 510,5798,310,14343,327,253,1111,15
-I need to buy some groceries for dinner. => 42,878,281,4489,690,45160,447,323,8955,15
-The sun is shining brightly in the sky. => 510,5101,310,28115,43925,275,253,8467,15
-She is reading a book in the park. => 2993,310,4361,247,1984,275,253,5603,15
-We went for a walk on the beach yesterday. => 1231,2427,323,247,2940,327,253,11600,11066,15
-He plays the guitar like a pro. => 1328,7120,253,12609,751,247,354,15
-They are going to the movies tonight. => 3726,403,1469,281,253,11321,11608,15
-The flowers are blooming in the garden. => 510,12405,403,30601,272,275,253,10329,15
-I enjoy listening to classical music. => 42,4264,11298,281,8946,3440,15
-We need to buy groceries for the week. => 1231,878,281,4489,45160,447,323,253,2129,15
-The dog is chasing its tail in circles. => 510,4370,310,31702,697,8105,275,14240,15
-She is wearing a beautiful red dress. => 2993,310,9398,247,5389,2502,7619,15
-He is a talented actor in Hollywood. => 1328,310,247,21220,12353,275,14759,15
-The children are playing in the playground. => 510,2151,403,4882,275,253,41008,15
-I'm going to visit my grandparents this weekend. => 42,1353,1469,281,4143,619,37186,436,8849,15
-The coffee tastes bitter without sugar. => 510,8574,27491,17123,1293,8618,15
-They are planning a surprise party for her. => 3726,403,7219,247,9326,3128,323,617,15
-She sings like an angel on stage. => 2993,44718,751,271,23087,327,3924,15
-We should take a vacation to relax. => 1231,943,1379,247,18125,281,7921,15
-He is studying medicine at the university. => 1328,310,12392,9921,387,253,9835,15
-The rain is pouring heavily outside. => 510,9313,310,31226,11306,3345,15
-I enjoy watching romantic movies. => 42,4264,7487,18109,11321,15
-They are celebrating their anniversary today. => 3726,403,28765,616,19054,3063,15
-She dances gracefully to the music. => 2993,47078,14426,2920,281,253,3440,15
-He is an excellent basketball player. => 1328,310,271,7126,14648,4760,15
-The baby is sleeping soundly in the crib. => 510,6858,310,14343,3590,314,275,253,260,725,15
-I need to finish my homework before dinner. => 42,878,281,8416,619,32110,1078,8955,15
-They are organizing a charity event next month. => 3726,403,26169,247,19489,2362,1735,1770,15
-She is cooking a delicious meal for us. => 2993,310,12398,247,17319,11484,323,441,15
-We should go hiking in the mountains. => 1231,943,564,33061,275,253,14700,15
-The car broke down on the way to work. => 510,1113,9377,1066,327,253,1039,281,789,15
-He loves playing video games in his free time. => 1328,14528,4882,3492,3958,275,521,1959,673,15
-The birds are chirping in the trees. => 510,11260,403,36494,14650,275,253,7139,15
-I want to learn how to play the piano. => 42,971,281,3037,849,281,1132,253,18542,15
-They are building a new shopping mall in the city. => 3726,403,3652,247,747,12701,28974,275,253,2846,15
-She is writing a novel in her spare time. => 2993,310,4028,247,4460,275,617,18345,673,15
-We are going to the zoo this Saturday. => 1231,403,1469,281,253,41089,436,7814,15
-The cake looks delicious with chocolate frosting. => 510,15221,4453,17319,342,14354,34724,272,15
-He is a talented painter who sells his artwork. => 1328,310,247,21220,27343,665,27924,521,28227,15
-The students are studying for their exams. => 510,3484,403,12392,323,616,34666,15
-I enjoy swimming in the ocean. => 42,4264,17120,275,253,12927,15
-They are renovating their house. => 3726,403,30074,839,616,2419,15
-She is practicing yoga to stay healthy. => 2993,310,25815,25551,281,3297,5875,15
-We should plant flowers in the garden. => 1231,943,4444,12405,275,253,10329,15
-The traffic is heavy during rush hour. => 510,7137,310,5536,1309,16949,4964,15
-He is a skilled chef who creates amazing dishes. => 1328,310,247,18024,26540,665,10513,8644,17114,15
-The baby is crawling on the floor. => 510,6858,310,44922,327,253,5254,15
-I need to buy a new pair of shoes. => 42,878,281,4489,247,747,4667,273,12682,15
-They are going on a road trip across the country. => 3726,403,1469,327,247,3971,7408,2439,253,2586,15
-She is playing the piano beautifully. => 2993,310,4882,253,18542,27839,15
-We are going to a concert tomorrow night. => 1231,403,1469,281,247,12699,10873,2360,15
-The cake tastes delicious with vanilla frosting. => 510,15221,27491,17319,342,26724,34724,272,15
-He is a dedicated teacher who inspires his students. => 1328,310,247,9940,9732,665,6381,2731,521,3484,15
-The students are participating in a science fair. => 510,3484,403,15299,275,247,5859,4344,15
-I enjoy hiking in the mountains. => 42,4264,33061,275,253,14700,15
-They are organizing a beach cleanup next weekend. => 3726,403,26169,247,11600,34709,1735,8849,15
-She is taking photographs of nature. => 2993,310,3192,15928,273,3753,15
-We should try a new restaurant in town. => 1231,943,1611,247,747,10301,275,3874,15
-The traffic is moving slowly on the highway. => 510,7137,310,4886,7808,327,253,17657,15
-He is a talented singer with a beautiful voice. => 1328,310,247,21220,16057,342,247,5389,4318,15
-The baby is laughing and giggling. => 510,6858,310,17053,285,41542,1981,15
-I need to do laundry and wash my clothes. => 42,878,281,513,29023,285,14841,619,10015,15
-They are planning a trip to Europe. => 3726,403,7219,247,7408,281,3060,15
-She is learning how to play the guitar. => 2993,310,4715,849,281,1132,253,12609,15
-We are going to a museum this Sunday. => 1231,403,1469,281,247,16064,436,6926,15
-The coffee smells amazing in the morning. => 510,8574,34247,8644,275,253,4131,15
-He is a hardworking farmer who grows crops. => 1328,310,247,1892,21107,24718,665,17202,19492,15
-The students are presenting their research projects. => 510,3484,403,15250,616,2561,6493,15
-I enjoy playing soccer with my friends. => 42,4264,4882,20391,342,619,3858,15
-They are volunteering at a local shelter. => 3726,403,10057,2158,387,247,1980,17824,15
-She is practicing martial arts for self-defense. => 2993,310,25815,29731,14635,323,1881,14,29337,15
-We should try a new recipe for dinner. => 1231,943,1611,247,747,13612,323,8955,15
-The traffic is congest => 510,7137,310,25801
-The sun is shining brightly today. => 510,5101,310,28115,43925,3063,15
-I enjoy reading books in my free time. => 42,4264,4361,5098,275,619,1959,673,15
-She plays the piano beautifully. => 2993,7120,253,18542,27839,15
-The cat chased the mouse around the room. => 510,5798,40754,253,6521,1475,253,2316,15
-I love eating pizza with extra cheese. => 42,2389,9123,22534,342,4465,12173,15
-He always wears a hat wherever he goes. => 1328,1900,31394,247,7856,20312,344,4566,15
-The flowers in the garden are blooming. => 510,12405,275,253,10329,403,30601,272,15
-She danced gracefully on the stage. => 2993,39860,14426,2920,327,253,3924,15
-The dog barked loudly in the park. => 510,4370,21939,264,31311,275,253,5603,15
-We went swimming in the ocean yesterday. => 1231,2427,17120,275,253,12927,11066,15
-He speaks fluent French and Spanish. => 1328,16544,2938,290,5112,285,9883,15
-The train arrived at the station on time. => 510,6194,7244,387,253,4660,327,673,15
-She cooked a delicious meal for her family. => 2993,18621,247,17319,11484,323,617,2021,15
diff --git a/ggml/examples/prompts/gpt-2-chinese.txt b/ggml/examples/prompts/gpt-2-chinese.txt
deleted file mode 100644
index 919829d..0000000
--- a/ggml/examples/prompts/gpt-2-chinese.txt
+++ /dev/null
@@ -1 +0,0 @@
-请问洗手间在哪里? => 6435,7309,3819,2797,7313,1762,1525,7027,8043
diff --git a/ggml/examples/prompts/gpt-2.txt b/ggml/examples/prompts/gpt-2.txt
deleted file mode 100644
index a2ed931..0000000
--- a/ggml/examples/prompts/gpt-2.txt
+++ /dev/null
@@ -1,100 +0,0 @@
-Hello World! => 15496,2159,0
-I can't believe it's already Friday!" => 40,460,470,1975,340,338,1541,3217,2474
-The URL for the website is https://www.example.com." => 464,10289,329,262,3052,318,3740,1378,2503,13,20688,13,785,526
-"She said, 'I love to travel.'" => 1,3347,531,11,705,40,1842,284,3067,11496
-'The temperature is 25.5°C.' => 6,464,5951,318,1679,13,20,7200,34,2637
-"Let's meet at 2:30 p.m. in the park." => 1,5756,338,1826,379,362,25,1270,279,13,76,13,287,262,3952,526
-The book costs $19.99 => 464,1492,3484,720,1129,13,2079
-"John's favorite color is blue." => 1,7554,338,4004,3124,318,4171,526
-Th@nk y0u f0r y0ur h3lp! => 817,31,77,74,331,15,84,277,15,81,331,15,333,289,18,34431,0
-C@n I g3t a c0ffee, pl3@se? => 34,31,77,314,308,18,83,257,269,15,5853,11,458,18,31,325,30
-W0w! Th@t's @m@zing! => 54,15,86,0,536,31,83,338,2488,76,31,9510,0
-H0w 4re y0u t0d@y? => 39,15,86,604,260,331,15,84,256,15,67,31,88,30
-I l0ve t0 tr@vel @r0und the w0rld. => 40,300,15,303,256,15,491,31,626,2488,81,15,917,262,266,15,81,335,13
-Wh@t's y0ur f@v0rite m0vie? => 1199,31,83,338,331,15,333,277,31,85,15,6525,285,15,85,494,30
-The cat is sleeping on the mat. => 464,3797,318,11029,319,262,2603,13
-I need to buy some groceries for dinner. => 40,761,284,2822,617,38464,329,8073,13
-The sun is shining brightly in the sky. => 464,4252,318,22751,35254,287,262,6766,13
-She is reading a book in the park. => 3347,318,3555,257,1492,287,262,3952,13
-We went for a walk on the beach yesterday. => 1135,1816,329,257,2513,319,262,10481,7415,13
-He plays the guitar like a pro. => 1544,5341,262,10047,588,257,386,13
-They are going to the movies tonight. => 2990,389,1016,284,262,6918,9975,13
-The flowers are blooming in the garden. => 464,12734,389,24924,3383,287,262,11376,13
-I enjoy listening to classical music. => 40,2883,8680,284,15993,2647,13
-We need to buy groceries for the week. => 1135,761,284,2822,38464,329,262,1285,13
-The dog is chasing its tail in circles. => 464,3290,318,20023,663,7894,287,13332,13
-She is wearing a beautiful red dress. => 3347,318,5762,257,4950,2266,6576,13
-He is a talented actor in Hollywood. => 1544,318,257,12356,8674,287,8502,13
-The children are playing in the playground. => 464,1751,389,2712,287,262,24817,13
-I'm going to visit my grandparents this weekend. => 40,1101,1016,284,3187,616,28571,428,5041,13
-The coffee tastes bitter without sugar. => 464,6891,18221,12922,1231,7543,13
-They are planning a surprise party for her. => 2990,389,5410,257,5975,2151,329,607,13
-She sings like an angel on stage. => 3347,33041,588,281,18304,319,3800,13
-We should take a vacation to relax. => 1135,815,1011,257,14600,284,8960,13
-He is studying medicine at the university. => 1544,318,11065,9007,379,262,6403,13
-The rain is pouring heavily outside. => 464,6290,318,23147,7272,2354,13
-I enjoy watching romantic movies. => 40,2883,4964,14348,6918,13
-They are celebrating their anniversary today. => 2990,389,17499,511,11162,1909,13
-She dances gracefully to the music. => 3347,38207,11542,2759,284,262,2647,13
-He is an excellent basketball player. => 1544,318,281,6275,9669,2137,13
-The baby is sleeping soundly in the crib. => 464,5156,318,11029,2128,306,287,262,48083,13
-I need to finish my homework before dinner. => 40,761,284,5461,616,26131,878,8073,13
-They are organizing a charity event next month. => 2990,389,16924,257,11016,1785,1306,1227,13
-She is cooking a delicious meal for us. => 3347,318,10801,257,12625,9799,329,514,13
-We should go hiking in the mountains. => 1135,815,467,24522,287,262,12269,13
-The car broke down on the way to work. => 464,1097,6265,866,319,262,835,284,670,13
-He loves playing video games in his free time. => 1544,10408,2712,2008,1830,287,465,1479,640,13
-The birds are chirping in the trees. => 464,10087,389,442,343,13886,287,262,7150,13
-I want to learn how to play the piano. => 40,765,284,2193,703,284,711,262,19132,13
-They are building a new shopping mall in the city. => 2990,389,2615,257,649,9735,17374,287,262,1748,13
-She is writing a novel in her spare time. => 3347,318,3597,257,5337,287,607,13952,640,13
-We are going to the zoo this Saturday. => 1135,389,1016,284,262,26626,428,3909,13
-The cake looks delicious with chocolate frosting. => 464,12187,3073,12625,351,11311,21682,278,13
-He is a talented painter who sells his artwork. => 1544,318,257,12356,34537,508,16015,465,16257,13
-The students are studying for their exams. => 464,2444,389,11065,329,511,26420,13
-I enjoy swimming in the ocean. => 40,2883,14899,287,262,9151,13
-They are renovating their house. => 2990,389,24317,803,511,2156,13
-She is practicing yoga to stay healthy. => 3347,318,18207,20351,284,2652,5448,13
-We should plant flowers in the garden. => 1135,815,4618,12734,287,262,11376,13
-The traffic is heavy during rush hour. => 464,4979,318,4334,1141,10484,1711,13
-He is a skilled chef who creates amazing dishes. => 1544,318,257,14297,21221,508,8075,4998,16759,13
-The baby is crawling on the floor. => 464,5156,318,34499,319,262,4314,13
-I need to buy a new pair of shoes. => 40,761,284,2822,257,649,5166,286,10012,13
-They are going on a road trip across the country. => 2990,389,1016,319,257,2975,5296,1973,262,1499,13
-She is playing the piano beautifully. => 3347,318,2712,262,19132,21104,13
-We are going to a concert tomorrow night. => 1135,389,1016,284,257,10010,9439,1755,13
-The cake tastes delicious with vanilla frosting. => 464,12187,18221,12625,351,16858,21682,278,13
-He is a dedicated teacher who inspires his students. => 1544,318,257,7256,4701,508,38934,465,2444,13
-The students are participating in a science fair. => 464,2444,389,11983,287,257,3783,3148,13
-I enjoy hiking in the mountains. => 40,2883,24522,287,262,12269,13
-They are organizing a beach cleanup next weekend. => 2990,389,16924,257,10481,27425,1306,5041,13
-She is taking photographs of nature. => 3347,318,2263,12566,286,3450,13
-We should try a new restaurant in town. => 1135,815,1949,257,649,7072,287,3240,13
-The traffic is moving slowly on the highway. => 464,4979,318,3867,6364,319,262,12763,13
-He is a talented singer with a beautiful voice. => 1544,318,257,12356,14015,351,257,4950,3809,13
-The baby is laughing and giggling. => 464,5156,318,14376,290,30442,1359,13
-I need to do laundry and wash my clothes. => 40,761,284,466,25724,290,13502,616,8242,13
-They are planning a trip to Europe. => 2990,389,5410,257,5296,284,2031,13
-She is learning how to play the guitar. => 3347,318,4673,703,284,711,262,10047,13
-We are going to a museum this Sunday. => 1135,389,1016,284,257,13257,428,3502,13
-The coffee smells amazing in the morning. => 464,6891,25760,4998,287,262,3329,13
-He is a hardworking farmer who grows crops. => 1544,318,257,1327,16090,18739,508,13676,14450,13
-The students are presenting their research projects. => 464,2444,389,17728,511,2267,4493,13
-I enjoy playing soccer with my friends. => 40,2883,2712,11783,351,616,2460,13
-They are volunteering at a local shelter. => 2990,389,41434,379,257,1957,11772,13
-She is practicing martial arts for self-defense. => 3347,318,18207,15618,10848,329,2116,12,19774,13
-We should try a new recipe for dinner. => 1135,815,1949,257,649,8364,329,8073,13
-The traffic is congest => 464,4979,318,22791
-The sun is shining brightly today. => 464,4252,318,22751,35254,1909,13
-I enjoy reading books in my free time. => 40,2883,3555,3835,287,616,1479,640,13
-She plays the piano beautifully. => 3347,5341,262,19132,21104,13
-The cat chased the mouse around the room. => 464,3797,26172,262,10211,1088,262,2119,13
-I love eating pizza with extra cheese. => 40,1842,6600,14256,351,3131,9891,13
-He always wears a hat wherever he goes. => 1544,1464,17326,257,6877,14530,339,2925,13
-The flowers in the garden are blooming. => 464,12734,287,262,11376,389,24924,3383,13
-She danced gracefully on the stage. => 3347,39480,11542,2759,319,262,3800,13
-The dog barked loudly in the park. => 464,3290,21405,276,23112,287,262,3952,13
-We went swimming in the ocean yesterday. => 1135,1816,14899,287,262,9151,7415,13
-He speaks fluent French and Spanish. => 1544,9209,43472,4141,290,7897,13
-The train arrived at the station on time. => 464,4512,5284,379,262,4429,319,640,13
-She cooked a delicious meal for her family. => 3347,15847,257,12625,9799,329,607,1641,13
diff --git a/ggml/examples/prompts/gpt-j.txt b/ggml/examples/prompts/gpt-j.txt
deleted file mode 100644
index a2ed931..0000000
--- a/ggml/examples/prompts/gpt-j.txt
+++ /dev/null
@@ -1,100 +0,0 @@
-Hello World! => 15496,2159,0
-I can't believe it's already Friday!" => 40,460,470,1975,340,338,1541,3217,2474
-The URL for the website is https://www.example.com." => 464,10289,329,262,3052,318,3740,1378,2503,13,20688,13,785,526
-"She said, 'I love to travel.'" => 1,3347,531,11,705,40,1842,284,3067,11496
-'The temperature is 25.5°C.' => 6,464,5951,318,1679,13,20,7200,34,2637
-"Let's meet at 2:30 p.m. in the park." => 1,5756,338,1826,379,362,25,1270,279,13,76,13,287,262,3952,526
-The book costs $19.99 => 464,1492,3484,720,1129,13,2079
-"John's favorite color is blue." => 1,7554,338,4004,3124,318,4171,526
-Th@nk y0u f0r y0ur h3lp! => 817,31,77,74,331,15,84,277,15,81,331,15,333,289,18,34431,0
-C@n I g3t a c0ffee, pl3@se? => 34,31,77,314,308,18,83,257,269,15,5853,11,458,18,31,325,30
-W0w! Th@t's @m@zing! => 54,15,86,0,536,31,83,338,2488,76,31,9510,0
-H0w 4re y0u t0d@y? => 39,15,86,604,260,331,15,84,256,15,67,31,88,30
-I l0ve t0 tr@vel @r0und the w0rld. => 40,300,15,303,256,15,491,31,626,2488,81,15,917,262,266,15,81,335,13
-Wh@t's y0ur f@v0rite m0vie? => 1199,31,83,338,331,15,333,277,31,85,15,6525,285,15,85,494,30
-The cat is sleeping on the mat. => 464,3797,318,11029,319,262,2603,13
-I need to buy some groceries for dinner. => 40,761,284,2822,617,38464,329,8073,13
-The sun is shining brightly in the sky. => 464,4252,318,22751,35254,287,262,6766,13
-She is reading a book in the park. => 3347,318,3555,257,1492,287,262,3952,13
-We went for a walk on the beach yesterday. => 1135,1816,329,257,2513,319,262,10481,7415,13
-He plays the guitar like a pro. => 1544,5341,262,10047,588,257,386,13
-They are going to the movies tonight. => 2990,389,1016,284,262,6918,9975,13
-The flowers are blooming in the garden. => 464,12734,389,24924,3383,287,262,11376,13
-I enjoy listening to classical music. => 40,2883,8680,284,15993,2647,13
-We need to buy groceries for the week. => 1135,761,284,2822,38464,329,262,1285,13
-The dog is chasing its tail in circles. => 464,3290,318,20023,663,7894,287,13332,13
-She is wearing a beautiful red dress. => 3347,318,5762,257,4950,2266,6576,13
-He is a talented actor in Hollywood. => 1544,318,257,12356,8674,287,8502,13
-The children are playing in the playground. => 464,1751,389,2712,287,262,24817,13
-I'm going to visit my grandparents this weekend. => 40,1101,1016,284,3187,616,28571,428,5041,13
-The coffee tastes bitter without sugar. => 464,6891,18221,12922,1231,7543,13
-They are planning a surprise party for her. => 2990,389,5410,257,5975,2151,329,607,13
-She sings like an angel on stage. => 3347,33041,588,281,18304,319,3800,13
-We should take a vacation to relax. => 1135,815,1011,257,14600,284,8960,13
-He is studying medicine at the university. => 1544,318,11065,9007,379,262,6403,13
-The rain is pouring heavily outside. => 464,6290,318,23147,7272,2354,13
-I enjoy watching romantic movies. => 40,2883,4964,14348,6918,13
-They are celebrating their anniversary today. => 2990,389,17499,511,11162,1909,13
-She dances gracefully to the music. => 3347,38207,11542,2759,284,262,2647,13
-He is an excellent basketball player. => 1544,318,281,6275,9669,2137,13
-The baby is sleeping soundly in the crib. => 464,5156,318,11029,2128,306,287,262,48083,13
-I need to finish my homework before dinner. => 40,761,284,5461,616,26131,878,8073,13
-They are organizing a charity event next month. => 2990,389,16924,257,11016,1785,1306,1227,13
-She is cooking a delicious meal for us. => 3347,318,10801,257,12625,9799,329,514,13
-We should go hiking in the mountains. => 1135,815,467,24522,287,262,12269,13
-The car broke down on the way to work. => 464,1097,6265,866,319,262,835,284,670,13
-He loves playing video games in his free time. => 1544,10408,2712,2008,1830,287,465,1479,640,13
-The birds are chirping in the trees. => 464,10087,389,442,343,13886,287,262,7150,13
-I want to learn how to play the piano. => 40,765,284,2193,703,284,711,262,19132,13
-They are building a new shopping mall in the city. => 2990,389,2615,257,649,9735,17374,287,262,1748,13
-She is writing a novel in her spare time. => 3347,318,3597,257,5337,287,607,13952,640,13
-We are going to the zoo this Saturday. => 1135,389,1016,284,262,26626,428,3909,13
-The cake looks delicious with chocolate frosting. => 464,12187,3073,12625,351,11311,21682,278,13
-He is a talented painter who sells his artwork. => 1544,318,257,12356,34537,508,16015,465,16257,13
-The students are studying for their exams. => 464,2444,389,11065,329,511,26420,13
-I enjoy swimming in the ocean. => 40,2883,14899,287,262,9151,13
-They are renovating their house. => 2990,389,24317,803,511,2156,13
-She is practicing yoga to stay healthy. => 3347,318,18207,20351,284,2652,5448,13
-We should plant flowers in the garden. => 1135,815,4618,12734,287,262,11376,13
-The traffic is heavy during rush hour. => 464,4979,318,4334,1141,10484,1711,13
-He is a skilled chef who creates amazing dishes. => 1544,318,257,14297,21221,508,8075,4998,16759,13
-The baby is crawling on the floor. => 464,5156,318,34499,319,262,4314,13
-I need to buy a new pair of shoes. => 40,761,284,2822,257,649,5166,286,10012,13
-They are going on a road trip across the country. => 2990,389,1016,319,257,2975,5296,1973,262,1499,13
-She is playing the piano beautifully. => 3347,318,2712,262,19132,21104,13
-We are going to a concert tomorrow night. => 1135,389,1016,284,257,10010,9439,1755,13
-The cake tastes delicious with vanilla frosting. => 464,12187,18221,12625,351,16858,21682,278,13
-He is a dedicated teacher who inspires his students. => 1544,318,257,7256,4701,508,38934,465,2444,13
-The students are participating in a science fair. => 464,2444,389,11983,287,257,3783,3148,13
-I enjoy hiking in the mountains. => 40,2883,24522,287,262,12269,13
-They are organizing a beach cleanup next weekend. => 2990,389,16924,257,10481,27425,1306,5041,13
-She is taking photographs of nature. => 3347,318,2263,12566,286,3450,13
-We should try a new restaurant in town. => 1135,815,1949,257,649,7072,287,3240,13
-The traffic is moving slowly on the highway. => 464,4979,318,3867,6364,319,262,12763,13
-He is a talented singer with a beautiful voice. => 1544,318,257,12356,14015,351,257,4950,3809,13
-The baby is laughing and giggling. => 464,5156,318,14376,290,30442,1359,13
-I need to do laundry and wash my clothes. => 40,761,284,466,25724,290,13502,616,8242,13
-They are planning a trip to Europe. => 2990,389,5410,257,5296,284,2031,13
-She is learning how to play the guitar. => 3347,318,4673,703,284,711,262,10047,13
-We are going to a museum this Sunday. => 1135,389,1016,284,257,13257,428,3502,13
-The coffee smells amazing in the morning. => 464,6891,25760,4998,287,262,3329,13
-He is a hardworking farmer who grows crops. => 1544,318,257,1327,16090,18739,508,13676,14450,13
-The students are presenting their research projects. => 464,2444,389,17728,511,2267,4493,13
-I enjoy playing soccer with my friends. => 40,2883,2712,11783,351,616,2460,13
-They are volunteering at a local shelter. => 2990,389,41434,379,257,1957,11772,13
-She is practicing martial arts for self-defense. => 3347,318,18207,15618,10848,329,2116,12,19774,13
-We should try a new recipe for dinner. => 1135,815,1949,257,649,8364,329,8073,13
-The traffic is congest => 464,4979,318,22791
-The sun is shining brightly today. => 464,4252,318,22751,35254,1909,13
-I enjoy reading books in my free time. => 40,2883,3555,3835,287,616,1479,640,13
-She plays the piano beautifully. => 3347,5341,262,19132,21104,13
-The cat chased the mouse around the room. => 464,3797,26172,262,10211,1088,262,2119,13
-I love eating pizza with extra cheese. => 40,1842,6600,14256,351,3131,9891,13
-He always wears a hat wherever he goes. => 1544,1464,17326,257,6877,14530,339,2925,13
-The flowers in the garden are blooming. => 464,12734,287,262,11376,389,24924,3383,13
-She danced gracefully on the stage. => 3347,39480,11542,2759,319,262,3800,13
-The dog barked loudly in the park. => 464,3290,21405,276,23112,287,262,3952,13
-We went swimming in the ocean yesterday. => 1135,1816,14899,287,262,9151,7415,13
-He speaks fluent French and Spanish. => 1544,9209,43472,4141,290,7897,13
-The train arrived at the station on time. => 464,4512,5284,379,262,4429,319,640,13
-She cooked a delicious meal for her family. => 3347,15847,257,12625,9799,329,607,1641,13
diff --git a/ggml/examples/prompts/gpt-neox-japanese.txt b/ggml/examples/prompts/gpt-neox-japanese.txt
deleted file mode 100644
index c39df16..0000000
--- a/ggml/examples/prompts/gpt-neox-japanese.txt
+++ /dev/null
@@ -1 +0,0 @@
-明日の天気はどうですか。 => 263,7353,268,18461,271,1722,18405,265
diff --git a/ggml/examples/prompts/gpt-neox.txt b/ggml/examples/prompts/gpt-neox.txt
deleted file mode 100644
index ecdb0b7..0000000
--- a/ggml/examples/prompts/gpt-neox.txt
+++ /dev/null
@@ -1,100 +0,0 @@
-Hello World! => 12092,3645,2
-I can't believe it's already Friday!" => 42,476,626,2868,352,434,2168,6794,1476
-The URL for the website is https://www.example.com." => 510,10611,323,253,4422,310,5987,1358,2700,15,11667,15,681,449
-"She said, 'I love to travel.'" => 3,2993,753,13,686,42,2389,281,4288,18574
-'The temperature is 25.5°C.' => 8,510,3276,310,2030,15,22,3272,36,2464
-"Let's meet at 2:30 p.m. in the park." => 3,1466,434,2525,387,374,27,1229,268,15,78,15,275,253,5603,449
-The book costs $19.99 => 510,1984,4815,370,746,15,1525
-"John's favorite color is blue." => 3,8732,434,7583,3295,310,4797,449
-Th@nk y0u f0r y0ur h3lp! => 1044,33,30664,340,17,86,269,17,83,340,17,321,288,20,24343,2
-C@n I g3t a c0ffee, pl3@se? => 36,33,79,309,305,20,85,247,260,17,71,6851,13,499,20,33,339,32
-W0w! Th@t's @m@zing! => 56,17,88,2,596,33,85,434,1214,78,33,8537,2
-H0w 4re y0u t0d@y? => 41,17,88,577,250,340,17,86,246,17,69,33,90,32
-I l0ve t0 tr@vel @r0und the w0rld. => 42,298,17,306,246,17,492,33,652,1214,83,17,1504,253,259,17,83,392,15
-Wh@t's y0ur f@v0rite m0vie? => 3152,33,85,434,340,17,321,269,33,87,17,3852,278,17,25858,32
-The cat is sleeping on the mat. => 510,5798,310,14343,327,253,1111,15
-I need to buy some groceries for dinner. => 42,878,281,4489,690,45160,447,323,8955,15
-The sun is shining brightly in the sky. => 510,5101,310,28115,43925,275,253,8467,15
-She is reading a book in the park. => 2993,310,4361,247,1984,275,253,5603,15
-We went for a walk on the beach yesterday. => 1231,2427,323,247,2940,327,253,11600,11066,15
-He plays the guitar like a pro. => 1328,7120,253,12609,751,247,354,15
-They are going to the movies tonight. => 3726,403,1469,281,253,11321,11608,15
-The flowers are blooming in the garden. => 510,12405,403,30601,272,275,253,10329,15
-I enjoy listening to classical music. => 42,4264,11298,281,8946,3440,15
-We need to buy groceries for the week. => 1231,878,281,4489,45160,447,323,253,2129,15
-The dog is chasing its tail in circles. => 510,4370,310,31702,697,8105,275,14240,15
-She is wearing a beautiful red dress. => 2993,310,9398,247,5389,2502,7619,15
-He is a talented actor in Hollywood. => 1328,310,247,21220,12353,275,14759,15
-The children are playing in the playground. => 510,2151,403,4882,275,253,41008,15
-I'm going to visit my grandparents this weekend. => 42,1353,1469,281,4143,619,37186,436,8849,15
-The coffee tastes bitter without sugar. => 510,8574,27491,17123,1293,8618,15
-They are planning a surprise party for her. => 3726,403,7219,247,9326,3128,323,617,15
-She sings like an angel on stage. => 2993,44718,751,271,23087,327,3924,15
-We should take a vacation to relax. => 1231,943,1379,247,18125,281,7921,15
-He is studying medicine at the university. => 1328,310,12392,9921,387,253,9835,15
-The rain is pouring heavily outside. => 510,9313,310,31226,11306,3345,15
-I enjoy watching romantic movies. => 42,4264,7487,18109,11321,15
-They are celebrating their anniversary today. => 3726,403,28765,616,19054,3063,15
-She dances gracefully to the music. => 2993,47078,14426,2920,281,253,3440,15
-He is an excellent basketball player. => 1328,310,271,7126,14648,4760,15
-The baby is sleeping soundly in the crib. => 510,6858,310,14343,3590,314,275,253,260,725,15
-I need to finish my homework before dinner. => 42,878,281,8416,619,32110,1078,8955,15
-They are organizing a charity event next month. => 3726,403,26169,247,19489,2362,1735,1770,15
-She is cooking a delicious meal for us. => 2993,310,12398,247,17319,11484,323,441,15
-We should go hiking in the mountains. => 1231,943,564,33061,275,253,14700,15
-The car broke down on the way to work. => 510,1113,9377,1066,327,253,1039,281,789,15
-He loves playing video games in his free time. => 1328,14528,4882,3492,3958,275,521,1959,673,15
-The birds are chirping in the trees. => 510,11260,403,36494,14650,275,253,7139,15
-I want to learn how to play the piano. => 42,971,281,3037,849,281,1132,253,18542,15
-They are building a new shopping mall in the city. => 3726,403,3652,247,747,12701,28974,275,253,2846,15
-She is writing a novel in her spare time. => 2993,310,4028,247,4460,275,617,18345,673,15
-We are going to the zoo this Saturday. => 1231,403,1469,281,253,41089,436,7814,15
-The cake looks delicious with chocolate frosting. => 510,15221,4453,17319,342,14354,34724,272,15
-He is a talented painter who sells his artwork. => 1328,310,247,21220,27343,665,27924,521,28227,15
-The students are studying for their exams. => 510,3484,403,12392,323,616,34666,15
-I enjoy swimming in the ocean. => 42,4264,17120,275,253,12927,15
-They are renovating their house. => 3726,403,30074,839,616,2419,15
-She is practicing yoga to stay healthy. => 2993,310,25815,25551,281,3297,5875,15
-We should plant flowers in the garden. => 1231,943,4444,12405,275,253,10329,15
-The traffic is heavy during rush hour. => 510,7137,310,5536,1309,16949,4964,15
-He is a skilled chef who creates amazing dishes. => 1328,310,247,18024,26540,665,10513,8644,17114,15
-The baby is crawling on the floor. => 510,6858,310,44922,327,253,5254,15
-I need to buy a new pair of shoes. => 42,878,281,4489,247,747,4667,273,12682,15
-They are going on a road trip across the country. => 3726,403,1469,327,247,3971,7408,2439,253,2586,15
-She is playing the piano beautifully. => 2993,310,4882,253,18542,27839,15
-We are going to a concert tomorrow night. => 1231,403,1469,281,247,12699,10873,2360,15
-The cake tastes delicious with vanilla frosting. => 510,15221,27491,17319,342,26724,34724,272,15
-He is a dedicated teacher who inspires his students. => 1328,310,247,9940,9732,665,6381,2731,521,3484,15
-The students are participating in a science fair. => 510,3484,403,15299,275,247,5859,4344,15
-I enjoy hiking in the mountains. => 42,4264,33061,275,253,14700,15
-They are organizing a beach cleanup next weekend. => 3726,403,26169,247,11600,34709,1735,8849,15
-She is taking photographs of nature. => 2993,310,3192,15928,273,3753,15
-We should try a new restaurant in town. => 1231,943,1611,247,747,10301,275,3874,15
-The traffic is moving slowly on the highway. => 510,7137,310,4886,7808,327,253,17657,15
-He is a talented singer with a beautiful voice. => 1328,310,247,21220,16057,342,247,5389,4318,15
-The baby is laughing and giggling. => 510,6858,310,17053,285,41542,1981,15
-I need to do laundry and wash my clothes. => 42,878,281,513,29023,285,14841,619,10015,15
-They are planning a trip to Europe. => 3726,403,7219,247,7408,281,3060,15
-She is learning how to play the guitar. => 2993,310,4715,849,281,1132,253,12609,15
-We are going to a museum this Sunday. => 1231,403,1469,281,247,16064,436,6926,15
-The coffee smells amazing in the morning. => 510,8574,34247,8644,275,253,4131,15
-He is a hardworking farmer who grows crops. => 1328,310,247,1892,21107,24718,665,17202,19492,15
-The students are presenting their research projects. => 510,3484,403,15250,616,2561,6493,15
-I enjoy playing soccer with my friends. => 42,4264,4882,20391,342,619,3858,15
-They are volunteering at a local shelter. => 3726,403,10057,2158,387,247,1980,17824,15
-She is practicing martial arts for self-defense. => 2993,310,25815,29731,14635,323,1881,14,29337,15
-We should try a new recipe for dinner. => 1231,943,1611,247,747,13612,323,8955,15
-The traffic is congest => 510,7137,310,25801
-The sun is shining brightly today. => 510,5101,310,28115,43925,3063,15
-I enjoy reading books in my free time. => 42,4264,4361,5098,275,619,1959,673,15
-She plays the piano beautifully. => 2993,7120,253,18542,27839,15
-The cat chased the mouse around the room. => 510,5798,40754,253,6521,1475,253,2316,15
-I love eating pizza with extra cheese. => 42,2389,9123,22534,342,4465,12173,15
-He always wears a hat wherever he goes. => 1328,1900,31394,247,7856,20312,344,4566,15
-The flowers in the garden are blooming. => 510,12405,275,253,10329,403,30601,272,15
-She danced gracefully on the stage. => 2993,39860,14426,2920,327,253,3924,15
-The dog barked loudly in the park. => 510,4370,21939,264,31311,275,253,5603,15
-We went swimming in the ocean yesterday. => 1231,2427,17120,275,253,12927,11066,15
-He speaks fluent French and Spanish. => 1328,16544,2938,290,5112,285,9883,15
-The train arrived at the station on time. => 510,6194,7244,387,253,4660,327,673,15
-She cooked a delicious meal for her family. => 2993,18621,247,17319,11484,323,617,2021,15
diff --git a/ggml/examples/prompts/polyglot-ko.txt b/ggml/examples/prompts/polyglot-ko.txt
deleted file mode 100644
index 41fa008..0000000
--- a/ggml/examples/prompts/polyglot-ko.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-이것은 테스트 이다. => 12271,296,6474,28037,17
-걱정할 필요 없다. => 18311,482,1062,550,267,17
-버그는 언젠가 고쳐진다. => 6904,272,8575,10381,1765,17
diff --git a/ggml/examples/prompts/replit.txt b/ggml/examples/prompts/replit.txt
deleted file mode 100644
index 7b5ffcf..0000000
--- a/ggml/examples/prompts/replit.txt
+++ /dev/null
@@ -1,100 +0,0 @@
-Hello World! => 6466,147,2317,350
-I can't believe it's already Friday!" => 286,512,172,185,13392,393,172,155,3239,147,29249,8537
-The URL for the website is https://www.example.com." => 505,5635,250,170,11745,235,147,303,262,552,148,811,148,241,148,161
-"She said, 'I love to travel.'" => 161,10386,4089,150,206,286,8440,194,147,12363,148,172,161
-'The temperature is 25.5°C.' => 172,505,147,9502,235,147,20022,8516,228,148,172
-"Let's meet at 2:30 p.m. in the park." => 161,8997,172,155,17120,536,147,162,5245,147,207,148,204,148,219,170,147,17664,148,161
-The book costs $19.99 => 505,147,2277,17494,236,166,11824
-"John's favorite color is blue." => 161,7475,172,155,147,11105,147,349,235,17046,148,161
-Th@nk y0u f0r y0ur h3lp! => 6309,240,9019,147,237,159,247,147,202,159,223,147,237,159,2458,147,226,171,3899,350
-C@n I g3t a c0ffee, pl3@se? => 228,240,211,398,147,267,171,185,216,147,196,159,13360,163,150,147,1287,171,240,155,163,272
-W0w! Th@t's @m@zing! => 450,159,274,350,147,6309,240,185,172,155,268,204,240,301,248,350
-H0w 4re y0u t0d@y? => 304,159,274,320,440,147,237,159,247,147,185,159,182,240,237,272
-I l0ve t0 tr@vel @r0und the w0rld. => 286,997,159,1290,147,185,159,147,490,240,3893,268,223,159,3981,170,147,274,159,223,2833,148
-Wh@t's y0ur f@v0rite m0vie? => 450,226,240,185,172,155,147,237,159,2458,147,202,240,252,159,5961,163,147,204,159,24373,272
-The cat is sleeping on the mat. => 505,147,1604,235,147,3987,248,347,170,147,1297,148
-I need to buy some groceries for dinner. => 286,1645,194,147,8068,1499,147,10022,1037,10023,250,147,182,2749,148
-The sun is shining brightly in the sky. => 505,147,5852,235,147,7304,2967,147,215,649,391,219,170,147,7310,148
-She is reading a book in the park. => 10386,235,9838,216,147,2277,219,170,147,17664,148
-We went for a walk on the beach yesterday. => 3250,10825,250,216,147,8156,347,170,294,5371,147,28830,148
-He plays the guitar like a pro. => 5301,7084,155,170,147,4604,2214,1425,216,3474,148
-They are going to the movies tonight. => 18815,429,6552,194,170,147,15877,194,7907,148
-The flowers are blooming in the garden. => 505,147,22953,155,429,147,10411,2799,248,219,170,147,22140,148
-I enjoy listening to classical music. => 286,23162,15876,248,194,239,4251,147,7395,148
-We need to buy groceries for the week. => 3250,1645,194,147,8068,147,10022,1037,10023,250,170,9238,148
-The dog is chasing its tail in circles. => 505,147,6540,235,147,196,916,248,1602,147,5129,219,147,4095,155,148
-She is wearing a beautiful red dress. => 10386,235,147,16427,248,216,147,23447,147,1160,147,14592,148
-He is a talented actor in Hollywood. => 5301,235,216,147,29750,246,147,5112,219,147,16924,391,10477,148
-The children are playing in the playground. => 505,7934,429,7084,248,219,170,7084,12055,148
-I'm going to visit my grandparents this weekend. => 286,172,204,6552,194,9939,1247,147,11806,12019,291,9238,314,148
-The coffee tastes bitter without sugar. => 505,147,21526,147,20931,155,5145,1430,1988,147,28759,148
-They are planning a surprise party for her. => 18815,429,147,23661,216,147,29240,147,7344,250,1869,148
-She sings like an angel on stage. => 10386,147,155,6502,1425,426,147,26028,347,12685,148
-We should take a vacation to relax. => 3250,936,4654,216,147,15388,946,194,1998,2744,148
-He is studying medicine at the university. => 5301,235,7959,248,147,20742,1668,536,170,147,8025,148
-The rain is pouring heavily outside. => 505,147,6885,235,5306,248,1189,5451,391,8096,148
-I enjoy watching romantic movies. => 286,23162,147,3355,248,147,26080,4140,147,15877,148
-They are celebrating their anniversary today. => 18815,429,147,30000,5841,1669,147,24734,5464,1770,13386,148
-She dances gracefully to the music. => 10386,147,182,1626,155,147,267,8771,8001,194,170,147,7395,148
-He is an excellent basketball player. => 5301,235,426,147,12300,675,185,147,26646,5132,6294,148
-The baby is sleeping soundly in the crib. => 505,147,23597,235,147,3987,248,12642,391,219,170,147,7696,215,148
-I need to finish my homework before dinner. => 286,1645,194,147,6717,1247,147,1071,2722,2643,147,182,2749,148
-They are organizing a charity event next month. => 18815,429,147,16442,248,216,1054,1511,1663,2399,12821,148
-She is cooking a delicious meal for us. => 10386,235,147,20453,248,216,3936,23455,147,26658,250,147,539,148
-We should go hiking in the mountains. => 3250,936,4242,147,2254,5357,219,170,147,204,18028,155,148
-The car broke down on the way to work. => 505,7553,147,510,10036,4288,347,170,3699,194,1916,148
-He loves playing video games in his free time. => 5301,8440,155,7084,248,8722,147,11281,219,1439,4002,801,148
-The birds are chirping in the trees. => 505,147,13043,155,429,147,3904,223,4639,219,170,5311,155,148
-I want to learn how to play the piano. => 286,1857,194,14167,2496,194,7084,170,147,207,23635,148
-They are building a new shopping mall in the city. => 18815,429,11038,216,277,147,22184,147,204,609,219,170,147,2416,148
-She is writing a novel in her spare time. => 10386,235,3242,216,147,25814,219,1869,6772,2382,801,148
-We are going to the zoo this Saturday. => 3250,429,6552,194,170,147,25101,291,147,31426,148
-The cake looks delicious with chocolate frosting. => 505,147,24422,16303,3936,23455,312,147,5619,533,2239,147,202,3973,3431,148
-He is a talented painter who sells his artwork. => 5301,235,216,147,29750,246,147,9226,279,2888,13004,155,1439,12234,2722,148
-The students are studying for their exams. => 505,15707,429,7959,248,250,1669,147,12398,155,148
-I enjoy swimming in the ocean. => 286,23162,147,4729,8528,248,219,170,147,26193,148
-They are renovating their house. => 18815,429,991,10724,3643,1669,13788,148
-She is practicing yoga to stay healthy. => 10386,235,147,18453,248,147,5063,1186,194,15344,147,28550,148
-We should plant flowers in the garden. => 3250,936,147,9212,147,22953,155,219,170,147,22140,148
-The traffic is heavy during rush hour. => 505,147,11097,235,147,22232,4340,147,22319,147,5686,148
-He is a skilled chef who creates amazing dishes. => 5301,235,216,147,8891,246,9784,202,2888,13720,147,28880,147,23852,383,148
-The baby is crawling on the floor. => 505,147,23597,235,147,22120,248,347,170,147,5895,148
-I need to buy a new pair of shoes. => 286,1645,194,147,8068,216,277,12632,210,147,155,21953,155,148
-They are going on a road trip across the country. => 18815,429,6552,347,216,147,6362,147,11395,9762,170,11305,148
-She is playing the piano beautifully. => 10386,235,7084,248,170,147,207,23635,147,23447,391,148
-We are going to a concert tomorrow night. => 3250,429,6552,194,216,1710,4391,29524,12716,148
-The cake tastes delicious with vanilla frosting. => 505,147,24422,147,20931,155,3936,23455,312,5535,7476,147,202,3973,3431,148
-He is a dedicated teacher who inspires his students. => 5301,235,216,326,8298,3460,147,9675,2888,147,28801,155,1439,15707,148
-The students are participating in a science fair. => 505,15707,429,147,30961,3643,219,216,147,10587,147,7636,148
-I enjoy hiking in the mountains. => 286,23162,147,2254,5357,219,170,147,204,18028,155,148
-They are organizing a beach cleanup next weekend. => 18815,429,147,16442,248,216,294,5371,147,10401,2399,9238,314,148
-She is taking photographs of nature. => 10386,235,147,12345,147,4709,1547,155,210,147,211,8603,148
-We should try a new restaurant in town. => 3250,936,147,746,216,277,147,11007,219,147,10200,148
-The traffic is moving slowly on the highway. => 505,147,11097,235,147,8601,147,9880,391,347,170,5976,3330,148
-He is a talented singer with a beautiful voice. => 5301,235,216,147,29750,246,147,155,248,279,312,216,147,23447,147,9316,148
-The baby is laughing and giggling. => 505,147,23597,235,147,23066,248,221,147,2341,3631,2869,148
-I need to do laundry and wash my clothes. => 286,1645,194,543,960,3981,2154,221,147,27589,1247,147,22141,383,148
-They are planning a trip to Europe. => 18815,429,147,23661,216,147,11395,194,13131,148
-She is learning how to play the guitar. => 10386,235,11754,2496,194,7084,170,147,4604,2214,148
-We are going to a museum this Sunday. => 3250,429,6552,194,216,147,204,433,1177,291,147,29111,148
-The coffee smells amazing in the morning. => 505,147,21526,31454,155,147,28880,219,170,20701,148
-He is a hardworking farmer who grows crops. => 5301,235,216,8524,14992,147,16679,279,2888,147,6044,155,147,8650,155,148
-The students are presenting their research projects. => 505,15707,429,5130,248,1669,13217,14235,148
-I enjoy playing soccer with my friends. => 286,23162,7084,248,147,9351,5318,312,1247,147,5347,155,148
-They are volunteering at a local shelter. => 18815,429,147,5238,7478,163,12798,536,216,2491,2905,1359,279,148
-She is practicing martial arts for self-defense. => 10386,235,147,18453,248,147,3261,185,4381,12234,155,250,623,153,29896,148
-We should try a new recipe for dinner. => 3250,936,147,746,216,277,147,9851,250,147,182,2749,148
-The traffic is congest => 505,147,11097,235,1710,14169
-The sun is shining brightly today. => 505,147,5852,235,147,7304,2967,147,215,649,391,13386,148
-I enjoy reading books in my free time. => 286,23162,9838,147,9670,219,1247,4002,801,148
-She plays the piano beautifully. => 10386,7084,155,170,147,207,23635,147,23447,391,148
-The cat chased the mouse around the room. => 505,147,1604,147,196,916,246,170,12551,6890,170,9654,148
-I love eating pizza with extra cheese. => 286,8440,147,163,3643,147,207,8403,312,8230,9784,383,163,148
-He always wears a hat wherever he goes. => 5301,5418,147,16427,155,216,147,4879,2171,2433,1189,16177,148
-The flowers in the garden are blooming. => 505,147,22953,155,219,170,147,22140,429,147,10411,2799,248,148
-She danced gracefully on the stage. => 10386,13378,12408,147,267,8771,8001,347,170,12685,148
-The dog barked loudly in the park. => 505,147,6540,147,973,293,246,147,30182,391,219,170,147,17664,148
-We went swimming in the ocean yesterday. => 3250,10825,147,4729,8528,248,219,170,147,26193,147,28830,148
-He speaks fluent French and Spanish. => 5301,147,13285,155,147,21677,147,254,17590,221,147,31519,148
-The train arrived at the station on time. => 505,147,872,147,20712,182,536,170,147,7184,347,801,148
-She cooked a delicious meal for her family. => 10386,147,20453,246,216,3936,23455,147,26658,250,1869,147,2002,148
diff --git a/ggml/examples/prompts/starcoder.txt b/ggml/examples/prompts/starcoder.txt
deleted file mode 100644
index 03a5b22..0000000
--- a/ggml/examples/prompts/starcoder.txt
+++ /dev/null
@@ -1,100 +0,0 @@
-Hello World! => 8279,10896,19
-I can't believe it's already Friday!" => 59,883,1330,13710,561,1182,3425,506,25674,11555
-The URL for the website is https://www.example.com." => 1318,3834,436,322,9575,438,1678,555,1499,32,2763,32,508,3107
-"She said, 'I love to travel.'" => 20,25387,9884,30,330,59,14290,372,25283,29329
-'The temperature is 25.5°C.' => 25,1318,13587,438,225,36,39,32,39,23767,53,4564
-"Let's meet at 2:30 p.m. in the park." => 20,9809,1182,18450,821,225,36,44,37,34,298,32,95,32,328,322,880,93,3107
-The book costs $19.99 => 1318,7618,25950,398,35,43,32,43,43
-"John's favorite color is blue." => 20,19693,1182,27448,1963,438,10087,3107
-Th@nk y0u f0r y0ur h3lp! => 1027,50,19877,533,34,103,296,34,100,533,34,305,420,37,1915,19
-C@n I g3t a c0ffee, pl3@se? => 53,50,96,439,485,37,102,312,281,34,21298,30,1278,37,50,277,49
-W0w! Th@t's @m@zing! => 73,34,105,19,947,50,102,1182,477,95,50,26768,19
-H0w 4re y0u t0d@y? => 58,34,105,225,38,268,533,34,103,273,34,86,50,107,49
-I l0ve t0 tr@vel @r0und the w0rld. => 59,456,34,587,273,34,554,50,1203,477,100,34,642,322,341,34,100,1381,32
-Wh@t's y0ur f@v0rite m0vie? => 2444,50,102,1182,533,34,305,296,50,104,34,1049,345,34,104,1075,49
-The cat is sleeping on the mat. => 1318,10501,438,9368,299,544,322,2491,32
-I need to buy some groceries for dinner. => 59,1849,372,16968,1629,20234,85,6958,436,343,3369,32
-The sun is shining brightly in the sky. => 1318,15323,438,787,19068,38231,631,328,322,26718,32
-She is reading a book in the park. => 25387,438,9175,312,7618,328,322,880,93,32
-We went for a walk on the beach yesterday. => 3122,14236,436,312,13503,544,322,526,867,39485,32
-He plays the guitar like a pro. => 1331,41271,322,3932,19931,2124,312,534,32
-They are going to the movies tonight. => 31805,884,6783,372,322,27889,26076,694,32
-The flowers are blooming in the garden. => 1318,7290,483,884,323,18466,299,328,322,485,22461,32
-I enjoy listening to classical music. => 59,31567,20498,372,443,1578,17522,32
-We need to buy groceries for the week. => 3122,1849,372,16968,20234,85,6958,436,322,8209,32
-The dog is chasing its tail in circles. => 1318,27435,438,663,9949,2819,13203,328,46428,32
-She is wearing a beautiful red dress. => 25387,438,996,6992,312,36493,3346,343,714,32
-He is a talented actor in Hollywood. => 1331,438,312,273,9556,318,16038,328,48228,631,21118,32
-The children are playing in the playground. => 1318,5713,884,19788,328,322,4654,1749,32
-I'm going to visit my grandparents this weekend. => 59,3464,6783,372,7725,1672,33162,19277,458,40618,32
-The coffee tastes bitter without sugar. => 1318,36917,273,633,307,3493,391,2876,309,18628,32
-They are planning a surprise party for her. => 31805,884,26116,312,6178,9251,15270,436,7791,32
-She sings like an angel on stage. => 25387,309,2052,2124,600,600,17691,544,10019,32
-We should take a vacation to relax. => 3122,1395,4818,312,29164,367,372,41972,32
-He is studying medicine at the university. => 1331,438,14866,299,32388,482,821,322,707,9190,32
-The rain is pouring heavily outside. => 1318,36987,438,9202,299,46003,2801,11127,32
-I enjoy watching romantic movies. => 59,31567,37652,26045,7268,27889,32
-They are celebrating their anniversary today. => 31805,884,48278,839,1741,3623,23921,5810,672,11610,32
-She dances gracefully to the music. => 25387,343,3151,31376,4938,372,322,17522,32
-He is an excellent basketball player. => 1331,438,600,39203,48400,11653,4362,32
-The baby is sleeping soundly in the crib. => 1318,323,17156,438,9368,299,9934,631,328,322,281,7972,32
-I need to finish my homework before dinner. => 59,1849,372,11361,1672,6765,1007,2670,343,3369,32
-They are organizing a charity event next month. => 31805,884,10558,6183,312,1351,543,1692,2354,6811,32
-She is cooking a delicious meal for us. => 25387,438,23682,299,312,409,406,2406,597,279,436,1770,32
-We should go hiking in the mountains. => 3122,1395,1983,420,1546,299,328,322,10874,1907,32
-The car broke down on the way to work. => 1318,6346,43289,2835,544,322,3352,372,1389,32
-He loves playing video games in his free time. => 1331,598,4954,19788,6027,19705,328,6697,3741,1133,32
-The birds are chirping in the trees. => 1318,8424,3210,884,663,476,7075,328,322,23453,32
-I want to learn how to play the piano. => 59,2637,372,7350,2624,372,4654,322,298,25757,32
-They are building a new shopping mall in the city. => 31805,884,9038,312,537,40692,345,464,328,322,11297,32
-She is writing a novel in her spare time. => 25387,438,4127,312,32913,328,7791,1869,586,1133,32
-We are going to the zoo this Saturday. => 3122,884,6783,372,322,1288,604,458,358,30288,32
-The cake looks delicious with chocolate frosting. => 1318,281,1062,7780,409,406,2406,623,10408,27589,296,20932,299,32
-He is a talented painter who sells his artwork. => 1331,438,312,273,9556,318,42300,6560,10800,101,6697,5549,1007,32
-The students are studying for their exams. => 1318,16512,884,14866,299,436,3623,538,1462,32
-I enjoy swimming in the ocean. => 59,31567,2535,449,6714,328,322,337,18857,32
-They are renovating their house. => 31805,884,316,15007,1741,3623,17075,32
-She is practicing yoga to stay healthy. => 25387,438,11808,11636,533,40067,372,20005,44538,32
-We should plant flowers in the garden. => 3122,1395,26795,7290,483,328,322,485,22461,32
-The traffic is heavy during rush hour. => 1318,16391,438,32389,5929,540,1372,12021,32
-He is a skilled chef who creates amazing dishes. => 1331,438,312,3001,12088,44051,6560,9585,36986,1214,4279,32
-The baby is crawling on the floor. => 1318,323,17156,438,281,1294,2920,544,322,17648,32
-I need to buy a new pair of shoes. => 59,1849,372,16968,312,537,6092,432,787,37764,32
-They are going on a road trip across the country. => 31805,884,6783,544,312,24122,19337,10160,322,10769,32
-She is playing the piano beautifully. => 25387,438,19788,322,298,25757,526,4846,325,514,107,32
-We are going to a concert tomorrow night. => 3122,884,6783,372,312,457,6989,31841,19212,32
-The cake tastes delicious with vanilla frosting. => 1318,281,1062,273,633,307,409,406,2406,623,44653,296,20932,299,32
-He is a dedicated teacher who inspires his students. => 1331,438,312,23112,30877,6560,26194,8017,6697,16512,32
-The students are participating in a science fair. => 1318,16512,884,24623,1741,328,312,27536,19375,32
-I enjoy hiking in the mountains. => 59,31567,420,1546,299,328,322,10874,1907,32
-They are organizing a beach cleanup next weekend. => 31805,884,10558,6183,312,526,867,13144,2354,40618,32
-She is taking photographs of nature. => 25387,438,15137,15110,23626,432,24406,32
-We should try a new restaurant in town. => 3122,1395,1596,312,537,43719,328,38212,32
-The traffic is moving slowly on the highway. => 1318,16391,438,14089,12899,631,544,322,3857,3073,32
-He is a talented singer with a beautiful voice. => 1331,438,312,273,9556,318,309,10118,623,312,36493,20309,32
-The baby is laughing and giggling. => 1318,323,17156,438,2317,2943,299,461,485,365,36088,32
-I need to do laundry and wash my clothes. => 59,1849,372,745,2317,642,994,461,341,917,1672,7375,46948,32
-They are planning a trip to Europe. => 31805,884,26116,312,19337,372,27268,32
-She is learning how to play the guitar. => 25387,438,9608,2624,372,4654,322,3932,19931,32
-We are going to a museum this Sunday. => 3122,884,6783,372,312,345,539,378,458,358,28036,32
-The coffee smells amazing in the morning. => 1318,36917,309,42153,101,36986,328,322,33768,32
-He is a hardworking farmer who grows crops. => 1331,438,312,6784,13578,9019,2302,6560,485,2138,25170,1069,32
-The students are presenting their research projects. => 1318,16512,884,5024,299,3623,13234,8528,32
-I enjoy playing soccer with my friends. => 59,31567,19788,22682,10035,623,1672,22523,32
-They are volunteering at a local shelter. => 31805,884,3920,45585,8637,821,312,2196,309,2542,391,32
-She is practicing martial arts for self-defense. => 25387,438,11808,11636,345,502,564,5549,101,436,630,31,43694,32
-We should try a new recipe for dinner. => 3122,1395,1596,312,537,15233,436,343,3369,32
-The traffic is congest => 1318,16391,438,457,2776
-The sun is shining brightly today. => 1318,15323,438,787,19068,38231,631,11610,32
-I enjoy reading books in my free time. => 59,31567,9175,21739,328,1672,3741,1133,32
-She plays the piano beautifully. => 25387,41271,322,298,25757,526,4846,325,514,107,32
-The cat chased the mouse around the room. => 1318,10501,663,16109,322,8459,6835,322,8355,32
-I love eating pizza with extra cheese. => 59,14290,484,1741,47630,623,6717,8277,30315,32
-He always wears a hat wherever he goes. => 1331,5182,996,4177,312,25793,2154,424,938,13107,32
-The flowers in the garden are blooming. => 1318,7290,483,328,322,485,22461,884,323,18466,299,32
-She danced gracefully on the stage. => 25387,343,6087,31376,4938,544,322,10019,32
-The dog barked loudly in the park. => 1318,27435,323,1087,318,598,836,631,328,322,880,93,32
-We went swimming in the ocean yesterday. => 3122,14236,2535,449,6714,328,322,337,18857,39485,32
-He speaks fluent French and Spanish. => 1331,24498,101,38055,43652,461,14911,1708,32
-The train arrived at the station on time. => 1318,5683,2099,32114,821,322,18662,544,1133,32
-She cooked a delicious meal for her family. => 25387,23682,318,312,409,406,2406,597,279,436,7791,13872,32
diff --git a/ggml/examples/prompts/test-cases.txt b/ggml/examples/prompts/test-cases.txt
deleted file mode 100644
index 4d0bdbf..0000000
--- a/ggml/examples/prompts/test-cases.txt
+++ /dev/null
@@ -1,110 +0,0 @@
-# test case format
-# <language>: <sentence>
-
-English: Hello World!
-English: I can't believe it's already Friday!"
-English: The URL for the website is https://www.example.com."
-English: "She said, 'I love to travel.'"
-English: 'The temperature is 25.5°C.'
-English: "Let's meet at 2:30 p.m. in the park."
-English: The book costs $19.99
-English: "John's favorite color is blue."
-English: Th@nk y0u f0r y0ur h3lp!
-English: C@n I g3t a c0ffee, pl3@se?
-English: W0w! Th@t's @m@zing!
-English: H0w 4re y0u t0d@y?
-English: I l0ve t0 tr@vel @r0und the w0rld.
-English: Wh@t's y0ur f@v0rite m0vie?
-English: The cat is sleeping on the mat.
-English: I need to buy some groceries for dinner.
-English: The sun is shining brightly in the sky.
-English: She is reading a book in the park.
-English: We went for a walk on the beach yesterday.
-English: He plays the guitar like a pro.
-English: They are going to the movies tonight.
-English: The flowers are blooming in the garden.
-English: I enjoy listening to classical music.
-English: We need to buy groceries for the week.
-English: The dog is chasing its tail in circles.
-English: She is wearing a beautiful red dress.
-English: He is a talented actor in Hollywood.
-English: The children are playing in the playground.
-English: I'm going to visit my grandparents this weekend.
-English: The coffee tastes bitter without sugar.
-English: They are planning a surprise party for her.
-English: She sings like an angel on stage.
-English: We should take a vacation to relax.
-English: He is studying medicine at the university.
-English: The rain is pouring heavily outside.
-English: I enjoy watching romantic movies.
-English: They are celebrating their anniversary today.
-English: She dances gracefully to the music.
-English: He is an excellent basketball player.
-English: The baby is sleeping soundly in the crib.
-English: I need to finish my homework before dinner.
-English: They are organizing a charity event next month.
-English: She is cooking a delicious meal for us.
-English: We should go hiking in the mountains.
-English: The car broke down on the way to work.
-English: He loves playing video games in his free time.
-English: The birds are chirping in the trees.
-English: I want to learn how to play the piano.
-English: They are building a new shopping mall in the city.
-English: She is writing a novel in her spare time.
-English: We are going to the zoo this Saturday.
-English: The cake looks delicious with chocolate frosting.
-English: He is a talented painter who sells his artwork.
-English: The students are studying for their exams.
-English: I enjoy swimming in the ocean.
-English: They are renovating their house.
-English: She is practicing yoga to stay healthy.
-English: We should plant flowers in the garden.
-English: The traffic is heavy during rush hour.
-English: He is a skilled chef who creates amazing dishes.
-English: The baby is crawling on the floor.
-English: I need to buy a new pair of shoes.
-English: They are going on a road trip across the country.
-English: She is playing the piano beautifully.
-English: We are going to a concert tomorrow night.
-English: The cake tastes delicious with vanilla frosting.
-English: He is a dedicated teacher who inspires his students.
-English: The students are participating in a science fair.
-English: I enjoy hiking in the mountains.
-English: They are organizing a beach cleanup next weekend.
-English: She is taking photographs of nature.
-English: We should try a new restaurant in town.
-English: The traffic is moving slowly on the highway.
-English: He is a talented singer with a beautiful voice.
-English: The baby is laughing and giggling.
-English: I need to do laundry and wash my clothes.
-English: They are planning a trip to Europe.
-English: She is learning how to play the guitar.
-English: We are going to a museum this Sunday.
-English: The coffee smells amazing in the morning.
-English: He is a hardworking farmer who grows crops.
-English: The students are presenting their research projects.
-English: I enjoy playing soccer with my friends.
-English: They are volunteering at a local shelter.
-English: She is practicing martial arts for self-defense.
-English: We should try a new recipe for dinner.
-English: The traffic is congest
-English: The sun is shining brightly today.
-English: I enjoy reading books in my free time.
-English: She plays the piano beautifully.
-English: The cat chased the mouse around the room.
-English: I love eating pizza with extra cheese.
-English: He always wears a hat wherever he goes.
-English: The flowers in the garden are blooming.
-English: She danced gracefully on the stage.
-English: The dog barked loudly in the park.
-English: We went swimming in the ocean yesterday.
-English: He speaks fluent French and Spanish.
-English: The train arrived at the station on time.
-English: She cooked a delicious meal for her family.
-Korean: 이것은 테스트 이다.
-Korean: 걱정할 필요 없다.
-Korean: 버그는 언젠가 고쳐진다.
-Japanese: 明日の天気はどうですか。
-Chinese: 请问洗手间在哪里?
-Emoji: I'm feeling 😄 today!
-Unicode: ◑ ▢ ▣ ◱
\ No newline at end of file
diff --git a/ggml/examples/prompts/tokenize_huggingface.py b/ggml/examples/prompts/tokenize_huggingface.py
deleted file mode 100644
index 627771f..0000000
--- a/ggml/examples/prompts/tokenize_huggingface.py
+++ /dev/null
@@ -1,65 +0,0 @@
-import os
-from transformers import AutoTokenizer
-
-os.environ['TOKENIZERS_PARALLELISM'] = "false"
-
-list_repo_hf  = ["databricks/dolly-v2-3b",           # dolly-v2 (3b, 7b, 12b models share the same tokenizer)
-                 "gpt2",                             # gpt-2 (gpt2-xl, gpt2-large share the same tokenizer)
-                 "uer/gpt2-chinese-cluecorpussmall", # gpt-2-chinese
-                 "EleutherAI/gpt-j-6b",              # gpt-j
-                 "EleutherAI/gpt-neox-20b",          # gpt-neox
-                 "EleutherAI/polyglot-ko-1.3b",      # gpt-neox (polyglot-ko 5.8b and 12.8b share the same tokenizer")
-                 "rinna/japanese-gpt-neox-3.6b",     # gpt-neox
-                 # mpt-7b (uses gpt-neox-20b tokenizer)
-                 "replit/replit-code-v1-3b",         # replit
-                 "bigcode/starcoder",                # starcoder (huggingface-cli login required)
-                 "openai/whisper-tiny"               # whisper (base, large, large-v2 share the same tokenizer)
-                 ]
-
-repo2ggml     = {"databricks/dolly-v2-3b"           : "dolly-v2",
-                 "gpt2"                             : "gpt-2",
-                 "uer/gpt2-chinese-cluecorpussmall" : "gpt-2-chinese",
-                 "EleutherAI/gpt-j-6b"              : "gpt-j",
-                 "EleutherAI/gpt-neox-20b"          : "gpt-neox",
-                 "EleutherAI/polyglot-ko-1.3b"      : "polyglot-ko",
-                 "rinna/japanese-gpt-neox-3.6b"     : "gpt-neox-japanese",
-                 "replit/replit-code-v1-3b"         : "replit",
-                 "bigcode/starcoder"                : "starcoder",
-                 "openai/whisper-tiny"              : "whisper"}
-
-repo2language = {"databricks/dolly-v2-3b"           : "english",
-                 "gpt2"                             : "english",
-                 "uer/gpt2-chinese-cluecorpussmall" : "chinese",
-                 "EleutherAI/gpt-j-6b"              : "english",
-                 "EleutherAI/gpt-neox-20b"          : "english",
-                 "EleutherAI/polyglot-ko-1.3b"      : "korean",
-                 "rinna/japanese-gpt-neox-3.6b"     : "japanese",
-                 "replit/replit-code-v1-3b"         : "english",
-                 "bigcode/starcoder"                : "english",
-                 "openai/whisper-tiny"              : "english"}
-
-delimeter = ": "
-test_sentences = []
-with open("test-cases.txt", "r") as f:
-    lines = [l.rstrip() for l in f.readlines()]
-    for l in lines:
-        if delimeter in l:
-            language = l[:l.index(delimeter)]
-            sentence = l[l.index(delimeter) + len(delimeter):]
-            test_sentences.append((language.lower(), sentence))
-
-for repo in list_repo_hf:
-
-    target_language = repo2language[repo]
-
-    tokenizer = AutoTokenizer.from_pretrained(repo, trust_remote_code=True)
-
-    tokens_hf = []
-    for language, sentence in test_sentences:
-        if language == target_language:
-            tokens = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentence))
-            tokens_hf.append((sentence, tokens))
-
-    save_txt = repo2ggml[repo] + ".txt"
-    with open(save_txt, "w") as f:
-        f.writelines([sentence + " => " + ",".join(str(t) for t in tokens) + "\n" for sentence, tokens in tokens_hf])
diff --git a/ggml/examples/prompts/whisper.txt b/ggml/examples/prompts/whisper.txt
deleted file mode 100644
index a8f1caa..0000000
--- a/ggml/examples/prompts/whisper.txt
+++ /dev/null
@@ -1,100 +0,0 @@
-Hello World! => 15947,3937,0
-I can't believe it's already Friday!" => 40,393,380,1697,309,311,1217,6984,2963
-The URL for the website is https://www.example.com." => 2278,12905,337,220,3322,3144,307,34426,21492,17919,13,3121,335,781,13,1112,889
-"She said, 'I love to travel.'" => 1,9526,848,11,922,40,959,220,1353,220,17227,779,28763
-'The temperature is 25.5°C.' => 6,2278,220,18275,610,1503,307,3552,13,20,11782,34,4443
-"Let's meet at 2:30 p.m. in the park." => 1,8373,311,1677,412,568,25,3446,280,13,76,13,294,220,3322,3884,889
-The book costs $19.99 => 2278,1446,5497,1848,3405,13,8494
-"John's favorite color is blue." => 1,16938,311,2954,2017,307,3344,889
-Th@nk y0u f0r y0ur h3lp! => 2434,31,77,74,288,15,84,283,15,81,288,15,374,276,18,75,79,0
-C@n I g3t a c0ffee, pl3@se? => 34,31,77,286,290,18,83,257,269,15,4617,11,499,18,31,405,30
-W0w! Th@t's @m@zing! => 54,15,86,0,334,31,83,311,10428,76,31,8781,0
-H0w 4re y0u t0d@y? => 39,15,86,1017,265,288,15,84,220,83,15,67,31,88,30
-I l0ve t0 tr@vel @r0und the w0rld. => 40,287,15,303,220,83,15,220,6903,31,779,10428,81,15,997,220,3322,261,15,81,348,13
-Wh@t's y0ur f@v0rite m0vie? => 2471,31,83,311,288,15,374,283,31,85,15,35002,275,15,12702,30
-The cat is sleeping on the mat. => 2278,3857,307,8296,322,220,3322,3803,13
-I need to buy some groceries for dinner. => 40,643,220,1353,2256,512,31391,337,6148,13
-The sun is shining brightly in the sky. => 2278,3295,307,18269,47418,294,220,3322,5443,13
-She is reading a book in the park. => 9526,307,3760,257,1446,294,220,3322,3884,13
-We went for a walk on the beach yesterday. => 4360,1437,337,257,1792,322,220,3322,7534,5186,13
-He plays the guitar like a pro. => 5205,5749,220,3322,7531,411,257,447,13
-They are going to the movies tonight. => 8829,366,516,220,1353,220,3322,6233,220,1756,397,13
-The flowers are blooming in the garden. => 2278,8085,366,45294,294,220,3322,7431,13
-I enjoy listening to classical music. => 40,2103,4764,220,1353,13735,1318,13
-We need to buy groceries for the week. => 4360,643,220,1353,2256,31391,337,220,3322,1243,13
-The dog is chasing its tail in circles. => 2278,3000,307,17876,1080,220,14430,294,13040,13
-She is wearing a beautiful red dress. => 9526,307,4769,257,2238,2182,5231,13
-He is a talented actor in Hollywood. => 5205,307,257,220,32831,6003,8747,294,11628,13
-The children are playing in the playground. => 2278,2227,366,2433,294,220,3322,24646,13
-I'm going to visit my grandparents this weekend. => 40,478,516,220,1353,3441,452,21876,220,11176,6711,13
-The coffee tastes bitter without sugar. => 2278,4982,220,83,40246,13871,1553,5076,13
-They are planning a surprise party for her. => 8829,366,5038,257,6365,3595,337,720,13
-She sings like an angel on stage. => 9526,23250,411,364,14250,322,3233,13
-We should take a vacation to relax. => 4360,820,220,27612,257,12830,220,1353,5789,13
-He is studying medicine at the university. => 5205,307,7601,7195,412,220,3322,5454,13
-The rain is pouring heavily outside. => 2278,4830,307,20450,10950,2380,13
-I enjoy watching romantic movies. => 40,2103,1976,13590,6233,13
-They are celebrating their anniversary today. => 8829,366,15252,220,3322,347,12962,220,83,378,320,13
-She dances gracefully to the music. => 9526,28322,10042,2277,220,1353,220,3322,1318,13
-He is an excellent basketball player. => 5205,307,364,7103,11767,4256,13
-The baby is sleeping soundly in the crib. => 2278,3186,307,8296,1626,356,294,220,3322,47163,13
-I need to finish my homework before dinner. => 40,643,220,1353,2413,452,14578,949,6148,13
-They are organizing a charity event next month. => 8829,366,17608,257,16863,2280,958,1618,13
-She is cooking a delicious meal for us. => 9526,307,6361,257,4809,6791,337,505,13
-We should go hiking in the mountains. => 4360,820,352,23784,294,220,3322,10233,13
-The car broke down on the way to work. => 2278,1032,6902,760,322,220,3322,636,220,1353,589,13
-He loves playing video games in his free time. => 5205,6752,2433,960,2813,294,702,1737,220,3766,13
-The birds are chirping in the trees. => 2278,9009,366,36682,294,220,3322,220,3599,279,13
-I want to learn how to play the piano. => 40,528,220,1353,1466,577,220,1353,862,220,3322,9211,13
-They are building a new shopping mall in the city. => 8829,366,2390,257,777,8688,16026,294,220,3322,2307,13
-She is writing a novel in her spare time. => 9526,307,3579,257,7613,294,720,13798,220,3766,13
-We are going to the zoo this Saturday. => 4360,366,516,220,1353,220,3322,25347,220,11176,8803,13
-The cake looks delicious with chocolate frosting. => 2278,5908,1542,4809,365,6215,37048,13
-He is a talented painter who sells his artwork. => 5205,307,257,220,32831,6003,26619,567,20897,702,15829,13
-The students are studying for their exams. => 2278,1731,366,7601,337,220,3322,347,20514,13
-I enjoy swimming in the ocean. => 40,2103,11989,294,220,3322,7810,13
-They are renovating their house. => 8829,366,18845,990,220,3322,347,1782,13
-She is practicing yoga to stay healthy. => 9526,307,11350,15128,220,1353,1754,4627,13
-We should plant flowers in the garden. => 4360,820,3709,8085,294,220,3322,7431,13
-The traffic is heavy during rush hour. => 2278,220,17227,3341,307,4676,1830,9300,1773,13
-He is a skilled chef who creates amazing dishes. => 5205,307,257,19690,10530,567,7829,2243,10814,13
-The baby is crawling on the floor. => 2278,3186,307,32979,322,220,3322,4123,13
-I need to buy a new pair of shoes. => 40,643,220,1353,2256,257,777,6119,295,6654,13
-They are going on a road trip across the country. => 8829,366,516,322,257,3060,220,83,8400,2108,220,3322,1941,13
-She is playing the piano beautifully. => 9526,307,2433,220,3322,9211,16525,13
-We are going to a concert tomorrow night. => 4360,366,516,220,1353,257,8543,220,83,298,3162,1818,13
-The cake tastes delicious with vanilla frosting. => 2278,5908,220,83,40246,4809,365,17528,37048,13
-He is a dedicated teacher who inspires his students. => 5205,307,257,8374,220,975,4062,567,32566,702,1731,13
-The students are participating in a science fair. => 2278,1731,366,13950,294,257,3497,3143,13
-I enjoy hiking in the mountains. => 40,2103,23784,294,220,3322,10233,13
-They are organizing a beach cleanup next weekend. => 8829,366,17608,257,7534,40991,958,6711,13
-She is taking photographs of nature. => 9526,307,220,48625,17649,295,3687,13
-We should try a new restaurant in town. => 4360,820,220,83,627,257,777,6383,294,220,30401,13
-The traffic is moving slowly on the highway. => 2278,220,17227,3341,307,2684,5692,322,220,3322,17205,13
-He is a talented singer with a beautiful voice. => 5205,307,257,220,32831,6003,11564,365,257,2238,3177,13
-The baby is laughing and giggling. => 2278,3186,307,5059,293,290,24542,13
-I need to do laundry and wash my clothes. => 40,643,220,1353,360,19811,293,5675,452,5534,13
-They are planning a trip to Europe. => 8829,366,5038,257,220,83,8400,220,1353,3315,13
-She is learning how to play the guitar. => 9526,307,2539,577,220,1353,862,220,3322,7531,13
-We are going to a museum this Sunday. => 4360,366,516,220,1353,257,8441,220,11176,7776,13
-The coffee smells amazing in the morning. => 2278,4982,10036,2243,294,220,3322,2446,13
-He is a hardworking farmer who grows crops. => 5205,307,257,1152,22475,17891,567,13156,16829,13
-The students are presenting their research projects. => 2278,1731,366,15578,220,3322,347,2132,4455,13
-I enjoy playing soccer with my friends. => 40,2103,2433,15469,365,452,1855,13
-They are volunteering at a local shelter. => 8829,366,33237,412,257,2654,13341,13
-She is practicing martial arts for self-defense. => 9526,307,11350,20755,8609,337,2698,12,49268,13
-We should try a new recipe for dinner. => 4360,820,220,83,627,257,777,6782,337,6148,13
-The traffic is congest => 2278,220,17227,3341,307,31871
-The sun is shining brightly today. => 2278,3295,307,18269,47418,220,83,378,320,13
-I enjoy reading books in my free time. => 40,2103,3760,3642,294,452,1737,220,3766,13
-She plays the piano beautifully. => 9526,5749,220,3322,9211,16525,13
-The cat chased the mouse around the room. => 2278,3857,33091,220,3322,9719,926,220,3322,1808,13
-I love eating pizza with extra cheese. => 40,959,3936,8298,365,2857,5399,13
-He always wears a hat wherever he goes. => 5205,1009,20877,257,2385,8660,415,1709,13
-The flowers in the garden are blooming. => 2278,8085,294,220,3322,7431,366,45294,13
-She danced gracefully on the stage. => 9526,32909,10042,2277,322,220,3322,3233,13
-The dog barked loudly in the park. => 2278,3000,16202,292,22958,294,220,3322,3884,13
-We went swimming in the ocean yesterday. => 4360,1437,11989,294,220,3322,7810,5186,13
-He speaks fluent French and Spanish. => 5205,10789,40799,5522,293,8058,13
-The train arrived at the station on time. => 2278,220,83,7146,6678,412,220,3322,5214,322,220,3766,13
-She cooked a delicious meal for her family. => 9526,9267,257,4809,6791,337,720,1605,13
diff --git a/ggml/examples/python/README.md b/ggml/examples/python/README.md
deleted file mode 100644
index 480920f..0000000
--- a/ggml/examples/python/README.md
+++ /dev/null
@@ -1,115 +0,0 @@
-# Simple autogenerated Python bindings for ggml
-
-This folder contains:
-
-- Scripts to generate full Python bindings from ggml headers (+ stubs for autocompletion in IDEs)
-- Some barebones utils (see [ggml/utils.py](./ggml/utils.py)):
-  - `ggml.utils.init` builds a context that's freed automatically when the pointer gets GC'd
-  - `ggml.utils.copy` **copies between same-shaped tensors (numpy or ggml), w/ automatic (de/re)quantization**
-  - `ggml.utils.numpy` returns a numpy view over a ggml tensor; if it's quantized, it returns a copy (requires `allow_copy=True`)
-- Very basic examples (anyone wants to port [llama2.c](https://github.com/karpathy/llama2.c)?)
-
-Provided you set `GGML_LIBRARY=.../path/to/libggml_shared.so` (see instructions below), it's trivial to do some operations on quantized tensors:
-
-```python
-# Make sure libllama.so is in your [DY]LD_LIBRARY_PATH, or set GGML_LIBRARY=.../libggml_shared.so
-
-from ggml import lib, ffi
-from ggml.utils import init, copy, numpy
-import numpy as np
-
-ctx = init(mem_size=12*1024*1024)
-n = 256
-n_threads = 4
-
-a = lib.ggml_new_tensor_1d(ctx, lib.GGML_TYPE_Q5_K, n)
-b = lib.ggml_new_tensor_1d(ctx, lib.GGML_TYPE_F32, n) # Can't both be quantized
-sum = lib.ggml_add(ctx, a, b) # all zeroes for now. Will be quantized too!
-
-gf = ffi.new('struct ggml_cgraph*')
-lib.ggml_build_forward_expand(gf, sum)
-
-copy(np.array([i for i in range(n)], np.float32), a)
-copy(np.array([i*100 for i in range(n)], np.float32), b)
-
-lib.ggml_graph_compute_with_ctx(ctx, gf, n_threads)
-
-print(numpy(a, allow_copy=True))
-#  0.    1.0439453   2.0878906   3.131836    4.1757812   5.2197266. ...
-print(numpy(b))
-#  0.  100.        200.        300.        400.        500.         ...
-print(numpy(sum, allow_copy=True))
-#  0.  105.4375    210.875     316.3125    421.75      527.1875     ...
-```
-
-### Prerequisites
-
-You'll need a shared library of ggml to use the bindings.
-
-#### Build libggml_shared.so or libllama.so
-
-As of this writing the best is to use [ggerganov/llama.cpp](https://github.com/ggerganov/llama.cpp)'s generated `libggml_shared.so` or `libllama.so`, which you can build as follows:
-
-```bash
-git clone https://github.com/ggerganov/llama.cpp
-# On a CUDA-enabled system add -DLLAMA_CUBLAS=1
-# On a Mac add -DLLAMA_METAL=1
-cmake llama.cpp \
-  -B llama_build \
-  -DCMAKE_C_FLAGS=-Ofast \
-  -DLLAMA_NATIVE=1 \
-  -DLLAMA_LTO=1 \
-  -DBUILD_SHARED_LIBS=1 \
-  -DLLAMA_MPI=1 \
-  -DLLAMA_BUILD_TESTS=0 \
-  -DLLAMA_BUILD_EXAMPLES=0
-( cd llama_build && make -j )
-
-# On Mac, this will be libggml_shared.dylib instead
-export GGML_LIBRARY=$PWD/llama_build/libggml_shared.so
-# Alternatively, you can just copy it to your system's lib dir, e.g /usr/local/lib
-```
-
-#### (Optional) Regenerate the bindings and stubs
-
-If you added or changed any signatures of the C API, you'll want to regenerate the bindings ([ggml/cffi.py](./ggml/cffi.py)) and stubs ([ggml/__init__.pyi](./ggml/__init__.pyi)).
-
-Luckily it's a one-liner using [regenerate.py](./regenerate.py):
-
-```bash
-pip install -q cffi
-
-python regenerate.py
-```
-
-By default it assumes `llama.cpp` was cloned in ../../../llama.cpp (alongside the ggml folder). You can override this with:
-
-```bash
-C_INCLUDE_DIR=$LLAMA_CPP_DIR python regenerate.py
-```
-
-You can also edit [api.h](./api.h) to control which files should be included in the generated bindings (defaults to `llama.cpp/ggml*.h`)
-
-In fact, if you wanted to only generate bindings for the current version of the `ggml` repo itself (instead of `llama.cpp`; you'd loose support for k-quants), you could run:
-
-```bash
-API=../../include/ggml/ggml.h python regenerate.py
-```
-
-## Develop
-
-Run tests:
-
-```bash
-pytest
-```
-
-### Alternatives
-
-This example's goal is to showcase [cffi](https://cffi.readthedocs.io/)-generated bindings that are trivial to use and update, but there are already alternatives in the wild:
-
-- https://github.com/abetlen/ggml-python: these bindings seem to be hand-written and use [ctypes](https://docs.python.org/3/library/ctypes.html). It has [high-quality API reference docs](https://ggml-python.readthedocs.io/en/latest/api-reference/#ggml.ggml) that can be used with these bindings too, but it doesn't expose Metal, CUDA, MPI or OpenCL calls, doesn't support transparent (de/re)quantization like this example does (see [ggml.utils](./ggml/utils.py) module), and won't pick up your local changes.
-  
-- https://github.com/abetlen/llama-cpp-python: these expose the C++ `llama.cpp` interface, which this example cannot easily be extended to support (`cffi` only generates bindings of C libraries)
-
-- [pybind11](https://github.com/pybind/pybind11) and [nanobind](https://github.com/wjakob/nanobind) are two alternatives to cffi that support binding C++ libraries, but it doesn't seem either of them have an automatic generator (writing bindings is rather time-consuming).
diff --git a/ggml/examples/python/api.h b/ggml/examples/python/api.h
deleted file mode 100644
index 8d565bd..0000000
--- a/ggml/examples/python/api.h
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
-  List here all the headers you want to expose in the Python bindings,
-  then run `python regenerate.py` (see details in README.md)
-*/
-
-#include "ggml.h"
-#include "ggml-metal.h"
-#include "ggml-opencl.h"
-
-// Headers below are currently only present in the llama.cpp repository, comment them out if you don't have them.
-#include "k_quants.h"
-#include "ggml-alloc.h"
-#include "ggml-cuda.h"
-#include "ggml-mpi.h"
\ No newline at end of file
diff --git a/ggml/examples/python/example_add_quant.py b/ggml/examples/python/example_add_quant.py
deleted file mode 100644
index cecb44e..0000000
--- a/ggml/examples/python/example_add_quant.py
+++ /dev/null
@@ -1,25 +0,0 @@
-from ggml import lib, ffi
-from ggml.utils import init, copy, numpy
-import numpy as np
-
-ctx = init(mem_size=12*1024*1024) # automatically freed when pointer is GC'd
-n = 256
-n_threads = 4
-
-a = lib.ggml_new_tensor_1d(ctx, lib.GGML_TYPE_Q5_K, n)
-b = lib.ggml_new_tensor_1d(ctx, lib.GGML_TYPE_F32, n) # can't both be quantized
-sum = lib.ggml_add(ctx, a, b) # all zeroes for now. Will be quantized too!
-
-# See cffi's doc on how to allocate native memory: it's very simple!
-# https://cffi.readthedocs.io/en/latest/ref.html#ffi-interface
-gf = ffi.new('struct ggml_cgraph*')
-lib.ggml_build_forward_expand(gf, sum)
-
-copy(np.array([i for i in range(n)], np.float32), a)
-copy(np.array([i*100 for i in range(n)], np.float32), b)
-
-lib.ggml_graph_compute_with_ctx(ctx, gf, n_threads)
-
-print(numpy(a, allow_copy=True))
-print(numpy(b))
-print(numpy(sum, allow_copy=True))
\ No newline at end of file
diff --git a/ggml/examples/python/example_test_all_quants.py b/ggml/examples/python/example_test_all_quants.py
deleted file mode 100644
index 8d3c966..0000000
--- a/ggml/examples/python/example_test_all_quants.py
+++ /dev/null
@@ -1,68 +0,0 @@
-from ggml import ffi, lib
-from ggml.utils import init, numpy, copy
-import numpy as np
-from math import pi, cos, sin, ceil
-
-import matplotlib.pyplot as plt
-
-ctx = init(mem_size=100*1024*1024) # Will be auto-GC'd
-n = 256
-
-orig = np.array([
-    [
-        cos(j * 2 * pi / n) * (sin(i * 2 * pi / n))
-        for j in range(n)
-    ]
-    for i in range(n)
-], np.float32)
-orig_tensor = lib.ggml_new_tensor_2d(ctx, lib.GGML_TYPE_F32, n, n)
-copy(orig, orig_tensor)
-
-quants = [
-    type for type in range(lib.GGML_TYPE_COUNT)
-    if lib.ggml_is_quantized(type) and
-       type not in [lib.GGML_TYPE_Q8_1, lib.GGML_TYPE_Q8_K] # Apparently not supported
-]
-# quants = [lib.GGML_TYPE_Q2_K] # Test a single one
-
-def get_name(type):
-    name = lib.ggml_type_name(type)
-    return ffi.string(name).decode('utf-8') if name else '?'
-
-quants.sort(key=get_name)
-quants.insert(0, None)
-print(quants)
-
-ncols=4
-nrows = ceil(len(quants) / ncols)
-
-plt.figure(figsize=(ncols * 5, nrows * 5), layout='tight')
-
-for i, type in enumerate(quants):
-    plt.subplot(nrows, ncols, i + 1)
-    try:
-        if type == None:
-            plt.title('Original')
-            plt.imshow(orig)
-        else:
-            quantized_tensor = lib.ggml_new_tensor_2d(ctx, type, n, n)
-            copy(orig_tensor, quantized_tensor)
-            quantized = numpy(quantized_tensor, allow_copy=True)
-            d = quantized - orig
-            results = {
-                "l2": np.linalg.norm(d, 2),
-                "linf": np.linalg.norm(d, np.inf),
-                "compression":
-                    round(lib.ggml_nbytes(orig_tensor) /
-                          lib.ggml_nbytes(quantized_tensor), 1)
-            }
-            name = get_name(type)
-            print(f'{name}: {results}')
-
-            plt.title(f'{name} ({results["compression"]}x smaller)')
-            plt.imshow(quantized, interpolation='nearest')
-        
-    except Exception as e:
-        print(f'Error: {e}')
-
-plt.show()
\ No newline at end of file
diff --git a/ggml/examples/python/ggml/__init__.py b/ggml/examples/python/ggml/__init__.py
deleted file mode 100644
index 31a1910..0000000
--- a/ggml/examples/python/ggml/__init__.py
+++ /dev/null
@@ -1,58 +0,0 @@
-"""
-  Python bindings for the ggml library.
-
-  Usage example:
-
-      from ggml import lib, ffi
-      from ggml.utils import init, copy, numpy
-      import numpy as np
-
-      ctx = init(mem_size=10*1024*1024)
-      n = 1024
-      n_threads = 4
-
-      a = lib.ggml_new_tensor_1d(ctx, lib.GGML_TYPE_Q5_K, n)
-      b = lib.ggml_new_tensor_1d(ctx, lib.GGML_TYPE_F32, n)
-      sum = lib.ggml_add(ctx, a, b)
-
-      gf = ffi.new('struct ggml_cgraph*')
-      lib.ggml_build_forward_expand(gf, sum)
-
-      copy(np.array([i for i in range(n)], np.float32), a)
-      copy(np.array([i*100 for i in range(n)], np.float32), b)
-      lib.ggml_graph_compute_with_ctx(ctx, gf, n_threads)
-
-      print(numpy(sum, allow_copy=True))
-
-  See https://cffi.readthedocs.io/en/latest/cdef.html for more on cffi.
-"""
-
-try:
-    from ggml.cffi import ffi as ffi
-except ImportError as e:
-    raise ImportError(f"Couldn't find ggml bindings ({e}). Run `python regenerate.py` or check your PYTHONPATH.")
-
-import os, platform
-
-__exact_library = os.environ.get("GGML_LIBRARY")
-if __exact_library:
-    __candidates = [__exact_library]
-elif platform.system() == "Windows":
-    __candidates = ["ggml_shared.dll", "llama.dll"]
-else:
-    __candidates = ["libggml_shared.so", "libllama.so"]
-    if platform.system() == "Darwin":
-        __candidates += ["libggml_shared.dylib", "libllama.dylib"]
-
-for i, name in enumerate(__candidates):
-    try:
-        # This is where all the functions, enums and constants are defined
-        lib = ffi.dlopen(name)
-    except OSError:
-        if i < len(__candidates) - 1:
-            continue
-        raise OSError(f"Couldn't find ggml's shared library (tried names: {__candidates}). Add its directory to DYLD_LIBRARY_PATH (on Mac) or LD_LIBRARY_PATH, or define GGML_LIBRARY.")
-
-# This contains the cffi helpers such as new, cast, string, etc.
-# https://cffi.readthedocs.io/en/latest/ref.html#ffi-interface
-ffi = ffi
diff --git a/ggml/examples/python/ggml/__init__.pyi b/ggml/examples/python/ggml/__init__.pyi
deleted file mode 100644
index b08ed40..0000000
--- a/ggml/examples/python/ggml/__init__.pyi
+++ /dev/null
@@ -1,2412 +0,0 @@
-# auto-generated file
-import ggml.ffi as ffi
-import numpy as np
-class lib:
-  @property
-  def GGML_BACKEND_CPU(self) -> int: ...
-  @property
-  def GGML_BACKEND_GPU(self) -> int: ...
-  @property
-  def GGML_BACKEND_GPU_SPLIT(self) -> int: ...
-  @property
-  def GGML_FTYPE_ALL_F32(self) -> int: ...
-  @property
-  def GGML_FTYPE_MOSTLY_F16(self) -> int: ...
-  @property
-  def GGML_FTYPE_MOSTLY_Q2_K(self) -> int: ...
-  @property
-  def GGML_FTYPE_MOSTLY_Q3_K(self) -> int: ...
-  @property
-  def GGML_FTYPE_MOSTLY_Q4_0(self) -> int: ...
-  @property
-  def GGML_FTYPE_MOSTLY_Q4_1(self) -> int: ...
-  @property
-  def GGML_FTYPE_MOSTLY_Q4_1_SOME_F16(self) -> int: ...
-  @property
-  def GGML_FTYPE_MOSTLY_Q4_K(self) -> int: ...
-  @property
-  def GGML_FTYPE_MOSTLY_Q5_0(self) -> int: ...
-  @property
-  def GGML_FTYPE_MOSTLY_Q5_1(self) -> int: ...
-  @property
-  def GGML_FTYPE_MOSTLY_Q5_K(self) -> int: ...
-  @property
-  def GGML_FTYPE_MOSTLY_Q6_K(self) -> int: ...
-  @property
-  def GGML_FTYPE_MOSTLY_Q8_0(self) -> int: ...
-  @property
-  def GGML_FTYPE_UNKNOWN(self) -> int: ...
-  @property
-  def GGML_LINESEARCH_BACKTRACKING_ARMIJO(self) -> int: ...
-  @property
-  def GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE(self) -> int: ...
-  @property
-  def GGML_LINESEARCH_BACKTRACKING_WOLFE(self) -> int: ...
-  @property
-  def GGML_LINESEARCH_DEFAULT(self) -> int: ...
-  @property
-  def GGML_LINESEARCH_FAIL(self) -> int: ...
-  @property
-  def GGML_LINESEARCH_INVALID_PARAMETERS(self) -> int: ...
-  @property
-  def GGML_LINESEARCH_MAXIMUM_ITERATIONS(self) -> int: ...
-  @property
-  def GGML_LINESEARCH_MAXIMUM_STEP(self) -> int: ...
-  @property
-  def GGML_LINESEARCH_MINIMUM_STEP(self) -> int: ...
-  @property
-  def GGML_OBJECT_GRAPH(self) -> int: ...
-  @property
-  def GGML_OBJECT_TENSOR(self) -> int: ...
-  @property
-  def GGML_OBJECT_WORK_BUFFER(self) -> int: ...
-  @property
-  def GGML_OPT_ADAM(self) -> int: ...
-  @property
-  def GGML_OPT_DID_NOT_CONVERGE(self) -> int: ...
-  @property
-  def GGML_OPT_FAIL(self) -> int: ...
-  @property
-  def GGML_OPT_INVALID_WOLFE(self) -> int: ...
-  @property
-  def GGML_OPT_LBFGS(self) -> int: ...
-  @property
-  def GGML_OPT_NO_CONTEXT(self) -> int: ...
-  @property
-  def GGML_OPT_OK(self) -> int: ...
-  @property
-  def GGML_OP_ACC(self) -> int: ...
-  @property
-  def GGML_OP_ADD(self) -> int: ...
-  @property
-  def GGML_OP_ADD1(self) -> int: ...
-  @property
-  def GGML_OP_ALIBI(self) -> int: ...
-  @property
-  def GGML_OP_ARGMAX(self) -> int: ...
-  @property
-  def GGML_OP_CLAMP(self) -> int: ...
-  @property
-  def GGML_OP_CONT(self) -> int: ...
-  @property
-  def GGML_OP_CONV_1D(self) -> int: ...
-  @property
-  def GGML_OP_CONV_2D(self) -> int: ...
-  @property
-  def GGML_OP_COUNT(self) -> int: ...
-  @property
-  def GGML_OP_CPY(self) -> int: ...
-  @property
-  def GGML_OP_CROSS_ENTROPY_LOSS(self) -> int: ...
-  @property
-  def GGML_OP_CROSS_ENTROPY_LOSS_BACK(self) -> int: ...
-  @property
-  def GGML_OP_DIAG(self) -> int: ...
-  @property
-  def GGML_OP_DIAG_MASK_INF(self) -> int: ...
-  @property
-  def GGML_OP_DIAG_MASK_ZERO(self) -> int: ...
-  @property
-  def GGML_OP_DIV(self) -> int: ...
-  @property
-  def GGML_OP_DUP(self) -> int: ...
-  @property
-  def GGML_OP_FLASH_ATTN(self) -> int: ...
-  @property
-  def GGML_OP_FLASH_ATTN_BACK(self) -> int: ...
-  @property
-  def GGML_OP_FLASH_FF(self) -> int: ...
-  @property
-  def GGML_OP_GET_ROWS(self) -> int: ...
-  @property
-  def GGML_OP_GET_ROWS_BACK(self) -> int: ...
-  @property
-  def GGML_OP_LOG(self) -> int: ...
-  @property
-  def GGML_OP_MAP_BINARY(self) -> int: ...
-  @property
-  def GGML_OP_MAP_CUSTOM1(self) -> int: ...
-  @property
-  def GGML_OP_MAP_CUSTOM1_F32(self) -> int: ...
-  @property
-  def GGML_OP_MAP_CUSTOM2(self) -> int: ...
-  @property
-  def GGML_OP_MAP_CUSTOM2_F32(self) -> int: ...
-  @property
-  def GGML_OP_MAP_CUSTOM3(self) -> int: ...
-  @property
-  def GGML_OP_MAP_CUSTOM3_F32(self) -> int: ...
-  @property
-  def GGML_OP_MAP_UNARY(self) -> int: ...
-  @property
-  def GGML_OP_MEAN(self) -> int: ...
-  @property
-  def GGML_OP_MUL(self) -> int: ...
-  @property
-  def GGML_OP_MUL_MAT(self) -> int: ...
-  @property
-  def GGML_OP_NONE(self) -> int: ...
-  @property
-  def GGML_OP_NORM(self) -> int: ...
-  @property
-  def GGML_OP_OUT_PROD(self) -> int: ...
-  @property
-  def GGML_OP_PERMUTE(self) -> int: ...
-  @property
-  def GGML_OP_POOL_1D(self) -> int: ...
-  @property
-  def GGML_OP_POOL_2D(self) -> int: ...
-  @property
-  def GGML_OP_POOL_AVG(self) -> int: ...
-  @property
-  def GGML_OP_POOL_COUNT(self) -> int: ...
-  @property
-  def GGML_OP_POOL_MAX(self) -> int: ...
-  @property
-  def GGML_OP_REPEAT(self) -> int: ...
-  @property
-  def GGML_OP_REPEAT_BACK(self) -> int: ...
-  @property
-  def GGML_OP_RESHAPE(self) -> int: ...
-  @property
-  def GGML_OP_RMS_NORM(self) -> int: ...
-  @property
-  def GGML_OP_RMS_NORM_BACK(self) -> int: ...
-  @property
-  def GGML_OP_ROPE(self) -> int: ...
-  @property
-  def GGML_OP_ROPE_BACK(self) -> int: ...
-  @property
-  def GGML_OP_SCALE(self) -> int: ...
-  @property
-  def GGML_OP_SET(self) -> int: ...
-  @property
-  def GGML_OP_SILU_BACK(self) -> int: ...
-  @property
-  def GGML_OP_SOFT_MAX(self) -> int: ...
-  @property
-  def GGML_OP_SOFT_MAX_BACK(self) -> int: ...
-  @property
-  def GGML_OP_SQR(self) -> int: ...
-  @property
-  def GGML_OP_SQRT(self) -> int: ...
-  @property
-  def GGML_OP_SUB(self) -> int: ...
-  @property
-  def GGML_OP_SUM(self) -> int: ...
-  @property
-  def GGML_OP_SUM_ROWS(self) -> int: ...
-  @property
-  def GGML_OP_TRANSPOSE(self) -> int: ...
-  @property
-  def GGML_OP_UNARY(self) -> int: ...
-  @property
-  def GGML_OP_VIEW(self) -> int: ...
-  @property
-  def GGML_OP_WIN_PART(self) -> int: ...
-  @property
-  def GGML_OP_WIN_UNPART(self) -> int: ...
-  @property
-  def GGML_TASK_COMPUTE(self) -> int: ...
-  @property
-  def GGML_TASK_FINALIZE(self) -> int: ...
-  @property
-  def GGML_TASK_INIT(self) -> int: ...
-  @property
-  def GGML_TYPE_COUNT(self) -> int: ...
-  @property
-  def GGML_TYPE_F16(self) -> int: ...
-  @property
-  def GGML_TYPE_F32(self) -> int: ...
-  @property
-  def GGML_TYPE_I16(self) -> int: ...
-  @property
-  def GGML_TYPE_I32(self) -> int: ...
-  @property
-  def GGML_TYPE_I8(self) -> int: ...
-  @property
-  def GGML_TYPE_Q2_K(self) -> int: ...
-  @property
-  def GGML_TYPE_Q3_K(self) -> int: ...
-  @property
-  def GGML_TYPE_Q4_0(self) -> int: ...
-  @property
-  def GGML_TYPE_Q4_1(self) -> int: ...
-  @property
-  def GGML_TYPE_Q4_K(self) -> int: ...
-  @property
-  def GGML_TYPE_Q5_0(self) -> int: ...
-  @property
-  def GGML_TYPE_Q5_1(self) -> int: ...
-  @property
-  def GGML_TYPE_Q5_K(self) -> int: ...
-  @property
-  def GGML_TYPE_Q6_K(self) -> int: ...
-  @property
-  def GGML_TYPE_Q8_0(self) -> int: ...
-  @property
-  def GGML_TYPE_Q8_1(self) -> int: ...
-  @property
-  def GGML_TYPE_Q8_K(self) -> int: ...
-  @property
-  def GGML_UNARY_OP_ABS(self) -> int: ...
-  @property
-  def GGML_UNARY_OP_ELU(self) -> int: ...
-  @property
-  def GGML_UNARY_OP_GELU(self) -> int: ...
-  @property
-  def GGML_UNARY_OP_GELU_QUICK(self) -> int: ...
-  @property
-  def GGML_UNARY_OP_NEG(self) -> int: ...
-  @property
-  def GGML_UNARY_OP_RELU(self) -> int: ...
-  @property
-  def GGML_UNARY_OP_SGN(self) -> int: ...
-  @property
-  def GGML_UNARY_OP_SILU(self) -> int: ...
-  @property
-  def GGML_UNARY_OP_STEP(self) -> int: ...
-  @property
-  def GGML_UNARY_OP_TANH(self) -> int: ...
-  @property
-  def GGUF_TYPE_ARRAY(self) -> int: ...
-  @property
-  def GGUF_TYPE_BOOL(self) -> int: ...
-  @property
-  def GGUF_TYPE_COUNT(self) -> int: ...
-  @property
-  def GGUF_TYPE_FLOAT32(self) -> int: ...
-  @property
-  def GGUF_TYPE_INT16(self) -> int: ...
-  @property
-  def GGUF_TYPE_INT32(self) -> int: ...
-  @property
-  def GGUF_TYPE_INT8(self) -> int: ...
-  @property
-  def GGUF_TYPE_STRING(self) -> int: ...
-  @property
-  def GGUF_TYPE_UINT16(self) -> int: ...
-  @property
-  def GGUF_TYPE_UINT32(self) -> int: ...
-  @property
-  def GGUF_TYPE_UINT8(self) -> int: ...
-  def abort_callback(data: ffi.CData) -> bool:
-    """
-    abort ggml_graph_compute when true
-
-            bool (*abort_callback)(void * data);
-    """
-    ...
-  def dequantize_row_q2_K(x: ffi.CData, y: ffi.CData, k: int) -> None:
-    """
-    Dequantization
-
-    void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int k);
-    """
-    ...
-  def dequantize_row_q3_K(x: ffi.CData, y: ffi.CData, k: int) -> None:
-    """void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k);"""
-    ...
-  def dequantize_row_q4_K(x: ffi.CData, y: ffi.CData, k: int) -> None:
-    """void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k);"""
-    ...
-  def dequantize_row_q5_K(x: ffi.CData, y: ffi.CData, k: int) -> None:
-    """void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int k);"""
-    ...
-  def dequantize_row_q6_K(x: ffi.CData, y: ffi.CData, k: int) -> None:
-    """void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k);"""
-    ...
-  def dequantize_row_q8_K(x: ffi.CData, y: ffi.CData, k: int) -> None:
-    """void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k);"""
-    ...
-  def ggml_abs(ctx: ffi.CData, a: ffi.CData) -> ffi.CData:
-    """
-        GGML_API struct ggml_tensor * ggml_abs(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a);
-    """
-    ...
-  def ggml_abs_inplace(ctx: ffi.CData, a: ffi.CData) -> ffi.CData:
-    """
-        GGML_API struct ggml_tensor * ggml_abs_inplace(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a);
-    """
-    ...
-  def ggml_acc(ctx: ffi.CData, a: ffi.CData, b: ffi.CData, nb1: int, nb2: int, nb3: int, offset: int) -> ffi.CData:
-    """
-        GGML_API struct ggml_tensor * ggml_acc(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a,
-                struct ggml_tensor  * b,
-                size_t                nb1,
-                size_t                nb2,
-                size_t                nb3,
-                size_t                offset);
-    """
-    ...
-  def ggml_acc_inplace(ctx: ffi.CData, a: ffi.CData, b: ffi.CData, nb1: int, nb2: int, nb3: int, offset: int) -> ffi.CData:
-    """
-        GGML_API struct ggml_tensor * ggml_acc_inplace(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a,
-                struct ggml_tensor  * b,
-                size_t                nb1,
-                size_t                nb2,
-                size_t                nb3,
-                size_t                offset);
-    """
-    ...
-  def ggml_add(ctx: ffi.CData, a: ffi.CData, b: ffi.CData) -> ffi.CData:
-    """
-        GGML_API struct ggml_tensor * ggml_add(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a,
-                struct ggml_tensor  * b);
-    """
-    ...
-  def ggml_add1(ctx: ffi.CData, a: ffi.CData, b: ffi.CData) -> ffi.CData:
-    """
-        GGML_API struct ggml_tensor * ggml_add1(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a,
-                struct ggml_tensor  * b);
-    """
-    ...
-  def ggml_add1_inplace(ctx: ffi.CData, a: ffi.CData, b: ffi.CData) -> ffi.CData:
-    """
-        GGML_API struct ggml_tensor * ggml_add1_inplace(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a,
-                struct ggml_tensor  * b);
-    """
-    ...
-  def ggml_add_inplace(ctx: ffi.CData, a: ffi.CData, b: ffi.CData) -> ffi.CData:
-    """
-        GGML_API struct ggml_tensor * ggml_add_inplace(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a,
-                struct ggml_tensor  * b);
-    """
-    ...
-  def ggml_alibi(ctx: ffi.CData, a: ffi.CData, n_past: int, n_head: int, bias_max: float) -> ffi.CData:
-    """
-    alibi position embedding
-    in-place, returns view(a)
-
-        struct ggml_tensor * ggml_alibi(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a,
-                int                   n_past,
-                int                   n_head,
-                float                 bias_max);
-    """
-    ...
-  def ggml_allocr_alloc(alloc: ffi.CData, tensor: ffi.CData) -> None:
-    """GGML_API void   ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor);"""
-    ...
-  def ggml_allocr_alloc_graph(alloc: ffi.CData, graph: ffi.CData) -> int:
-    """GGML_API size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph);"""
-    ...
-  def ggml_allocr_free(alloc: ffi.CData) -> None:
-    """GGML_API void   ggml_allocr_free(struct ggml_allocr * alloc);"""
-    ...
-  def ggml_allocr_is_measure(alloc: ffi.CData) -> bool:
-    """GGML_API bool   ggml_allocr_is_measure(struct ggml_allocr * alloc);"""
-    ...
-  def ggml_allocr_new(data: ffi.CData, size: int, alignment: int) -> ffi.CData:
-    """GGML_API struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment);"""
-    ...
-  def ggml_allocr_new_measure(alignment: int) -> ffi.CData:
-    """GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment);"""
-    ...
-  def ggml_allocr_reset(alloc: ffi.CData) -> None:
-    """GGML_API void   ggml_allocr_reset(struct ggml_allocr * alloc);"""
-    ...
-  def ggml_allocr_set_parse_seq(alloc: ffi.CData, list: ffi.CData, n: int) -> None:
-    """
-    tell the allocator to parse nodes following the order described in the list
-    you should call this if your graph are optimized to execute out-of-order
-
-    GGML_API void   ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, int * list, int n);
-    """
-    ...
-  def ggml_are_same_shape(t0: ffi.CData, t1: ffi.CData) -> bool:
-    """    GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1);"""
-    ...
-  def ggml_argmax(ctx: ffi.CData, a: ffi.CData) -> ffi.CData:
-    """
-    argmax along rows
-
-        GGML_API struct ggml_tensor * ggml_argmax(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a);
-    """
-    ...
-  def ggml_blck_size(type: int) -> int:
-    """    GGML_API int     ggml_blck_size (enum ggml_type type);"""
-    ...
-  def ggml_build_backward(ctx: ffi.CData, gf: ffi.CData, keep: bool) -> ffi.CData:
-    """    GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);"""
-    ...
-  def ggml_build_forward(tensor: ffi.CData) -> ffi.CData:
-    """    GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);"""
-    ...
-  def ggml_build_forward_ctx(ctx: ffi.CData, tensor: ffi.CData) -> ffi.CData:
-    """    GGML_API struct ggml_cgraph * ggml_build_forward_ctx(struct ggml_context * ctx, struct ggml_tensor * tensor);"""
-    ...
-  def ggml_build_forward_expand(cgraph: ffi.CData, tensor: ffi.CData) -> None:
-    """    GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);"""
-    ...
-  def ggml_cl_can_mul_mat(src0: ffi.CData, src1: ffi.CData, dst: ffi.CData) -> bool:
-    """bool   ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);"""
-    ...
-  def ggml_cl_free_data(tensor: ffi.CData) -> None:
-    """void ggml_cl_free_data(const struct ggml_tensor* tensor);"""
-    ...
-  def ggml_cl_host_free(ptr: ffi.CData) -> None:
-    """void   ggml_cl_host_free(void * ptr);"""
-    ...
-  def ggml_cl_host_malloc(size: int) -> ffi.CData:
-    """void * ggml_cl_host_malloc(size_t size);"""
-    ...
-  def ggml_cl_init() -> None:
-    """void ggml_cl_init(void);"""
-    ...
-  def ggml_cl_mul(src0: ffi.CData, src1: ffi.CData, dst: ffi.CData) -> None:
-    """void   ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);"""
-    ...
-  def ggml_cl_mul_mat(src0: ffi.CData, src1: ffi.CData, dst: ffi.CData, wdata: ffi.CData, wsize: int) -> None:
-    """void   ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);"""
-    ...
-  def ggml_cl_mul_mat_get_wsize(src0: ffi.CData, src1: ffi.CData, dst: ffi.CData) -> int:
-    """size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);"""
-    ...
-  def ggml_cl_transform_tensor(data: ffi.CData, tensor: ffi.CData) -> None:
-    """void ggml_cl_transform_tensor(void * data, struct ggml_tensor * tensor);"""
-    ...
-  def ggml_clamp(ctx: ffi.CData, a: ffi.CData, min: float, max: float) -> ffi.CData:
-    """
-    clamp
-    in-place, returns view(a)
-
-        struct ggml_tensor * ggml_clamp(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a,
-                float                 min,
-                float                 max);
-    """
-    ...
-  def ggml_cont(ctx: ffi.CData, a: ffi.CData) -> ffi.CData:
-    """
-    make contiguous
-
-        GGML_API struct ggml_tensor * ggml_cont(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a);
-    """
-    ...
-  def ggml_conv_1d(ctx: ffi.CData, a: ffi.CData, b: ffi.CData, s0: int, p0: int, d0: int) -> ffi.CData:
-    """
-        GGML_API struct ggml_tensor * ggml_conv_1d(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a,
-                struct ggml_tensor  * b,
-                int                   s0,  // stride
-                int                   p0,  // padding
-                int                   d0); // dilation
-    """
-    ...
-  def ggml_conv_1d_ph(ctx: ffi.CData, a: ffi.CData, b: ffi.CData, s: int, d: int) -> ffi.CData:
-    """
-    conv_1d with padding = half
-    alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d)
-
-        GGML_API struct ggml_tensor * ggml_conv_1d_ph(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a,
-                struct ggml_tensor  * b,
-                int                   s,
-                int                   d);
-    """
-    ...
-  def ggml_conv_2d(ctx: ffi.CData, a: ffi.CData, b: ffi.CData, s0: int, s1: int, p0: int, p1: int, d0: int, d1: int) -> ffi.CData:
-    """
-        GGML_API struct ggml_tensor * ggml_conv_2d(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a,
-                struct ggml_tensor  * b,
-                int                   s0,
-                int                   s1,
-                int                   p0,
-                int                   p1,
-                int                   d0,
-                int                   d1);
-    """
-    ...
-  def ggml_cpu_has_arm_fma() -> int:
-    """    GGML_API int ggml_cpu_has_arm_fma    (void);"""
-    ...
-  def ggml_cpu_has_avx() -> int:
-    """    GGML_API int ggml_cpu_has_avx        (void);"""
-    ...
-  def ggml_cpu_has_avx2() -> int:
-    """    GGML_API int ggml_cpu_has_avx2       (void);"""
-    ...
-  def ggml_cpu_has_avx512() -> int:
-    """    GGML_API int ggml_cpu_has_avx512     (void);"""
-    ...
-  def ggml_cpu_has_avx512_vbmi() -> int:
-    """    GGML_API int ggml_cpu_has_avx512_vbmi(void);"""
-    ...
-  def ggml_cpu_has_avx512_vnni() -> int:
-    """    GGML_API int ggml_cpu_has_avx512_vnni(void);"""
-    ...
-  def ggml_cpu_has_blas() -> int:
-    """    GGML_API int ggml_cpu_has_blas       (void);"""
-    ...
-  def ggml_cpu_has_clblast() -> int:
-    """    GGML_API int ggml_cpu_has_clblast    (void);"""
-    ...
-  def ggml_cpu_has_cublas() -> int:
-    """    GGML_API int ggml_cpu_has_cublas     (void);"""
-    ...
-  def ggml_cpu_has_f16c() -> int:
-    """    GGML_API int ggml_cpu_has_f16c       (void);"""
-    ...
-  def ggml_cpu_has_fma() -> int:
-    """    GGML_API int ggml_cpu_has_fma        (void);"""
-    ...
-  def ggml_cpu_has_fp16_va() -> int:
-    """    GGML_API int ggml_cpu_has_fp16_va    (void);"""
-    ...
-  def ggml_cpu_has_gpublas() -> int:
-    """    GGML_API int ggml_cpu_has_gpublas    (void);"""
-    ...
-  def ggml_cpu_has_neon() -> int:
-    """    GGML_API int ggml_cpu_has_neon       (void);"""
-    ...
-  def ggml_cpu_has_sse3() -> int:
-    """    GGML_API int ggml_cpu_has_sse3       (void);"""
-    ...
-  def ggml_cpu_has_vsx() -> int:
-    """    GGML_API int ggml_cpu_has_vsx        (void);"""
-    ...
-  def ggml_cpu_has_wasm_simd() -> int:
-    """    GGML_API int ggml_cpu_has_wasm_simd  (void);"""
-    ...
-  def ggml_cpy(ctx: ffi.CData, a: ffi.CData, b: ffi.CData) -> ffi.CData:
-    """
-    a -> b, return view(b)
-
-        GGML_API struct ggml_tensor * ggml_cpy(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a,
-                struct ggml_tensor  * b);
-    """
-    ...
-  def ggml_cross_entropy_loss(ctx: ffi.CData, a: ffi.CData, b: ffi.CData) -> ffi.CData:
-    """
-        GGML_API struct ggml_tensor * ggml_cross_entropy_loss(
-                struct ggml_context         * ctx,
-                struct ggml_tensor          * a,
-                struct ggml_tensor          * b);
-    """
-    ...
-  def ggml_cross_entropy_loss_back(ctx: ffi.CData, a: ffi.CData, b: ffi.CData, c: ffi.CData) -> ffi.CData:
-    """
-        GGML_API struct ggml_tensor * ggml_cross_entropy_loss_back(
-                struct ggml_context         * ctx,
-                struct ggml_tensor          * a,
-                struct ggml_tensor          * b,
-                struct ggml_tensor          * c);
-    """
-    ...
-  def ggml_cuda_assign_buffers(tensor: ffi.CData) -> None:
-    """GGML_API void   ggml_cuda_assign_buffers(struct ggml_tensor * tensor);"""
-    ...
-  def ggml_cuda_assign_buffers_force_inplace(tensor: ffi.CData) -> None:
-    """GGML_API void   ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor);"""
-    ...
-  def ggml_cuda_assign_buffers_no_scratch(tensor: ffi.CData) -> None:
-    """GGML_API void   ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor);"""
-    ...
-  def ggml_cuda_can_mul_mat(src0: ffi.CData, src1: ffi.CData, dst: ffi.CData) -> bool:
-    """GGML_API bool   ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);"""
-    ...
-  def ggml_cuda_compute_forward(params: ffi.CData, tensor: ffi.CData) -> bool:
-    """GGML_API bool   ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);"""
-    ...
-  def ggml_cuda_free_data(tensor: ffi.CData) -> None:
-    """GGML_API void   ggml_cuda_free_data(struct ggml_tensor * tensor);"""
-    ...
-  def ggml_cuda_free_scratch() -> None:
-    """GGML_API void   ggml_cuda_free_scratch(void);"""
-    ...
-  def ggml_cuda_get_device_count() -> int:
-    """GGML_API int    ggml_cuda_get_device_count(void);"""
-    ...
-  def ggml_cuda_get_device_description(device: int, description: ffi.CData, description_size: int) -> None:
-    """GGML_API void   ggml_cuda_get_device_description(int device, char * description, size_t description_size);"""
-    ...
-  def ggml_cuda_host_free(ptr: ffi.CData) -> None:
-    """GGML_API void   ggml_cuda_host_free(void * ptr);"""
-    ...
-  def ggml_cuda_host_malloc(size: int) -> ffi.CData:
-    """GGML_API void * ggml_cuda_host_malloc(size_t size);"""
-    ...
-  def ggml_cuda_set_main_device(main_device: int) -> None:
-    """GGML_API void   ggml_cuda_set_main_device(int main_device);"""
-    ...
-  def ggml_cuda_set_mul_mat_q(mul_mat_q: bool) -> None:
-    """GGML_API void   ggml_cuda_set_mul_mat_q(bool mul_mat_q);"""
-    ...
-  def ggml_cuda_set_scratch_size(scratch_size: int) -> None:
-    """GGML_API void   ggml_cuda_set_scratch_size(size_t scratch_size);"""
-    ...
-  def ggml_cuda_set_tensor_split(tensor_split: ffi.CData) -> None:
-    """GGML_API void   ggml_cuda_set_tensor_split(const float * tensor_split);"""
-    ...
-  def ggml_cuda_transform_tensor(data: ffi.CData, tensor: ffi.CData) -> None:
-    """GGML_API void   ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor);"""
-    ...
-  def ggml_cycles() -> int:
-    """    GGML_API int64_t ggml_cycles(void);"""
-    ...
-  def ggml_cycles_per_ms() -> int:
-    """    GGML_API int64_t ggml_cycles_per_ms(void);"""
-    ...
-  def ggml_diag(ctx: ffi.CData, a: ffi.CData) -> ffi.CData:
-    """
-        GGML_API struct ggml_tensor * ggml_diag(
-            struct ggml_context     * ctx,
-            struct ggml_tensor      * a);
-    """
-    ...
-  def ggml_diag_mask_inf(ctx: ffi.CData, a: ffi.CData, n_past: int) -> ffi.CData:
-    """
-    set elements above the diagonal to -INF
-
-        GGML_API struct ggml_tensor * ggml_diag_mask_inf(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a,
-                int                   n_past);
-    """
-    ...
-  def ggml_diag_mask_inf_inplace(ctx: ffi.CData, a: ffi.CData, n_past: int) -> ffi.CData:
-    """
-    in-place, returns view(a)
-
-        GGML_API struct ggml_tensor * ggml_diag_mask_inf_inplace(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a,
-                int                   n_past);
-    """
-    ...
-  def ggml_diag_mask_zero(ctx: ffi.CData, a: ffi.CData, n_past: int) -> ffi.CData:
-    """
-    set elements above the diagonal to 0
-
-        GGML_API struct ggml_tensor * ggml_diag_mask_zero(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a,
-                int                   n_past);
-    """
-    ...
-  def ggml_diag_mask_zero_inplace(ctx: ffi.CData, a: ffi.CData, n_past: int) -> ffi.CData:
-    """
-    in-place, returns view(a)
-
-        GGML_API struct ggml_tensor * ggml_diag_mask_zero_inplace(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a,
-                int                   n_past);
-    """
-    ...
-  def ggml_div(ctx: ffi.CData, a: ffi.CData, b: ffi.CData) -> ffi.CData:
-    """
-        GGML_API struct ggml_tensor * ggml_div(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a,
-                struct ggml_tensor  * b);
-    """
-    ...
-  def ggml_div_inplace(ctx: ffi.CData, a: ffi.CData, b: ffi.CData) -> ffi.CData:
-    """
-        GGML_API struct ggml_tensor * ggml_div_inplace(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a,
-                struct ggml_tensor  * b);
-    """
-    ...
-  def ggml_dup(ctx: ffi.CData, a: ffi.CData) -> ffi.CData:
-    """
-        GGML_API struct ggml_tensor * ggml_dup(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a);
-    """
-    ...
-  def ggml_dup_inplace(ctx: ffi.CData, a: ffi.CData) -> ffi.CData:
-    """
-    in-place, returns view(a)
-
-        GGML_API struct ggml_tensor * ggml_dup_inplace(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a);
-    """
-    ...
-  def ggml_dup_tensor(ctx: ffi.CData, src: ffi.CData) -> ffi.CData:
-    """    GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);"""
-    ...
-  def ggml_element_size(tensor: ffi.CData) -> int:
-    """    GGML_API size_t  ggml_element_size(const struct ggml_tensor * tensor);"""
-    ...
-  def ggml_elu(ctx: ffi.CData, a: ffi.CData) -> ffi.CData:
-    """
-        GGML_API struct ggml_tensor * ggml_elu(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a);
-    """
-    ...
-  def ggml_elu_inplace(ctx: ffi.CData, a: ffi.CData) -> ffi.CData:
-    """
-        GGML_API struct ggml_tensor * ggml_elu_inplace(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a);
-    """
-    ...
-  def ggml_flash_attn(ctx: ffi.CData, q: ffi.CData, k: ffi.CData, v: ffi.CData, masked: bool) -> ffi.CData:
-    """
-        GGML_API struct ggml_tensor * ggml_flash_attn(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * q,
-                struct ggml_tensor  * k,
-                struct ggml_tensor  * v,
-                bool                  masked);
-    """
-    ...
-  def ggml_flash_attn_back(ctx: ffi.CData, q: ffi.CData, k: ffi.CData, v: ffi.CData, d: ffi.CData, masked: bool) -> ffi.CData:
-    """
-        GGML_API struct ggml_tensor * ggml_flash_attn_back(
-               struct ggml_context * ctx,
-               struct ggml_tensor  * q,
-               struct ggml_tensor  * k,
-               struct ggml_tensor  * v,
-               struct ggml_tensor  * d,
-               bool                  masked);
-    """
-    ...
-  def ggml_flash_ff(ctx: ffi.CData, a: ffi.CData, b0: ffi.CData, b1: ffi.CData, c0: ffi.CData, c1: ffi.CData) -> ffi.CData:
-    """
-        GGML_API struct ggml_tensor * ggml_flash_ff(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a,
-                struct ggml_tensor  * b0,
-                struct ggml_tensor  * b1,
-                struct ggml_tensor  * c0,
-                struct ggml_tensor  * c1);
-    """
-    ...
-  def ggml_format_name(tensor: ffi.CData, fmt: ffi.CData, *args2) -> ffi.CData:
-    """    GGML_API struct ggml_tensor * ggml_format_name(      struct ggml_tensor * tensor, const char * fmt, ...);"""
-    ...
-  def ggml_fp16_to_fp32(x: np.float16) -> float:
-    """
-    convert FP16 <-> FP32
-
-        GGML_API float       ggml_fp16_to_fp32(ggml_fp16_t x);
-    """
-    ...
-  def ggml_fp16_to_fp32_row(x: ffi.CData, y: ffi.CData, n: int) -> None:
-    """    GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int n);"""
-    ...
-  def ggml_fp32_to_fp16(x: float) -> np.float16:
-    """    GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);"""
-    ...
-  def ggml_fp32_to_fp16_row(x: ffi.CData, y: ffi.CData, n: int) -> None:
-    """    GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int n);"""
-    ...
-  def ggml_free(ctx: ffi.CData) -> None:
-    """    GGML_API void                  ggml_free(struct ggml_context * ctx);"""
-    ...
-  def ggml_ftype_to_ggml_type(ftype: int) -> int:
-    """
-    TODO: temporary until model loading of ggml examples is refactored
-
-        GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
-    """
-    ...
-  def ggml_gelu(ctx: ffi.CData, a: ffi.CData) -> ffi.CData:
-    """
-    TODO: double-check this computation is correct
-
-        GGML_API struct ggml_tensor * ggml_gelu(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a);
-    """
-    ...
-  def ggml_gelu_inplace(ctx: ffi.CData, a: ffi.CData) -> ffi.CData:
-    """
-        GGML_API struct ggml_tensor * ggml_gelu_inplace(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a);
-    """
-    ...
-  def ggml_gelu_quick(ctx: ffi.CData, a: ffi.CData) -> ffi.CData:
-    """
-        GGML_API struct ggml_tensor * ggml_gelu_quick(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a);
-    """
-    ...
-  def ggml_gelu_quick_inplace(ctx: ffi.CData, a: ffi.CData) -> ffi.CData:
-    """
-        GGML_API struct ggml_tensor * ggml_gelu_quick_inplace(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a);
-    """
-    ...
-  def ggml_get_data(tensor: ffi.CData) -> ffi.CData:
-    """    GGML_API void *  ggml_get_data    (const struct ggml_tensor * tensor);"""
-    ...
-  def ggml_get_data_f32(tensor: ffi.CData) -> ffi.CData:
-    """    GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);"""
-    ...
-  def ggml_get_f32_1d(tensor: ffi.CData, i: int) -> float:
-    """    GGML_API float   ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);"""
-    ...
-  def ggml_get_i32_1d(tensor: ffi.CData, i: int) -> int:
-    """    GGML_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);"""
-    ...
-  def ggml_get_max_tensor_size(ctx: ffi.CData) -> int:
-    """    GGML_API size_t  ggml_get_max_tensor_size(const struct ggml_context * ctx);"""
-    ...
-  def ggml_get_mem_buffer(ctx: ffi.CData) -> ffi.CData:
-    """    GGML_API void *  ggml_get_mem_buffer     (const struct ggml_context * ctx);"""
-    ...
-  def ggml_get_mem_size(ctx: ffi.CData) -> int:
-    """    GGML_API size_t  ggml_get_mem_size       (const struct ggml_context * ctx);"""
-    ...
-  def ggml_get_name(tensor: ffi.CData) -> ffi.CData:
-    """    GGML_API const char *         ggml_get_name   (const struct ggml_tensor * tensor);"""
-    ...
-  def ggml_get_no_alloc(ctx: ffi.CData) -> bool:
-    """    GGML_API bool    ggml_get_no_alloc(struct ggml_context * ctx);"""
-    ...
-  def ggml_get_rows(ctx: ffi.CData, a: ffi.CData, b: ffi.CData) -> ffi.CData:
-    """
-        GGML_API struct ggml_tensor * ggml_get_rows(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a,
-                struct ggml_tensor  * b);
-    """
-    ...
-  def ggml_get_rows_back(ctx: ffi.CData, a: ffi.CData, b: ffi.CData, c: ffi.CData) -> ffi.CData:
-    """
-        GGML_API struct ggml_tensor * ggml_get_rows_back(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a,
-                struct ggml_tensor  * b,
-                struct ggml_tensor  * c);
-    """
-    ...
-  def ggml_get_tensor(ctx: ffi.CData, name: ffi.CData) -> ffi.CData:
-    """    GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);"""
-    ...
-  def ggml_get_unary_op(tensor: ffi.CData) -> int:
-    """    GGML_API enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);"""
-    ...
-  def ggml_graph_compute(cgraph: ffi.CData, cplan: ffi.CData) -> int:
-    """    GGML_API               int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);"""
-    ...
-  def ggml_graph_compute_with_ctx(ctx: ffi.CData, cgraph: ffi.CData, n_threads: int) -> None:
-    """
-    same as ggml_graph_compute() but the work data is allocated as a part of the context
-    note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
-
-        GGML_API void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
-    """
-    ...
-  def ggml_graph_dump_dot(gb: ffi.CData, gf: ffi.CData, filename: ffi.CData) -> None:
-    """
-    dump the graph into a file using the dot format
-
-        GGML_API void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename);
-    """
-    ...
-  def ggml_graph_export(cgraph: ffi.CData, fname: ffi.CData) -> None:
-    """    GGML_API void               ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname);"""
-    ...
-  def ggml_graph_get_tensor(cgraph: ffi.CData, name: ffi.CData) -> ffi.CData:
-    """    GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);"""
-    ...
-  def ggml_graph_import(fname: ffi.CData, ctx_data: ffi.CData, ctx_eval: ffi.CData) -> ffi.CData:
-    """    GGML_API struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval);"""
-    ...
-  def ggml_graph_overhead() -> int:
-    """    GGML_API size_t ggml_graph_overhead(void);"""
-    ...
-  def ggml_graph_plan(cgraph: ffi.CData, n_threads: int) -> ffi.CData:
-    """
-    ggml_graph_plan() has to be called before ggml_graph_compute()
-    when plan.work_size > 0, caller must allocate memory for plan.work_data
-
-        GGML_API struct ggml_cplan ggml_graph_plan   (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
-    """
-    ...
-  def ggml_graph_print(cgraph: ffi.CData) -> None:
-    """
-    print info and performance information for the graph
-
-        GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph);
-    """
-    ...
-  def ggml_graph_reset(cgraph: ffi.CData) -> None:
-    """    GGML_API              void ggml_graph_reset  (struct ggml_cgraph * cgraph);"""
-    ...
-  def ggml_init(params: ffi.CData) -> ffi.CData:
-    """    GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);"""
-    ...
-  def ggml_init_cublas() -> None:
-    """GGML_API void   ggml_init_cublas(void);"""
-    ...
-  def ggml_internal_get_type_traits(type: int) -> ffi.CData:
-    """    ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);"""
-    ...
-  def ggml_is_contiguous(tensor: ffi.CData) -> bool:
-    """    GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);"""
-    ...
-  def ggml_is_numa() -> bool:
-    """    GGML_API bool    ggml_is_numa(void); // true if init detected that system has >1 NUMA node"""
-    ...
-  def ggml_is_permuted(tensor: ffi.CData) -> bool:
-    """    GGML_API bool ggml_is_permuted  (const struct ggml_tensor * tensor);"""
-    ...
-  def ggml_is_quantized(type: int) -> bool:
-    """    GGML_API bool    ggml_is_quantized(enum ggml_type type);"""
-    ...
-  def ggml_is_transposed(tensor: ffi.CData) -> bool:
-    """    GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);"""
-    ...
-  def ggml_log(ctx: ffi.CData, a: ffi.CData) -> ffi.CData:
-    """
-        GGML_API struct ggml_tensor * ggml_log(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a);
-    """
-    ...
-  def ggml_log_inplace(ctx: ffi.CData, a: ffi.CData) -> ffi.CData:
-    """
-        GGML_API struct ggml_tensor * ggml_log_inplace(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a);
-    """
-    ...
-  def ggml_map_binary_f32(ctx: ffi.CData, a: ffi.CData, b: ffi.CData, fun: ffi.CData) -> ffi.CData:
-    """
-        GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_binary_f32(
-                struct ggml_context         * ctx,
-                struct ggml_tensor          * a,
-                struct ggml_tensor          * b,
-                       ggml_binary_op_f32_t   fun),
-            "use ggml_map_custom2 instead");
-    """
-    ...
-  def ggml_map_binary_inplace_f32(ctx: ffi.CData, a: ffi.CData, b: ffi.CData, fun: ffi.CData) -> ffi.CData:
-    """
-        GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_binary_inplace_f32(
-                struct ggml_context         * ctx,
-                struct ggml_tensor          * a,
-                struct ggml_tensor          * b,
-                       ggml_binary_op_f32_t   fun),
-            "use ggml_map_custom2_inplace instead");
-    """
-    ...
-  def ggml_map_custom1(ctx: ffi.CData, a: ffi.CData, fun: ffi.CData, n_tasks: int, userdata: ffi.CData) -> ffi.CData:
-    """
-        GGML_API struct ggml_tensor * ggml_map_custom1(
-                struct ggml_context   * ctx,
-                struct ggml_tensor    * a,
-                ggml_custom1_op_t       fun,
-                int                     n_tasks,
-                void                  * userdata);
-    """
-    ...
-  def ggml_map_custom1_f32(ctx: ffi.CData, a: ffi.CData, fun: ffi.CData) -> ffi.CData:
-    """
-        GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom1_f32(
-                struct ggml_context          * ctx,
-                struct ggml_tensor           * a,
-                       ggml_custom1_op_f32_t   fun),
-            "use ggml_map_custom1 instead");
-    """
-    ...
-  def ggml_map_custom1_inplace(ctx: ffi.CData, a: ffi.CData, fun: ffi.CData, n_tasks: int, userdata: ffi.CData) -> ffi.CData:
-    """
-        GGML_API struct ggml_tensor * ggml_map_custom1_inplace(
-                struct ggml_context   * ctx,
-                struct ggml_tensor    * a,
-                ggml_custom1_op_t       fun,
-                int                     n_tasks,
-                void                  * userdata);
-    """
-    ...
-  def ggml_map_custom1_inplace_f32(ctx: ffi.CData, a: ffi.CData, fun: ffi.CData) -> ffi.CData:
-    """
-        GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom1_inplace_f32(
-                struct ggml_context          * ctx,
-                struct ggml_tensor           * a,
-                       ggml_custom1_op_f32_t   fun),
-            "use ggml_map_custom1_inplace instead");
-    """
-    ...
-  def ggml_map_custom2(ctx: ffi.CData, a: ffi.CData, b: ffi.CData, fun: ffi.CData, n_tasks: int, userdata: ffi.CData) -> ffi.CData:
-    """
-        GGML_API struct ggml_tensor * ggml_map_custom2(
-                struct ggml_context   * ctx,
-                struct ggml_tensor    * a,
-                struct ggml_tensor    * b,
-                ggml_custom2_op_t       fun,
-                int                     n_tasks,
-                void                  * userdata);
-    """
-    ...
-  def ggml_map_custom2_f32(ctx: ffi.CData, a: ffi.CData, b: ffi.CData, fun: ffi.CData) -> ffi.CData:
-    """
-        GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom2_f32(
-                struct ggml_context          * ctx,
-                struct ggml_tensor           * a,
-                struct ggml_tensor           * b,
-                       ggml_custom2_op_f32_t   fun),
-            "use ggml_map_custom2 instead");
-    """
-    ...
-  def ggml_map_custom2_inplace(ctx: ffi.CData, a: ffi.CData, b: ffi.CData, fun: ffi.CData, n_tasks: int, userdata: ffi.CData) -> ffi.CData:
-    """
-        GGML_API struct ggml_tensor * ggml_map_custom2_inplace(
-                struct ggml_context   * ctx,
-                struct ggml_tensor    * a,
-                struct ggml_tensor    * b,
-                ggml_custom2_op_t       fun,
-                int                     n_tasks,
-                void                  * userdata);
-    """
-    ...
-  def ggml_map_custom2_inplace_f32(ctx: ffi.CData, a: ffi.CData, b: ffi.CData, fun: ffi.CData) -> ffi.CData:
-    """
-        GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom2_inplace_f32(
-                struct ggml_context          * ctx,
-                struct ggml_tensor           * a,
-                struct ggml_tensor           * b,
-                       ggml_custom2_op_f32_t   fun),
-            "use ggml_map_custom2_inplace instead");
-    """
-    ...
-  def ggml_map_custom3(ctx: ffi.CData, a: ffi.CData, b: ffi.CData, c: ffi.CData, fun: ffi.CData, n_tasks: int, userdata: ffi.CData) -> ffi.CData:
-    """
-        GGML_API struct ggml_tensor * ggml_map_custom3(
-                struct ggml_context   * ctx,
-                struct ggml_tensor    * a,
-                struct ggml_tensor    * b,
-                struct ggml_tensor    * c,
-                ggml_custom3_op_t       fun,
-                int                     n_tasks,
-                void                  * userdata);
-    """
-    ...
-  def ggml_map_custom3_f32(ctx: ffi.CData, a: ffi.CData, b: ffi.CData, c: ffi.CData, fun: ffi.CData) -> ffi.CData:
-    """
-        GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom3_f32(
-                struct ggml_context          * ctx,
-                struct ggml_tensor           * a,
-                struct ggml_tensor           * b,
-                struct ggml_tensor           * c,
-                       ggml_custom3_op_f32_t   fun),
-            "use ggml_map_custom3 instead");
-    """
-    ...
-  def ggml_map_custom3_inplace(ctx: ffi.CData, a: ffi.CData, b: ffi.CData, c: ffi.CData, fun: ffi.CData, n_tasks: int, userdata: ffi.CData) -> ffi.CData:
-    """
-        GGML_API struct ggml_tensor * ggml_map_custom3_inplace(
-                struct ggml_context   * ctx,
-                struct ggml_tensor    * a,
-                struct ggml_tensor    * b,
-                struct ggml_tensor    * c,
-                ggml_custom3_op_t       fun,
-                int                     n_tasks,
-                void                  * userdata);
-    """
-    ...
-  def ggml_map_custom3_inplace_f32(ctx: ffi.CData, a: ffi.CData, b: ffi.CData, c: ffi.CData, fun: ffi.CData) -> ffi.CData:
-    """
-        GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom3_inplace_f32(
-                struct ggml_context          * ctx,
-                struct ggml_tensor           * a,
-                struct ggml_tensor           * b,
-                struct ggml_tensor           * c,
-                       ggml_custom3_op_f32_t   fun),
-            "use ggml_map_custom3_inplace instead");
-    """
-    ...
-  def ggml_map_unary_f32(ctx: ffi.CData, a: ffi.CData, fun: ffi.CData) -> ffi.CData:
-    """
-        GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_unary_f32(
-                struct ggml_context        * ctx,
-                struct ggml_tensor         * a,
-                       ggml_unary_op_f32_t   fun),
-            "use ggml_map_custom1 instead");
-    """
-    ...
-  def ggml_map_unary_inplace_f32(ctx: ffi.CData, a: ffi.CData, fun: ffi.CData) -> ffi.CData:
-    """
-        GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_unary_inplace_f32(
-                struct ggml_context        * ctx,
-                struct ggml_tensor         * a,
-                       ggml_unary_op_f32_t   fun),
-            "use ggml_map_custom1_inplace instead");
-    """
-    ...
-  def ggml_mean(ctx: ffi.CData, a: ffi.CData) -> ffi.CData:
-    """
-    mean along rows
-
-        GGML_API struct ggml_tensor * ggml_mean(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a);
-    """
-    ...
-  def ggml_metal_add_buffer(ctx: ffi.CData, name: ffi.CData, data: ffi.CData, size: int, max_size: int) -> bool:
-    """
-    creates a mapping between a host memory buffer and a device memory buffer
-    - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute
-    - the mapping is used during computation to determine the arguments of the compute kernels
-    - you don't need to keep the host memory buffer allocated as it is never accessed by Metal
-    - max_size specifies the maximum size of a tensor and is used to create shared views such
-    that it is guaranteed that the tensor will fit in at least one of the views
-
-
-    bool ggml_metal_add_buffer(
-            struct ggml_metal_context * ctx,
-                           const char * name,
-                                 void * data,
-                               size_t   size,
-                               size_t   max_size);
-    """
-    ...
-  def ggml_metal_free(ctx: ffi.CData) -> None:
-    """void ggml_metal_free(struct ggml_metal_context * ctx);"""
-    ...
-  def ggml_metal_get_concur_list(ctx: ffi.CData) -> ffi.CData:
-    """
-    output the concur_list for ggml_alloc
-
-    int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx);
-    """
-    ...
-  def ggml_metal_get_tensor(ctx: ffi.CData, t: ffi.CData) -> None:
-    """
-    get data from the device into host memory
-
-    void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
-    """
-    ...
-  def ggml_metal_graph_compute(ctx: ffi.CData, gf: ffi.CData) -> None:
-    """
-    same as ggml_graph_compute but uses Metal
-    creates gf->n_threads command buffers in parallel
-
-    void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
-    """
-    ...
-  def ggml_metal_graph_find_concurrency(ctx: ffi.CData, gf: ffi.CData, check_mem: bool) -> None:
-    """
-    try to find operations that can be run concurrently in the graph
-    you should run it again if the topology of your graph changes
-
-    void ggml_metal_graph_find_concurrency(struct ggml_metal_context * ctx, struct ggml_cgraph * gf, bool check_mem);
-    """
-    ...
-  def ggml_metal_host_free(data: ffi.CData) -> None:
-    """void   ggml_metal_host_free  (void * data);"""
-    ...
-  def ggml_metal_host_malloc(n: int) -> ffi.CData:
-    """void * ggml_metal_host_malloc(size_t n);"""
-    ...
-  def ggml_metal_if_optimized(ctx: ffi.CData) -> int:
-    """
-    if the graph has been optimized for concurrently dispatch, return length of the concur_list if optimized
-
-    int ggml_metal_if_optimized(struct ggml_metal_context * ctx);
-    """
-    ...
-  def ggml_metal_init(n_cb: int) -> ffi.CData:
-    """
-    number of command buffers to use
-
-    struct ggml_metal_context * ggml_metal_init(int n_cb);
-    """
-    ...
-  def ggml_metal_set_n_cb(ctx: ffi.CData, n_cb: int) -> None:
-    """
-    set the number of command buffers to use
-
-    void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb);
-    """
-    ...
-  def ggml_metal_set_tensor(ctx: ffi.CData, t: ffi.CData) -> None:
-    """
-    set data from host memory into the device
-
-    void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
-    """
-    ...
-  def ggml_mpi_backend_free() -> None:
-    """void ggml_mpi_backend_free(void);"""
-    ...
-  def ggml_mpi_backend_init() -> None:
-    """void ggml_mpi_backend_init(void);"""
-    ...
-  def ggml_mpi_eval_init(ctx_mpi: ffi.CData, n_tokens: ffi.CData, n_past: ffi.CData, n_threads: ffi.CData) -> None:
-    """
-    void ggml_mpi_eval_init(
-            struct ggml_mpi_context * ctx_mpi,
-                                int * n_tokens,
-                                int * n_past,
-                                int * n_threads);
-    """
-    ...
-  def ggml_mpi_free(ctx: ffi.CData) -> None:
-    """void ggml_mpi_free(struct ggml_mpi_context * ctx);"""
-    ...
-  def ggml_mpi_graph_compute_post(ctx_mpi: ffi.CData, gf: ffi.CData, n_layers: int) -> None:
-    """
-    void ggml_mpi_graph_compute_post(
-            struct ggml_mpi_context * ctx_mpi,
-                 struct ggml_cgraph * gf,
-                                int   n_layers);
-    """
-    ...
-  def ggml_mpi_graph_compute_pre(ctx_mpi: ffi.CData, gf: ffi.CData, n_layers: int) -> None:
-    """
-    void ggml_mpi_graph_compute_pre(
-            struct ggml_mpi_context * ctx_mpi,
-                 struct ggml_cgraph * gf,
-                                int   n_layers);
-    """
-    ...
-  def ggml_mpi_init() -> ffi.CData:
-    """struct ggml_mpi_context * ggml_mpi_init(void);"""
-    ...
-  def ggml_mpi_rank(ctx: ffi.CData) -> int:
-    """int ggml_mpi_rank(struct ggml_mpi_context * ctx);"""
-    ...
-  def ggml_mul(ctx: ffi.CData, a: ffi.CData, b: ffi.CData) -> ffi.CData:
-    """
-        GGML_API struct ggml_tensor * ggml_mul(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a,
-                struct ggml_tensor  * b);
-    """
-    ...
-  def ggml_mul_inplace(ctx: ffi.CData, a: ffi.CData, b: ffi.CData) -> ffi.CData:
-    """
-        GGML_API struct ggml_tensor * ggml_mul_inplace(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a,
-                struct ggml_tensor  * b);
-    """
-    ...
-  def ggml_mul_mat(ctx: ffi.CData, a: ffi.CData, b: ffi.CData) -> ffi.CData:
-    """
-    A: n columns, m rows
-    B: n columns, p rows  (i.e. we transpose it internally)
-    result is m columns, p rows
-
-        GGML_API struct ggml_tensor * ggml_mul_mat(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a,
-                struct ggml_tensor  * b);
-    """
-    ...
-  def ggml_nbytes(tensor: ffi.CData) -> int:
-    """    GGML_API size_t  ggml_nbytes      (const struct ggml_tensor * tensor);"""
-    ...
-  def ggml_nbytes_pad(tensor: ffi.CData) -> int:
-    """    GGML_API size_t  ggml_nbytes_pad  (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN"""
-    ...
-  def ggml_nbytes_split(tensor: ffi.CData, nrows_split: int) -> int:
-    """    GGML_API size_t  ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split);"""
-    ...
-  def ggml_neg(ctx: ffi.CData, a: ffi.CData) -> ffi.CData:
-    """
-        GGML_API struct ggml_tensor * ggml_neg(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a);
-    """
-    ...
-  def ggml_neg_inplace(ctx: ffi.CData, a: ffi.CData) -> ffi.CData:
-    """
-        GGML_API struct ggml_tensor * ggml_neg_inplace(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a);
-    """
-    ...
-  def ggml_nelements(tensor: ffi.CData) -> int:
-    """    GGML_API int64_t ggml_nelements   (const struct ggml_tensor * tensor);"""
-    ...
-  def ggml_new_f32(ctx: ffi.CData, value: float) -> ffi.CData:
-    """    GGML_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);"""
-    ...
-  def ggml_new_graph(ctx: ffi.CData) -> ffi.CData:
-    """
-    graph allocation in a context
-
-        GGML_API struct ggml_cgraph * ggml_new_graph        (struct ggml_context * ctx);
-    """
-    ...
-  def ggml_new_i32(ctx: ffi.CData, value: int) -> ffi.CData:
-    """    GGML_API struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);"""
-    ...
-  def ggml_new_tensor(ctx: ffi.CData, type: int, n_dims: int, ne: ffi.CData) -> ffi.CData:
-    """
-        GGML_API struct ggml_tensor * ggml_new_tensor(
-                struct ggml_context * ctx,
-                enum   ggml_type type,
-                int    n_dims,
-                const int64_t *ne);
-    """
-    ...
-  def ggml_new_tensor_1d(ctx: ffi.CData, type: int, ne0: int) -> ffi.CData:
-    """
-        GGML_API struct ggml_tensor * ggml_new_tensor_1d(
-                struct ggml_context * ctx,
-                enum   ggml_type type,
-                int64_t ne0);
-    """
-    ...
-  def ggml_new_tensor_2d(ctx: ffi.CData, type: int, ne0: int, ne1: int) -> ffi.CData:
-    """
-        GGML_API struct ggml_tensor * ggml_new_tensor_2d(
-                struct ggml_context * ctx,
-                enum   ggml_type type,
-                int64_t ne0,
-                int64_t ne1);
-    """
-    ...
-  def ggml_new_tensor_3d(ctx: ffi.CData, type: int, ne0: int, ne1: int, ne2: int) -> ffi.CData:
-    """
-        GGML_API struct ggml_tensor * ggml_new_tensor_3d(
-                struct ggml_context * ctx,
-                enum   ggml_type type,
-                int64_t ne0,
-                int64_t ne1,
-                int64_t ne2);
-    """
-    ...
-  def ggml_new_tensor_4d(ctx: ffi.CData, type: int, ne0: int, ne1: int, ne2: int, ne3: int) -> ffi.CData:
-    """
-        GGML_API struct ggml_tensor * ggml_new_tensor_4d(
-                struct ggml_context * ctx,
-                enum   ggml_type type,
-                int64_t ne0,
-                int64_t ne1,
-                int64_t ne2,
-                int64_t ne3);
-    """
-    ...
-  def ggml_norm(ctx: ffi.CData, a: ffi.CData) -> ffi.CData:
-    """
-    normalize along rows
-    TODO: eps is hardcoded to 1e-5 for now
-
-        GGML_API struct ggml_tensor * ggml_norm(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a);
-    """
-    ...
-  def ggml_norm_inplace(ctx: ffi.CData, a: ffi.CData) -> ffi.CData:
-    """
-        GGML_API struct ggml_tensor * ggml_norm_inplace(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a);
-    """
-    ...
-  def ggml_nrows(tensor: ffi.CData) -> int:
-    """    GGML_API int64_t ggml_nrows       (const struct ggml_tensor * tensor);"""
-    ...
-  def ggml_numa_init() -> None:
-    """    GGML_API void    ggml_numa_init(void); // call once for better performance on NUMA systems"""
-    ...
-  def ggml_op_name(op: int) -> ffi.CData:
-    """    GGML_API const char * ggml_op_name  (enum ggml_op   op);"""
-    ...
-  def ggml_op_symbol(op: int) -> ffi.CData:
-    """    GGML_API const char * ggml_op_symbol(enum ggml_op   op);"""
-    ...
-  def ggml_opt(ctx: ffi.CData, params: ffi.CData, f: ffi.CData) -> int:
-    """
-    optimize the function defined by the tensor f
-
-        GGML_API enum ggml_opt_result ggml_opt(
-                struct ggml_context * ctx,
-                struct ggml_opt_params params,
-                struct ggml_tensor * f);
-    """
-    ...
-  def ggml_opt_default_params(type: int) -> ffi.CData:
-    """    GGML_API struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type);"""
-    ...
-  def ggml_opt_init(ctx: ffi.CData, opt: ffi.CData, params: ffi.CData, nx: int) -> None:
-    """
-    initialize optimizer context
-
-        GGML_API void ggml_opt_init(
-                struct ggml_context * ctx,
-                struct ggml_opt_context * opt,
-                struct ggml_opt_params params,
-                int64_t nx);
-    """
-    ...
-  def ggml_opt_resume(ctx: ffi.CData, opt: ffi.CData, f: ffi.CData) -> int:
-    """
-    continue optimizing the function defined by the tensor f
-
-        GGML_API enum ggml_opt_result ggml_opt_resume(
-                struct ggml_context * ctx,
-                struct ggml_opt_context * opt,
-                struct ggml_tensor * f);
-    """
-    ...
-  def ggml_opt_resume_g(ctx: ffi.CData, opt: ffi.CData, f: ffi.CData, gf: ffi.CData, gb: ffi.CData) -> int:
-    """
-    continue optimizing the function defined by the tensor f
-
-        GGML_API enum ggml_opt_result ggml_opt_resume_g(
-                struct ggml_context * ctx,
-                struct ggml_opt_context * opt,
-                struct ggml_tensor * f,
-                struct ggml_cgraph * gf,
-                struct ggml_cgraph * gb);
-    """
-    ...
-  def ggml_out_prod(ctx: ffi.CData, a: ffi.CData, b: ffi.CData) -> ffi.CData:
-    """
-    A: m columns, n rows,
-    B: p columns, n rows,
-    result is m columns, p rows
-
-        GGML_API struct ggml_tensor * ggml_out_prod(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a,
-                struct ggml_tensor  * b);
-    """
-    ...
-  def ggml_permute(ctx: ffi.CData, a: ffi.CData, axis0: int, axis1: int, axis2: int, axis3: int) -> ffi.CData:
-    """
-        GGML_API struct ggml_tensor * ggml_permute(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a,
-                int                   axis0,
-                int                   axis1,
-                int                   axis2,
-                int                   axis3);
-    """
-    ...
-  def ggml_pool_1d(ctx: ffi.CData, a: ffi.CData, op: int, k0: int, s0: int, p0: int) -> ffi.CData:
-    """
-        GGML_API struct ggml_tensor * ggml_pool_1d(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a,
-                enum ggml_op_pool     op,
-                int                   k0, // kernel size
-                int                   s0, // stride
-                int                   p0); // padding
-    """
-    ...
-  def ggml_pool_2d(ctx: ffi.CData, a: ffi.CData, op: int, k0: int, k1: int, s0: int, s1: int, p0: int, p1: int) -> ffi.CData:
-    """
-        GGML_API struct ggml_tensor * ggml_pool_2d(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a,
-                enum ggml_op_pool     op,
-                int                   k0,
-                int                   k1,
-                int                   s0,
-                int                   s1,
-                int                   p0,
-                int                   p1);
-    """
-    ...
-  def ggml_print_object(obj: ffi.CData) -> None:
-    """    GGML_API void    ggml_print_object (const struct ggml_object * obj);"""
-    ...
-  def ggml_print_objects(ctx: ffi.CData) -> None:
-    """    GGML_API void    ggml_print_objects(const struct ggml_context * ctx);"""
-    ...
-  def ggml_quantize_chunk(type: int, src: ffi.CData, dst: ffi.CData, start: int, n: int, hist: ffi.CData) -> int:
-    """    GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);"""
-    ...
-  def ggml_quantize_q2_K(src: ffi.CData, dst: ffi.CData, n: int, k: int, hist: ffi.CData) -> int:
-    """
-    Quantization with histogram collection
-
-    size_t ggml_quantize_q2_K(const float * src, void * dst, int n, int k, int64_t * hist);
-    """
-    ...
-  def ggml_quantize_q3_K(src: ffi.CData, dst: ffi.CData, n: int, k: int, hist: ffi.CData) -> int:
-    """size_t ggml_quantize_q3_K(const float * src, void * dst, int n, int k, int64_t * hist);"""
-    ...
-  def ggml_quantize_q4_0(src: ffi.CData, dst: ffi.CData, n: int, k: int, hist: ffi.CData) -> int:
-    """    GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);"""
-    ...
-  def ggml_quantize_q4_1(src: ffi.CData, dst: ffi.CData, n: int, k: int, hist: ffi.CData) -> int:
-    """    GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);"""
-    ...
-  def ggml_quantize_q4_K(src: ffi.CData, dst: ffi.CData, n: int, k: int, hist: ffi.CData) -> int:
-    """size_t ggml_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist);"""
-    ...
-  def ggml_quantize_q5_0(src: ffi.CData, dst: ffi.CData, n: int, k: int, hist: ffi.CData) -> int:
-    """    GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);"""
-    ...
-  def ggml_quantize_q5_1(src: ffi.CData, dst: ffi.CData, n: int, k: int, hist: ffi.CData) -> int:
-    """    GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);"""
-    ...
-  def ggml_quantize_q5_K(src: ffi.CData, dst: ffi.CData, n: int, k: int, hist: ffi.CData) -> int:
-    """size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);"""
-    ...
-  def ggml_quantize_q6_K(src: ffi.CData, dst: ffi.CData, n: int, k: int, hist: ffi.CData) -> int:
-    """size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);"""
-    ...
-  def ggml_quantize_q8_0(src: ffi.CData, dst: ffi.CData, n: int, k: int, hist: ffi.CData) -> int:
-    """    GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);"""
-    ...
-  def ggml_relu(ctx: ffi.CData, a: ffi.CData) -> ffi.CData:
-    """
-        GGML_API struct ggml_tensor * ggml_relu(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a);
-    """
-    ...
-  def ggml_relu_inplace(ctx: ffi.CData, a: ffi.CData) -> ffi.CData:
-    """
-        GGML_API struct ggml_tensor * ggml_relu_inplace(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a);
-    """
-    ...
-  def ggml_repeat(ctx: ffi.CData, a: ffi.CData, b: ffi.CData) -> ffi.CData:
-    """
-    if a is the same shape as b, and a is not parameter, return a
-    otherwise, return a new tensor: repeat(a) to fit in b
-
-        GGML_API struct ggml_tensor * ggml_repeat(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a,
-                struct ggml_tensor  * b);
-    """
-    ...
-  def ggml_repeat_back(ctx: ffi.CData, a: ffi.CData, b: ffi.CData) -> ffi.CData:
-    """
-        GGML_API struct ggml_tensor * ggml_repeat_back(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a,
-                struct ggml_tensor  * b);
-    """
-    ...
-  def ggml_reshape(ctx: ffi.CData, a: ffi.CData, b: ffi.CData) -> ffi.CData:
-    """
-    return view(a), b specifies the new shape
-    TODO: when we start computing gradient, make a copy instead of view
-
-        GGML_API struct ggml_tensor * ggml_reshape(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a,
-                struct ggml_tensor  * b);
-    """
-    ...
-  def ggml_reshape_1d(ctx: ffi.CData, a: ffi.CData, ne0: int) -> ffi.CData:
-    """
-    return view(a)
-    TODO: when we start computing gradient, make a copy instead of view
-
-        GGML_API struct ggml_tensor * ggml_reshape_1d(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a,
-                int64_t               ne0);
-    """
-    ...
-  def ggml_reshape_2d(ctx: ffi.CData, a: ffi.CData, ne0: int, ne1: int) -> ffi.CData:
-    """
-        GGML_API struct ggml_tensor * ggml_reshape_2d(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a,
-                int64_t               ne0,
-                int64_t               ne1);
-    """
-    ...
-  def ggml_reshape_3d(ctx: ffi.CData, a: ffi.CData, ne0: int, ne1: int, ne2: int) -> ffi.CData:
-    """
-    return view(a)
-    TODO: when we start computing gradient, make a copy instead of view
-
-        GGML_API struct ggml_tensor * ggml_reshape_3d(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a,
-                int64_t               ne0,
-                int64_t               ne1,
-                int64_t               ne2);
-    """
-    ...
-  def ggml_reshape_4d(ctx: ffi.CData, a: ffi.CData, ne0: int, ne1: int, ne2: int, ne3: int) -> ffi.CData:
-    """
-        GGML_API struct ggml_tensor * ggml_reshape_4d(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a,
-                int64_t               ne0,
-                int64_t               ne1,
-                int64_t               ne2,
-                int64_t               ne3);
-    """
-    ...
-  def ggml_rms_norm(ctx: ffi.CData, a: ffi.CData, eps: float) -> ffi.CData:
-    """
-        GGML_API struct ggml_tensor * ggml_rms_norm(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a,
-                float                 eps);
-    """
-    ...
-  def ggml_rms_norm_back(ctx: ffi.CData, a: ffi.CData, b: ffi.CData) -> ffi.CData:
-    """
-    a - x
-    b - dy
-    TODO: update with configurable eps
-
-        GGML_API struct ggml_tensor * ggml_rms_norm_back(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a,
-                struct ggml_tensor  * b);
-    """
-    ...
-  def ggml_rms_norm_inplace(ctx: ffi.CData, a: ffi.CData, eps: float) -> ffi.CData:
-    """
-        GGML_API struct ggml_tensor * ggml_rms_norm_inplace(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a,
-                float                 eps);
-    """
-    ...
-  def ggml_rope(ctx: ffi.CData, a: ffi.CData, n_past: int, n_dims: int, mode: int, n_ctx: int) -> ffi.CData:
-    """
-    rotary position embedding
-    if mode & 1 == 1, skip n_past elements
-    if mode & 2 == 1, GPT-NeoX style
-    if mode & 4 == 1, ChatGLM style
-    TODO: avoid creating a new tensor every time
-
-        GGML_API struct ggml_tensor * ggml_rope(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a,
-                int                   n_past,
-                int                   n_dims,
-                int                   mode,
-                int                   n_ctx);
-    """
-    ...
-  def ggml_rope_back(ctx: ffi.CData, a: ffi.CData, n_past: int, n_dims: int, mode: int, n_ctx: int) -> ffi.CData:
-    """
-    rotary position embedding backward, i.e compute dx from dy
-    a - dy
-
-        GGML_API struct ggml_tensor * ggml_rope_back(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a,
-                int                   n_past,
-                int                   n_dims,
-                int                   mode,
-                int                   n_ctx);
-    """
-    ...
-  def ggml_rope_custom(ctx: ffi.CData, a: ffi.CData, n_past: int, n_dims: int, mode: int, n_ctx: int, freq_base: float, freq_scale: float) -> ffi.CData:
-    """
-    custom RoPE
-
-        GGML_API struct ggml_tensor * ggml_rope_custom(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a,
-                int                   n_past,
-                int                   n_dims,
-                int                   mode,
-                int                   n_ctx,
-                float                 freq_base,
-                float                 freq_scale);
-    """
-    ...
-  def ggml_rope_custom_inplace(ctx: ffi.CData, a: ffi.CData, n_past: int, n_dims: int, mode: int, n_ctx: int, freq_base: float, freq_scale: float) -> ffi.CData:
-    """
-    in-place, returns view(a)
-
-        GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a,
-                int                   n_past,
-                int                   n_dims,
-                int                   mode,
-                int                   n_ctx,
-                float                 freq_base,
-                float                 freq_scale);
-    """
-    ...
-  def ggml_rope_inplace(ctx: ffi.CData, a: ffi.CData, n_past: int, n_dims: int, mode: int, n_ctx: int) -> ffi.CData:
-    """
-    in-place, returns view(a)
-
-        GGML_API struct ggml_tensor * ggml_rope_inplace(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a,
-                int                   n_past,
-                int                   n_dims,
-                int                   mode,
-                int                   n_ctx);
-    """
-    ...
-  def ggml_scale(ctx: ffi.CData, a: ffi.CData, b: ffi.CData) -> ffi.CData:
-    """
-        GGML_API struct ggml_tensor * ggml_scale(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a,
-                struct ggml_tensor  * b);
-    """
-    ...
-  def ggml_scale_inplace(ctx: ffi.CData, a: ffi.CData, b: ffi.CData) -> ffi.CData:
-    """
-    in-place, returns view(a)
-
-        GGML_API struct ggml_tensor * ggml_scale_inplace(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a,
-                struct ggml_tensor  * b);
-    """
-    ...
-  def ggml_set(ctx: ffi.CData, a: ffi.CData, b: ffi.CData, nb1: int, nb2: int, nb3: int, offset: int) -> ffi.CData:
-    """
-    b -> view(a,offset,nb1,nb2,3), return modified a
-
-        GGML_API struct ggml_tensor * ggml_set(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a,
-                struct ggml_tensor  * b,
-                size_t                nb1,
-                size_t                nb2,
-                size_t                nb3,
-                size_t                offset);
-    """
-    ...
-  def ggml_set_1d(ctx: ffi.CData, a: ffi.CData, b: ffi.CData, offset: int) -> ffi.CData:
-    """
-        GGML_API struct ggml_tensor * ggml_set_1d(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a,
-                struct ggml_tensor  * b,
-                size_t                offset);
-    """
-    ...
-  def ggml_set_1d_inplace(ctx: ffi.CData, a: ffi.CData, b: ffi.CData, offset: int) -> ffi.CData:
-    """
-        GGML_API struct ggml_tensor * ggml_set_1d_inplace(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a,
-                struct ggml_tensor  * b,
-                size_t                offset);
-    """
-    ...
-  def ggml_set_2d(ctx: ffi.CData, a: ffi.CData, b: ffi.CData, nb1: int, offset: int) -> ffi.CData:
-    """
-    b -> view(a,offset,nb1,nb2,3), return modified a
-
-        GGML_API struct ggml_tensor * ggml_set_2d(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a,
-                struct ggml_tensor  * b,
-                size_t                nb1,
-                size_t                offset);
-    """
-    ...
-  def ggml_set_2d_inplace(ctx: ffi.CData, a: ffi.CData, b: ffi.CData, nb1: int, offset: int) -> ffi.CData:
-    """
-    b -> view(a,offset,nb1,nb2,3), return view(a)
-
-        GGML_API struct ggml_tensor * ggml_set_2d_inplace(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a,
-                struct ggml_tensor  * b,
-                size_t                nb1,
-                size_t                offset);
-    """
-    ...
-  def ggml_set_f32(tensor: ffi.CData, value: float) -> ffi.CData:
-    """    GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);"""
-    ...
-  def ggml_set_f32_1d(tensor: ffi.CData, i: int, value: float) -> None:
-    """    GGML_API void    ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);"""
-    ...
-  def ggml_set_i32(tensor: ffi.CData, value: int) -> ffi.CData:
-    """    GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);"""
-    ...
-  def ggml_set_i32_1d(tensor: ffi.CData, i: int, value: int) -> None:
-    """    GGML_API void    ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);"""
-    ...
-  def ggml_set_inplace(ctx: ffi.CData, a: ffi.CData, b: ffi.CData, nb1: int, nb2: int, nb3: int, offset: int) -> ffi.CData:
-    """
-    b -> view(a,offset,nb1,nb2,3), return view(a)
-
-        GGML_API struct ggml_tensor * ggml_set_inplace(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a,
-                struct ggml_tensor  * b,
-                size_t                nb1,
-                size_t                nb2,
-                size_t                nb3,
-                size_t                offset);
-    """
-    ...
-  def ggml_set_name(tensor: ffi.CData, name: ffi.CData) -> ffi.CData:
-    """    GGML_API struct ggml_tensor * ggml_set_name   (      struct ggml_tensor * tensor, const char * name);"""
-    ...
-  def ggml_set_no_alloc(ctx: ffi.CData, no_alloc: bool) -> None:
-    """    GGML_API void    ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);"""
-    ...
-  def ggml_set_param(ctx: ffi.CData, tensor: ffi.CData) -> None:
-    """
-        GGML_API void ggml_set_param(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * tensor);
-    """
-    ...
-  def ggml_set_scratch(ctx: ffi.CData, scratch: ffi.CData) -> int:
-    """    GGML_API size_t  ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch);"""
-    ...
-  def ggml_set_zero(tensor: ffi.CData) -> ffi.CData:
-    """    GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);"""
-    ...
-  def ggml_sgn(ctx: ffi.CData, a: ffi.CData) -> ffi.CData:
-    """
-        GGML_API struct ggml_tensor * ggml_sgn(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a);
-    """
-    ...
-  def ggml_sgn_inplace(ctx: ffi.CData, a: ffi.CData) -> ffi.CData:
-    """
-        GGML_API struct ggml_tensor * ggml_sgn_inplace(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a);
-    """
-    ...
-  def ggml_silu(ctx: ffi.CData, a: ffi.CData) -> ffi.CData:
-    """
-        GGML_API struct ggml_tensor * ggml_silu(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a);
-    """
-    ...
-  def ggml_silu_back(ctx: ffi.CData, a: ffi.CData, b: ffi.CData) -> ffi.CData:
-    """
-    a - x
-    b - dy
-
-        GGML_API struct ggml_tensor * ggml_silu_back(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a,
-                struct ggml_tensor  * b);
-    """
-    ...
-  def ggml_silu_inplace(ctx: ffi.CData, a: ffi.CData) -> ffi.CData:
-    """
-        GGML_API struct ggml_tensor * ggml_silu_inplace(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a);
-    """
-    ...
-  def ggml_soft_max(ctx: ffi.CData, a: ffi.CData) -> ffi.CData:
-    """
-        GGML_API struct ggml_tensor * ggml_soft_max(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a);
-    """
-    ...
-  def ggml_soft_max_back(ctx: ffi.CData, a: ffi.CData, b: ffi.CData) -> ffi.CData:
-    """
-        GGML_API struct ggml_tensor * ggml_soft_max_back(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a,
-                struct ggml_tensor  * b);
-    """
-    ...
-  def ggml_soft_max_back_inplace(ctx: ffi.CData, a: ffi.CData, b: ffi.CData) -> ffi.CData:
-    """
-    in-place, returns view(a)
-
-        GGML_API struct ggml_tensor * ggml_soft_max_back_inplace(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a,
-                struct ggml_tensor  * b);
-    """
-    ...
-  def ggml_soft_max_inplace(ctx: ffi.CData, a: ffi.CData) -> ffi.CData:
-    """
-    in-place, returns view(a)
-
-        GGML_API struct ggml_tensor * ggml_soft_max_inplace(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a);
-    """
-    ...
-  def ggml_sqr(ctx: ffi.CData, a: ffi.CData) -> ffi.CData:
-    """
-        GGML_API struct ggml_tensor * ggml_sqr(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a);
-    """
-    ...
-  def ggml_sqr_inplace(ctx: ffi.CData, a: ffi.CData) -> ffi.CData:
-    """
-        GGML_API struct ggml_tensor * ggml_sqr_inplace(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a);
-    """
-    ...
-  def ggml_sqrt(ctx: ffi.CData, a: ffi.CData) -> ffi.CData:
-    """
-        GGML_API struct ggml_tensor * ggml_sqrt(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a);
-    """
-    ...
-  def ggml_sqrt_inplace(ctx: ffi.CData, a: ffi.CData) -> ffi.CData:
-    """
-        GGML_API struct ggml_tensor * ggml_sqrt_inplace(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a);
-    """
-    ...
-  def ggml_step(ctx: ffi.CData, a: ffi.CData) -> ffi.CData:
-    """
-        GGML_API struct ggml_tensor * ggml_step(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a);
-    """
-    ...
-  def ggml_step_inplace(ctx: ffi.CData, a: ffi.CData) -> ffi.CData:
-    """
-        GGML_API struct ggml_tensor * ggml_step_inplace(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a);
-    """
-    ...
-  def ggml_sub(ctx: ffi.CData, a: ffi.CData, b: ffi.CData) -> ffi.CData:
-    """
-        GGML_API struct ggml_tensor * ggml_sub(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a,
-                struct ggml_tensor  * b);
-    """
-    ...
-  def ggml_sub_inplace(ctx: ffi.CData, a: ffi.CData, b: ffi.CData) -> ffi.CData:
-    """
-        GGML_API struct ggml_tensor * ggml_sub_inplace(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a,
-                struct ggml_tensor  * b);
-    """
-    ...
-  def ggml_sum(ctx: ffi.CData, a: ffi.CData) -> ffi.CData:
-    """
-    return scalar
-
-        GGML_API struct ggml_tensor * ggml_sum(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a);
-    """
-    ...
-  def ggml_sum_rows(ctx: ffi.CData, a: ffi.CData) -> ffi.CData:
-    """
-    sums along rows, with input shape [a,b,c,d] return shape [1,b,c,d]
-
-        GGML_API struct ggml_tensor * ggml_sum_rows(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a);
-    """
-    ...
-  def ggml_tanh(ctx: ffi.CData, a: ffi.CData) -> ffi.CData:
-    """
-        GGML_API struct ggml_tensor * ggml_tanh(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a);
-    """
-    ...
-  def ggml_tanh_inplace(ctx: ffi.CData, a: ffi.CData) -> ffi.CData:
-    """
-        GGML_API struct ggml_tensor * ggml_tanh_inplace(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a);
-    """
-    ...
-  def ggml_tensor_overhead() -> int:
-    """
-    use this to compute the memory overhead of a tensor
-
-        GGML_API size_t ggml_tensor_overhead(void);
-    """
-    ...
-  def ggml_time_init() -> None:
-    """    GGML_API void    ggml_time_init(void); // call this once at the beginning of the program"""
-    ...
-  def ggml_time_ms() -> int:
-    """    GGML_API int64_t ggml_time_ms(void);"""
-    ...
-  def ggml_time_us() -> int:
-    """    GGML_API int64_t ggml_time_us(void);"""
-    ...
-  def ggml_transpose(ctx: ffi.CData, a: ffi.CData) -> ffi.CData:
-    """
-    alias for ggml_permute(ctx, a, 1, 0, 2, 3)
-
-        GGML_API struct ggml_tensor * ggml_transpose(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a);
-    """
-    ...
-  def ggml_type_name(type: int) -> ffi.CData:
-    """    GGML_API const char * ggml_type_name(enum ggml_type type);"""
-    ...
-  def ggml_type_size(type: int) -> int:
-    """    GGML_API size_t  ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block"""
-    ...
-  def ggml_type_sizef(type: int) -> float:
-    """    GGML_API float   ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float"""
-    ...
-  def ggml_unary(ctx: ffi.CData, a: ffi.CData, op: int) -> ffi.CData:
-    """
-        GGML_API struct ggml_tensor * ggml_unary(
-                struct ggml_context * ctx,
-                 struct ggml_tensor * a,
-                 enum ggml_unary_op op);
-    """
-    ...
-  def ggml_unary_inplace(ctx: ffi.CData, a: ffi.CData, op: int) -> ffi.CData:
-    """
-        GGML_API struct ggml_tensor * ggml_unary_inplace(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            enum ggml_unary_op op);
-    """
-    ...
-  def ggml_used_mem(ctx: ffi.CData) -> int:
-    """    GGML_API size_t  ggml_used_mem(const struct ggml_context * ctx);"""
-    ...
-  def ggml_vec_dot_q2_K_q8_K(n: int, s: ffi.CData, vx: ffi.CData, vy: ffi.CData) -> None:
-    """
-    Dot product
-
-    void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-    """
-    ...
-  def ggml_vec_dot_q3_K_q8_K(n: int, s: ffi.CData, vx: ffi.CData, vy: ffi.CData) -> None:
-    """void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);"""
-    ...
-  def ggml_vec_dot_q4_K_q8_K(n: int, s: ffi.CData, vx: ffi.CData, vy: ffi.CData) -> None:
-    """void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);"""
-    ...
-  def ggml_vec_dot_q5_K_q8_K(n: int, s: ffi.CData, vx: ffi.CData, vy: ffi.CData) -> None:
-    """void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);"""
-    ...
-  def ggml_vec_dot_q6_K_q8_K(n: int, s: ffi.CData, vx: ffi.CData, vy: ffi.CData) -> None:
-    """void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);"""
-    ...
-  def ggml_view_1d(ctx: ffi.CData, a: ffi.CData, ne0: int, offset: int) -> ffi.CData:
-    """
-    offset in bytes
-
-        GGML_API struct ggml_tensor * ggml_view_1d(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a,
-                int64_t               ne0,
-                size_t                offset);
-    """
-    ...
-  def ggml_view_2d(ctx: ffi.CData, a: ffi.CData, ne0: int, ne1: int, nb1: int, offset: int) -> ffi.CData:
-    """
-        GGML_API struct ggml_tensor * ggml_view_2d(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a,
-                int64_t               ne0,
-                int64_t               ne1,
-                size_t                nb1, // row stride in bytes
-                size_t                offset);
-    """
-    ...
-  def ggml_view_3d(ctx: ffi.CData, a: ffi.CData, ne0: int, ne1: int, ne2: int, nb1: int, nb2: int, offset: int) -> ffi.CData:
-    """
-        GGML_API struct ggml_tensor * ggml_view_3d(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a,
-                int64_t               ne0,
-                int64_t               ne1,
-                int64_t               ne2,
-                size_t                nb1, // row   stride in bytes
-                size_t                nb2, // slice stride in bytes
-                size_t                offset);
-    """
-    ...
-  def ggml_view_4d(ctx: ffi.CData, a: ffi.CData, ne0: int, ne1: int, ne2: int, ne3: int, nb1: int, nb2: int, nb3: int, offset: int) -> ffi.CData:
-    """
-        GGML_API struct ggml_tensor * ggml_view_4d(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a,
-                int64_t               ne0,
-                int64_t               ne1,
-                int64_t               ne2,
-                int64_t               ne3,
-                size_t                nb1, // row   stride in bytes
-                size_t                nb2, // slice stride in bytes
-                size_t                nb3,
-                size_t                offset);
-    """
-    ...
-  def ggml_view_tensor(ctx: ffi.CData, src: ffi.CData) -> ffi.CData:
-    """    GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src);"""
-    ...
-  def ggml_win_part(ctx: ffi.CData, a: ffi.CData, w: int) -> ffi.CData:
-    """
-    partition into non-overlapping windows with padding if needed
-    example:
-    a:   768   64   64    1
-    w:    14
-    res: 768   14   14    25
-    used in sam
-
-        GGML_API struct ggml_tensor * ggml_win_part(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a,
-                int                   w);
-    """
-    ...
-  def ggml_win_unpart(ctx: ffi.CData, a: ffi.CData, w0: int, h0: int, w: int) -> ffi.CData:
-    """
-    reverse of ggml_win_part
-    used in sam
-
-        GGML_API struct ggml_tensor * ggml_win_unpart(
-                struct ggml_context * ctx,
-                struct ggml_tensor  * a,
-                int                   w0,
-                int                   h0,
-                int                   w);
-    """
-    ...
-  def gguf_add_tensor(ctx: ffi.CData, tensor: ffi.CData) -> None:
-    """
-    manage tensor info
-
-        GGML_API void gguf_add_tensor(struct gguf_context * ctx, const struct ggml_tensor * tensor);
-    """
-    ...
-  def gguf_find_key(ctx: ffi.CData, key: ffi.CData) -> int:
-    """    GGML_API int          gguf_find_key(struct gguf_context * ctx, const char * key);"""
-    ...
-  def gguf_find_tensor(ctx: ffi.CData, name: ffi.CData) -> int:
-    """    GGML_API int    gguf_find_tensor      (struct gguf_context * ctx, const char * name);"""
-    ...
-  def gguf_free(ctx: ffi.CData) -> None:
-    """    GGML_API void gguf_free(struct gguf_context * ctx);"""
-    ...
-  def gguf_get_alignment(ctx: ffi.CData) -> int:
-    """    GGML_API size_t gguf_get_alignment  (struct gguf_context * ctx);"""
-    ...
-  def gguf_get_arr_data(ctx: ffi.CData, i: int) -> ffi.CData:
-    """    GGML_API const void * gguf_get_arr_data(struct gguf_context * ctx, int i);"""
-    ...
-  def gguf_get_arr_n(ctx: ffi.CData, i: int) -> int:
-    """    GGML_API int          gguf_get_arr_n   (struct gguf_context * ctx, int i);"""
-    ...
-  def gguf_get_arr_str(ctx: ffi.CData, key_id: int, i: int) -> ffi.CData:
-    """    GGML_API const char * gguf_get_arr_str (struct gguf_context * ctx, int key_id, int i);"""
-    ...
-  def gguf_get_arr_type(ctx: ffi.CData, i: int) -> int:
-    """    GGML_API enum gguf_type gguf_get_arr_type(struct gguf_context * ctx, int i);"""
-    ...
-  def gguf_get_data(ctx: ffi.CData) -> ffi.CData:
-    """    GGML_API void * gguf_get_data       (struct gguf_context * ctx);"""
-    ...
-  def gguf_get_data_offset(ctx: ffi.CData) -> int:
-    """    GGML_API size_t gguf_get_data_offset(struct gguf_context * ctx);"""
-    ...
-  def gguf_get_key(ctx: ffi.CData, i: int) -> ffi.CData:
-    """    GGML_API const char * gguf_get_key (struct gguf_context * ctx, int i);"""
-    ...
-  def gguf_get_kv_type(ctx: ffi.CData, i: int) -> int:
-    """    GGML_API enum gguf_type gguf_get_kv_type (struct gguf_context * ctx, int i);"""
-    ...
-  def gguf_get_meta_data(ctx: ffi.CData, data: ffi.CData) -> None:
-    """    GGML_API void   gguf_get_meta_data(struct gguf_context * ctx, void * data);"""
-    ...
-  def gguf_get_meta_size(ctx: ffi.CData) -> int:
-    """
-    get the size in bytes of the meta data (header, kv pairs, tensor info) including padding
-
-        GGML_API size_t gguf_get_meta_size(struct gguf_context * ctx);
-    """
-    ...
-  def gguf_get_n_kv(ctx: ffi.CData) -> int:
-    """    GGML_API int          gguf_get_n_kv(struct gguf_context * ctx);"""
-    ...
-  def gguf_get_n_tensors(ctx: ffi.CData) -> int:
-    """    GGML_API int    gguf_get_n_tensors    (struct gguf_context * ctx);"""
-    ...
-  def gguf_get_tensor_name(ctx: ffi.CData, i: int) -> ffi.CData:
-    """    GGML_API char * gguf_get_tensor_name  (struct gguf_context * ctx, int i);"""
-    ...
-  def gguf_get_tensor_offset(ctx: ffi.CData, i: int) -> int:
-    """    GGML_API size_t gguf_get_tensor_offset(struct gguf_context * ctx, int i);"""
-    ...
-  def gguf_get_val_bool(ctx: ffi.CData, i: int) -> bool:
-    """    GGML_API bool         gguf_get_val_bool(struct gguf_context * ctx, int i);"""
-    ...
-  def gguf_get_val_f32(ctx: ffi.CData, i: int) -> float:
-    """    GGML_API float        gguf_get_val_f32 (struct gguf_context * ctx, int i);"""
-    ...
-  def gguf_get_val_i16(ctx: ffi.CData, i: int) -> int:
-    """    GGML_API int16_t      gguf_get_val_i16 (struct gguf_context * ctx, int i);"""
-    ...
-  def gguf_get_val_i32(ctx: ffi.CData, i: int) -> int:
-    """    GGML_API int32_t      gguf_get_val_i32 (struct gguf_context * ctx, int i);"""
-    ...
-  def gguf_get_val_i8(ctx: ffi.CData, i: int) -> int:
-    """    GGML_API int8_t       gguf_get_val_i8  (struct gguf_context * ctx, int i);"""
-    ...
-  def gguf_get_val_str(ctx: ffi.CData, i: int) -> ffi.CData:
-    """    GGML_API const char * gguf_get_val_str (struct gguf_context * ctx, int i);"""
-    ...
-  def gguf_get_val_u16(ctx: ffi.CData, i: int) -> int:
-    """    GGML_API uint16_t     gguf_get_val_u16 (struct gguf_context * ctx, int i);"""
-    ...
-  def gguf_get_val_u32(ctx: ffi.CData, i: int) -> int:
-    """    GGML_API uint32_t     gguf_get_val_u32 (struct gguf_context * ctx, int i);"""
-    ...
-  def gguf_get_val_u8(ctx: ffi.CData, i: int) -> int:
-    """
-    results are undefined if the wrong type is used for the key
-
-        GGML_API uint8_t      gguf_get_val_u8  (struct gguf_context * ctx, int i);
-    """
-    ...
-  def gguf_get_version(ctx: ffi.CData) -> int:
-    """    GGML_API int    gguf_get_version    (struct gguf_context * ctx);"""
-    ...
-  def gguf_init_empty() -> ffi.CData:
-    """    GGML_API struct gguf_context * gguf_init_empty(void);"""
-    ...
-  def gguf_init_from_file(fname: ffi.CData, params: ffi.CData) -> ffi.CData:
-    """    GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params);"""
-    ...
-  def gguf_set_arr_data(ctx: ffi.CData, key: ffi.CData, type: int, data: ffi.CData, n: int) -> None:
-    """    GGML_API void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, int n);"""
-    ...
-  def gguf_set_arr_str(ctx: ffi.CData, key: ffi.CData, data: ffi.CData, n: int) -> None:
-    """    GGML_API void gguf_set_arr_str (struct gguf_context * ctx, const char * key, const char ** data, int n);"""
-    ...
-  def gguf_set_kv(ctx: ffi.CData, src: ffi.CData) -> None:
-    """
-    set or add KV pairs from another context
-
-        GGML_API void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src);
-    """
-    ...
-  def gguf_set_tensor_data(ctx: ffi.CData, name: ffi.CData, data: ffi.CData, size: int) -> None:
-    """    GGML_API void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size);"""
-    ...
-  def gguf_set_tensor_type(ctx: ffi.CData, name: ffi.CData, type: int) -> None:
-    """    GGML_API void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type);"""
-    ...
-  def gguf_set_val_bool(ctx: ffi.CData, key: ffi.CData, val: bool) -> None:
-    """    GGML_API void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool     val);"""
-    ...
-  def gguf_set_val_f32(ctx: ffi.CData, key: ffi.CData, val: float) -> None:
-    """    GGML_API void gguf_set_val_f32 (struct gguf_context * ctx, const char * key, float    val);"""
-    ...
-  def gguf_set_val_i16(ctx: ffi.CData, key: ffi.CData, val: int) -> None:
-    """    GGML_API void gguf_set_val_i16 (struct gguf_context * ctx, const char * key, int16_t  val);"""
-    ...
-  def gguf_set_val_i32(ctx: ffi.CData, key: ffi.CData, val: int) -> None:
-    """    GGML_API void gguf_set_val_i32 (struct gguf_context * ctx, const char * key, int32_t  val);"""
-    ...
-  def gguf_set_val_i8(ctx: ffi.CData, key: ffi.CData, val: int) -> None:
-    """    GGML_API void gguf_set_val_i8  (struct gguf_context * ctx, const char * key, int8_t   val);"""
-    ...
-  def gguf_set_val_str(ctx: ffi.CData, key: ffi.CData, val: ffi.CData) -> None:
-    """    GGML_API void gguf_set_val_str (struct gguf_context * ctx, const char * key, const char * val);"""
-    ...
-  def gguf_set_val_u16(ctx: ffi.CData, key: ffi.CData, val: int) -> None:
-    """    GGML_API void gguf_set_val_u16 (struct gguf_context * ctx, const char * key, uint16_t val);"""
-    ...
-  def gguf_set_val_u32(ctx: ffi.CData, key: ffi.CData, val: int) -> None:
-    """    GGML_API void gguf_set_val_u32 (struct gguf_context * ctx, const char * key, uint32_t val);"""
-    ...
-  def gguf_set_val_u8(ctx: ffi.CData, key: ffi.CData, val: int) -> None:
-    """
-    overrides existing values or adds a new one
-
-        GGML_API void gguf_set_val_u8  (struct gguf_context * ctx, const char * key, uint8_t  val);
-    """
-    ...
-  def gguf_type_name(type: int) -> ffi.CData:
-    """    GGML_API const char * gguf_type_name(enum gguf_type type);"""
-    ...
-  def gguf_write_to_file(ctx: ffi.CData, fname: ffi.CData, only_meta: bool) -> None:
-    """
-    write the entire context to a binary file
-
-        GGML_API void gguf_write_to_file(struct gguf_context * ctx, const char * fname, bool only_meta);
-    """
-    ...
-  def quantize_row_q2_K(x: ffi.CData, y: ffi.CData, k: int) -> None:
-    """void quantize_row_q2_K(const float * restrict x, void * restrict y, int k);"""
-    ...
-  def quantize_row_q2_K_reference(x: ffi.CData, y: ffi.CData, k: int) -> None:
-    """
-    Quantization
-
-    void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int k);
-    """
-    ...
-  def quantize_row_q3_K(x: ffi.CData, y: ffi.CData, k: int) -> None:
-    """void quantize_row_q3_K(const float * restrict x, void * restrict y, int k);"""
-    ...
-  def quantize_row_q3_K_reference(x: ffi.CData, y: ffi.CData, k: int) -> None:
-    """void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k);"""
-    ...
-  def quantize_row_q4_K(x: ffi.CData, y: ffi.CData, k: int) -> None:
-    """void quantize_row_q4_K(const float * restrict x, void * restrict y, int k);"""
-    ...
-  def quantize_row_q4_K_reference(x: ffi.CData, y: ffi.CData, k: int) -> None:
-    """void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k);"""
-    ...
-  def quantize_row_q5_K(x: ffi.CData, y: ffi.CData, k: int) -> None:
-    """void quantize_row_q5_K(const float * restrict x, void * restrict y, int k);"""
-    ...
-  def quantize_row_q5_K_reference(x: ffi.CData, y: ffi.CData, k: int) -> None:
-    """void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k);"""
-    ...
-  def quantize_row_q6_K(x: ffi.CData, y: ffi.CData, k: int) -> None:
-    """void quantize_row_q6_K(const float * restrict x, void * restrict y, int k);"""
-    ...
-  def quantize_row_q6_K_reference(x: ffi.CData, y: ffi.CData, k: int) -> None:
-    """void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k);"""
-    ...
-  def quantize_row_q8_K(x: ffi.CData, y: ffi.CData, k: int) -> None:
-    """void quantize_row_q8_K(const float * restrict x, void * restrict y, int k);"""
-    ...
-  def quantize_row_q8_K_reference(x: ffi.CData, y: ffi.CData, k: int) -> None:
-    """void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k);"""
-    ...
diff --git a/ggml/examples/python/ggml/cffi.py b/ggml/examples/python/ggml/cffi.py
deleted file mode 100644
index ecc1bed..0000000
--- a/ggml/examples/python/ggml/cffi.py
+++ /dev/null
@@ -1,11 +0,0 @@
-# auto-generated file
-import _cffi_backend
-
-ffi = _cffi_backend.FFI('ggml.cffi',
-    _version = 0x2601,
-    _types = b'\x00\x00\xB6\x0D\x00\x00\x09\x0B\x00\x00\x00\x0F\x00\x00\xB6\x0D\x00\x04\x2F\x03\x00\x00\x00\x0F\x00\x00\xB6\x0D\x00\x04\x31\x03\x00\x04\x3D\x03\x00\x00\x00\x0F\x00\x00\xB6\x0D\x00\x04\x32\x03\x00\x00\x00\x0F\x00\x00\xB6\x0D\x00\x04\x34\x03\x00\x03\xFE\x03\x00\x04\x53\x03\x00\x00\x0A\x01\x00\x00\x0A\x01\x00\x00\x00\x0F\x00\x00\xB6\x0D\x00\x04\x3D\x03\x00\x00\x00\x0F\x00\x00\xB6\x0D\x00\x00\x15\x11\x00\x00\x15\x11\x00\x00\x00\x0F\x00\x00\xB6\x0D\x00\x00\x15\x11\x00\x00\x15\x11\x00\x00\x08\x11\x00\x00\x00\x0F\x00\x00\xB6\x0D\x00\x04\x3E\x03\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\xB6\x0D\x00\x00\x10\x11\x00\x00\x00\x0F\x00\x00\xB6\x0D\x00\x00\x00\x0F\x00\x02\xD0\x0D\x00\x00\x21\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\x0F\x0D\x00\x00\x04\x0B\x00\x00\x00\x0F\x00\x00\x0F\x0D\x00\x00\x01\x11\x00\x00\x00\x0F\x00\x00\x0F\x0D\x00\x00\x0B\x0B\x00\x00\x00\x0F\x00\x00\x0F\x0D\x00\x00\x15\x11\x00\x00\x00\x0F\x00\x00\x0F\x0D\x00\x00\x21\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\x0F\x0D\x00\x00\x21\x11\x00\x00\x07\x01\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x04\x16\x0D\x00\x00\x0B\x11\x00\x04\x38\x03\x00\x00\x08\x11\x00\x00\x00\x0F\x00\x04\x16\x0D\x00\x00\x0B\x11\x00\x00\x44\x11\x00\x00\x08\x11\x00\x04\x30\x03\x00\x00\x4B\x11\x00\x00\x00\x0F\x00\x04\x16\x0D\x00\x00\x0B\x11\x00\x00\x20\x09\x00\x00\x08\x11\x00\x00\x00\x0F\x00\x00\x01\x0D\x00\x00\x01\x0B\x00\x00\x00\x0F\x00\x01\x14\x0D\x00\x00\x15\x11\x00\x00\x00\x0F\x00\x00\x34\x0D\x00\x00\x21\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x02\x7E\x0D\x00\x00\x15\x11\x00\x00\x00\x0F\x00\x00\xF4\x0D\x00\x00\x01\x11\x00\x00\x00\x0F\x00\x00\xF4\x0D\x00\x00\x15\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\xF4\x0D\x00\x00\x21\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\xF4\x0D\x00\x00\x06\x01\x00\x00\x00\x0F\x00\x04\x18\x0D\x00\x00\x01\x11\x00\x00\x00\x0F\x00\x02\xE9\x0D\x00\x00\x0E\x11\x00\x00\x00\x0F\x00\x00\x22\x0D\x00\x00\x01\x11\x00\x00\x00\x0F\x00\x00\x22\x0D\x00\x00\x4B\x11\x00\x04\x33\x03\x00\x00\x00\x0F\x00\x00\x22\x0D\x00\x00\x0E\x11\x00\x00\x00\x0F\x00\x00\x22\x0D\x00\x04\x35\x03\x00\x00\x00\x0F\x00\x00\x22\x0D\x00\x00\x15\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\x22\x0D\x00\x00\x21\x11\x00\x00\x00\x0F\x00\x00\x22\x0D\x00\x00\x21\x11\x00\x00\x0F\x11\x00\x00\x00\x0F\x00\x00\x22\x0D\x00\x00\x21\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\x22\x0D\x00\x00\x00\x0F\x00\x00\xDB\x0D\x00\x00\x15\x11\x00\x00\x00\x0F\x00\x00\xDB\x0D\x00\x00\x00\x0F\x00\x03\xB0\x0D\x00\x00\x21\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x03\xB5\x0D\x00\x00\x21\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\x04\x0D\x00\x00\x0A\x01\x00\x00\x00\x0F\x00\x00\x04\x0D\x00\x00\x10\x11\x00\x00\x0A\x01\x00\x00\x0A\x01\x00\x00\x00\x0F\x00\x00\x4B\x0D\x00\x00\x0B\x11\x00\x00\x00\x0F\x00\x00\x4B\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x00\x0F\x00\x04\x30\x0D\x00\x00\x0F\x11\x00\x00\x0B\x03\x00\x00\xB0\x11\x00\x00\x00\x0F\x00\x04\x30\x0D\x00\x00\x0B\x11\x00\x00\x4B\x11\x00\x00\x01\x01\x00\x00\x00\x0F\x00\x04\x30\x0D\x00\x00\x08\x11\x00\x00\x00\x0F\x00\x00\x0B\x0D\x00\x00\x1B\x09\x00\x00\x00\x0F\x00\x04\x33\x0D\x00\x00\x4B\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\x0E\x0D\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\x7F\x0D\x00\x00\x00\x0F\x00\x00\x50\x0D\x00\x00\x07\x0B\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x4B\x11\x00\x00\x0F\x11\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x0F\x11\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x01\x11\x00\x00\x07\x01\x00\x00\xDB\x03\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x01\x11\x00\x00\x0B\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x01\x11\x00\x00\x0B\x01\x00\x00\x0B\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x01\x11\x00\x00\x0B\x01\x00\x00\x0B\x01\x00\x00\x0B\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x01\x11\x00\x00\x0B\x01\x00\x00\x0B\x01\x00\x00\x0B\x01\x00\x00\x0B\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x0D\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x05\x0B\x00\x00\x07\x01\x00\x00\x07\x01\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x01\x01\x11\x00\x00\x07\x01\x00\x00\x07\x01\x00\x00\x07\x01\x00\x00\x07\x01\x00\x00\x07\x01\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x0A\x0B\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x0D\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x0D\x01\x00\x00\x0D\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x07\x01\x00\x00\x07\x01\x00\x00\x0D\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x07\x01\x00\x00\x07\x01\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x07\x01\x00\x00\x07\x01\x00\x00\x07\x01\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x07\x01\x00\x00\x07\x01\x00\x00\x07\x01\x00\x00\x07\x01\x00\x00\x0D\x01\x00\x00\x0D\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x0B\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x0B\x01\x00\x00\x0B\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x0B\x01\x00\x00\x0B\x01\x00\x00\x0B\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x0B\x01\x00\x00\x0B\x01\x00\x00\x0B\x01\x00\x00\x0B\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x0B\x01\x00\x00\x0B\x01\x00\x00\x0B\x01\x00\x00\x0B\x01\x00\x00\x0A\x01\x00\x00\x0A\x01\x00\x00\x0A\x01\x00\x00\x0A\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x0B\x01\x00\x00\x0B\x01\x00\x00\x0B\x01\x00\x00\x0A\x01\x00\x00\x0A\x01\x00\x00\x0A\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x0B\x01\x00\x00\x0B\x01\x00\x00\x0A\x01\x00\x00\x0A\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x0B\x01\x00\x00\x0A\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x08\x11\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x08\x11\x00\x00\x07\x01\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x08\x11\x00\x00\x07\x01\x00\x00\x07\x01\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x08\x11\x00\x00\x07\x01\x00\x00\x07\x01\x00\x00\x07\x01\x00\x00\x07\x01\x00\x00\x07\x01\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x08\x11\x00\x00\x08\x11\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x08\x11\x00\x00\x08\x11\x00\x00\x01\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x08\x11\x00\x00\x08\x11\x00\x00\x08\x11\x00\x00\x01\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x08\x11\x00\x00\x08\x11\x00\x00\x08\x11\x00\x00\x08\x11\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x08\x11\x00\x00\x08\x11\x00\x03\x5C\x03\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x08\x11\x00\x00\x08\x11\x00\x03\x62\x03\x00\x00\x07\x01\x00\x00\x10\x11\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x08\x11\x00\x00\x0A\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x08\x11\x00\x00\x0A\x01\x00\x00\x0A\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x08\x11\x00\x00\x0A\x01\x00\x00\x0A\x01\x00\x00\x0A\x01\x00\x00\x0A\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x08\x11\x00\x02\xD8\x03\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x08\x11\x00\x03\x4F\x03\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x08\x11\x00\x03\x54\x03\x00\x00\x07\x01\x00\x00\x10\x11\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x02\xD3\x03\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x03\x44\x03\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x03\x48\x03\x00\x00\x07\x01\x00\x00\x10\x11\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x0B\x11\x00\x00\x15\x11\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x08\x11\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x08\x11\x00\x00\x0F\x11\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x08\x11\x00\x00\x0F\x11\x00\x00\x01\x0F\x00\x00\x08\x0D\x00\x00\x08\x11\x00\x00\x0D\x01\x00\x00\x00\x0F\x00\x00\x08\x0D\x00\x00\x08\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\x21\x0D\x00\x00\x0F\x11\x00\x00\x24\x09\x00\x00\x00\x0F\x00\x00\x21\x0D\x00\x00\x00\x0F\x00\x03\xBA\x0D\x00\x00\x21\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x03\xBF\x0D\x00\x00\x21\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\x11\x0D\x00\x00\x01\x11\x00\x00\x00\x0F\x00\x00\x11\x0D\x00\x00\x01\x11\x00\x00\xF4\x03\x00\x00\x10\x11\x00\x00\x07\x01\x00\x00\x07\x01\x00\x00\xDB\x03\x00\x00\x00\x0F\x00\x00\x11\x0D\x00\x02\x35\x11\x00\x00\x10\x11\x00\x00\x07\x01\x00\x00\x07\x01\x00\x02\x39\x11\x00\x00\x00\x0F\x00\x00\x11\x0D\x00\x00\x04\x11\x00\x00\x4B\x11\x00\x00\x00\x0F\x00\x00\x11\x0D\x00\x00\x0B\x11\x00\x00\x21\x09\x00\x00\x00\x0F\x00\x00\x11\x0D\x00\x04\x32\x03\x00\x00\x00\x0F\x00\x00\x11\x0D\x00\x00\x15\x11\x00\x00\x00\x0F\x00\x00\x11\x0D\x00\x00\x15\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\x11\x0D\x00\x00\x15\x11\x00\x00\x15\x11\x00\x00\x08\x11\x00\x00\x00\x0F\x00\x00\x11\x0D\x00\x00\x21\x11\x00\x00\x00\x0F\x00\x00\x11\x0D\x00\x00\x21\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\x11\x0D\x00\x00\x00\x0F\x00\x00\x6C\x0D\x00\x00\x0D\x01\x00\x00\x00\x0F\x00\x00\x6C\x0D\x00\x00\x21\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x00\x10\x0D\x00\x02\x4B\x11\x00\x00\x00\x0F\x00\x00\x10\x0D\x00\x00\x15\x11\x00\x00\x00\x0F\x00\x00\x10\x0D\x00\x00\x21\x11\x00\x00\x00\x0F\x00\x00\x10\x0D\x00\x00\x0A\x01\x00\x00\x00\x0F\x00\x02\xE1\x0D\x00\x00\x21\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x01\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x03\xF8\x03\x00\x00\xF4\x03\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x03\xF9\x03\x00\x02\x7E\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x03\xFA\x03\x00\x02\x7E\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x03\xFB\x03\x00\x02\x7E\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x03\xFC\x03\x00\x02\x7E\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x03\xFD\x03\x00\x02\x7E\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x0F\x11\x00\x00\x0F\x11\x00\x00\x07\x01\x00\x00\x0F\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x02\x35\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x02\x35\x11\x00\x03\xF8\x03\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x02\x35\x11\x00\x03\xF9\x03\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x02\x35\x11\x00\x03\xFA\x03\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x02\x35\x11\x00\x03\xFB\x03\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x02\x35\x11\x00\x03\xFC\x03\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x02\x35\x11\x00\x03\xFD\x03\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x02\x35\x11\x00\x00\x6C\x03\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x02\x35\x11\x00\x00\x10\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x07\x01\x00\x03\xFE\x03\x00\x00\x0A\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x07\x01\x00\x02\x7E\x11\x00\x02\x35\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x07\x01\x00\x02\x7E\x11\x00\x02\x35\x11\x00\x02\x35\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x07\x01\x00\x02\x7E\x11\x00\x04\x53\x03\x00\x02\xE1\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x04\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x04\x11\x00\x00\x22\x03\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x04\x11\x00\x00\x08\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x4B\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x4B\x11\x00\x00\x08\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x04\x30\x03\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x02\xF8\x11\x00\x00\x0F\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x02\xF8\x11\x00\x02\xF8\x11\x00\x00\x0F\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x0B\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x0B\x11\x00\x00\x01\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x0B\x11\x00\x00\x4B\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x0B\x11\x00\x00\x44\x11\x00\x00\x50\x11\x00\x00\x0B\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x0B\x11\x00\x00\x08\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x02\x4B\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x0E\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x0E\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x0E\x11\x00\x00\x4B\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x0E\x11\x00\x00\x4B\x11\x00\x00\x01\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x0E\x11\x00\x00\x08\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x7F\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x7F\x11\x00\x02\xE9\x11\x00\x02\xE9\x11\x00\x02\xE9\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x7F\x11\x00\x00\x4B\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x04\x37\x03\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x08\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x08\x11\x00\x00\x15\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x08\x11\x00\x00\x15\x11\x00\x00\x07\x01\x00\x00\x07\x01\x00\x00\x10\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x08\x11\x00\x00\x15\x11\x00\x00\x15\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x08\x11\x00\x00\x15\x11\x00\x00\x15\x11\x00\x00\x07\x01\x00\x00\x07\x01\x00\x00\x10\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x08\x11\x00\x00\x15\x11\x00\x00\x15\x11\x00\x00\x15\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x08\x11\x00\x00\x15\x11\x00\x00\x15\x11\x00\x00\x15\x11\x00\x00\x07\x01\x00\x00\x07\x01\x00\x00\x10\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x15\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x15\x11\x00\x00\x07\x01\x00\x00\x0D\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x15\x11\x00\x00\x07\x01\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x15\x11\x00\x00\x15\x11\x00\x00\x08\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x15\x11\x00\x00\x15\x11\x00\x00\x08\x11\x00\x00\x10\x11\x00\x00\x0A\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x21\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x21\x11\x00\x00\x0F\x11\x00\x00\x01\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x21\x11\x00\x00\x0F\x11\x00\x00\x0F\x03\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x21\x11\x00\x00\x0F\x11\x00\x00\x0F\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x21\x11\x00\x00\x0F\x11\x00\x00\x01\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x21\x11\x00\x00\x0F\x11\x00\x00\x34\x11\x00\x02\xE1\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x21\x11\x00\x00\x0F\x11\x00\x00\x0D\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x21\x11\x00\x00\x0F\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x21\x11\x00\x00\x0F\x11\x00\x00\x05\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x21\x11\x00\x00\x0F\x11\x00\x00\x03\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x21\x11\x00\x00\x0F\x11\x00\x00\x04\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x21\x11\x00\x00\x0F\x11\x00\x00\x08\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x21\x11\x00\x00\x0F\x11\x00\x00\x06\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x21\x11\x00\x00\x0F\x11\x00\x02\xE1\x11\x00\x00\x0A\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x21\x11\x00\x00\x15\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x21\x11\x00\x00\x21\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x21\x11\x00\x00\x10\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x0A\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x6C\x03\x00\x02\x7E\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x10\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x10\x11\x00\x00\x08\x11\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x02\xE1\x11\x00\x02\x7E\x11\x00\x00\x07\x01\x00\x00\x00\x0F\x00\x04\x53\x0D\x00\x00\x00\x0F\x00\x00\x24\x03\x00\x00\x0D\x09\x00\x00\x0E\x09\x00\x00\x0F\x09\x00\x00\x10\x09\x00\x00\x11\x09\x00\x00\x12\x09\x00\x00\x13\x09\x00\x00\x14\x09\x00\x00\x04\x09\x00\x00\x05\x09\x00\x00\x06\x09\x00\x00\x07\x09\x00\x00\x08\x09\x00\x00\x09\x09\x00\x00\x0A\x09\x00\x00\x02\x01\x00\x03\xFE\x05\x00\x00\x00\x80\x00\x03\xFE\x05\x00\x00\x00\x10\x00\x03\xFE\x05\x00\x00\x00\xC0\x00\x03\xFE\x05\x00\x00\x00\x25\x00\x03\xFE\x05\x00\x00\x00\x28\x00\x03\xFE\x05\x00\x00\x00\x04\x00\x03\xFE\x05\x00\x00\x00\x38\x00\x03\xFE\x05\x00\x00\x00\x40\x00\x03\xFE\x05\x00\x00\x1F\xF0\x00\x03\xFE\x05\x00\x00\x00\x08\x00\x00\x00\x0B\x00\x00\x02\x0B\x00\x00\x03\x0B\x00\x00\x06\x0B\x00\x00\x08\x0B\x00\x00\x0B\x09\x00\x00\x22\x05\x00\x00\x10\x00\x00\x00\x22\x05\x00\x00\x00\x08\x00\x00\x0F\x01\x00\x00\xDB\x05\x00\x00\x00\x04\x00\x00\x09\x01\x00\x03\xB0\x05\x00\x00\x00\x10\x00\x03\xB5\x05\x00\x00\x00\x10\x00\x03\xB5\x05\x00\x00\x01\x00\x00\x00\x00\x09\x00\x00\x01\x09\x00\x00\x02\x09\x00\x00\x03\x09\x00\x04\x2C\x03\x00\x00\x0C\x09\x00\x04\x2E\x03\x00\x00\x15\x09\x00\x00\x16\x09\x00\x00\x17\x09\x00\x00\x18\x09\x00\x00\x19\x09\x00\x00\x1A\x09\x00\x00\x1C\x09\x00\x00\x1D\x09\x00\x04\x37\x03\x00\x00\x1E\x09\x00\x00\x1F\x09\x00\x00\x08\x05\x00\x00\x10\x00\x00\x00\x08\x05\x00\x00\x00\x06\x00\x00\x22\x09\x00\x00\x23\x09\x00\x03\xBA\x03\x00\x03\xBA\x05\x00\x00\x00\x80\x00\x03\xBA\x05\x00\x00\x00\x0C\x00\x03\xBA\x05\x00\x00\x00\x10\x00\x03\xBA\x05\x00\x00\x00\x20\x00\x03\xBA\x05\x00\x00\x00\x40\x00\x00\x0C\x01\x00\x00\x11\x05\x00\x00\x00\x04\x00\x00\x10\x05\x00\x00\x20\x51\x00\x02\xC6\x03\x00\x02\xDE\x03\x00\x03\xE0\x03\x00\x03\xE7\x03\x00\x00\x00\x01',
-    _globals = (b'\xFF\xFF\xFF\x0BGGML_BACKEND_CPU',0,b'\xFF\xFF\xFF\x0BGGML_BACKEND_GPU',10,b'\xFF\xFF\xFF\x0BGGML_BACKEND_GPU_SPLIT',20,b'\xFF\xFF\xFF\x0BGGML_FTYPE_ALL_F32',0,b'\xFF\xFF\xFF\x0BGGML_FTYPE_MOSTLY_F16',1,b'\xFF\xFF\xFF\x0BGGML_FTYPE_MOSTLY_Q2_K',10,b'\xFF\xFF\xFF\x0BGGML_FTYPE_MOSTLY_Q3_K',11,b'\xFF\xFF\xFF\x0BGGML_FTYPE_MOSTLY_Q4_0',2,b'\xFF\xFF\xFF\x0BGGML_FTYPE_MOSTLY_Q4_1',3,b'\xFF\xFF\xFF\x0BGGML_FTYPE_MOSTLY_Q4_1_SOME_F16',4,b'\xFF\xFF\xFF\x0BGGML_FTYPE_MOSTLY_Q4_K',12,b'\xFF\xFF\xFF\x0BGGML_FTYPE_MOSTLY_Q5_0',8,b'\xFF\xFF\xFF\x0BGGML_FTYPE_MOSTLY_Q5_1',9,b'\xFF\xFF\xFF\x0BGGML_FTYPE_MOSTLY_Q5_K',13,b'\xFF\xFF\xFF\x0BGGML_FTYPE_MOSTLY_Q6_K',14,b'\xFF\xFF\xFF\x0BGGML_FTYPE_MOSTLY_Q8_0',7,b'\xFF\xFF\xFF\x0BGGML_FTYPE_UNKNOWN',-1,b'\xFF\xFF\xFF\x1FGGML_GRAPH_SIZE',164520,b'\xFF\xFF\xFF\x0BGGML_LINESEARCH_BACKTRACKING_ARMIJO',0,b'\xFF\xFF\xFF\x0BGGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE',2,b'\xFF\xFF\xFF\x0BGGML_LINESEARCH_BACKTRACKING_WOLFE',1,b'\xFF\xFF\xFF\x0BGGML_LINESEARCH_DEFAULT',1,b'\xFF\xFF\xFF\x0BGGML_LINESEARCH_FAIL',-128,b'\xFF\xFF\xFF\x0BGGML_LINESEARCH_INVALID_PARAMETERS',-124,b'\xFF\xFF\xFF\x0BGGML_LINESEARCH_MAXIMUM_ITERATIONS',-125,b'\xFF\xFF\xFF\x0BGGML_LINESEARCH_MAXIMUM_STEP',-126,b'\xFF\xFF\xFF\x0BGGML_LINESEARCH_MINIMUM_STEP',-127,b'\xFF\xFF\xFF\x0BGGML_OBJECT_GRAPH',1,b'\xFF\xFF\xFF\x1FGGML_OBJECT_SIZE',32,b'\xFF\xFF\xFF\x0BGGML_OBJECT_TENSOR',0,b'\xFF\xFF\xFF\x0BGGML_OBJECT_WORK_BUFFER',2,b'\xFF\xFF\xFF\x0BGGML_OPT_ADAM',0,b'\xFF\xFF\xFF\x0BGGML_OPT_DID_NOT_CONVERGE',1,b'\xFF\xFF\xFF\x0BGGML_OPT_FAIL',4,b'\xFF\xFF\xFF\x0BGGML_OPT_INVALID_WOLFE',3,b'\xFF\xFF\xFF\x0BGGML_OPT_LBFGS',1,b'\xFF\xFF\xFF\x0BGGML_OPT_NO_CONTEXT',2,b'\xFF\xFF\xFF\x0BGGML_OPT_OK',0,b'\xFF\xFF\xFF\x0BGGML_OP_ACC',4,b'\xFF\xFF\xFF\x0BGGML_OP_ADD',2,b'\xFF\xFF\xFF\x0BGGML_OP_ADD1',3,b'\xFF\xFF\xFF\x0BGGML_OP_ALIBI',40,b'\xFF\xFF\xFF\x0BGGML_OP_ARGMAX',14,b'\xFF\xFF\xFF\x0BGGML_OP_CLAMP',41,b'\xFF\xFF\xFF\x0BGGML_OP_CONT',26,b'\xFF\xFF\xFF\x0BGGML_OP_CONV_1D',42,b'\xFF\xFF\xFF\x0BGGML_OP_CONV_2D',43,b'\xFF\xFF\xFF\x0BGGML_OP_COUNT',62,b'\xFF\xFF\xFF\x0BGGML_OP_CPY',25,b'\xFF\xFF\xFF\x0BGGML_OP_CROSS_ENTROPY_LOSS',60,b'\xFF\xFF\xFF\x0BGGML_OP_CROSS_ENTROPY_LOSS_BACK',61,b'\xFF\xFF\xFF\x0BGGML_OP_DIAG',33,b'\xFF\xFF\xFF\x0BGGML_OP_DIAG_MASK_INF',34,b'\xFF\xFF\xFF\x0BGGML_OP_DIAG_MASK_ZERO',35,b'\xFF\xFF\xFF\x0BGGML_OP_DIV',7,b'\xFF\xFF\xFF\x0BGGML_OP_DUP',1,b'\xFF\xFF\xFF\x0BGGML_OP_FLASH_ATTN',46,b'\xFF\xFF\xFF\x0BGGML_OP_FLASH_ATTN_BACK',48,b'\xFF\xFF\xFF\x0BGGML_OP_FLASH_FF',47,b'\xFF\xFF\xFF\x0BGGML_OP_GET_ROWS',31,b'\xFF\xFF\xFF\x0BGGML_OP_GET_ROWS_BACK',32,b'\xFF\xFF\xFF\x0BGGML_OP_LOG',10,b'\xFF\xFF\xFF\x0BGGML_OP_MAP_BINARY',53,b'\xFF\xFF\xFF\x0BGGML_OP_MAP_CUSTOM1',57,b'\xFF\xFF\xFF\x0BGGML_OP_MAP_CUSTOM1_F32',54,b'\xFF\xFF\xFF\x0BGGML_OP_MAP_CUSTOM2',58,b'\xFF\xFF\xFF\x0BGGML_OP_MAP_CUSTOM2_F32',55,b'\xFF\xFF\xFF\x0BGGML_OP_MAP_CUSTOM3',59,b'\xFF\xFF\xFF\x0BGGML_OP_MAP_CUSTOM3_F32',56,b'\xFF\xFF\xFF\x0BGGML_OP_MAP_UNARY',52,b'\xFF\xFF\xFF\x0BGGML_OP_MEAN',13,b'\xFF\xFF\xFF\x0BGGML_OP_MUL',6,b'\xFF\xFF\xFF\x0BGGML_OP_MUL_MAT',21,b'\xFF\xFF\xFF\x0BGGML_OP_NONE',0,b'\xFF\xFF\xFF\x0BGGML_OP_NORM',18,b'\xFF\xFF\xFF\x0BGGML_OP_OUT_PROD',22,b'\xFF\xFF\xFF\x0BGGML_OP_PERMUTE',29,b'\xFF\xFF\xFF\x0BGGML_OP_POOL_1D',44,b'\xFF\xFF\xFF\x0BGGML_OP_POOL_2D',45,b'\xFF\xFF\xFF\x0BGGML_OP_POOL_AVG',1,b'\xFF\xFF\xFF\x0BGGML_OP_POOL_COUNT',2,b'\xFF\xFF\xFF\x0BGGML_OP_POOL_MAX',0,b'\xFF\xFF\xFF\x0BGGML_OP_REPEAT',15,b'\xFF\xFF\xFF\x0BGGML_OP_REPEAT_BACK',16,b'\xFF\xFF\xFF\x0BGGML_OP_RESHAPE',27,b'\xFF\xFF\xFF\x0BGGML_OP_RMS_NORM',19,b'\xFF\xFF\xFF\x0BGGML_OP_RMS_NORM_BACK',20,b'\xFF\xFF\xFF\x0BGGML_OP_ROPE',38,b'\xFF\xFF\xFF\x0BGGML_OP_ROPE_BACK',39,b'\xFF\xFF\xFF\x0BGGML_OP_SCALE',23,b'\xFF\xFF\xFF\x0BGGML_OP_SET',24,b'\xFF\xFF\xFF\x0BGGML_OP_SILU_BACK',17,b'\xFF\xFF\xFF\x0BGGML_OP_SOFT_MAX',36,b'\xFF\xFF\xFF\x0BGGML_OP_SOFT_MAX_BACK',37,b'\xFF\xFF\xFF\x0BGGML_OP_SQR',8,b'\xFF\xFF\xFF\x0BGGML_OP_SQRT',9,b'\xFF\xFF\xFF\x0BGGML_OP_SUB',5,b'\xFF\xFF\xFF\x0BGGML_OP_SUM',11,b'\xFF\xFF\xFF\x0BGGML_OP_SUM_ROWS',12,b'\xFF\xFF\xFF\x0BGGML_OP_TRANSPOSE',30,b'\xFF\xFF\xFF\x0BGGML_OP_UNARY',51,b'\xFF\xFF\xFF\x0BGGML_OP_VIEW',28,b'\xFF\xFF\xFF\x0BGGML_OP_WIN_PART',49,b'\xFF\xFF\xFF\x0BGGML_OP_WIN_UNPART',50,b'\xFF\xFF\xFF\x0BGGML_TASK_COMPUTE',1,b'\xFF\xFF\xFF\x0BGGML_TASK_FINALIZE',2,b'\xFF\xFF\xFF\x0BGGML_TASK_INIT',0,b'\xFF\xFF\xFF\x1FGGML_TENSOR_SIZE',288,b'\xFF\xFF\xFF\x0BGGML_TYPE_COUNT',19,b'\xFF\xFF\xFF\x0BGGML_TYPE_F16',1,b'\xFF\xFF\xFF\x0BGGML_TYPE_F32',0,b'\xFF\xFF\xFF\x0BGGML_TYPE_I16',17,b'\xFF\xFF\xFF\x0BGGML_TYPE_I32',18,b'\xFF\xFF\xFF\x0BGGML_TYPE_I8',16,b'\xFF\xFF\xFF\x0BGGML_TYPE_Q2_K',10,b'\xFF\xFF\xFF\x0BGGML_TYPE_Q3_K',11,b'\xFF\xFF\xFF\x0BGGML_TYPE_Q4_0',2,b'\xFF\xFF\xFF\x0BGGML_TYPE_Q4_1',3,b'\xFF\xFF\xFF\x0BGGML_TYPE_Q4_K',12,b'\xFF\xFF\xFF\x0BGGML_TYPE_Q5_0',6,b'\xFF\xFF\xFF\x0BGGML_TYPE_Q5_1',7,b'\xFF\xFF\xFF\x0BGGML_TYPE_Q5_K',13,b'\xFF\xFF\xFF\x0BGGML_TYPE_Q6_K',14,b'\xFF\xFF\xFF\x0BGGML_TYPE_Q8_0',8,b'\xFF\xFF\xFF\x0BGGML_TYPE_Q8_1',9,b'\xFF\xFF\xFF\x0BGGML_TYPE_Q8_K',15,b'\xFF\xFF\xFF\x0BGGML_UNARY_OP_ABS',0,b'\xFF\xFF\xFF\x0BGGML_UNARY_OP_ELU',5,b'\xFF\xFF\xFF\x0BGGML_UNARY_OP_GELU',7,b'\xFF\xFF\xFF\x0BGGML_UNARY_OP_GELU_QUICK',8,b'\xFF\xFF\xFF\x0BGGML_UNARY_OP_NEG',2,b'\xFF\xFF\xFF\x0BGGML_UNARY_OP_RELU',6,b'\xFF\xFF\xFF\x0BGGML_UNARY_OP_SGN',1,b'\xFF\xFF\xFF\x0BGGML_UNARY_OP_SILU',9,b'\xFF\xFF\xFF\x0BGGML_UNARY_OP_STEP',3,b'\xFF\xFF\xFF\x0BGGML_UNARY_OP_TANH',4,b'\xFF\xFF\xFF\x0BGGUF_TYPE_ARRAY',9,b'\xFF\xFF\xFF\x0BGGUF_TYPE_BOOL',7,b'\xFF\xFF\xFF\x0BGGUF_TYPE_COUNT',10,b'\xFF\xFF\xFF\x0BGGUF_TYPE_FLOAT32',6,b'\xFF\xFF\xFF\x0BGGUF_TYPE_INT16',3,b'\xFF\xFF\xFF\x0BGGUF_TYPE_INT32',5,b'\xFF\xFF\xFF\x0BGGUF_TYPE_INT8',1,b'\xFF\xFF\xFF\x0BGGUF_TYPE_STRING',8,b'\xFF\xFF\xFF\x0BGGUF_TYPE_UINT16',2,b'\xFF\xFF\xFF\x0BGGUF_TYPE_UINT32',4,b'\xFF\xFF\xFF\x0BGGUF_TYPE_UINT8',0,b'\x00\x02\x9A\x23__assert_rtn',0,b'\x00\x02\x7C\x23dequantize_row_q2_K',0,b'\x00\x02\x81\x23dequantize_row_q3_K',0,b'\x00\x02\x86\x23dequantize_row_q4_K',0,b'\x00\x02\x8B\x23dequantize_row_q5_K',0,b'\x00\x02\x90\x23dequantize_row_q6_K',0,b'\x00\x02\x95\x23dequantize_row_q8_K',0,b'\x00\x00\xFA\x23ggml_abs',0,b'\x00\x00\xFA\x23ggml_abs_inplace',0,b'\x00\x01\xDD\x23ggml_acc',0,b'\x00\x01\xDD\x23ggml_acc_inplace',0,b'\x00\x01\x84\x23ggml_add',0,b'\x00\x01\x84\x23ggml_add1',0,b'\x00\x01\x84\x23ggml_add1_inplace',0,b'\x00\x01\x84\x23ggml_add_inplace',0,b'\x00\x01\x26\x23ggml_alibi',0,b'\x00\x02\xEC\x23ggml_allocr_alloc',0,b'\x00\x02\x42\x23ggml_allocr_alloc_graph',0,b'\x00\x02\xE4\x23ggml_allocr_free',0,b'\x00\x00\x03\x23ggml_allocr_is_measure',0,b'\x00\x00\xA2\x23ggml_allocr_new',0,b'\x00\x00\x9F\x23ggml_allocr_new_measure',0,b'\x00\x02\xE4\x23ggml_allocr_reset',0,b'\x00\x02\xE7\x23ggml_allocr_set_parse_seq',0,b'\x00\x00\x17\x23ggml_are_same_shape',0,b'\x00\x00\xFA\x23ggml_argmax',0,b'\x00\x00\x74\x23ggml_blck_size',0,b'\x00\x00\xB3\x23ggml_build_backward',0,b'\x00\x00\xB8\x23ggml_build_forward',0,b'\x00\x00\xAA\x23ggml_build_forward_ctx',0,b'\x00\x02\xF3\x23ggml_build_forward_expand',0,b'\x00\x00\x1B\x23ggml_cl_can_mul_mat',0,b'\x00\x03\x6B\x23ggml_cl_free_data',0,b'\x00\x03\xE0\x23ggml_cl_host_free',0,b'\x00\x02\x72\x23ggml_cl_host_malloc',0,b'\x00\x03\xEC\x23ggml_cl_init',0,b'\x00\x03\x78\x23ggml_cl_mul',0,b'\x00\x03\x7D\x23ggml_cl_mul_mat',0,b'\x00\x02\x54\x23ggml_cl_mul_mat_get_wsize',0,b'\x00\x03\xE3\x23ggml_cl_transform_tensor',0,b'\x00\x01\x1B\x23ggml_clamp',0,b'\x00\x00\xFA\x23ggml_cont',0,b'\x00\x01\x90\x23ggml_conv_1d',0,b'\x00\x01\x89\x23ggml_conv_1d_ph',0,b'\x00\x01\x98\x23ggml_conv_2d',0,b'\x00\x00\x90\x23ggml_cpu_has_arm_fma',0,b'\x00\x00\x90\x23ggml_cpu_has_avx',0,b'\x00\x00\x90\x23ggml_cpu_has_avx2',0,b'\x00\x00\x90\x23ggml_cpu_has_avx512',0,b'\x00\x00\x90\x23ggml_cpu_has_avx512_vbmi',0,b'\x00\x00\x90\x23ggml_cpu_has_avx512_vnni',0,b'\x00\x00\x90\x23ggml_cpu_has_blas',0,b'\x00\x00\x90\x23ggml_cpu_has_clblast',0,b'\x00\x00\x90\x23ggml_cpu_has_cublas',0,b'\x00\x00\x90\x23ggml_cpu_has_f16c',0,b'\x00\x00\x90\x23ggml_cpu_has_fma',0,b'\x00\x00\x90\x23ggml_cpu_has_fp16_va',0,b'\x00\x00\x90\x23ggml_cpu_has_gpublas',0,b'\x00\x00\x90\x23ggml_cpu_has_neon',0,b'\x00\x00\x90\x23ggml_cpu_has_sse3',0,b'\x00\x00\x90\x23ggml_cpu_has_vsx',0,b'\x00\x00\x90\x23ggml_cpu_has_wasm_simd',0,b'\x00\x01\x84\x23ggml_cpy',0,b'\x00\x01\x84\x23ggml_cross_entropy_loss',0,b'\x00\x01\xA3\x23ggml_cross_entropy_loss_back',0,b'\x00\x03\x41\x23ggml_cuda_assign_buffers',0,b'\x00\x03\x41\x23ggml_cuda_assign_buffers_force_inplace',0,b'\x00\x03\x41\x23ggml_cuda_assign_buffers_no_scratch',0,b'\x00\x00\x1B\x23ggml_cuda_can_mul_mat',0,b'\x00\x00\x06\x23ggml_cuda_compute_forward',0,b'\x00\x03\x41\x23ggml_cuda_free_data',0,b'\x00\x03\xEC\x23ggml_cuda_free_scratch',0,b'\x00\x00\x90\x23ggml_cuda_get_device_count',0,b'\x00\x02\xCE\x23ggml_cuda_get_device_description',0,b'\x00\x03\xE0\x23ggml_cuda_host_free',0,b'\x00\x02\x72\x23ggml_cuda_host_malloc',0,b'\x00\x02\xCB\x23ggml_cuda_set_main_device',0,b'\x00\x02\x79\x23ggml_cuda_set_mul_mat_q',0,b'\x00\x03\xD8\x23ggml_cuda_set_scratch_size',0,b'\x00\x02\xA0\x23ggml_cuda_set_tensor_split',0,b'\x00\x03\xE3\x23ggml_cuda_transform_tensor',0,b'\x00\x00\x95\x23ggml_cycles',0,b'\x00\x00\x95\x23ggml_cycles_per_ms',0,b'\x00\x00\xFA\x23ggml_diag',0,b'\x00\x01\x21\x23ggml_diag_mask_inf',0,b'\x00\x01\x21\x23ggml_diag_mask_inf_inplace',0,b'\x00\x01\x21\x23ggml_diag_mask_zero',0,b'\x00\x01\x21\x23ggml_diag_mask_zero_inplace',0,b'\x00\x01\x84\x23ggml_div',0,b'\x00\x01\x84\x23ggml_div_inplace',0,b'\x00\x00\xFA\x23ggml_dup',0,b'\x00\x00\xFA\x23ggml_dup_inplace',0,b'\x00\x02\x0B\x23ggml_dup_tensor',0,b'\x00\x02\x4D\x23ggml_element_size',0,b'\x00\x00\xFA\x23ggml_elu',0,b'\x00\x00\xFA\x23ggml_elu_inplace',0,b'\x00\x01\xA9\x23ggml_flash_attn',0,b'\x00\x01\xB0\x23ggml_flash_attn_back',0,b'\x00\x01\xB8\x23ggml_flash_ff',0,b'\x00\x02\x16\x23ggml_format_name',0,b'\x00\x00\x6B\x23ggml_fp16_to_fp32',0,b'\x00\x03\xDB\x23ggml_fp16_to_fp32_row',0,b'\x00\x02\x62\x23ggml_fp32_to_fp16',0,b'\x00\x02\xC1\x23ggml_fp32_to_fp16_row',0,b'\x00\x03\x03\x23ggml_free',0,b'\x00\x00\x53\x23ggml_ftype_to_ggml_type',0,b'\x00\x00\xFA\x23ggml_gelu',0,b'\x00\x00\xFA\x23ggml_gelu_inplace',0,b'\x00\x00\xFA\x23ggml_gelu_quick',0,b'\x00\x00\xFA\x23ggml_gelu_quick_inplace',0,b'\x00\x02\x6C\x23ggml_get_data',0,b'\x00\x00\x5D\x23ggml_get_data_f32',0,b'\x00\x00\x63\x23ggml_get_f32_1d',0,b'\x00\x00\x81\x23ggml_get_i32_1d',0,b'\x00\x02\x4A\x23ggml_get_max_tensor_size',0,b'\x00\x02\x69\x23ggml_get_mem_buffer',0,b'\x00\x02\x4A\x23ggml_get_mem_size',0,b'\x00\x00\x36\x23ggml_get_name',0,b'\x00\x00\x0A\x23ggml_get_no_alloc',0,b'\x00\x01\x84\x23ggml_get_rows',0,b'\x00\x01\xA3\x23ggml_get_rows_back',0,b'\x00\x00\xCE\x23ggml_get_tensor',0,b'\x00\x00\x56\x23ggml_get_unary_op',0,b'\x00\x00\x77\x23ggml_graph_compute',0,b'\x00\x03\x0A\x23ggml_graph_compute_with_ctx',0,b'\x00\x02\xFE\x23ggml_graph_dump_dot',0,b'\x00\x02\xFA\x23ggml_graph_export',0,b'\x00\x00\xCA\x23ggml_graph_get_tensor',0,b'\x00\x00\xAE\x23ggml_graph_import',0,b'\x00\x02\x60\x23ggml_graph_overhead',0,b'\x00\x00\xBE\x23ggml_graph_plan',0,b'\x00\x02\xF7\x23ggml_graph_print',0,b'\x00\x02\xF0\x23ggml_graph_reset',0,b'\x00\x00\xBB\x23ggml_init',0,b'\x00\x03\xEC\x23ggml_init_cublas',0,b'\x00\x00\x6E\x23ggml_internal_get_type_traits',0,b'\x00\x00\x14\x23ggml_is_contiguous',0,b'\x00\x00\x27\x23ggml_is_numa',0,b'\x00\x00\x14\x23ggml_is_permuted',0,b'\x00\x00\x00\x23ggml_is_quantized',0,b'\x00\x00\x14\x23ggml_is_transposed',0,b'\x00\x00\xFA\x23ggml_log',0,b'\x00\x00\xFA\x23ggml_log_inplace',0,b'\x00\x01\xE6\x23ggml_map_binary_f32',0,b'\x00\x01\xE6\x23ggml_map_binary_inplace_f32',0,b'\x00\x02\x04\x23ggml_map_custom1',0,b'\x00\x01\xFF\x23ggml_map_custom1_f32',0,b'\x00\x02\x04\x23ggml_map_custom1_inplace',0,b'\x00\x01\xFF\x23ggml_map_custom1_inplace_f32',0,b'\x00\x01\xF2\x23ggml_map_custom2',0,b'\x00\x01\xEC\x23ggml_map_custom2_f32',0,b'\x00\x01\xF2\x23ggml_map_custom2_inplace',0,b'\x00\x01\xEC\x23ggml_map_custom2_inplace_f32',0,b'\x00\x01\xC7\x23ggml_map_custom3',0,b'\x00\x01\xC0\x23ggml_map_custom3_f32',0,b'\x00\x01\xC7\x23ggml_map_custom3_inplace',0,b'\x00\x01\xC0\x23ggml_map_custom3_inplace_f32',0,b'\x00\x01\xFA\x23ggml_map_unary_f32',0,b'\x00\x01\xFA\x23ggml_map_unary_inplace_f32',0,b'\x00\x00\xFA\x23ggml_mean',0,b'\x00\x00\x0D\x23ggml_metal_add_buffer',0,b'\x00\x03\x1C\x23ggml_metal_free',0,b'\x00\x00\x71\x23ggml_metal_get_concur_list',0,b'\x00\x03\x2C\x23ggml_metal_get_tensor',0,b'\x00\x03\x23\x23ggml_metal_graph_compute',0,b'\x00\x03\x27\x23ggml_metal_graph_find_concurrency',0,b'\x00\x03\xE0\x23ggml_metal_host_free',0,b'\x00\x02\x72\x23ggml_metal_host_malloc',0,b'\x00\x00\x7B\x23ggml_metal_if_optimized',0,b'\x00\x00\xC2\x23ggml_metal_init',0,b'\x00\x03\x1F\x23ggml_metal_set_n_cb',0,b'\x00\x03\x2C\x23ggml_metal_set_tensor',0,b'\x00\x03\xEC\x23ggml_mpi_backend_free',0,b'\x00\x03\xEC\x23ggml_mpi_backend_init',0,b'\x00\x03\x33\x23ggml_mpi_eval_init',0,b'\x00\x03\x30\x23ggml_mpi_free',0,b'\x00\x03\x39\x23ggml_mpi_graph_compute_post',0,b'\x00\x03\x39\x23ggml_mpi_graph_compute_pre',0,b'\x00\x00\xC5\x23ggml_mpi_init',0,b'\x00\x00\x7E\x23ggml_mpi_rank',0,b'\x00\x01\x84\x23ggml_mul',0,b'\x00\x01\x84\x23ggml_mul_inplace',0,b'\x00\x01\x84\x23ggml_mul_mat',0,b'\x00\x02\x4D\x23ggml_nbytes',0,b'\x00\x02\x4D\x23ggml_nbytes_pad',0,b'\x00\x02\x50\x23ggml_nbytes_split',0,b'\x00\x00\xFA\x23ggml_neg',0,b'\x00\x00\xFA\x23ggml_neg_inplace',0,b'\x00\x00\x92\x23ggml_nelements',0,b'\x00\x00\xF2\x23ggml_new_f32',0,b'\x00\x00\xA7\x23ggml_new_graph',0,b'\x00\x00\xF6\x23ggml_new_i32',0,b'\x00\x00\xD2\x23ggml_new_tensor',0,b'\x00\x00\xD8\x23ggml_new_tensor_1d',0,b'\x00\x00\xDD\x23ggml_new_tensor_2d',0,b'\x00\x00\xE3\x23ggml_new_tensor_3d',0,b'\x00\x00\xEA\x23ggml_new_tensor_4d',0,b'\x00\x00\xFA\x23ggml_norm',0,b'\x00\x00\xFA\x23ggml_norm_inplace',0,b'\x00\x00\x92\x23ggml_nrows',0,b'\x00\x03\xEC\x23ggml_numa_init',0,b'\x00\x00\x2D\x23ggml_op_name',0,b'\x00\x00\x2D\x23ggml_op_symbol',0,b'\x00\x00\x4E\x23ggml_opt',0,b'\x00\x00\xC7\x23ggml_opt_default_params',0,b'\x00\x03\x0F\x23ggml_opt_init',0,b'\x00\x00\x42\x23ggml_opt_resume',0,b'\x00\x00\x47\x23ggml_opt_resume_g',0,b'\x00\x01\x84\x23ggml_out_prod',0,b'\x00\x01\x34\x23ggml_permute',0,b'\x00\x00\xFE\x23ggml_pool_1d',0,b'\x00\x01\x06\x23ggml_pool_2d',0,b'\x00\x03\x3E\x23ggml_print_object',0,b'\x00\x03\x19\x23ggml_print_objects',0,b'\x00\x02\x33\x23ggml_quantize_chunk',0,b'\x00\x02\x3B\x23ggml_quantize_q2_K',0,b'\x00\x02\x3B\x23ggml_quantize_q3_K',0,b'\x00\x02\x3B\x23ggml_quantize_q4_0',0,b'\x00\x02\x3B\x23ggml_quantize_q4_1',0,b'\x00\x02\x3B\x23ggml_quantize_q4_K',0,b'\x00\x02\x3B\x23ggml_quantize_q5_0',0,b'\x00\x02\x3B\x23ggml_quantize_q5_1',0,b'\x00\x02\x3B\x23ggml_quantize_q5_K',0,b'\x00\x02\x3B\x23ggml_quantize_q6_K',0,b'\x00\x02\x3B\x23ggml_quantize_q8_0',0,b'\x00\x00\xFA\x23ggml_relu',0,b'\x00\x00\xFA\x23ggml_relu_inplace',0,b'\x00\x01\x84\x23ggml_repeat',0,b'\x00\x01\x84\x23ggml_repeat_back',0,b'\x00\x01\x84\x23ggml_reshape',0,b'\x00\x01\x46\x23ggml_reshape_1d',0,b'\x00\x01\x4B\x23ggml_reshape_2d',0,b'\x00\x01\x51\x23ggml_reshape_3d',0,b'\x00\x01\x58\x23ggml_reshape_4d',0,b'\x00\x01\x16\x23ggml_rms_norm',0,b'\x00\x01\x84\x23ggml_rms_norm_back',0,b'\x00\x01\x16\x23ggml_rms_norm_inplace',0,b'\x00\x01\x34\x23ggml_rope',0,b'\x00\x01\x34\x23ggml_rope_back',0,b'\x00\x01\x3C\x23ggml_rope_custom',0,b'\x00\x01\x3C\x23ggml_rope_custom_inplace',0,b'\x00\x01\x34\x23ggml_rope_inplace',0,b'\x00\x01\x84\x23ggml_scale',0,b'\x00\x01\x84\x23ggml_scale_inplace',0,b'\x00\x01\xDD\x23ggml_set',0,b'\x00\x01\xD0\x23ggml_set_1d',0,b'\x00\x01\xD0\x23ggml_set_1d_inplace',0,b'\x00\x01\xD6\x23ggml_set_2d',0,b'\x00\x01\xD6\x23ggml_set_2d_inplace',0,b'\x00\x02\x1A\x23ggml_set_f32',0,b'\x00\x03\x6E\x23ggml_set_f32_1d',0,b'\x00\x02\x1E\x23ggml_set_i32',0,b'\x00\x03\x73\x23ggml_set_i32_1d',0,b'\x00\x01\xDD\x23ggml_set_inplace',0,b'\x00\x02\x12\x23ggml_set_name',0,b'\x00\x03\x06\x23ggml_set_no_alloc',0,b'\x00\x03\x15\x23ggml_set_param',0,b'\x00\x02\x46\x23ggml_set_scratch',0,b'\x00\x02\x0F\x23ggml_set_zero',0,b'\x00\x00\xFA\x23ggml_sgn',0,b'\x00\x00\xFA\x23ggml_sgn_inplace',0,b'\x00\x00\xFA\x23ggml_silu',0,b'\x00\x01\x84\x23ggml_silu_back',0,b'\x00\x00\xFA\x23ggml_silu_inplace',0,b'\x00\x00\xFA\x23ggml_soft_max',0,b'\x00\x01\x84\x23ggml_soft_max_back',0,b'\x00\x01\x84\x23ggml_soft_max_back_inplace',0,b'\x00\x00\xFA\x23ggml_soft_max_inplace',0,b'\x00\x00\xFA\x23ggml_sqr',0,b'\x00\x00\xFA\x23ggml_sqr_inplace',0,b'\x00\x00\xFA\x23ggml_sqrt',0,b'\x00\x00\xFA\x23ggml_sqrt_inplace',0,b'\x00\x00\xFA\x23ggml_step',0,b'\x00\x00\xFA\x23ggml_step_inplace',0,b'\x00\x01\x84\x23ggml_sub',0,b'\x00\x01\x84\x23ggml_sub_inplace',0,b'\x00\x00\xFA\x23ggml_sum',0,b'\x00\x00\xFA\x23ggml_sum_rows',0,b'\x00\x00\xFA\x23ggml_tanh',0,b'\x00\x00\xFA\x23ggml_tanh_inplace',0,b'\x00\x02\x60\x23ggml_tensor_overhead',0,b'\x00\x03\xEC\x23ggml_time_init',0,b'\x00\x00\x95\x23ggml_time_ms',0,b'\x00\x00\x95\x23ggml_time_us',0,b'\x00\x00\xFA\x23ggml_transpose',0,b'\x00\x00\x30\x23ggml_type_name',0,b'\x00\x02\x30\x23ggml_type_size',0,b'\x00\x00\x60\x23ggml_type_sizef',0,b'\x00\x01\x11\x23ggml_unary',0,b'\x00\x01\x11\x23ggml_unary_inplace',0,b'\x00\x02\x4A\x23ggml_used_mem',0,b'\x00\x02\xDE\x23ggml_vec_dot_q2_K_q8_K',0,b'\x00\x02\xDE\x23ggml_vec_dot_q3_K_q8_K',0,b'\x00\x02\xDE\x23ggml_vec_dot_q4_K_q8_K',0,b'\x00\x02\xDE\x23ggml_vec_dot_q5_K_q8_K',0,b'\x00\x02\xDE\x23ggml_vec_dot_q6_K_q8_K',0,b'\x00\x01\x7E\x23ggml_view_1d',0,b'\x00\x01\x76\x23ggml_view_2d',0,b'\x00\x01\x6C\x23ggml_view_3d',0,b'\x00\x01\x60\x23ggml_view_4d',0,b'\x00\x02\x0B\x23ggml_view_tensor',0,b'\x00\x01\x21\x23ggml_win_part',0,b'\x00\x01\x2D\x23ggml_win_unpart',0,b'\x00\x03\xCC\x23gguf_add_tensor',0,b'\x00\x00\x88\x23gguf_find_key',0,b'\x00\x00\x88\x23gguf_find_tensor',0,b'\x00\x03\x84\x23gguf_free',0,b'\x00\x02\x59\x23gguf_get_alignment',0,b'\x00\x02\x75\x23gguf_get_arr_data',0,b'\x00\x00\x8C\x23gguf_get_arr_n',0,b'\x00\x00\x3D\x23gguf_get_arr_str',0,b'\x00\x00\x59\x23gguf_get_arr_type',0,b'\x00\x02\x6F\x23gguf_get_data',0,b'\x00\x02\x59\x23gguf_get_data_offset',0,b'\x00\x00\x39\x23gguf_get_key',0,b'\x00\x00\x59\x23gguf_get_kv_type',0,b'\x00\x03\xD4\x23gguf_get_meta_data',0,b'\x00\x02\x59\x23gguf_get_meta_size',0,b'\x00\x00\x85\x23gguf_get_n_kv',0,b'\x00\x00\x85\x23gguf_get_n_tensors',0,b'\x00\x00\x29\x23gguf_get_tensor_name',0,b'\x00\x02\x5C\x23gguf_get_tensor_offset',0,b'\x00\x00\x20\x23gguf_get_val_bool',0,b'\x00\x00\x67\x23gguf_get_val_f32',0,b'\x00\x00\x97\x23gguf_get_val_i16',0,b'\x00\x00\x8C\x23gguf_get_val_i32',0,b'\x00\x00\x9B\x23gguf_get_val_i8',0,b'\x00\x00\x39\x23gguf_get_val_str',0,b'\x00\x02\x65\x23gguf_get_val_u16',0,b'\x00\x02\x2C\x23gguf_get_val_u32',0,b'\x00\x02\x28\x23gguf_get_val_u8',0,b'\x00\x00\x85\x23gguf_get_version',0,b'\x00\x02\x26\x23gguf_init_empty',0,b'\x00\x02\x22\x23gguf_init_from_file',0,b'\x00\x03\x9C\x23gguf_set_arr_data',0,b'\x00\x03\x8C\x23gguf_set_arr_str',0,b'\x00\x03\xD0\x23gguf_set_kv',0,b'\x00\x03\xC6\x23gguf_set_tensor_data',0,b'\x00\x03\x97\x23gguf_set_tensor_type',0,b'\x00\x03\x87\x23gguf_set_val_bool',0,b'\x00\x03\xA3\x23gguf_set_val_f32',0,b'\x00\x03\xAD\x23gguf_set_val_i16',0,b'\x00\x03\xA8\x23gguf_set_val_i32',0,b'\x00\x03\xB2\x23gguf_set_val_i8',0,b'\x00\x03\x92\x23gguf_set_val_str',0,b'\x00\x03\xC1\x23gguf_set_val_u16',0,b'\x00\x03\xBC\x23gguf_set_val_u32',0,b'\x00\x03\xB7\x23gguf_set_val_u8',0,b'\x00\x00\x33\x23gguf_type_name',0,b'\x00\x03\x87\x23gguf_write_to_file',0,b'\x00\x02\xC6\x23quantize_row_q2_K',0,b'\x00\x02\xA3\x23quantize_row_q2_K_reference',0,b'\x00\x02\xC6\x23quantize_row_q3_K',0,b'\x00\x02\xA8\x23quantize_row_q3_K_reference',0,b'\x00\x02\xC6\x23quantize_row_q4_K',0,b'\x00\x02\xAD\x23quantize_row_q4_K_reference',0,b'\x00\x02\xC6\x23quantize_row_q5_K',0,b'\x00\x02\xB2\x23quantize_row_q5_K_reference',0,b'\x00\x02\xC6\x23quantize_row_q6_K',0,b'\x00\x02\xB7\x23quantize_row_q6_K_reference',0,b'\x00\x02\xC6\x23quantize_row_q8_K',0,b'\x00\x02\xBC\x23quantize_row_q8_K_reference',0),
-    _struct_unions = ((b'\x00\x00\x04\x27\x00\x00\x00\x02$1',b'\x00\x00\x22\x11n_iter',b'\x00\x00\xF4\x11sched',b'\x00\x00\xF4\x11decay',b'\x00\x00\xF4\x11alpha',b'\x00\x00\xF4\x11beta1',b'\x00\x00\xF4\x11beta2',b'\x00\x00\xF4\x11eps',b'\x00\x00\xF4\x11eps_f',b'\x00\x00\xF4\x11eps_g'),(b'\x00\x00\x04\x28\x00\x00\x00\x02$2',b'\x00\x00\x22\x11m',b'\x00\x00\x22\x11n_iter',b'\x00\x00\x22\x11max_linesearch',b'\x00\x00\xF4\x11eps',b'\x00\x00\xF4\x11ftol',b'\x00\x00\xF4\x11wolfe',b'\x00\x00\xF4\x11min_step',b'\x00\x00\xF4\x11max_step',b'\x00\x04\x14\x11linesearch'),(b'\x00\x00\x04\x29\x00\x00\x00\x02$3',b'\x00\x00\x08\x11x',b'\x00\x00\x08\x11g1',b'\x00\x00\x08\x11g2',b'\x00\x00\x08\x11m',b'\x00\x00\x08\x11v',b'\x00\x00\x08\x11mh',b'\x00\x00\x08\x11vh',b'\x00\x00\x08\x11pf',b'\x00\x00\xF4\x11fx_best',b'\x00\x00\xF4\x11fx_prev',b'\x00\x00\x22\x11n_no_improvement'),(b'\x00\x00\x04\x2A\x00\x00\x00\x02$4',b'\x00\x00\x08\x11x',b'\x00\x00\x08\x11xp',b'\x00\x00\x08\x11g',b'\x00\x00\x08\x11gp',b'\x00\x00\x08\x11d',b'\x00\x00\x08\x11pf',b'\x00\x00\x08\x11lmal',b'\x00\x00\x08\x11lmys',b'\x00\x00\x08\x11lms',b'\x00\x00\x08\x11lmy',b'\x00\x00\xF4\x11fx_best',b'\x00\x00\xF4\x11step',b'\x00\x00\x22\x11j',b'\x00\x00\x22\x11k',b'\x00\x00\x22\x11end',b'\x00\x00\x22\x11n_no_improvement'),(b'\x00\x00\x03\xF7\x00\x00\x00\x03$__mbstate_t',b'\x00\x03\xFF\x11__mbstate8',b'\x00\x00\xDB\x11_mbstateL'),(b'\x00\x00\x03\xF8\x00\x00\x00\x02$block_q2_K',b'\x00\x04\x44\x11scales',b'\x00\x04\x48\x11qs',b'\x00\x00\x6C\x11d',b'\x00\x00\x6C\x11dmin'),(b'\x00\x00\x03\xF9\x00\x00\x00\x02$block_q3_K',b'\x00\x04\x46\x11hmask',b'\x00\x04\x48\x11qs',b'\x00\x04\x42\x11scales',b'\x00\x00\x6C\x11d'),(b'\x00\x00\x03\xFA\x00\x00\x00\x02$block_q4_K',b'\x00\x00\x6C\x11d',b'\x00\x00\x6C\x11dmin',b'\x00\x04\x42\x11scales',b'\x00\x04\x40\x11qs'),(b'\x00\x00\x03\xFB\x00\x00\x00\x02$block_q5_K',b'\x00\x00\x6C\x11d',b'\x00\x00\x6C\x11dmin',b'\x00\x04\x42\x11scales',b'\x00\x04\x46\x11qh',b'\x00\x04\x40\x11qs'),(b'\x00\x00\x03\xFC\x00\x00\x00\x02$block_q6_K',b'\x00\x04\x40\x11ql',b'\x00\x04\x48\x11qh',b'\x00\x04\x23\x11scales',b'\x00\x00\x6C\x11d'),(b'\x00\x00\x03\xFD\x00\x00\x00\x02$block_q8_K',b'\x00\x00\xF4\x11d',b'\x00\x04\x25\x11qs',b'\x00\x04\x21\x11bsums'),(b'\x00\x00\x04\x18\x00\x00\x00\x02$ggml_type_traits_t',b'\x00\x00\x0F\x11type_name',b'\x00\x00\x22\x11blck_size',b'\x00\x00\x11\x11type_size',b'\x00\x00\xB6\x11is_quantized',b'\x00\x04\x52\x11to_float',b'\x00\x04\x4F\x11from_float',b'\x00\x04\x4F\x11from_float_reference',b'\x00\x04\x50\x11vec_dot',b'\x00\x00\x01\x11vec_dot_type'),(b'\x00\x00\x04\x2C\x00\x00\x00\x02__darwin_pthread_handler_rec',b'\x00\x04\x51\x11__routine',b'\x00\x00\x10\x11__arg',b'\x00\x04\x2B\x11__next'),(b'\x00\x00\x03\xEF\x00\x00\x00\x02_opaque_pthread_attr_t',b'\x00\x04\x20\x11__sig',b'\x00\x04\x0B\x11__opaque'),(b'\x00\x00\x03\xF0\x00\x00\x00\x02_opaque_pthread_cond_t',b'\x00\x04\x20\x11__sig',b'\x00\x04\x07\x11__opaque'),(b'\x00\x00\x03\xF1\x00\x00\x00\x02_opaque_pthread_condattr_t',b'\x00\x04\x20\x11__sig',b'\x00\x04\x11\x11__opaque'),(b'\x00\x00\x03\xF2\x00\x00\x00\x02_opaque_pthread_mutex_t',b'\x00\x04\x20\x11__sig',b'\x00\x04\x0B\x11__opaque'),(b'\x00\x00\x03\xF3\x00\x00\x00\x02_opaque_pthread_mutexattr_t',b'\x00\x04\x20\x11__sig',b'\x00\x04\x11\x11__opaque'),(b'\x00\x00\x03\xF4\x00\x00\x00\x02_opaque_pthread_once_t',b'\x00\x04\x20\x11__sig',b'\x00\x04\x11\x11__opaque'),(b'\x00\x00\x03\xF5\x00\x00\x00\x02_opaque_pthread_rwlock_t',b'\x00\x04\x20\x11__sig',b'\x00\x04\x03\x11__opaque'),(b'\x00\x00\x03\xF6\x00\x00\x00\x02_opaque_pthread_rwlockattr_t',b'\x00\x04\x20\x11__sig',b'\x00\x04\x01\x11__opaque'),(b'\x00\x00\x04\x2E\x00\x00\x00\x02_opaque_pthread_t',b'\x00\x04\x20\x11__sig',b'\x00\x04\x2B\x11__cleanup_stack',b'\x00\x04\x0F\x11__opaque'),(b'\x00\x00\x04\x2F\x00\x00\x00\x10ggml_allocr',),(b'\x00\x00\x04\x30\x00\x00\x00\x02ggml_cgraph',b'\x00\x00\x22\x11n_nodes',b'\x00\x00\x22\x11n_leafs',b'\x00\x04\x39\x11nodes',b'\x00\x04\x39\x11grads',b'\x00\x04\x39\x11leafs',b'\x00\x04\x4D\x11visited_hash_table',b'\x00\x00\x22\x11perf_runs',b'\x00\x00\xDB\x11perf_cycles',b'\x00\x00\xDB\x11perf_time_us'),(b'\x00\x00\x04\x31\x00\x00\x00\x02ggml_compute_params',b'\x00\x04\x17\x11type',b'\x00\x00\x22\x11ith',b'\x00\x00\x22\x11nth',b'\x00\x00\x11\x11wsize',b'\x00\x00\x10\x11wdata'),(b'\x00\x00\x04\x32\x00\x00\x00\x10ggml_context',),(b'\x00\x00\x04\x33\x00\x00\x00\x02ggml_cplan',b'\x00\x00\x11\x11work_size',b'\x00\x04\x3F\x11work_data',b'\x00\x00\x22\x11n_threads',b'\x00\x04\x19\x11n_tasks',b'\x00\x03\xEE\x11abort_callback',b'\x00\x00\x10\x11abort_callback_data'),(b'\x00\x00\x00\xBC\x00\x00\x00\x02ggml_init_params',b'\x00\x00\x11\x11mem_size',b'\x00\x00\x10\x11mem_buffer',b'\x00\x00\xB6\x11no_alloc'),(b'\x00\x00\x04\x34\x00\x00\x00\x10ggml_metal_context',),(b'\x00\x00\x04\x35\x00\x00\x00\x10ggml_mpi_context',),(b'\x00\x00\x04\x37\x00\x00\x00\x02ggml_object',b'\x00\x00\x11\x11offs',b'\x00\x00\x11\x11size',b'\x00\x04\x36\x11next',b'\x00\x04\x15\x11type',b'\x00\x04\x09\x11padding'),(b'\x00\x00\x04\x38\x00\x00\x00\x02ggml_opt_context',b'\x00\x00\x0B\x11ctx',b'\x00\x00\x50\x11params',b'\x00\x00\x22\x11iter',b'\x00\x00\xDB\x11nx',b'\x00\x00\xB6\x11just_initialized',b'\x00\x04\x29\x11adam',b'\x00\x04\x2A\x11lbfgs'),(b'\x00\x00\x00\x50\x00\x00\x00\x02ggml_opt_params',b'\x00\x00\xC8\x11type',b'\x00\x00\x22\x11n_threads',b'\x00\x00\x22\x11past',b'\x00\x00\xF4\x11delta',b'\x00\x00\x22\x11max_no_improvement',b'\x00\x00\xB6\x11print_forward_graph',b'\x00\x00\xB6\x11print_backward_graph',b'\x00\x04\x27\x11adam',b'\x00\x04\x28\x11lbfgs'),(b'\x00\x00\x02\x48\x00\x00\x00\x02ggml_scratch',b'\x00\x00\x11\x11offs',b'\x00\x00\x11\x11size',b'\x00\x00\x10\x11data'),(b'\x00\x00\x04\x3D\x00\x00\x00\x02ggml_tensor',b'\x00\x00\x01\x11type',b'\x00\x04\x13\x11backend',b'\x00\x00\x22\x11n_dims',b'\x00\x04\x1E\x11ne',b'\x00\x04\x4B\x11nb',b'\x00\x00\x2E\x11op',b'\x00\x04\x1B\x11op_params',b'\x00\x00\xB6\x11is_param',b'\x00\x00\x08\x11grad',b'\x00\x04\x3B\x11src',b'\x00\x00\x22\x11perf_runs',b'\x00\x00\xDB\x11perf_cycles',b'\x00\x00\xDB\x11perf_time_us',b'\x00\x00\x10\x11data',b'\x00\x04\x0D\x11name',b'\x00\x00\x10\x11extra',b'\x00\x04\x09\x11padding'),(b'\x00\x00\x04\x3E\x00\x00\x00\x10gguf_context',),(b'\x00\x00\x02\x24\x00\x00\x00\x02gguf_init_params',b'\x00\x00\xB6\x11no_alloc',b'\x00\x00\xB0\x11ctx')),
-    _enums = (b'\x00\x00\x04\x13\x00\x00\x00\x16ggml_backend\x00GGML_BACKEND_CPU,GGML_BACKEND_GPU,GGML_BACKEND_GPU_SPLIT',b'\x00\x00\x00\x54\x00\x00\x00\x15ggml_ftype\x00GGML_FTYPE_UNKNOWN,GGML_FTYPE_ALL_F32,GGML_FTYPE_MOSTLY_F16,GGML_FTYPE_MOSTLY_Q4_0,GGML_FTYPE_MOSTLY_Q4_1,GGML_FTYPE_MOSTLY_Q4_1_SOME_F16,GGML_FTYPE_MOSTLY_Q8_0,GGML_FTYPE_MOSTLY_Q5_0,GGML_FTYPE_MOSTLY_Q5_1,GGML_FTYPE_MOSTLY_Q2_K,GGML_FTYPE_MOSTLY_Q3_K,GGML_FTYPE_MOSTLY_Q4_K,GGML_FTYPE_MOSTLY_Q5_K,GGML_FTYPE_MOSTLY_Q6_K',b'\x00\x00\x04\x14\x00\x00\x00\x16ggml_linesearch\x00GGML_LINESEARCH_DEFAULT,GGML_LINESEARCH_BACKTRACKING_ARMIJO,GGML_LINESEARCH_BACKTRACKING_WOLFE,GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE',b'\x00\x00\x04\x15\x00\x00\x00\x16ggml_object_type\x00GGML_OBJECT_TENSOR,GGML_OBJECT_GRAPH,GGML_OBJECT_WORK_BUFFER',b'\x00\x00\x00\x2E\x00\x00\x00\x16ggml_op\x00GGML_OP_NONE,GGML_OP_DUP,GGML_OP_ADD,GGML_OP_ADD1,GGML_OP_ACC,GGML_OP_SUB,GGML_OP_MUL,GGML_OP_DIV,GGML_OP_SQR,GGML_OP_SQRT,GGML_OP_LOG,GGML_OP_SUM,GGML_OP_SUM_ROWS,GGML_OP_MEAN,GGML_OP_ARGMAX,GGML_OP_REPEAT,GGML_OP_REPEAT_BACK,GGML_OP_SILU_BACK,GGML_OP_NORM,GGML_OP_RMS_NORM,GGML_OP_RMS_NORM_BACK,GGML_OP_MUL_MAT,GGML_OP_OUT_PROD,GGML_OP_SCALE,GGML_OP_SET,GGML_OP_CPY,GGML_OP_CONT,GGML_OP_RESHAPE,GGML_OP_VIEW,GGML_OP_PERMUTE,GGML_OP_TRANSPOSE,GGML_OP_GET_ROWS,GGML_OP_GET_ROWS_BACK,GGML_OP_DIAG,GGML_OP_DIAG_MASK_INF,GGML_OP_DIAG_MASK_ZERO,GGML_OP_SOFT_MAX,GGML_OP_SOFT_MAX_BACK,GGML_OP_ROPE,GGML_OP_ROPE_BACK,GGML_OP_ALIBI,GGML_OP_CLAMP,GGML_OP_CONV_1D,GGML_OP_CONV_2D,GGML_OP_POOL_1D,GGML_OP_POOL_2D,GGML_OP_FLASH_ATTN,GGML_OP_FLASH_FF,GGML_OP_FLASH_ATTN_BACK,GGML_OP_WIN_PART,GGML_OP_WIN_UNPART,GGML_OP_UNARY,GGML_OP_MAP_UNARY,GGML_OP_MAP_BINARY,GGML_OP_MAP_CUSTOM1_F32,GGML_OP_MAP_CUSTOM2_F32,GGML_OP_MAP_CUSTOM3_F32,GGML_OP_MAP_CUSTOM1,GGML_OP_MAP_CUSTOM2,GGML_OP_MAP_CUSTOM3,GGML_OP_CROSS_ENTROPY_LOSS,GGML_OP_CROSS_ENTROPY_LOSS_BACK,GGML_OP_COUNT',b'\x00\x00\x01\x01\x00\x00\x00\x16ggml_op_pool\x00GGML_OP_POOL_MAX,GGML_OP_POOL_AVG,GGML_OP_POOL_COUNT',b'\x00\x00\x04\x16\x00\x00\x00\x15ggml_opt_result\x00GGML_OPT_OK,GGML_OPT_DID_NOT_CONVERGE,GGML_OPT_NO_CONTEXT,GGML_OPT_INVALID_WOLFE,GGML_OPT_FAIL,GGML_LINESEARCH_FAIL,GGML_LINESEARCH_MINIMUM_STEP,GGML_LINESEARCH_MAXIMUM_STEP,GGML_LINESEARCH_MAXIMUM_ITERATIONS,GGML_LINESEARCH_INVALID_PARAMETERS',b'\x00\x00\x00\xC8\x00\x00\x00\x16ggml_opt_type\x00GGML_OPT_ADAM,GGML_OPT_LBFGS',b'\x00\x00\x04\x17\x00\x00\x00\x16ggml_task_type\x00GGML_TASK_INIT,GGML_TASK_COMPUTE,GGML_TASK_FINALIZE',b'\x00\x00\x00\x01\x00\x00\x00\x16ggml_type\x00GGML_TYPE_F32,GGML_TYPE_F16,GGML_TYPE_Q4_0,GGML_TYPE_Q4_1,GGML_TYPE_Q5_0,GGML_TYPE_Q5_1,GGML_TYPE_Q8_0,GGML_TYPE_Q8_1,GGML_TYPE_Q2_K,GGML_TYPE_Q3_K,GGML_TYPE_Q4_K,GGML_TYPE_Q5_K,GGML_TYPE_Q6_K,GGML_TYPE_Q8_K,GGML_TYPE_I8,GGML_TYPE_I16,GGML_TYPE_I32,GGML_TYPE_COUNT',b'\x00\x00\x01\x14\x00\x00\x00\x16ggml_unary_op\x00GGML_UNARY_OP_ABS,GGML_UNARY_OP_SGN,GGML_UNARY_OP_NEG,GGML_UNARY_OP_STEP,GGML_UNARY_OP_TANH,GGML_UNARY_OP_ELU,GGML_UNARY_OP_RELU,GGML_UNARY_OP_GELU,GGML_UNARY_OP_GELU_QUICK,GGML_UNARY_OP_SILU',b'\x00\x00\x00\x34\x00\x00\x00\x16gguf_type\x00GGUF_TYPE_UINT8,GGUF_TYPE_INT8,GGUF_TYPE_UINT16,GGUF_TYPE_INT16,GGUF_TYPE_UINT32,GGUF_TYPE_INT32,GGUF_TYPE_FLOAT32,GGUF_TYPE_BOOL,GGUF_TYPE_STRING,GGUF_TYPE_ARRAY,GGUF_TYPE_COUNT'),
-    _typenames = (b'\x00\x00\x00\xDB__darwin_blkcnt_t',b'\x00\x00\x00\x22__darwin_blksize_t',b'\x00\x00\x00\x11__darwin_clock_t',b'\x00\x00\x00\x22__darwin_ct_rune_t',b'\x00\x00\x00\x22__darwin_dev_t',b'\x00\x00\x03\xBF__darwin_fsblkcnt_t',b'\x00\x00\x03\xBF__darwin_fsfilcnt_t',b'\x00\x00\x03\xBF__darwin_gid_t',b'\x00\x00\x03\xBF__darwin_id_t',b'\x00\x00\x04\x4A__darwin_ino64_t',b'\x00\x00\x04\x4A__darwin_ino_t',b'\x00\x00\x04\x20__darwin_intptr_t',b'\x00\x00\x03\xBF__darwin_mach_port_name_t',b'\x00\x00\x03\xBF__darwin_mach_port_t',b'\x00\x00\x03\xF7__darwin_mbstate_t',b'\x00\x00\x00\x6C__darwin_mode_t',b'\x00\x00\x03\xBF__darwin_natural_t',b'\x00\x00\x00\xDB__darwin_off_t',b'\x00\x00\x00\x22__darwin_pid_t',b'\x00\x00\x03\xEF__darwin_pthread_attr_t',b'\x00\x00\x03\xF0__darwin_pthread_cond_t',b'\x00\x00\x03\xF1__darwin_pthread_condattr_t',b'\x00\x00\x00\x11__darwin_pthread_key_t',b'\x00\x00\x03\xF2__darwin_pthread_mutex_t',b'\x00\x00\x03\xF3__darwin_pthread_mutexattr_t',b'\x00\x00\x03\xF4__darwin_pthread_once_t',b'\x00\x00\x03\xF5__darwin_pthread_rwlock_t',b'\x00\x00\x03\xF6__darwin_pthread_rwlockattr_t',b'\x00\x00\x04\x2D__darwin_pthread_t',b'\x00\x00\x04\x20__darwin_ptrdiff_t',b'\x00\x00\x00\x22__darwin_rune_t',b'\x00\x00\x03\xBF__darwin_sigset_t',b'\x00\x00\x00\x11__darwin_size_t',b'\x00\x00\x03\xBF__darwin_socklen_t',b'\x00\x00\x04\x20__darwin_ssize_t',b'\x00\x00\x00\x22__darwin_suseconds_t',b'\x00\x00\x04\x20__darwin_time_t',b'\x00\x00\x03\xBF__darwin_uid_t',b'\x00\x00\x03\xBF__darwin_useconds_t',b'\x00\x00\x04\x05__darwin_uuid_string_t',b'\x00\x00\x04\x44__darwin_uuid_t',b'\x00\x00\x00\x22__darwin_wchar_t',b'\x00\x00\x00\x22__darwin_wint_t',b'\x00\x00\x03\xB0__int16_t',b'\x00\x00\x00\x22__int32_t',b'\x00\x00\x00\xDB__int64_t',b'\x00\x00\x03\xB5__int8_t',b'\x00\x00\x03\xF7__mbstate_t',b'\x00\x00\x00\x6C__uint16_t',b'\x00\x00\x03\xBF__uint32_t',b'\x00\x00\x04\x4A__uint64_t',b'\x00\x00\x03\xBA__uint8_t',b'\x00\x00\x03\xF8block_q2_K',b'\x00\x00\x03\xF9block_q3_K',b'\x00\x00\x03\xFAblock_q4_K',b'\x00\x00\x03\xFBblock_q5_K',b'\x00\x00\x03\xFCblock_q6_K',b'\x00\x00\x03\xFDblock_q8_K',b'\x00\x00\x01\xEAggml_binary_op_f32_t',b'\x00\x00\x02\x02ggml_custom1_op_f32_t',b'\x00\x00\x02\x07ggml_custom1_op_t',b'\x00\x00\x01\xF0ggml_custom2_op_f32_t',b'\x00\x00\x01\xF6ggml_custom2_op_t',b'\x00\x00\x01\xC5ggml_custom3_op_f32_t',b'\x00\x00\x01\xCCggml_custom3_op_t',b'\x00\x00\x00\x6Cggml_fp16_t',b'\x00\x00\x04\x4Fggml_from_float_t',b'\x00\x00\x04\x52ggml_to_float_t',b'\x00\x00\x04\x18ggml_type_traits_t',b'\x00\x00\x01\xFDggml_unary_op_f32_t',b'\x00\x00\x04\x50ggml_vec_dot_t',b'\x00\x00\x03\xB0int16_t',b'\x00\x00\x00\x22int32_t',b'\x00\x00\x00\xDBint64_t',b'\x00\x00\x03\xB5int8_t',b'\x00\x00\x03\xB0int_fast16_t',b'\x00\x00\x00\x22int_fast32_t',b'\x00\x00\x00\xDBint_fast64_t',b'\x00\x00\x03\xB5int_fast8_t',b'\x00\x00\x03\xB0int_least16_t',b'\x00\x00\x00\x22int_least32_t',b'\x00\x00\x00\xDBint_least64_t',b'\x00\x00\x03\xB5int_least8_t',b'\x00\x00\x04\x20intmax_t',b'\x00\x00\x04\x20intptr_t',b'\x00\x00\x04\x1Dmax_align_t',b'\x00\x00\x04\x20ptrdiff_t',b'\x00\x00\x00\xDBregister_t',b'\x00\x00\x00\x11rsize_t',b'\x00\x00\x00\x11size_t',b'\x00\x00\x04\x4Asyscall_arg_t',b'\x00\x00\x00\x6Cu_int16_t',b'\x00\x00\x03\xBFu_int32_t',b'\x00\x00\x04\x4Au_int64_t',b'\x00\x00\x03\xBAu_int8_t',b'\x00\x00\x00\x6Cuint16_t',b'\x00\x00\x03\xBFuint32_t',b'\x00\x00\x04\x4Auint64_t',b'\x00\x00\x03\xBAuint8_t',b'\x00\x00\x00\x6Cuint_fast16_t',b'\x00\x00\x03\xBFuint_fast32_t',b'\x00\x00\x04\x4Auint_fast64_t',b'\x00\x00\x03\xBAuint_fast8_t',b'\x00\x00\x00\x6Cuint_least16_t',b'\x00\x00\x03\xBFuint_least32_t',b'\x00\x00\x04\x4Auint_least64_t',b'\x00\x00\x03\xBAuint_least8_t',b'\x00\x00\x00\x11uintmax_t',b'\x00\x00\x00\x11uintptr_t',b'\x00\x00\x04\x4Auser_addr_t',b'\x00\x00\x00\xDBuser_long_t',b'\x00\x00\x00\xDBuser_off_t',b'\x00\x00\x04\x4Auser_size_t',b'\x00\x00\x00\xDBuser_ssize_t',b'\x00\x00\x00\xDBuser_time_t',b'\x00\x00\x04\x4Auser_ulong_t',b'\x00\x00\x00\x22wchar_t'),
-)
diff --git a/ggml/examples/python/ggml/ffi/__init__.pyi b/ggml/examples/python/ggml/ffi/__init__.pyi
deleted file mode 100644
index 73117a1..0000000
--- a/ggml/examples/python/ggml/ffi/__init__.pyi
+++ /dev/null
@@ -1,7 +0,0 @@
-# Phony stubs.
-
-class CData:
-    pass
-
-class CType:
-    pass
\ No newline at end of file
diff --git a/ggml/examples/python/ggml/utils.py b/ggml/examples/python/ggml/utils.py
deleted file mode 100644
index 7cea2bf..0000000
--- a/ggml/examples/python/ggml/utils.py
+++ /dev/null
@@ -1,182 +0,0 @@
-"""
-  Common helpers for working with ggml + numpy
-"""
-from ggml import ffi, lib
-from typing import Union, Optional
-import numpy as np
-
-def init(mem_size: int, mem_buffer: ffi.CData = ffi.NULL, no_alloc: bool = False) -> ffi.CData:
-    """
-      Initialize a ggml context, which will be freed automatically when the pointer is garbage collected.
-    """
-    params = ffi.new('struct ggml_init_params*')
-    params.mem_size = mem_size
-    params.mem_buffer = mem_buffer
-    params.no_alloc = no_alloc
-    return ffi.gc(lib.ggml_init(params[0]), lib.ggml_free)
- 
-TensorLike = Union[ffi.CData, np.ndarray]
-
-def copy(from_tensor: TensorLike, to_tensor: TensorLike, allow_requantize: bool = True):
-    """
-      Copy the contents of one tensor to another, doing any necessary (de/re)quantization transparently.
-      Works across numpy & ggml tensors, but they must have the same shape (and be contiguous).
-
-      Parameters
-      ----------
-      from_tensor : TensorLike
-          The tensor to copy from (a numpy array or possibly-quantized ggml tensor)
-      to_tensor : TensorLike
-          The tensor to copy to (a numpy array or possibly-quantized ggml tensor)
-      allow_requantize : bool
-          If False, will throw an error if requantization is required (i.e. both from_tensor
-          and to_tensor are quantized with different quantization types)
-    """
-    if id(from_tensor) == id(to_tensor):
-        return
- 
-    __expect_same_layout("source", from_tensor, "destination", to_tensor)
-    __check_shape_consistent_with_type(from_tensor)
-    __check_shape_consistent_with_type(to_tensor)
-
-    from_type = __get_type(from_tensor)
-    to_type = __get_type(to_tensor)
-
-    if from_type == to_type:
-        ffi.memmove(__get_data(to_tensor), __get_data(from_tensor), __get_nbytes(from_tensor))
-    else:
-        assert allow_requantize or not lib.ggml_is_quantized(from_type) or not lib.ggml_is_quantized(to_type), \
-            f"Requantizing from {__type_name(from_type)} to {__type_name(to_type)} is disabled. Force with allow_requantize=True"
- 
-        __set_floats(to_tensor, __get_floats(from_tensor))
-
-def numpy(tensor: ffi.CData, allow_copy: Union[bool, np.ndarray] = False, allow_requantize=False) -> np.ndarray:
-    """
-      Convert a ggml tensor to a numpy array.
-      If the tensor isn't quantized, the returned numpy array will be a view over its data.
- 
-      If it is quantized (and allow_copy is True), the copy will involve dequantization and the returned array will
-      be a copy of the original tensor (any changes to the numpy array won't then be reflected back to the tensor).
-
-      Parameters
-      ----------
-      tensor : ffi.CData
-          The tensor to convert to a numpy array
-      allow_copy : bool or np.ndarray
-          If False, will throw an error if the tensor is quantized (since dequantization requires extra memory).
-          If True, will dequantize the tensor and return a copy of the data in a new float32 numpy array.
-          If an np.ndarray, will copy the data into the given array (which must be the same shape as the tensor) when dequantization is needed
-      allow_requantize : bool
-          If allow_copy is a tensor with a different quantization type than the source tensor, will throw an error unless allow_requantize is True.
-    """
-    shape = __get_shape(tensor)
-
-    if lib.ggml_is_quantized(tensor.type):
-        if allow_copy == False:
-            raise ValueError(f"{__describe(tensor)} is quantized, conversion to numpy requires a copy (pass allow_copy=True; changes to the numpy array won't affect the original).")
-        elif isinstance(allow_copy, np.ndarray):
-            __expect_same_layout("source tensor", tensor, "dequantization output tensor", allow_copy)
-            destination = allow_copy
-        else:
-            destination = np.empty(shape, dtype=np.float32)
-
-        copy(tensor, destination, allow_requantize=allow_requantize)
-        return destination
-    else:
-        dtype = __type_to_dtype(tensor.type)
-        if not dtype:
-            raise NotImplementedError(f'Cannot convert {__describe(tensor)} to numpy')
-
-        assert __is_contiguous(tensor), f"Cannot convert {__describe(tensor)} to numpy (support contiguous tensors only)"
-        nbytes = lib.ggml_nelements(tensor) * lib.ggml_type_size(tensor.type)
-        array = np.frombuffer(ffi.buffer(lib.ggml_get_data(tensor), nbytes), dtype=dtype)
-        array.shape = shape
-        return array
-
-def __type_name(type: int) -> str:
-    name = lib.ggml_type_name(type)
-    return ffi.string(name).decode('utf-8') if name else None
-
-__k_quant_types = set([
-  lib.GGML_TYPE_Q2_K,
-  lib.GGML_TYPE_Q3_K,
-  lib.GGML_TYPE_Q4_K,
-  lib.GGML_TYPE_Q5_K,
-  lib.GGML_TYPE_Q6_K,
-  lib.GGML_TYPE_Q8_K,
-])
-
-__type_to_dtype_dict = {
-  lib.GGML_TYPE_I8: np.int8,
-  lib.GGML_TYPE_I16: np.int16,
-  lib.GGML_TYPE_I32: np.int32,
-  lib.GGML_TYPE_F16: np.float16,
-  lib.GGML_TYPE_F32: np.float32,
-}
-
-def __type_to_dtype(type: int) -> Optional[np.dtype]: return __type_to_dtype_dict.get(type)
-def __dtype_to_type(dtype: np.dtype):
-    if dtype == np.float32: return lib.GGML_TYPE_F32
-    elif dtype == np.float16: return lib.GGML_TYPE_F16
-    elif dtype == np.int32: return lib.GGML_TYPE_I32
-    elif dtype == np.int16: return lib.GGML_TYPE_I16
-    elif dtype == np.int8: return lib.GGML_TYPE_I8
-    else: raise ValueError(f"Unsupported dtype: {dtype}")
-
-def __describe(tensor: ffi.CType): return f'Tensor[{__type_name(__get_type(tensor))}, {__get_shape(tensor)}]'
-def __get_type(tensor: TensorLike): return __dtype_to_type(tensor.dtype) if isinstance(tensor, np.ndarray) else tensor.type
-def __get_shape(x: TensorLike): return x.shape if isinstance(x, np.ndarray) else tuple([x.ne[i] for i in range(x.n_dims)])
-def __get_strides(x: TensorLike): return x.strides if isinstance(x, np.ndarray) else tuple([x.nb[i] for i in range(x.n_dims)])
-def __get_data(x: TensorLike) -> ffi.CData: return ffi.from_buffer(x) if isinstance(x, np.ndarray) else lib.ggml_get_data(x)
-def __get_nbytes(tensor: TensorLike): return tensor.nbytes if isinstance(tensor, np.ndarray) else lib.ggml_nbytes(tensor)
-def __get_nelements(tensor: TensorLike): return tensor.size if isinstance(tensor, np.ndarray) else lib.ggml_nelements(tensor)
-def __is_contiguous(tensor: TensorLike): return tensor.flags['C_CONTIGUOUS'] if isinstance(tensor, np.ndarray) else lib.ggml_is_contiguous(tensor)
-
-def __get_floats(tensor: TensorLike) -> ffi.CData:
-    data, type = __get_data(tensor), __get_type(tensor)
-    if type == lib.GGML_TYPE_F32:
-        return ffi.cast('float*', data)
-    else:
-      nelements = __get_nelements(tensor)
-      floats = ffi.new('float[]', nelements)
-      if type == lib.GGML_TYPE_F16:
-          lib.ggml_fp16_to_fp32_row(ffi.cast('uint16_t*', data), floats, nelements)
-      elif lib.ggml_is_quantized(type):
-          qtype = lib.ggml_internal_get_type_traits(type)
-          assert qtype.to_float, f"Type {__type_name(type)} is not supported by ggml"
-          qtype.to_float(data, floats, nelements)
-      else:
-          raise NotImplementedError(f'Cannot read floats from {__describe(tensor)}')
-      return floats
-
-def __set_floats(tensor: TensorLike, f32_data: ffi.CData) -> None:
-    data, type, nbytes = __get_data(tensor), __get_type(tensor), __get_nbytes(tensor)
-    if type == lib.GGML_TYPE_F32:
-        ffi.memmove(data, f32_data, nbytes)
-    else:
-      nelements = __get_nelements(tensor)
-      if type == lib.GGML_TYPE_F16:
-          lib.ggml_fp32_to_fp16_row(f32_data, ffi.cast('uint16_t*', data), nelements)
-      elif lib.ggml_is_quantized(type):
-          qtype = lib.ggml_internal_get_type_traits(type)
-          assert qtype.from_float, f"Type {__type_name(type)} is not supported by ggml"
-          qtype.from_float(f32_data, data, nelements)
-      else:
-          raise NotImplementedError(f'Cannot write floats to {__describe(tensor)}')
-
-def __expect_same_layout(name1: str, tensor1: TensorLike, name2: str, tensor2: TensorLike):
-    shape1, shape2 = __get_shape(tensor1), __get_shape(tensor2)
-    assert shape1 == shape2, f"Shape mismatch: {name1} has {shape1} but {name2} has {shape2}"
-    assert __is_contiguous(tensor1) and __is_contiguous(tensor2), f"Only contiguous tensors are supported (got {name1} with strides {__get_strides(tensor1)} and {name2} with strides {__get_strides(tensor2)})"
-
-def __check_shape_consistent_with_type(tensor: TensorLike):
-    type = __get_type(tensor)
-    if not lib.ggml_is_quantized(type):
-        return
-    shape = __get_shape(tensor)
-
-    block_size = lib.ggml_blck_size(type)
-    assert not (block_size == 0 and type in __k_quant_types), f"Can't quantize, native library was not compiled with USE_K_QUANTS!"
-    assert block_size > 0, f"Invalid block size {block_size} for type {__type_name(type)}"
-    for i, d in enumerate(shape):
-        assert d % block_size == 0, f"Dimension {i} of {__describe(tensor)} is not divisible by {block_size}, required for quantization."
diff --git a/ggml/examples/python/regenerate.py b/ggml/examples/python/regenerate.py
deleted file mode 100644
index 08d84c0..0000000
--- a/ggml/examples/python/regenerate.py
+++ /dev/null
@@ -1,42 +0,0 @@
-# Generates bindings for the ggml library.
-#
-# cffi requires prior C preprocessing of the headers, and it uses pycparser which chokes on a couple of things
-# so we help it a bit (e.g. replace sizeof expressions with their value, remove exotic syntax found in Darwin headers).
-import os, sys, re, subprocess
-import cffi
-from stubs import generate_stubs
-
-API = os.environ.get('API', 'api.h')
-CC = os.environ.get('CC') or 'gcc'
-C_INCLUDE_DIR = os.environ.get('C_INCLUDE_DIR', '../../../llama.cpp')
-CPPFLAGS = [
-    "-I", C_INCLUDE_DIR,
-    '-D__fp16=uint16_t',  # pycparser doesn't support __fp16
-    '-D__attribute__(x)=',
-    '-D_Static_assert(x, m)=',
-] + [x for x in os.environ.get('CPPFLAGS', '').split(' ') if x != '']
-
-try: header = subprocess.run([CC, "-E", *CPPFLAGS, API], capture_output=True, text=True, check=True).stdout
-except subprocess.CalledProcessError as e: print(f'{e.stderr}\n{e}', file=sys.stderr); raise
-
-header = '\n'.join([l for l in header.split('\n') if '__darwin_va_list' not in l]) # pycparser hates this
-
-# Replace constant size expressions w/ their value (compile & run a mini exe for each, because why not).
-# First, extract anyting *inside* square brackets and anything that looks like a sizeof call.
-for expr in set(re.findall(f'(?<=\\[)[^\\]]+(?=])|sizeof\\s*\\([^()]+\\)', header)):
-    if re.match(r'^(\d+|\s*)$', expr): continue # skip constants and empty bracket contents
-    subprocess.run([CC, "-o", "eval_size_expr", *CPPFLAGS, "-x", "c", "-"], text=True, check=True,
-                   input=f'''#include <stdio.h>
-                             #include "{API}"
-                             int main() {{ printf("%lu", (size_t)({expr})); }}''')
-    size = subprocess.run(["./eval_size_expr"], capture_output=True, text=True, check=True).stdout
-    print(f'Computed constexpr {expr} = {size}')
-    header = header.replace(expr, size)
-
-ffibuilder = cffi.FFI()
-ffibuilder.cdef(header)
-ffibuilder.set_source(f'ggml.cffi', None) # we're not compiling a native extension, as this quickly gets hairy
-ffibuilder.compile(verbose=True)
-
-with open("ggml/__init__.pyi", "wt") as f:
-    f.write(generate_stubs(header))
\ No newline at end of file
diff --git a/ggml/examples/python/stubs.py b/ggml/examples/python/stubs.py
deleted file mode 100644
index adf3d6c..0000000
--- a/ggml/examples/python/stubs.py
+++ /dev/null
@@ -1,128 +0,0 @@
-"""
-  This generates .pyi stubs for the cffi Python bindings generated by regenerate.py
-"""
-import sys, re, itertools
-sys.path.extend(['.', '..']) # for pycparser
-
-from pycparser import c_ast, parse_file, CParser
-import pycparser.plyparser
-from pycparser.c_ast import PtrDecl, TypeDecl, FuncDecl, EllipsisParam, IdentifierType, Struct, Enum, Typedef
-from typing import Tuple
-
-__c_type_to_python_type = {
-    'void': 'None', '_Bool': 'bool',
-    'char': 'int', 'short': 'int', 'int': 'int', 'long': 'int',
-    'ptrdiff_t': 'int', 'size_t': 'int',
-    'int8_t': 'int', 'uint8_t': 'int',
-    'int16_t': 'int', 'uint16_t': 'int',
-    'int32_t': 'int', 'uint32_t': 'int',
-    'int64_t': 'int', 'uint64_t': 'int',
-    'float': 'float', 'double': 'float',
-    'ggml_fp16_t': 'np.float16',
-}
-
-def format_type(t: TypeDecl):
-    if isinstance(t, PtrDecl) or isinstance(t, Struct):
-        return 'ffi.CData'
-    if isinstance(t, Enum):
-        return 'int'
-    if isinstance(t, TypeDecl):
-        return format_type(t.type)
-    if isinstance(t, IdentifierType):
-        assert len(t.names) == 1, f'Expected a single name, got {t.names}'
-        return __c_type_to_python_type.get(t.names[0]) or 'ffi.CData'
-    return t.name
-
-class PythonStubFuncDeclVisitor(c_ast.NodeVisitor):
-    def __init__(self):
-        self.sigs = {}
-        self.sources = {}
-
-    def get_source_snippet_lines(self, coord: pycparser.plyparser.Coord) -> Tuple[list[str], list[str]]:
-        if coord.file not in self.sources:
-            with open(coord.file, 'rt') as f:
-                self.sources[coord.file] = f.readlines()
-        source_lines = self.sources[coord.file]
-        ncomment_lines = len(list(itertools.takewhile(lambda i: re.search(r'^\s*(//|/\*)', source_lines[i]), range(coord.line - 2, -1, -1))))
-        comment_lines = [l.strip() for l in source_lines[coord.line - 1 - ncomment_lines:coord.line - 1]]
-        decl_lines = []
-        for line in source_lines[coord.line - 1:]:
-            decl_lines.append(line.rstrip())
-            if (';' in line) or ('{' in line): break
-        return (comment_lines, decl_lines)
-
-    def visit_Enum(self, node: Enum):
-        if node.values is not None:
-          for e in node.values.enumerators:
-              self.sigs[e.name] = f'  @property\n  def {e.name}(self) -> int: ...'
-
-    def visit_Typedef(self, node: Typedef):
-        pass
-
-    def visit_FuncDecl(self, node: FuncDecl):
-        ret_type = node.type
-        is_ptr = False
-        while isinstance(ret_type, PtrDecl):
-            ret_type = ret_type.type
-            is_ptr = True
-
-        fun_name = ret_type.declname
-        if fun_name.startswith('__'):
-            return
-
-        args = []
-        argnames = []
-        def gen_name(stem):
-            i = 1
-            while True:
-                new_name = stem if i == 1 else f'{stem}{i}'
-                if new_name not in argnames: return new_name
-                i += 1
-
-        for a in node.args.params:
-            if isinstance(a, EllipsisParam):
-                arg_name = gen_name('args')
-                argnames.append(arg_name)
-                args.append('*' + gen_name('args'))
-            elif format_type(a.type) == 'None':
-                continue
-            else:
-                arg_name = a.name or gen_name('arg')
-                argnames.append(arg_name)
-                args.append(f'{arg_name}: {format_type(a.type)}')
-
-        ret = format_type(ret_type if not is_ptr else node.type)
-
-        comment_lines, decl_lines = self.get_source_snippet_lines(node.coord)
-
-        lines = [f'  def {fun_name}({", ".join(args)}) -> {ret}:']
-        if len(comment_lines) == 0 and len(decl_lines) == 1:
-            lines += [f'    """{decl_lines[0]}"""']
-        else:
-            lines += ['    """']
-            lines += [f'    {c.lstrip("/* ")}' for c in comment_lines]
-            if len(comment_lines) > 0:
-                lines += ['']
-            lines += [f'    {d}' for d in decl_lines]
-            lines += ['    """']
-        lines += ['    ...']
-        self.sigs[fun_name] = '\n'.join(lines)
-
-def generate_stubs(header: str):
-    """
-      Generates a .pyi Python stub file for the GGML API using C header files.
-    """
-
-    v = PythonStubFuncDeclVisitor()
-    v.visit(CParser().parse(header, "<input>"))
-
-    keys = list(v.sigs.keys())
-    keys.sort()
-
-    return '\n'.join([
-        '# auto-generated file',
-        'import ggml.ffi as ffi',
-        'import numpy as np',
-        'class lib:',
-        *[v.sigs[k] for k in keys]
-    ])
diff --git a/ggml/examples/python/test_tensor.py b/ggml/examples/python/test_tensor.py
deleted file mode 100644
index 1a365fa..0000000
--- a/ggml/examples/python/test_tensor.py
+++ /dev/null
@@ -1,258 +0,0 @@
-import pytest
-from pytest import raises
-
-from ggml import lib, ffi
-from ggml.utils import init, copy, numpy
-import numpy as np
-import numpy.testing as npt
-
-@pytest.fixture()
-def ctx():
-    print("setup")
-    yield init(mem_size=10*1024*1024)
-    print("teardown")
-
-class TestNumPy:
-    
-    # Single element
-
-    def test_set_get_single_i32(self, ctx):
-        i = lib.ggml_new_i32(ctx, 42)
-        assert lib.ggml_get_i32_1d(i, 0) == 42
-        assert numpy(i) == np.array([42], dtype=np.int32)
-
-    def test_set_get_single_f32(self, ctx):
-        i = lib.ggml_new_f32(ctx, 4.2)
-        
-        epsilon = 0.000001 # Not sure why so large a difference??
-        pytest.approx(lib.ggml_get_f32_1d(i, 0), 4.2, epsilon)
-        pytest.approx(numpy(i), np.array([4.2], dtype=np.float32), epsilon)
-
-    def _test_copy_np_to_ggml(self, a: np.ndarray, t: ffi.CData):
-        a2 = a.copy() # Clone original
-        copy(a, t)
-        npt.assert_array_equal(numpy(t), a2)
-
-    # I32
-
-    def test_copy_np_to_ggml_1d_i32(self, ctx):
-        t = lib.ggml_new_tensor_1d(ctx, lib.GGML_TYPE_I32, 10)
-        a = np.arange(10, dtype=np.int32)
-        self._test_copy_np_to_ggml(a, t)
-
-    def test_copy_np_to_ggml_2d_i32(self, ctx):
-        t = lib.ggml_new_tensor_2d(ctx, lib.GGML_TYPE_I32, 2, 3)
-        a = np.arange(2 * 3, dtype=np.int32).reshape((2, 3))
-        self._test_copy_np_to_ggml(a, t)
-
-    def test_copy_np_to_ggml_3d_i32(self, ctx):
-        t = lib.ggml_new_tensor_3d(ctx, lib.GGML_TYPE_I32, 2, 3, 4)
-        a = np.arange(2 * 3 * 4, dtype=np.int32).reshape((2, 3, 4))
-        self._test_copy_np_to_ggml(a, t)
-
-    def test_copy_np_to_ggml_4d_i32(self, ctx):
-        t = lib.ggml_new_tensor_4d(ctx, lib.GGML_TYPE_I32, 2, 3, 4, 5)
-        a = np.arange(2 * 3 * 4 * 5, dtype=np.int32).reshape((2, 3, 4, 5))
-        self._test_copy_np_to_ggml(a, t)
-
-    def test_copy_np_to_ggml_4d_n_i32(self, ctx):
-        dims = [2, 3, 4, 5] # GGML_MAX_DIMS is 4, going beyond would crash
-        pdims = ffi.new('int64_t[]', len(dims))
-        for i, d in enumerate(dims): pdims[i] = d
-        t = lib.ggml_new_tensor(ctx, lib.GGML_TYPE_I32, len(dims), pdims)
-        a = np.arange(np.prod(dims), dtype=np.int32).reshape(tuple(pdims))
-        self._test_copy_np_to_ggml(a, t)
-
-    # F32
-
-    def test_copy_np_to_ggml_1d_f32(self, ctx):
-        t = lib.ggml_new_tensor_1d(ctx, lib.GGML_TYPE_F32, 10)
-        a = np.arange(10, dtype=np.float32)
-        self._test_copy_np_to_ggml(a, t)
-
-    def test_copy_np_to_ggml_2d_f32(self, ctx):
-        t = lib.ggml_new_tensor_2d(ctx, lib.GGML_TYPE_F32, 2, 3)
-        a = np.arange(2 * 3, dtype=np.float32).reshape((2, 3))
-        self._test_copy_np_to_ggml(a, t)
-
-    def test_copy_np_to_ggml_3d_f32(self, ctx):
-        t = lib.ggml_new_tensor_3d(ctx, lib.GGML_TYPE_F32, 2, 3, 4)
-        a = np.arange(2 * 3 * 4, dtype=np.float32).reshape((2, 3, 4))
-        self._test_copy_np_to_ggml(a, t)
-
-    def test_copy_np_to_ggml_4d_f32(self, ctx):
-        t = lib.ggml_new_tensor_4d(ctx, lib.GGML_TYPE_F32, 2, 3, 4, 5)
-        a = np.arange(2 * 3 * 4 * 5, dtype=np.float32).reshape((2, 3, 4, 5))
-        self._test_copy_np_to_ggml(a, t)
-
-    def test_copy_np_to_ggml_4d_n_f32(self, ctx):
-        dims = [2, 3, 4, 5] # GGML_MAX_DIMS is 4, going beyond would crash
-        pdims = ffi.new('int64_t[]', len(dims))
-        for i, d in enumerate(dims): pdims[i] = d
-        t = lib.ggml_new_tensor(ctx, lib.GGML_TYPE_F32, len(dims), pdims)
-        a = np.arange(np.prod(dims), dtype=np.float32).reshape(tuple(pdims))
-        self._test_copy_np_to_ggml(a, t)
-
-    # F16
-
-    def test_copy_np_to_ggml_1d_f16(self, ctx):
-        t = lib.ggml_new_tensor_1d(ctx, lib.GGML_TYPE_F16, 10)
-        a = np.arange(10, dtype=np.float16)
-        self._test_copy_np_to_ggml(a, t)
-
-    def test_copy_np_to_ggml_2d_f16(self, ctx):
-        t = lib.ggml_new_tensor_2d(ctx, lib.GGML_TYPE_F16, 2, 3)
-        a = np.arange(2 * 3, dtype=np.float16).reshape((2, 3))
-        self._test_copy_np_to_ggml(a, t)
-
-    def test_copy_np_to_ggml_3d_f16(self, ctx):
-        t = lib.ggml_new_tensor_3d(ctx, lib.GGML_TYPE_F16, 2, 3, 4)
-        a = np.arange(2 * 3 * 4, dtype=np.float16).reshape((2, 3, 4))
-        self._test_copy_np_to_ggml(a, t)
-
-    def test_copy_np_to_ggml_4d_f16(self, ctx):
-        t = lib.ggml_new_tensor_4d(ctx, lib.GGML_TYPE_F16, 2, 3, 4, 5)
-        a = np.arange(2 * 3 * 4 * 5, dtype=np.float16).reshape((2, 3, 4, 5))
-        self._test_copy_np_to_ggml(a, t)
-
-    def test_copy_np_to_ggml_4d_n_f16(self, ctx):
-        dims = [2, 3, 4, 5] # GGML_MAX_DIMS is 4, going beyond would crash
-        pdims = ffi.new('int64_t[]', len(dims))
-        for i, d in enumerate(dims): pdims[i] = d
-        t = lib.ggml_new_tensor(ctx, lib.GGML_TYPE_F16, len(dims), pdims)
-        a = np.arange(np.prod(dims), dtype=np.float16).reshape(tuple(pdims))
-        self._test_copy_np_to_ggml(a, t)
-
-    # Mismatching shapes
-
-    def test_copy_mismatching_shapes_1d(self, ctx):
-        t = lib.ggml_new_tensor_1d(ctx, lib.GGML_TYPE_F32, 10)
-        a = np.arange(10, dtype=np.float32)
-        copy(a, t) # OK
-        
-        a = a.reshape((5, 2))
-        with raises(AssertionError): copy(a, t)
-        with raises(AssertionError): copy(t, a)
-            
-    def test_copy_mismatching_shapes_2d(self, ctx):
-        t = lib.ggml_new_tensor_2d(ctx, lib.GGML_TYPE_F32, 2, 3)
-        a = np.arange(6, dtype=np.float32)
-        copy(a.reshape((2, 3)), t) # OK
-        
-        a = a.reshape((3, 2))
-        with raises(AssertionError): copy(a, t)
-        with raises(AssertionError): copy(t, a)
-
-    def test_copy_mismatching_shapes_3d(self, ctx):
-        t = lib.ggml_new_tensor_3d(ctx, lib.GGML_TYPE_F32, 2, 3, 4)
-        a = np.arange(24, dtype=np.float32)
-        copy(a.reshape((2, 3, 4)), t) # OK
-        
-        a = a.reshape((2, 4, 3))
-        with raises(AssertionError): copy(a, t)
-        with raises(AssertionError): copy(t, a)
-
-    def test_copy_mismatching_shapes_4d(self, ctx):
-        t = lib.ggml_new_tensor_4d(ctx, lib.GGML_TYPE_F32, 2, 3, 4, 5)
-        a = np.arange(24*5, dtype=np.float32)
-        copy(a.reshape((2, 3, 4, 5)), t) # OK
-        
-        a = a.reshape((2, 3, 5, 4))
-        with raises(AssertionError): copy(a, t)
-        with raises(AssertionError): copy(t, a)
-
-    def test_copy_f16_to_f32(self, ctx):
-        t = lib.ggml_new_tensor_1d(ctx, lib.GGML_TYPE_F32, 1)
-        a = np.array([123.45], dtype=np.float16)
-        copy(a, t)
-        np.testing.assert_allclose(lib.ggml_get_f32_1d(t, 0), 123.45, rtol=1e-3)
-
-    def test_copy_f32_to_f16(self, ctx):
-        t = lib.ggml_new_tensor_1d(ctx, lib.GGML_TYPE_F16, 1)
-        a = np.array([123.45], dtype=np.float32)
-        copy(a, t)
-        np.testing.assert_allclose(lib.ggml_get_f32_1d(t, 0), 123.45, rtol=1e-3)
-
-    def test_copy_f16_to_Q5_K(self, ctx):
-        n = 256
-        t = lib.ggml_new_tensor_1d(ctx, lib.GGML_TYPE_Q5_K, n)
-        a = np.arange(n, dtype=np.float16)
-        copy(a, t)
-        np.testing.assert_allclose(a, numpy(t, allow_copy=True), rtol=0.05)
-
-    def test_copy_Q5_K_to_f16(self, ctx):
-        n = 256
-        t = lib.ggml_new_tensor_1d(ctx, lib.GGML_TYPE_Q5_K, n)
-        copy(np.arange(n, dtype=np.float32), t)
-        a = np.arange(n, dtype=np.float16)
-        copy(t, a)
-        np.testing.assert_allclose(a, numpy(t, allow_copy=True), rtol=0.05)
-
-    def test_copy_i16_f32_mismatching_types(self, ctx):
-        t = lib.ggml_new_tensor_1d(ctx, lib.GGML_TYPE_F32, 1)
-        a = np.arange(1, dtype=np.int16)
-        with raises(NotImplementedError): copy(a, t)
-        with raises(NotImplementedError): copy(t, a)
-
-class TestTensorCopy:
-
-    def test_copy_self(self, ctx):
-        t = lib.ggml_new_i32(ctx, 42)
-        copy(t, t)
-        assert lib.ggml_get_i32_1d(t, 0) == 42
-
-    def test_copy_1d(self, ctx):
-        t1 = lib.ggml_new_tensor_1d(ctx, lib.GGML_TYPE_F32, 10)
-        t2 = lib.ggml_new_tensor_1d(ctx, lib.GGML_TYPE_F32, 10)
-        a = np.arange(10, dtype=np.float32)
-        copy(a, t1)
-        copy(t1, t2)
-        assert np.allclose(a, numpy(t2))
-        assert np.allclose(numpy(t1), numpy(t2))
-
-class TestGraph:
-
-    def test_add(self, ctx):
-        n = 256
-        ta = lib.ggml_new_tensor_1d(ctx, lib.GGML_TYPE_F32, n)
-        tb = lib.ggml_new_tensor_1d(ctx, lib.GGML_TYPE_F32, n)
-        tsum = lib.ggml_add(ctx, ta, tb)
-        assert tsum.type == lib.GGML_TYPE_F32
-
-        gf = ffi.new('struct ggml_cgraph*')
-        lib.ggml_build_forward_expand(gf, tsum)
-
-        a = np.arange(0, n, dtype=np.float32)
-        b = np.arange(n, 0, -1, dtype=np.float32)
-        copy(a, ta)
-        copy(b, tb)
-
-        lib.ggml_graph_compute_with_ctx(ctx, gf, 1)
-
-        assert np.allclose(numpy(tsum, allow_copy=True), a + b)
-
-class TestQuantization:
-
-    def test_quantized_add(self, ctx):
-        n = 256
-        ta = lib.ggml_new_tensor_1d(ctx, lib.GGML_TYPE_Q5_K, n)
-        tb = lib.ggml_new_tensor_1d(ctx, lib.GGML_TYPE_F32, n)
-        tsum = lib.ggml_add(ctx, ta, tb)
-        assert tsum.type == lib.GGML_TYPE_Q5_K
-
-        gf = ffi.new('struct ggml_cgraph*')
-        lib.ggml_build_forward_expand(gf, tsum)
-
-        a = np.arange(0, n, dtype=np.float32)
-        b = np.arange(n, 0, -1, dtype=np.float32)
-        copy(a, ta)
-        copy(b, tb)
-
-        lib.ggml_graph_compute_with_ctx(ctx, gf, 1)
-
-        unquantized_sum = a + b
-        sum = numpy(tsum, allow_copy=True)
-
-        diff = np.linalg.norm(unquantized_sum - sum, np.inf)
-        assert diff > 4
-        assert diff < 5
diff --git a/ggml/examples/replit/CMakeLists.txt b/ggml/examples/replit/CMakeLists.txt
deleted file mode 100644
index 696b7f9..0000000
--- a/ggml/examples/replit/CMakeLists.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-#
-# replit
-
-set(TEST_TARGET replit)
-add_executable(${TEST_TARGET} main.cpp)
-target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)
-
-#
-# replit-quantize
-
-set(TEST_TARGET replit-quantize)
-add_executable(${TEST_TARGET} quantize.cpp)
-target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)
diff --git a/ggml/examples/replit/convert-h5-to-ggml.py b/ggml/examples/replit/convert-h5-to-ggml.py
deleted file mode 100644
index 4fc15a9..0000000
--- a/ggml/examples/replit/convert-h5-to-ggml.py
+++ /dev/null
@@ -1,117 +0,0 @@
-from pathlib import Path
-import sys
-import struct
-import json
-import numpy as np
-from transformers import AutoModelForCausalLM, AutoTokenizer
-import sentencepiece.sentencepiece_model_pb2 as model
-
-if len(sys.argv) < 3:
-    print("Usage: convert-h5-to-ggml.py dir-model [use-f32]\n")
-    print("  ftype == 0 -> float32")
-    print("  ftype == 1 -> float16")
-    sys.exit(1)
-
-
-# output in the same directory as the model
-dir_model = sys.argv[1]
-fname_out = sys.argv[1] + "/ggml-model.bin"
-
-
-with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
-    hparams = json.load(f)
-
-sp_proto = model.ModelProto()
-sp_proto.ParseFromString(open(Path(sys.argv[1]) / "spiece.model", "rb").read())
-
-
-# possible data types
-#   ftype == 0 -> float32
-#   ftype == 1 -> float16
-#
-# map from ftype to string
-ftype_str = ["f32", "f16"]
-
-ftype = 1
-if len(sys.argv) > 2:
-    ftype = int(sys.argv[2])
-    if ftype < 0 or ftype > 1:
-        print("Invalid ftype: " + str(ftype))
-        sys.exit(1)
-    fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin"
-
-
-tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
-model = AutoModelForCausalLM.from_pretrained(
-    dir_model, low_cpu_mem_usage=True, trust_remote_code=True
-)
-# print (model)
-
-# print(tokenizer.encode('I believe the meaning of life is'))
-
-list_vars = model.state_dict()
-for name in list_vars.keys():
-    print(name, list_vars[name].shape, list_vars[name].dtype)
-
-fout = open(fname_out, "wb")
-
-print(hparams)
-
-fout.write(struct.pack("i", 0x67676D6C))  # magic: ggml in hex
-fout.write(struct.pack("i", hparams["d_model"]))
-fout.write(struct.pack("i", hparams["max_seq_len"]))
-fout.write(struct.pack("i", hparams["n_heads"]))
-fout.write(struct.pack("i", hparams["n_layers"]))
-fout.write(struct.pack("i", hparams["vocab_size"]))
-fout.write(struct.pack("i", ftype))
-
-
-# TODO: temporary hack to not deal with implementing the tokenizer
-for piece in sp_proto.pieces:
-    encoded_piece = piece.piece.encode("utf-8")
-    fout.write(struct.pack("i", len(encoded_piece)))
-    fout.write(encoded_piece)
-    fout.write(struct.pack("f", piece.score))
-
-if hparams["vocab_size"] > len(sp_proto.pieces):
-    for i in range(hparams["vocab_size"] - len(sp_proto.pieces)):
-        fout.write(struct.pack("i", 0))
-        fout.write(struct.pack("f", 0))
-
-for name in list_vars.keys():
-    data = list_vars[name].squeeze().numpy()
-    print("Processing variable: " + name + " with shape: ", data.shape)
-
-    n_dims = len(data.shape)
-
-    # ftype == 0 -> float32, ftype == 1 -> float16
-    ftype_cur = 0
-    if ftype != 0:
-        if name[-7:] == ".weight" and n_dims == 2:
-            print("  Converting to float16")
-            data = data.astype(np.float16)
-            ftype_cur = 1
-        else:
-            print("  Converting to float32")
-            data = data.astype(np.float32)
-            ftype_cur = 0
-    else:
-        if data.dtype != np.float32:
-            print("  Converting to float32")
-            data = data.astype(np.float32)
-            ftype_cur = 0
-
-    # header
-    str = name.encode("utf-8")
-    fout.write(struct.pack("iii", n_dims, len(str), ftype_cur))
-    for i in range(n_dims):
-        fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
-    fout.write(str)
-
-    # data
-    data.tofile(fout)
-
-fout.close()
-
-print("Done. Output file: " + fname_out)
-print("")
diff --git a/ggml/examples/replit/main.cpp b/ggml/examples/replit/main.cpp
deleted file mode 100644
index acd1cbb..0000000
--- a/ggml/examples/replit/main.cpp
+++ /dev/null
@@ -1,798 +0,0 @@
-#include "ggml/ggml.h"
-
-#include "common-ggml.h"
-#include "common.h"
-
-#include <cassert>
-#include <cmath>
-#include <cinttypes>
-#include <cstddef>
-#include <cstdio>
-#include <cstring>
-#include <fstream>
-#include <iostream>
-#include <map>
-#include <stdint.h>
-#include <string>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
-#if defined(_WIN32)
-#define NOMINMAX
-#include <Windows.h>
-bool is_stdin_terminal() {
-    auto in = GetStdHandle(STD_INPUT_HANDLE);
-    return GetFileType(in) == FILE_TYPE_CHAR;
-}
-#else
-#include <unistd.h>
-bool is_stdin_terminal() {
-    return isatty(STDIN_FILENO);
-}
-#endif
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-
-using piece_t = std::pair<std::size_t, float>;
-using piece_map_t = std::unordered_map<std::string, piece_t>;
-
-struct replit_tokenizer {
-    gpt_vocab raw_vocab;
-    piece_map_t piece_map;
-    std::vector<std::string> vocab;
-};
-
-std::pair<std::vector<std::size_t>, float> encode_word(const std::string & word, const piece_map_t & model) {
-    std::vector<int> best_segmentations_starts(word.length() + 1, -1);
-    best_segmentations_starts[0] = 0;
-
-    std::vector<float> best_segmentations_scores(word.length() + 1, -std::numeric_limits<float>::infinity());
-    best_segmentations_scores[0] = 1.0;
-
-    for (size_t start_idx = 0; start_idx < word.length(); ++start_idx) {
-        float best_score_at_start = best_segmentations_scores[start_idx];
-        for (size_t end_idx = start_idx + 1; end_idx <= word.length(); ++end_idx) {
-            std::string token = word.substr(start_idx, end_idx - start_idx);
-            if (model.count(token) && best_score_at_start != -std::numeric_limits<float>::infinity()) {
-                float token_score = model.at(token).second;
-                float score = token_score + best_score_at_start;
-                if (best_segmentations_scores[end_idx] == -std::numeric_limits<float>::infinity() ||
-                    best_segmentations_scores[end_idx] > score) {
-                    best_segmentations_starts[end_idx] = start_idx;
-                    best_segmentations_scores[end_idx] = score;
-                }
-            }
-        }
-    }
-
-    if (best_segmentations_scores.back() == -std::numeric_limits<float>::infinity()) {
-        return std::make_pair(std::vector<std::size_t>{0}, 0.0f);
-    }
-
-    float score = best_segmentations_scores.back();
-    int start = best_segmentations_starts.back();
-    int end = word.length();
-    std::vector<std::size_t> tokens;
-    while (start != 0) {
-        const auto token_id = model.at(word.substr(start, end - start)).first;
-        tokens.insert(tokens.begin(), token_id);
-        int next_start = best_segmentations_starts[start];
-        end = start;
-        start = next_start;
-    }
-    const auto token_id = model.at(word.substr(start, end - start)).first;
-    tokens.insert(tokens.begin(), token_id);
-    return std::make_pair(tokens, score);
-}
-
-bool replit_tokenizer_load(replit_tokenizer & tokenizer, std::istream & fin, int max_vocab_size) {
-    std::string word;
-    std::vector<char> buf(128);
-
-    for (int i = 0; i < max_vocab_size; i++) {
-        uint32_t len;
-        fin.read((char *)&len, sizeof(len));
-
-        buf.resize(len);
-        fin.read((char *)buf.data(), len);
-        word.assign(buf.data(), len);
-
-        float score;
-        fin.read((char *)&score, sizeof(score));
-
-        tokenizer.piece_map[word] = std::make_pair(i, -score);
-        tokenizer.raw_vocab.id_to_token[i] = word;
-    }
-
-    return true;
-}
-
-std::string replace_all(const std::string & str,    // where to work
-                        const std::string & find,   // substitute 'find'
-                        const std::string & replace //      by 'replace'
-) {
-    using namespace std;
-    string result;
-    size_t find_len = find.size();
-    size_t pos, from = 0;
-    while (string::npos != (pos = str.find(find, from))) {
-        result.append(str, from, pos - from);
-        result.append(replace);
-        from = pos + find_len;
-    }
-    result.append(str, from, string::npos);
-    return result;
-}
-
-std::string ws_symbol = "\342\226\201";
-std::vector<std::size_t> replit_tokenizer_tokenize(replit_tokenizer & tokenizer, const std::string & text) {
-    std::vector<std::size_t> tokens;
-    auto normalized_text = replace_all(text, " ", ws_symbol);
-    auto tokenized = encode_word(normalized_text, tokenizer.piece_map);
-
-    return tokenized.first;
-}
-
-std::string replit_tokenizer_detokenize(replit_tokenizer & tokenizer, const std::vector<std::size_t> & tokens) {
-    std::string text;
-    for (auto token : tokens) {
-        text += tokenizer.raw_vocab.id_to_token[token];
-    }
-    auto denormalized_text = replace_all(text, ws_symbol, " ");
-    return denormalized_text;
-}
-
-// no defaults for now
-struct replit_hparams {
-    int32_t d_model = 0;
-    int32_t max_seq_len = 0;
-    int32_t n_heads = 0;
-    int32_t n_layers = 0;
-    int32_t n_vocab = 0;
-    int32_t ftype = 0;
-};
-
-struct replit_layer {
-    // pre normalization
-    struct ggml_tensor * norm_1_weight;
-
-    // attention
-    struct ggml_tensor * c_attn_wqkv_weight;
-    struct ggml_tensor * c_attn_out_proj_weight;
-
-    // post normalization
-    struct ggml_tensor * norm_2_weight;
-
-    // ff
-    struct ggml_tensor * ffn_up_proj;
-    struct ggml_tensor * ffn_down_proj;
-};
-
-struct replit_model {
-    replit_hparams hparams;
-
-    struct ggml_tensor * wte_weight;    // position embedding
-    struct ggml_tensor * norm_f_weight; // language model head
-
-    std::vector<replit_layer> layers;
-
-    // key + value memory
-    struct ggml_tensor * memory_k;
-    struct ggml_tensor * memory_v;
-
-    struct ggml_context * ctx;
-    std::map<std::string, struct ggml_tensor *> tensors;
-};
-
-// load the model's weights from a file
-bool replit_model_load(const std::string & fname, replit_model & model, replit_tokenizer & vocab) {
-    printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
-
-    auto fin = std::ifstream(fname, std::ios::binary);
-    if (!fin) {
-        fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
-        return false;
-    }
-
-    // verify magic
-    {
-        uint32_t magic;
-        fin.read((char *)&magic, sizeof(magic));
-        if (magic != GGML_FILE_MAGIC) {
-            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
-            return false;
-        }
-    }
-
-    // load hparams
-    {
-        auto & hparams = model.hparams;
-
-        fin.read((char *)&hparams.d_model, sizeof(hparams.d_model));
-        fin.read((char *)&hparams.max_seq_len, sizeof(hparams.max_seq_len));
-        fin.read((char *)&hparams.n_heads, sizeof(hparams.n_heads));
-        fin.read((char *)&hparams.n_layers, sizeof(hparams.n_layers));
-        fin.read((char *)&hparams.n_vocab, sizeof(hparams.n_vocab));
-        fin.read((char *)&hparams.ftype, sizeof(hparams.ftype));
-
-        const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
-
-        printf("%s: d_model      = %d\n", __func__, hparams.d_model);
-        printf("%s: max_seq_len  = %d\n", __func__, hparams.max_seq_len);
-        printf("%s: n_heads      = %d\n", __func__, hparams.n_heads);
-        printf("%s: n_layers     = %d\n", __func__, hparams.n_layers);
-        printf("%s: n_vocab      = %d\n", __func__, hparams.n_vocab);
-        printf("%s: ftype        = %d\n", __func__, hparams.ftype);
-        printf("%s: qntvr        = %d\n", __func__, qntvr);
-
-        hparams.ftype %= GGML_QNT_VERSION_FACTOR;
-    }
-
-    // load vocab
-    replit_tokenizer_load(vocab, fin, model.hparams.n_vocab);
-
-    // for the big tensors, we have the option to store the data in 16-bit
-    // floats or quantized in order to save memory and also to speed up the
-    // computation
-    ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype)(model.hparams.ftype));
-    if (wtype == GGML_TYPE_COUNT) {
-        fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n", __func__, fname.c_str(),
-                model.hparams.ftype);
-        return false;
-    }
-
-    auto & ctx = model.ctx;
-
-    size_t ctx_size = 0;
-
-    {
-        const auto & hparams = model.hparams;
-
-        const int n_embd = hparams.d_model;
-        const int n_layer = hparams.n_layers;
-        const int n_ctx = hparams.max_seq_len;
-        const int n_vocab = hparams.n_vocab;
-
-        ctx_size += ggml_row_size(wtype,         n_embd*n_vocab); // wte_weight
-        ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd);         // ln_f_weight
-
-        ctx_size += n_layer * (ggml_row_size(GGML_TYPE_F32, n_embd));       // ln_1_weight
-
-        ctx_size += n_layer * (ggml_row_size(wtype, 3 * n_embd * n_embd));  // attn_Wqkv_weight
-        ctx_size += n_layer * (ggml_row_size(wtype,     n_embd * n_embd)); // attn_out_proj_weight
-
-        ctx_size += n_layer * (ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_weight
-
-        ctx_size += n_layer * (ggml_row_size(wtype, 4 * n_embd * n_embd)); // mlp_mlp_up_weight
-        ctx_size += n_layer * (ggml_row_size(wtype, 4 * n_embd * n_embd)); // mlp_mlp_down_weight
-
-        ctx_size += n_ctx * n_layer * ggml_row_size(GGML_TYPE_F16, n_embd); // memory_k
-        ctx_size += n_ctx * n_layer * ggml_row_size(GGML_TYPE_F16, n_embd); // memory_v
-
-        ctx_size += (1 + 6 * n_layer) * 512; // object overhead
-
-        printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size / (1024.0 * 1024.0));
-    }
-
-    // create the ggml context
-    {
-        struct ggml_init_params params = {
-            /*.mem_size   =*/ ctx_size,
-            /*.mem_buffer =*/ NULL,
-            /*.no_alloc   =*/ false,
-        };
-
-        model.ctx = ggml_init(params);
-        if (!model.ctx) {
-            fprintf(stderr, "%s: ggml_init() failed\n", __func__);
-            return false;
-        }
-    }
-
-    // prepare memory for the weights
-    {
-        const auto & hparams = model.hparams;
-
-        const size_t n_embd = hparams.d_model;
-        const size_t n_layer = hparams.n_layers;
-        const size_t n_vocab = hparams.n_vocab;
-
-        model.layers.resize(n_layer);
-
-        model.wte_weight = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab);
-        model.norm_f_weight = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-
-        // map by name
-        model.tensors["transformer.wte.weight"] = model.wte_weight;
-        model.tensors["transformer.norm_f.weight"] = model.norm_f_weight;
-
-        for (int i = 0; i < (int)n_layer; ++i) {
-            auto & layer = model.layers[i];
-
-            layer.norm_1_weight = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-            layer.c_attn_wqkv_weight = ggml_new_tensor_2d(ctx, wtype, n_embd, 3 * n_embd);
-            layer.c_attn_out_proj_weight = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
-            layer.norm_2_weight = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-            layer.ffn_up_proj = ggml_new_tensor_2d(ctx, wtype, n_embd, 4 * n_embd);
-            layer.ffn_down_proj = ggml_new_tensor_2d(ctx, wtype, 4 * n_embd, n_embd);
-
-            // map by name
-            model.tensors["transformer.blocks." + std::to_string(i) + ".norm_1.weight"] = layer.norm_1_weight;
-            model.tensors["transformer.blocks." + std::to_string(i) + ".attn.Wqkv.weight"] = layer.c_attn_wqkv_weight;
-            model.tensors["transformer.blocks." + std::to_string(i) + ".attn.out_proj.weight"] =
-                layer.c_attn_out_proj_weight;
-            model.tensors["transformer.blocks." + std::to_string(i) + ".norm_2.weight"] = layer.norm_2_weight;
-            model.tensors["transformer.blocks." + std::to_string(i) + ".ffn.up_proj.weight"] = layer.ffn_up_proj;
-            model.tensors["transformer.blocks." + std::to_string(i) + ".ffn.down_proj.weight"] = layer.ffn_down_proj;
-        }
-    }
-
-    // key + value memory
-    {
-        const auto & hparams = model.hparams;
-
-        const int n_embd = hparams.d_model;
-        const int n_layer = hparams.n_layers;
-        const int n_ctx = hparams.max_seq_len;
-
-        const int64_t n_mem = n_layer * n_ctx;
-        const int64_t n_elements = n_embd * n_mem;
-
-        model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
-        model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
-
-        const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
-
-        printf("%s: memory_size = %8.2f MB, n_mem = %" PRIu64 "\n", __func__, memory_size / 1024.0 / 1024.0, n_mem);
-    }
-
-    // load weights
-    {
-        int n_tensors = 0;
-        size_t total_size = 0;
-
-        printf("%s: ", __func__);
-
-        while (true) {
-            int32_t n_dims;
-            int32_t length;
-            int32_t ttype;
-
-            fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
-            fin.read(reinterpret_cast<char *>(&length), sizeof(length));
-            fin.read(reinterpret_cast<char *>(&ttype), sizeof(ttype));
-
-            if (fin.eof()) {
-                break;
-            }
-
-            int32_t nelements = 1;
-            int32_t ne[2] = {1, 1};
-            for (int i = 0; i < n_dims; ++i) {
-                fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
-                nelements *= ne[i];
-            }
-
-            std::string name(length, 0);
-            fin.read(&name[0], length);
-
-            if (model.tensors.find(name) == model.tensors.end()) {
-                fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.c_str());
-                return false;
-            }
-
-            auto tensor = model.tensors[name];
-            if (ggml_nelements(tensor) != nelements) {
-                fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.c_str());
-                return false;
-            }
-
-            if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
-                fprintf(stderr,
-                        "%s: tensor '%s' has wrong shape in model file: got [%5d, "
-                        "%5d], expected [%5d, %5d]\n",
-                        __func__, name.c_str(), (int)tensor->ne[0], (int)tensor->ne[1], ne[0], ne[1]);
-                return false;
-            }
-
-            // for debugging
-            if (0) {
-                printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.c_str(), ne[0], ne[1],
-                       ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor) / 1024.0 / 1024.0, ggml_nbytes(tensor));
-            }
-
-            const size_t bpe = ggml_type_size(ggml_type(ttype));
-
-            if ((nelements * bpe) / ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
-                fprintf(stderr,
-                        "%s: tensor '%s' has wrong size in model file: got %zu, "
-                        "expected %zu\n",
-                        __func__, name.c_str(), ggml_nbytes(tensor), nelements * bpe);
-                return false;
-            }
-
-            fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
-
-            total_size += ggml_nbytes(tensor);
-            if (++n_tensors % 8 == 0) {
-                printf(".");
-                fflush(stdout);
-            }
-        }
-
-        printf(" done\n");
-
-        printf("%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size / 1024.0 / 1024.0, n_tensors);
-    }
-
-    fin.close();
-
-    return true;
-}
-
-// evaluate the transformer
-//
-//   - model:     the model
-//   - n_threads: number of threads to use
-//   - n_past:    the context size so far
-//   - embd_inp:  the embeddings of the tokens in the context
-//   - embd_w:    the predicted logits for the next token
-//
-bool replit_eval(const replit_model & model, const int n_threads, const int n_past,
-                 const std::vector<gpt_vocab::id> & embd_inp, std::vector<float> & embd_w, bool logits_all,
-                 size_t & mem_per_token) {
-    const int N = embd_inp.size();
-
-    const auto & hparams = model.hparams;
-
-    const int n_embd = hparams.d_model;
-    const int n_layer = hparams.n_layers;
-    const int n_head = hparams.n_heads;
-    const int n_vocab = hparams.n_vocab;
-    const int n_ctx = hparams.max_seq_len;
-    const float eps = 1e-5f;
-
-    static size_t buf_size = 256u * 1024 * 1024;
-    static void * buf = malloc(buf_size);
-
-    if (mem_per_token > 0 && mem_per_token * N > buf_size) {
-        const size_t buf_size_new = 1.1 * (mem_per_token * N); // add 10% to account for ggml object overhead
-        // printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__,
-        // buf_size, buf_size_new);
-
-        // reallocate
-        buf_size = buf_size_new;
-        buf = realloc(buf, buf_size);
-        if (buf == nullptr) {
-            fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size);
-            return false;
-        }
-    }
-
-    struct ggml_init_params params = {
-        /*.mem_size   =*/ buf_size,
-        /*.mem_buffer =*/ buf,
-        /*.no_alloc   =*/ false,
-    };
-
-    struct ggml_context * ctx0 = ggml_init(params);
-    struct ggml_cgraph * gf = ggml_new_graph(ctx0);
-
-    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
-    memcpy(embd->data, embd_inp.data(), N * ggml_element_size(embd));
-
-    struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.wte_weight, embd);
-
-    for (int il = 0; il < n_layer; ++il) {
-
-        struct ggml_tensor * cur;
-
-        // a = self.ln_1(x)
-        {
-            cur = ggml_norm(ctx0, inpL, eps);
-
-            cur = ggml_mul(ctx0, ggml_repeat(ctx0, model.layers[il].norm_1_weight, cur), cur);
-        }
-
-        // self-attention
-        //  b, _, past_key_value = self.attn(a, past_key_value=past_key_value,
-        //  attn_bias=attn_bias, attention_mask=attention_mask,
-        //  is_causal=is_causal)
-        {
-            // compute QKV
-            cur = ggml_mul_mat(ctx0, model.layers[il].c_attn_wqkv_weight, cur);
-
-            struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0 * sizeof(float) * n_embd);
-            struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1 * sizeof(float) * n_embd);
-            struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2 * sizeof(float) * n_embd);
-
-            // store key and value to memory
-            {
-                struct ggml_tensor * k =
-                    ggml_view_1d(ctx0, model.memory_k, N * n_embd,
-                                 (ggml_element_size(model.memory_k) * n_embd) * (il * n_ctx + n_past));
-                struct ggml_tensor * v =
-                    ggml_view_1d(ctx0, model.memory_v, N * n_embd,
-                                 (ggml_element_size(model.memory_v) * n_embd) * (il * n_ctx + n_past));
-
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
-            }
-
-            // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0,
-            // 2, 1, 3) [64, N, 12]
-            struct ggml_tensor * Q = ggml_permute(
-                ctx0, ggml_cpy(ctx0, Qcur, ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd / n_head, n_head, N)), 0, 2,
-                1, 3);
-
-            // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1,
-            // 3) [64, n_past + N, 12]
-            struct ggml_tensor * K =
-                ggml_permute(ctx0,
-                             ggml_reshape_3d(ctx0,
-                                             ggml_view_1d(ctx0, model.memory_k, (n_past + N) * n_embd,
-                                                          il * n_ctx * ggml_element_size(model.memory_k) * n_embd),
-                                             n_embd / n_head, n_head, n_past + N),
-                             0, 2, 1, 3);
-            // K * Q
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
-
-            // KQ_scaled = KQ / sqrt(n_embd/n_head)
-            struct ggml_tensor * KQ_scaled =
-                ggml_scale(ctx0, KQ, 1.0f / sqrt(float(n_embd) / n_head));
-
-            struct ggml_tensor * KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, n_past, n_head, 8.0f);
-
-            // KQ_masked = mask_past(KQ_scaled)
-            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled_alibi, n_past);
-
-            // KQ = soft_max(KQ_masked)
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
-
-            // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1,
-            // 2, 0, 3).contiguous() [n_past + N, 64, 12]
-            struct ggml_tensor * V_trans = ggml_cpy(
-                ctx0,
-                ggml_permute(ctx0,
-                             ggml_reshape_3d(ctx0,
-                                             ggml_view_1d(ctx0, model.memory_v, (n_past + N) * n_embd,
-                                                          il * n_ctx * ggml_element_size(model.memory_v) * n_embd),
-                                             n_embd / n_head, n_head, n_past + N),
-                             1, 2, 0, 3),
-                ggml_new_tensor_3d(ctx0, model.memory_v->type, n_past + N, n_embd / n_head, n_head));
-
-            // KQV = transpose(V) * KQ_soft_max
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
-
-            // KQV_merged = KQV.permute(0, 2, 1, 3)
-            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
-
-            // cur = KQV_merged.contiguous().view(n_embd, N)
-            cur = ggml_cpy(ctx0, KQV_merged, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
-
-            // projection
-            { cur = ggml_mul_mat(ctx0, model.layers[il].c_attn_out_proj_weight, cur); }
-        }
-
-        inpL = ggml_add(ctx0, inpL, cur);
-
-        // m = self.ln_2(x)
-        {
-            cur = ggml_norm(ctx0, inpL, eps);
-
-            cur = ggml_mul(ctx0, ggml_repeat(ctx0, model.layers[il].norm_2_weight, cur), cur);
-        }
-
-        // n = self.mlp(m)
-        {
-
-            cur = ggml_mul_mat(ctx0, model.layers[il].ffn_up_proj, cur);
-
-            // GELU activation
-            cur = ggml_gelu(ctx0, cur);
-
-            // projection
-            // cur = proj_w*cur + proj_b
-            cur = ggml_mul_mat(ctx0, model.layers[il].ffn_down_proj, cur);
-        }
-
-        // x = x + n
-        inpL = ggml_add(ctx0, inpL, cur);
-    }
-
-    // norm
-    {
-        inpL = ggml_norm(ctx0, inpL, eps);
-        // inpL = ln_f_g*inpL
-        inpL = ggml_mul(ctx0, ggml_repeat(ctx0, model.norm_f_weight, inpL), inpL);
-    }
-
-    // output embedding weight tied to input embedding
-    inpL = ggml_mul_mat(ctx0, model.wte_weight, inpL);
-
-    // logits -> probs
-    // inpL = ggml_soft_max(ctx0, inpL);
-
-    // run the computation
-    ggml_build_forward_expand(gf, inpL);
-    ggml_graph_compute_with_ctx(ctx0, gf, n_threads);
-
-    // std::cout << "Qcur" << std::endl;
-    // print_tensor(Qcur);
-
-    // if (n_past%100 == 0) {
-    // ggml_graph_print(&gf);
-    // ggml_graph_dump_dot(&gf, NULL, "mpt-model.dot");
-    // }
-
-    if (logits_all) {
-        // return result for all tokens
-        embd_w.resize(n_vocab * N);
-        memcpy(embd_w.data(), (float *)ggml_get_data(inpL), sizeof(float) * n_vocab * N);
-    } else {
-        // return result for just the last token
-        embd_w.resize(n_vocab);
-        memcpy(embd_w.data(), (float *)ggml_get_data(inpL) + (n_vocab * (N - 1)), sizeof(float) * n_vocab);
-    }
-
-    if (mem_per_token == 0) {
-        mem_per_token = ggml_used_mem(ctx0) / N;
-    }
-    // printf("used_mem = %zu\n", ggml_used_mem(ctx0));
-
-    ggml_free(ctx0);
-
-    return true;
-}
-
-int main(int argc, char ** argv) {
-    const int64_t t_main_start_us = ggml_time_us();
-
-    gpt_params params;
-    params.model = "";
-
-    if (gpt_params_parse(argc, argv, params) == false) {
-        return 1;
-    }
-
-    if (params.seed < 0) {
-        params.seed = time(NULL);
-    }
-
-    printf("%s: seed = %d\n", __func__, params.seed);
-
-    std::mt19937 rng(params.seed);
-    if (params.prompt.empty()) {
-        if (!is_stdin_terminal()) {
-            std::string line;
-            while (std::getline(std::cin, line)) {
-                params.prompt = params.prompt + "\n" + line;
-            }
-        } else {
-            params.prompt = gpt_random_prompt(rng);
-        }
-    }
-
-    int64_t t_load_us = 0;
-
-    replit_tokenizer vocab;
-    replit_model model;
-
-    // load the model
-    {
-        const int64_t t_start_us = ggml_time_us();
-
-        if (!replit_model_load(params.model, model, vocab)) {
-            fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
-            return 1;
-        }
-
-        t_load_us = ggml_time_us() - t_start_us;
-    }
-
-    int n_past = 0;
-
-    int64_t t_sample_us = 0;
-    int64_t t_predict_us = 0;
-
-    std::vector<float> logits;
-
-    // tokenize the prompt
-    std::vector<std::size_t> embd_inp = replit_tokenizer_tokenize(vocab, params.prompt);
-
-    printf("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
-
-    for (size_t i = 0; i < embd_inp.size(); i++) {
-        printf("%s: token[%zu] = %6zu\n", __func__, i, embd_inp[i]);
-        // vocab.id_to_token.at(embd_inp[i]).c_str()
-    }
-    printf("\n");
-
-    params.n_predict = std::min(params.n_predict, model.hparams.max_seq_len - (int)embd_inp.size());
-
-    std::vector<gpt_vocab::id> embd;
-
-    // determine the required inference memory per token:
-    size_t mem_per_token = 0;
-    replit_eval(model, params.n_threads, 0, {0, 1, 2, 3}, logits, false, mem_per_token);
-
-    for (size_t i = embd.size(); i < embd_inp.size() + params.n_predict; i++) {
-        // predict
-        if (embd.size() > 0) {
-            const int64_t t_start_us = ggml_time_us();
-
-            if (!replit_eval(model, params.n_threads, n_past, embd, logits, false, mem_per_token)) {
-                printf("Failed to predict\n");
-                return 1;
-            }
-
-            t_predict_us += ggml_time_us() - t_start_us;
-        }
-
-        n_past += embd.size();
-        embd.clear();
-
-        if (i >= embd_inp.size()) {
-            // sample next token
-            const int top_k = params.top_k;
-            const float top_p = params.top_p;
-            const float temp = params.temp;
-
-            const int n_vocab = model.hparams.n_vocab;
-
-            gpt_vocab::id id = 0;
-
-            {
-                const int64_t t_start_sample_us = ggml_time_us();
-
-                id = gpt_sample_top_k_top_p(vocab.raw_vocab, logits.data() + (logits.size() - n_vocab), top_k, top_p,
-                                            temp, rng);
-
-                t_sample_us += ggml_time_us() - t_start_sample_us;
-            }
-
-            // add it to the context
-            embd.push_back(id);
-        } else {
-            // if here, it means we are still processing the input prompt
-            for (size_t k = i; k < embd_inp.size(); k++) {
-                embd.push_back(embd_inp[k]);
-                if (int32_t(embd.size()) > params.n_batch) {
-                    break;
-                }
-            }
-            i += embd.size() - 1;
-        }
-
-        // display text
-        for (auto id : embd) {
-            printf("%s", replit_tokenizer_detokenize(vocab, {static_cast<std::size_t>(id)}).c_str());
-        }
-        fflush(stdout);
-
-        // end of text token
-        if (embd.back() == 0) {
-            break;
-        }
-    }
-
-    // report timing
-    {
-        const int64_t t_main_end_us = ggml_time_us();
-
-        printf("\n\n");
-        printf("%s: mem per token = %8zu bytes\n", __func__, mem_per_token);
-        printf("%s:     load time = %8.2f ms\n", __func__, t_load_us / 1000.0f);
-        printf("%s:   sample time = %8.2f ms\n", __func__, t_sample_us / 1000.0f);
-        printf("%s:  predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us / 1000.0f,
-               t_predict_us / 1000.0f / n_past);
-        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us) / 1000.0f);
-    }
-
-    ggml_free(model.ctx);
-
-    return 0;
-}
diff --git a/ggml/examples/replit/quantize.cpp b/ggml/examples/replit/quantize.cpp
deleted file mode 100644
index f274074..0000000
--- a/ggml/examples/replit/quantize.cpp
+++ /dev/null
@@ -1,182 +0,0 @@
-#include "ggml/ggml.h"
-
-#include "common-ggml.h"
-#include "common.h"
-
-#include <cassert>
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <fstream>
-#include <map>
-#include <regex>
-#include <string>
-#include <vector>
-
-struct mpt_hparams {
-    int32_t d_model     = 0;
-    int32_t max_seq_len = 0;
-    int32_t n_heads     = 0;
-    int32_t n_layers    = 0;
-    int32_t n_vocab     = 0;
-    int32_t ftype       = 0;
-};
-
-// quantize a model
-bool mpt_model_quantize(const std::string & fname_inp,
-                        const std::string & fname_out, ggml_ftype ftype) {
-
-    printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
-
-    auto finp = std::ifstream(fname_inp, std::ios::binary);
-    if (!finp) {
-        fprintf(stderr, "%s: failed to open '%s' for reading\n", __func__,
-                fname_inp.c_str());
-        return false;
-    }
-
-    auto fout = std::ofstream(fname_out, std::ios::binary);
-    if (!fout) {
-        fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__,
-                fname_out.c_str());
-        return false;
-    }
-
-    // verify magic
-    {
-        uint32_t magic;
-        finp.read((char *)&magic, sizeof(magic));
-        if (magic != GGML_FILE_MAGIC) {
-            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n",
-                    __func__, fname_inp.c_str());
-            return false;
-        }
-
-        fout.write((char *)&magic, sizeof(magic));
-    }
-
-    mpt_hparams hparams;
-
-    // load hparams
-    {
-        finp.read((char *) &hparams.d_model,     sizeof(hparams.d_model));
-        finp.read((char *) &hparams.max_seq_len, sizeof(hparams.max_seq_len));
-        finp.read((char *) &hparams.n_heads,     sizeof(hparams.n_heads));
-        finp.read((char *) &hparams.n_layers,    sizeof(hparams.n_layers));
-        finp.read((char *) &hparams.n_vocab,     sizeof(hparams.n_vocab));
-        finp.read((char *) &hparams.ftype,       sizeof(hparams.ftype));
-
-        const int32_t qntvr_src =    hparams.ftype / GGML_QNT_VERSION_FACTOR;
-        const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype;
-
-        printf("%s: d_model     = %d\n", __func__, hparams.d_model);
-        printf("%s: max_seq_len = %d\n", __func__, hparams.max_seq_len);
-        printf("%s: n_heads     = %d\n", __func__, hparams.n_heads);
-        printf("%s: n_layers    = %d\n", __func__, hparams.n_layers);
-        printf("%s: n_vocab     = %d\n", __func__, hparams.n_vocab);
-        printf("%s: ftype (src) = %d\n", __func__, hparams.ftype);
-        printf("%s: qntvr (src) = %d\n", __func__, qntvr_src);
-        printf("%s: ftype (dst) = %d\n", __func__, ftype_dst);
-        printf("%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION);
-
-        fout.write((char *) &hparams.d_model,     sizeof(hparams.d_model));
-        fout.write((char *) &hparams.max_seq_len, sizeof(hparams.max_seq_len));
-        fout.write((char *) &hparams.n_heads,     sizeof(hparams.n_heads));
-        fout.write((char *) &hparams.n_layers,    sizeof(hparams.n_layers));
-        fout.write((char *) &hparams.n_vocab,     sizeof(hparams.n_vocab));
-        fout.write((char *) &ftype_dst,           sizeof(ftype_dst));
-    }
-
-    // load vocab
-    {
-        const int32_t n_vocab = hparams.n_vocab;
-
-        std::string word;
-        for (int i = 0; i < n_vocab; i++) {
-            uint32_t len;
-            finp.read((char *)&len, sizeof(len));
-            fout.write((char *)&len, sizeof(len));
-
-            word.resize(len);
-            finp.read((char *)word.data(), len);
-            fout.write((char *)word.data(), len);
-
-            float prob;
-            finp.read((char *)&prob, sizeof(prob));
-            fout.write((char *)&prob, sizeof(prob));
-        }
-    }
-
-    printf("%s: quantizing tensors\n", __func__);
-
-    // regexes of tensor names to be quantized
-    const std::vector<std::string> to_quant = {
-        ".*weight",
-    };
-
-    if (!ggml_common_quantize_0(finp, fout, ftype, to_quant, {})) {
-        fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__,
-                fname_inp.c_str());
-        return false;
-    }
-
-    finp.close();
-    fout.close();
-
-    return true;
-}
-
-// usage:
-//  ./replit-quantize models/replit/ggml-model.bin
-//  models/replit/ggml-model-quant.bin type
-//
-int main(int argc, char ** argv) {
-    if (argc != 4) {
-        fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n",
-                argv[0]);
-        ggml_print_ftypes(stderr);
-        return 1;
-    }
-
-    // needed to initialize f16 tables
-    {
-        struct ggml_init_params params = {0, NULL, false};
-        struct ggml_context * ctx = ggml_init(params);
-        ggml_free(ctx);
-    }
-
-    const std::string fname_inp = argv[1];
-    const std::string fname_out = argv[2];
-
-    const ggml_ftype ftype = ggml_parse_ftype(argv[3]);
-
-    const int64_t t_main_start_us = ggml_time_us();
-
-    int64_t t_quantize_us = 0;
-
-    // load the model
-    {
-        const int64_t t_start_us = ggml_time_us();
-
-        if (!mpt_model_quantize(fname_inp, fname_out, ggml_ftype(ftype))) {
-            fprintf(stderr, "%s: failed to quantize model from '%s'\n",
-                    __func__, fname_inp.c_str());
-            return 1;
-        }
-
-        t_quantize_us = ggml_time_us() - t_start_us;
-    }
-
-    // report timing
-    {
-        const int64_t t_main_end_us = ggml_time_us();
-
-        printf("\n");
-        printf("%s: quantize time = %8.2f ms\n", __func__,
-               t_quantize_us / 1000.0f);
-        printf("%s:    total time = %8.2f ms\n", __func__,
-               (t_main_end_us - t_main_start_us) / 1000.0f);
-    }
-
-    return 0;
-}
diff --git a/ggml/examples/sam/CMakeLists.txt b/ggml/examples/sam/CMakeLists.txt
deleted file mode 100644
index fa302bb..0000000
--- a/ggml/examples/sam/CMakeLists.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-#
-# sam
-
-set(TEST_TARGET sam)
-add_executable(${TEST_TARGET} main.cpp)
-target_link_libraries(${TEST_TARGET} PRIVATE ggml common)
-
-#
-# sam-quantize
-
-#set(TEST_TARGET sam-quantize)
-#add_executable(${TEST_TARGET} quantize.cpp)
-#target_link_libraries(${TEST_TARGET} PRIVATE ggml common)
diff --git a/ggml/examples/sam/README.md b/ggml/examples/sam/README.md
deleted file mode 100644
index 1eef7e7..0000000
--- a/ggml/examples/sam/README.md
+++ /dev/null
@@ -1,103 +0,0 @@
-# SAM.cpp
-
-Inference of Meta's [Segment Anything Model](https://github.com/facebookresearch/segment-anything/) in pure C/C++
-
-## Description
-
-The example currently supports only the [ViT-B SAM model checkpoint](https://huggingface.co/facebook/sam-vit-base).
-
-## Next steps
-
-- [X] Reduce memory usage by utilizing the new ggml-alloc
-- [X] Remove redundant graph nodes
-- [ ] Make inference faster
-- [X] Fix the difference in output masks compared to the PyTorch implementation
-- [X] Filter masks based on stability score
-- [ ] Add support for user input
-- [ ] Support F16 for heavy F32 ops
-- [ ] Test quantization
-- [X] Support bigger model checkpoints
-- [ ] GPU support
-
-## Quick start
-```bash
-git clone https://github.com/ggerganov/ggml
-cd ggml
-
-# Install Python dependencies
-python3 -m pip install -r requirements.txt
-
-# Download PTH model
-wget -P examples/sam/ https://dl.fbaipublicfiles.com/segment_anything/sam_vit_b_01ec64.pth
-
-# Convert PTH model to ggml
-python examples/sam/convert-pth-to-ggml.py examples/sam/sam_vit_b_01ec64.pth examples/sam/ 1
-
-# Build ggml + examples
-mkdir build && cd build
-cmake .. && make -j4
-
-# run inference
-./bin/sam -t 16 -i ../examples/sam/example.jpg -m ../examples/sam/ggml-model-f16.bin
-```
-
-## Downloading and converting the model checkpoints
-
-You can download a [model checkpoint](https://github.com/facebookresearch/segment-anything/tree/main#model-checkpoints) and convert it to `ggml` format using the script `convert-pth-to-ggml.py`:
-
-## Example output on M2 Ultra
-```
- $ ▶ make -j sam && time ./bin/sam -t 8 -i img.jpg
-[ 28%] Built target common
-[ 71%] Built target ggml
-[100%] Built target sam
-main: seed = 1693224265
-main: loaded image 'img.jpg' (680 x 453)
-sam_image_preprocess: scale = 0.664062
-main: preprocessed image (1024 x 1024)
-sam_model_load: loading model from 'models/sam-vit-b/ggml-model-f16.bin' - please wait ...
-sam_model_load: n_enc_state      = 768
-sam_model_load: n_enc_layer      = 12
-sam_model_load: n_enc_head       = 12
-sam_model_load: n_enc_out_chans  = 256
-sam_model_load: n_pt_embd        = 4
-sam_model_load: ftype            = 1
-sam_model_load: qntvr            = 0
-operator(): ggml ctx size = 202.32 MB
-sam_model_load: ...................................... done
-sam_model_load: model size =   185.05 MB / num tensors = 304
-embd_img
-dims: 64 64 256 1 f32
-First & Last 10 elements:
--0.05117 -0.06408 -0.07154 -0.06991 -0.07212 -0.07690 -0.07508 -0.07281 -0.07383 -0.06779
-0.01589 0.01775 0.02250 0.01675 0.01766 0.01661 0.01811 0.02051 0.02103 0.03382
-sum:  12736.272313
-
-Skipping mask 0 with iou 0.705935 below threshold 0.880000
-Skipping mask 1 with iou 0.762136 below threshold 0.880000
-Mask 2: iou = 0.947081, stability_score = 0.955437, bbox (371, 436), (144, 168)
-
-
-main:     load time =    51.28 ms
-main:    total time =  2047.49 ms
-
-real	0m2.068s
-user	0m16.343s
-sys	0m0.214s
-```
-
-Input point is (414.375, 162.796875) (currently hardcoded)
-
-Input image:
-
-![llamas](https://user-images.githubusercontent.com/8558655/261301565-37b7bf4b-bf91-40cf-8ec1-1532316e1612.jpg)
-
-Output mask (mask_out_2.png in build folder):
-
-![mask_glasses](https://user-images.githubusercontent.com/8558655/263706800-47eeea30-1457-4c87-938b-8f11536c5aa7.png)
-
-## References
-
-- [ggml](https://github.com/ggerganov/ggml)
-- [SAM](https://segment-anything.com/)
-- [SAM demo](https://segment-anything.com/demo)
diff --git a/ggml/examples/sam/convert-pth-to-ggml.py b/ggml/examples/sam/convert-pth-to-ggml.py
deleted file mode 100644
index 0de422e..0000000
--- a/ggml/examples/sam/convert-pth-to-ggml.py
+++ /dev/null
@@ -1,147 +0,0 @@
-# Convert a SAM model checkpoint to a ggml compatible file
-#
-
-import sys
-import torch
-import struct
-import numpy as np
-
-if len(sys.argv) < 3:
-    print("Usage: convert-pth-to-ggml.py file-model dir-output [ftype]\n")
-    print("  ftype == 0 -> float32")
-    print("  ftype == 1 -> float16")
-    sys.exit(1)
-
-# output in the same directory as the model
-fname_model = sys.argv[1]
-dir_out     = sys.argv[2]
-fname_out   = dir_out + "/ggml-model.bin"
-
-# possible data types
-#   ftype == 0 -> float32
-#   ftype == 1 -> float16
-#
-# map from ftype to string
-ftype_str = ["f32", "f16"]
-
-ftype = 1
-if len(sys.argv) > 3:
-    ftype = int(sys.argv[3])
-
-if ftype < 0 or ftype > 1:
-    print("Invalid ftype: " + str(ftype))
-    sys.exit(1)
-
-fname_out = fname_out.replace(".bin", "-" + ftype_str[ftype] + ".bin")
-
-# Default params are set to sam_vit_b checkpoint
-n_enc_state = 768
-n_enc_layers = 12
-n_enc_heads = 12
-n_enc_out_chans = 256
-n_pt_embd = 4
-
-model = torch.load(fname_model, map_location="cpu")
-for k, v in model.items():
-    print(k, v.shape)
-    if k == "image_encoder.blocks.0.norm1.weight":
-        n_enc_state = v.shape[0]
-
-if n_enc_state == 1024: # sam_vit_l
-    n_enc_layers = 24
-    n_enc_heads  = 16
-elif n_enc_state == 1280: # sam_vit_h
-    n_enc_layers = 32
-    n_enc_heads  = 16
-
-hparams = {
-    "n_enc_state":      n_enc_state,
-    "n_enc_layers":     n_enc_layers,
-    "n_enc_heads":      n_enc_heads,
-    "n_enc_out_chans":  n_enc_out_chans,
-    "n_pt_embd":        n_pt_embd,
-}
-
-print(hparams)
-
-for k, v in model.items():
-    print(k, v.shape)
-
-#exit()
-#code.interact(local=locals())
-
-fout = open(fname_out, "wb")
-
-fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
-fout.write(struct.pack("i", hparams["n_enc_state"]))
-fout.write(struct.pack("i", hparams["n_enc_layers"]))
-fout.write(struct.pack("i", hparams["n_enc_heads"]))
-fout.write(struct.pack("i", hparams["n_enc_out_chans"]))
-fout.write(struct.pack("i", hparams["n_pt_embd"]))
-fout.write(struct.pack("i", ftype))
-
-for k, v in model.items():
-    name = k
-    shape = v.shape
-
-    if name[:19] == "prompt_encoder.mask":
-        continue
-
-    print("Processing variable: " + name + " with shape: ", shape, " and type: ", v.dtype)
-
-    #data = tf.train.load_variable(dir_model, name).squeeze()
-    #data = v.numpy().squeeze()
-    data = v.numpy()
-    n_dims = len(data.shape)
-
-    # for efficiency - transpose some matrices
-    # "model/h.*/attn/c_attn/w"
-    # "model/h.*/attn/c_proj/w"
-    # "model/h.*/mlp/c_fc/w"
-    # "model/h.*/mlp/c_proj/w"
-    #if name[-14:] == "/attn/c_attn/w" or \
-    #   name[-14:] == "/attn/c_proj/w" or \
-    #   name[-11:] == "/mlp/c_fc/w" or \
-    #   name[-13:] == "/mlp/c_proj/w":
-    #    print("  Transposing")
-    #    data = data.transpose()
-
-    dshape = data.shape
-
-    # default type is fp16
-    ftype_cur = 1
-    if ftype == 0 or n_dims == 1 or \
-            name == "image_encoder.pos_embed" or \
-            name.startswith("prompt_encoder") or \
-            name.startswith("mask_decoder.iou_token") or \
-            name.startswith("mask_decoder.mask_tokens"):
-        print("  Converting to float32")
-        data = data.astype(np.float32)
-        ftype_cur = 0
-    else:
-        print("  Converting to float16")
-        data = data.astype(np.float16)
-
-    # reshape the 1D bias into a 4D tensor so we can use ggml_repeat
-    # keep it in F32 since the data is small
-    if name == "image_encoder.patch_embed.proj.bias":
-        data = data.reshape(1, data.shape[0], 1, 1)
-        n_dims = len(data.shape)
-        dshape = data.shape
-
-    print("  New shape: ", dshape)
-
-    # header
-    str = name.encode('utf-8')
-    fout.write(struct.pack("iii", n_dims, len(str), ftype_cur))
-    for i in range(n_dims):
-        fout.write(struct.pack("i", dshape[n_dims - 1 - i]))
-    fout.write(str)
-
-    # data
-    data.tofile(fout)
-
-fout.close()
-
-print("Done. Output file: " + fname_out)
-print("")
diff --git a/ggml/examples/sam/example.jpg b/ggml/examples/sam/example.jpg
deleted file mode 100644
index 9d5116e..0000000
--- a/ggml/examples/sam/example.jpg
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/sam/main.cpp b/ggml/examples/sam/main.cpp
deleted file mode 100644
index d8dedf8..0000000
--- a/ggml/examples/sam/main.cpp
+++ /dev/null
@@ -1,2260 +0,0 @@
-#define _USE_MATH_DEFINES // for M_PI
-#define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows
-
-#include "ggml.h"
-#include "ggml-alloc.h"
-#define STB_IMAGE_IMPLEMENTATION
-#include "stb_image.h"
-#define STB_IMAGE_WRITE_IMPLEMENTATION
-#include "stb_image_write.h"
-
-#include <cassert>
-#include <cmath>
-#include <cstddef>
-#include <cstdio>
-#include <cstring>
-#include <fstream>
-#include <map>
-#include <string>
-#include <vector>
-#include <thread>
-#include <cinttypes>
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-
-// default hparams (ViT-B SAM)
-struct sam_hparams {
-    int32_t n_enc_state               = 768;
-    int32_t n_enc_layer               = 12;
-    int32_t n_enc_head                = 12;
-    int32_t n_enc_out_chans           = 256;
-    int32_t n_pt_embd                 = 4;
-    int32_t n_dec_heads               = 8;
-    int32_t ftype                     = 1;
-    float   mask_threshold            = 0.f;
-    float   iou_threshold             = 0.88f;
-    float   stability_score_threshold = 0.95f;
-    float   stability_score_offset    = 1.0f;
-    float   eps                       = 1e-6f;
-    float   eps_decoder_transformer   = 1e-5f;
-
-    int32_t n_enc_head_dim() const { return n_enc_state / n_enc_head; }
-    int32_t n_img_size()     const { return 1024; }
-    int32_t n_window_size()  const { return 14; }
-    int32_t n_patch_size()   const { return 16; }
-    int32_t n_img_embd()     const { return n_img_size() / n_patch_size(); }
-
-    std::vector<int32_t> global_attn_indices() const {
-        switch (n_enc_state) {
-            case  768: return {  2,  5,  8, 11 };
-            case 1024: return {  5, 11, 17, 23 };
-            case 1280: return {  7, 15, 23, 31 };
-            default:
-                {
-                    fprintf(stderr, "%s: unsupported n_enc_state = %d\n", __func__, n_enc_state);
-                } break;
-        };
-
-        return {};
-    }
-
-    bool is_global_attn(int32_t layer) const {
-        const auto indices = global_attn_indices();
-
-        for (const auto & idx : indices) {
-            if (layer == idx) {
-                return true;
-            }
-        }
-
-        return false;
-    }
-};
-
-struct sam_layer_enc {
-    struct ggml_tensor * norm1_w;
-    struct ggml_tensor * norm1_b;
-
-    struct ggml_tensor * rel_pos_w;
-    struct ggml_tensor * rel_pos_h;
-
-    struct ggml_tensor * qkv_w;
-    struct ggml_tensor * qkv_b;
-
-    struct ggml_tensor * proj_w;
-    struct ggml_tensor * proj_b;
-
-    struct ggml_tensor * norm2_w;
-    struct ggml_tensor * norm2_b;
-
-    struct ggml_tensor * mlp_lin1_w;
-    struct ggml_tensor * mlp_lin1_b;
-
-    struct ggml_tensor * mlp_lin2_w;
-    struct ggml_tensor * mlp_lin2_b;
-};
-
-struct sam_encoder_image {
-    struct ggml_tensor * pe;
-
-    struct ggml_tensor * proj_w;
-    struct ggml_tensor * proj_b;
-
-    struct ggml_tensor * neck_conv_0;
-    struct ggml_tensor * neck_norm_0_w;
-    struct ggml_tensor * neck_norm_0_b;
-    struct ggml_tensor * neck_conv_1;
-    struct ggml_tensor * neck_norm_1_w;
-    struct ggml_tensor * neck_norm_1_b;
-
-    std::vector<sam_layer_enc> layers;
-};
-
-struct sam_encoder_prompt {
-    struct ggml_tensor * pe;
-
-    struct ggml_tensor * not_a_pt_embd_w;
-    std::vector<struct ggml_tensor *> pt_embd;
-
-    struct ggml_tensor * no_mask_embd_w;
-    //std::vector<struct ggml_tensor *> mask_down_w;
-    //std::vector<struct ggml_tensor *> mask_down_b;
-};
-
-struct  sam_layer_dec_transformer_attn {
-    // q_proj
-    struct ggml_tensor * q_w;
-    struct ggml_tensor * q_b;
-
-    // k_proj
-    struct ggml_tensor * k_w;
-    struct ggml_tensor * k_b;
-
-    // v_proj
-    struct ggml_tensor * v_w;
-    struct ggml_tensor * v_b;
-
-    // out_proj
-    struct ggml_tensor * out_w;
-    struct ggml_tensor * out_b;
-};
-
-struct sam_layer_dec_transformer {
-    sam_layer_dec_transformer_attn self_attn;
-
-    // norm1
-    struct ggml_tensor * norm1_w;
-    struct ggml_tensor * norm1_b;
-
-    sam_layer_dec_transformer_attn cross_attn_token_to_img;
-
-    // norm2
-    struct ggml_tensor * norm2_w;
-    struct ggml_tensor * norm2_b;
-
-    // mlp.lin1
-    struct ggml_tensor * mlp_lin1_w;
-    struct ggml_tensor * mlp_lin1_b;
-
-    // mlp.lin2
-    struct ggml_tensor * mlp_lin2_w;
-    struct ggml_tensor * mlp_lin2_b;
-
-    // norm3
-    struct ggml_tensor * norm3_w;
-    struct ggml_tensor * norm3_b;
-
-    // norm4
-    struct ggml_tensor * norm4_w;
-    struct ggml_tensor * norm4_b;
-
-    sam_layer_dec_transformer_attn cross_attn_img_to_token;
-};
-
-struct sam_layer_dec_output_hypernet_mlps {
-    // mlps_*.layers.0
-    struct ggml_tensor * w_0;
-    struct ggml_tensor * b_0;
-
-    // mlps_*.layers.1
-    struct ggml_tensor * w_1;
-    struct ggml_tensor * b_1;
-
-    // mlps_*.layers.2
-    struct ggml_tensor * w_2;
-    struct ggml_tensor * b_2;
-};
-
-struct sam_decoder_mask {
-    std::vector<sam_layer_dec_transformer> transformer_layers;
-
-    // trasnformer.final_attn_token_to_image
-    sam_layer_dec_transformer_attn transformer_final_attn_token_to_img;
-
-    // transformer.norm_final
-    struct ggml_tensor * transformer_norm_final_w;
-    struct ggml_tensor * transformer_norm_final_b;
-
-    // output_upscaling.0
-    struct ggml_tensor * output_upscaling_0_w;
-    struct ggml_tensor * output_upscaling_0_b;
-
-    // output_upscaling.1
-    struct ggml_tensor * output_upscaling_1_w;
-    struct ggml_tensor * output_upscaling_1_b;
-
-    // output_upscaling.3
-    struct ggml_tensor * output_upscaling_3_w;
-    struct ggml_tensor * output_upscaling_3_b;
-
-    // output_hypernetworks_mlps
-    std::vector<sam_layer_dec_output_hypernet_mlps> output_hypernet_mlps;
-
-    // iou_prediction_head.0
-    struct ggml_tensor * iou_prediction_head_0_w;
-    struct ggml_tensor * iou_prediction_head_0_b;
-
-    // iou_prediction_head.1
-    struct ggml_tensor * iou_prediction_head_1_w;
-    struct ggml_tensor * iou_prediction_head_1_b;
-
-    // iou_prediction_head.2
-    struct ggml_tensor * iou_prediction_head_2_w;
-    struct ggml_tensor * iou_prediction_head_2_b;
-
-    // iou_token.weight
-    struct ggml_tensor * iou_token_w;
-
-    // mask_tokens.weight
-    struct ggml_tensor * mask_tokens_w;
-};
-
-
-struct sam_state {
-    struct ggml_tensor * embd_img;
-
-    struct ggml_tensor * low_res_masks;
-    struct ggml_tensor * iou_predictions;
-
-    //struct ggml_tensor * tmp_save = {};
-
-    struct ggml_context * ctx;
-
-    // buffer for `ggml_graph_plan.work_data`
-    std::vector<uint8_t> work_buffer;
-    // buffers to evaluate the model
-    std::vector<uint8_t> buf_alloc_img_enc;
-    std::vector<uint8_t> buf_compute_img_enc;
-
-    std::vector<uint8_t> buf_alloc_fast;
-    std::vector<uint8_t> buf_compute_fast;
-
-    struct ggml_allocr  * allocr = {};
-};
-
-// void save_tensor(sam_state& state, struct ggml_tensor * t, struct ggml_cgraph * gf) {
-//     if (!state.tmp_save) {
-//         state.tmp_save = ggml_new_tensor(state.ctx, t->type, t->n_dims, t->ne);
-//     }
-//     struct ggml_tensor * tmp0 = ggml_cpy(state.ctx, t, state.tmp_save);
-//     ggml_build_forward_expand(gf, tmp0);
-// }
-
-struct sam_model {
-    sam_hparams hparams;
-
-    sam_encoder_image  enc_img;
-    sam_encoder_prompt enc_prompt;
-    sam_decoder_mask   dec;
-
-    //
-    struct ggml_context * ctx;
-    std::map<std::string, struct ggml_tensor *> tensors;
-};
-
-struct sam_point {
-    float x;
-    float y;
-};
-
-// RGB uint8 image
-struct sam_image_u8 {
-    int nx;
-    int ny;
-
-    std::vector<uint8_t> data;
-};
-
-// RGB float32 image
-// Memory layout: RGBRGBRGB...
-struct sam_image_f32 {
-    int nx;
-    int ny;
-
-    std::vector<float> data;
-};
-
-struct sam_params {
-    int32_t seed      = -1; // RNG seed
-    int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
-
-    std::string model     = "models/sam-vit-b/ggml-model-f16.bin"; // model path
-    std::string fname_inp = "img.jpg";
-    std::string fname_out = "img.out";
-    float   mask_threshold            = 0.f;
-    float   iou_threshold             = 0.88f;
-    float   stability_score_threshold = 0.95f;
-    float   stability_score_offset    = 1.0f;
-    float   eps                       = 1e-6f;
-    float   eps_decoder_transformer   = 1e-5f;
-    sam_point pt = { 414.375f, 162.796875f, };
-};
-
-void print_t_f32(const char* title, struct ggml_tensor * t, int n = 10) {
-    printf("%s\n", title);
-    float * data = (float *)t->data;
-    printf("dims: % " PRId64 " % " PRId64 " % " PRId64 " % " PRId64 " f32\n", t->ne[0], t->ne[1], t->ne[2], t->ne[3]);
-    printf("First & Last %d elements:\n", n);
-    for (int i = 0; i < std::min((int) (t->ne[0]*t->ne[1]), n); i++) {
-        printf("%.5f ", data[i]);
-        if (i != 0 && i % t->ne[0] == 0) {
-            printf("\n");
-        }
-    }
-    printf("\n");
-    for (int i = 0; i < std::min((int) (t->ne[0]*t->ne[1]), n); i++) {
-        printf("%.5f ", data[ggml_nelements(t) - n + i]);
-        if ((ggml_nelements(t) - n + i) % t->ne[0] == 0) {
-            printf("\n");
-        }
-    }
-    printf("\n");
-    double sum = 0.0;
-    for (int i = 0; i < ggml_nelements(t); i++) {
-        sum += data[i];
-    }
-    printf("sum:  %f\n\n", sum);
-}
-
-static void ggml_disconnect_node_from_graph(ggml_tensor * t) {
-    t->op = GGML_OP_NONE;
-    for (int i = 0; i < GGML_MAX_SRC; i++) {
-        t->src[i] = NULL;
-    }
-}
-
-static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
-    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
-
-    if (plan.work_size > 0) {
-        buf.resize(plan.work_size);
-        plan.work_data = buf.data();
-    }
-
-    ggml_graph_compute(graph, &plan);
-}
-
-static void ggml_sam_sin(struct ggml_tensor * dst , const struct ggml_tensor * src, int ith, int nth, void * userdata) {
-    GGML_ASSERT(userdata == NULL);
-    GGML_ASSERT(ggml_are_same_shape(dst, src));
-    GGML_ASSERT(ggml_is_contiguous(dst));
-    GGML_ASSERT(ggml_is_contiguous(src));
-
-    const float * src_data = ggml_get_data_f32(src);
-    float * dst_data = ggml_get_data_f32(dst);
-
-    const int ne = (int)ggml_nelements(dst);
-    const int dr = (ne + nth - 1) / nth;
-    const int ie0 = dr * ith;
-    const int ie1 = std::min(ie0 + dr, ne);
-
-    for (int i = ie0; i < ie1; ++i) {
-        dst_data[i] = sinf(src_data[i]);
-    }
-}
-
-static void ggml_sam_cos(struct ggml_tensor * dst , const struct ggml_tensor * src, int ith, int nth, void * userdata) {
-    GGML_ASSERT(userdata == NULL);
-    GGML_ASSERT(ggml_are_same_shape(dst, src));
-    GGML_ASSERT(ggml_is_contiguous(dst));
-    GGML_ASSERT(ggml_is_contiguous(src));
-
-    const float * src_data = ggml_get_data_f32(src);
-    float * dst_data = ggml_get_data_f32(dst);
-
-    const int ne = (int)ggml_nelements(dst);
-    const int dr = (ne + nth - 1) / nth;
-    const int ie0 = dr * ith;
-    const int ie1 = std::min(ie0 + dr, ne);
-
-    for (int i = ie0; i < ie1; ++i) {
-        dst_data[i] = cosf(src_data[i]);
-    }
-}
-
-bool sam_image_load_from_file(const std::string & fname, sam_image_u8 & img) {
-    int nx, ny, nc;
-    auto data = stbi_load(fname.c_str(), &nx, &ny, &nc, 3);
-    if (!data) {
-        fprintf(stderr, "%s: failed to load '%s'\n", __func__, fname.c_str());
-        return false;
-    }
-
-    img.nx = nx;
-    img.ny = ny;
-    img.data.resize(nx * ny * 3);
-    memcpy(img.data.data(), data, nx * ny * 3);
-
-    stbi_image_free(data);
-
-    return true;
-}
-
-// ref: https://github.com/facebookresearch/segment-anything/blob/efeab7296ab579d4a261e554eca80faf6b33924a/segment_anything/modeling/sam.py#L164
-// resize largest dimension to 1024
-// normalize: x = (x - mean) / std
-//     mean = [123.675, 116.28, 103.53]
-//     std  = [58.395, 57.12, 57.375]
-//     TODO: why are these hardcoded !?
-// pad to 1024x1024
-// TODO: for some reason, this is not numerically identical to pytorch's interpolation
-bool sam_image_preprocess(const sam_image_u8 & img, sam_image_f32 & res) {
-    const int nx = img.nx;
-    const int ny = img.ny;
-
-    const int nx2 = 1024;
-    const int ny2 = 1024;
-
-    res.nx = nx2;
-    res.ny = ny2;
-    res.data.resize(3*nx2*ny2);
-
-    const float scale = std::max(nx, ny) / 1024.0f;
-
-    fprintf(stderr, "%s: scale = %f\n", __func__, scale);
-
-    const int nx3 = int(nx/scale + 0.5f);
-    const int ny3 = int(ny/scale + 0.5f);
-
-    const float m3[3] = { 123.675f, 116.280f, 103.530f };
-    const float s3[3] = {  58.395f,  57.120f,  57.375f };
-
-    for (int y = 0; y < ny3; y++) {
-        for (int x = 0; x < nx3; x++) {
-            for (int c = 0; c < 3; c++) {
-                // linear interpolation
-                const float sx = (x + 0.5f)*scale - 0.5f;
-                const float sy = (y + 0.5f)*scale - 0.5f;
-
-                const int x0 = std::max(0, (int) std::floor(sx));
-                const int y0 = std::max(0, (int) std::floor(sy));
-
-                const int x1 = std::min(x0 + 1, nx - 1);
-                const int y1 = std::min(y0 + 1, ny - 1);
-
-                const float dx = sx - x0;
-                const float dy = sy - y0;
-
-                const int j00 = 3*(y0*nx + x0) + c;
-                const int j01 = 3*(y0*nx + x1) + c;
-                const int j10 = 3*(y1*nx + x0) + c;
-                const int j11 = 3*(y1*nx + x1) + c;
-
-                const float v00 = img.data[j00];
-                const float v01 = img.data[j01];
-                const float v10 = img.data[j10];
-                const float v11 = img.data[j11];
-
-                const float v0 = v00*(1.0f - dx) + v01*dx;
-                const float v1 = v10*(1.0f - dx) + v11*dx;
-
-                const float v = v0*(1.0f - dy) + v1*dy;
-
-                const uint8_t v2 = std::min(std::max(std::round(v), 0.0f), 255.0f);
-
-                const int i = 3*(y*nx3 + x) + c;
-
-                res.data[i] = (float(v2) - m3[c]) / s3[c];
-            }
-        }
-    }
-
-    return true;
-}
-
-// load the model's weights from a file
-bool sam_model_load(const sam_params & params, sam_model & model) {
-    fprintf(stderr, "%s: loading model from '%s' - please wait ...\n", __func__, params.model.c_str());
-
-    auto fin = std::ifstream(params.model, std::ios::binary);
-    if (!fin) {
-        fprintf(stderr, "%s: failed to open '%s'\n", __func__, params.model.c_str());
-        return false;
-    }
-
-    // verify magic
-    {
-        uint32_t magic;
-        fin.read((char *) &magic, sizeof(magic));
-        if (magic != 0x67676d6c) {
-            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, params.model.c_str());
-            return false;
-        }
-    }
-
-    // load hparams
-    {
-        // Override defaults with user choices
-        model.hparams.mask_threshold            = params.mask_threshold;
-        model.hparams.iou_threshold             = params.iou_threshold;
-        model.hparams.stability_score_threshold = params.stability_score_threshold;
-        model.hparams.stability_score_offset    = params.stability_score_offset;
-        model.hparams.eps                       = params.eps;
-        model.hparams.eps_decoder_transformer   = params.eps_decoder_transformer;
-
-        auto & hparams = model.hparams;
-
-        fin.read((char *) &hparams.n_enc_state,     sizeof(hparams.n_enc_state));
-        fin.read((char *) &hparams.n_enc_layer,     sizeof(hparams.n_enc_layer));
-        fin.read((char *) &hparams.n_enc_head,      sizeof(hparams.n_enc_head));
-        fin.read((char *) &hparams.n_enc_out_chans, sizeof(hparams.n_enc_out_chans));
-        fin.read((char *) &hparams.n_pt_embd,       sizeof(hparams.n_pt_embd));
-        fin.read((char *) &hparams.ftype,           sizeof(hparams.ftype));
-
-        const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
-
-        printf("%s: n_enc_state      = %d\n", __func__, hparams.n_enc_state);
-        printf("%s: n_enc_layer      = %d\n", __func__, hparams.n_enc_layer);
-        printf("%s: n_enc_head       = %d\n", __func__, hparams.n_enc_head);
-        printf("%s: n_enc_out_chans  = %d\n", __func__, hparams.n_enc_out_chans);
-        printf("%s: n_pt_embd        = %d\n", __func__, hparams.n_pt_embd);
-        printf("%s: ftype            = %d\n", __func__, hparams.ftype);
-        printf("%s: qntvr            = %d\n", __func__, qntvr);
-
-        hparams.ftype %= GGML_QNT_VERSION_FACTOR;
-
-    }
-
-    // for the big tensors, we have the option to store the data in 16-bit floats or quantized
-    // in order to save memory and also to speed up the computation
-    ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
-    if (wtype == GGML_TYPE_COUNT) {
-        fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
-                __func__, params.model.c_str(), model.hparams.ftype);
-        return false;
-    }
-
-    auto & ctx = model.ctx;
-
-    const size_t ctx_size = [&]() {
-        size_t ctx_size = 0;
-
-        const auto & hparams = model.hparams;
-
-        const int32_t n_enc_state     = hparams.n_enc_state;
-        const int32_t n_enc_layer     = hparams.n_enc_layer;
-        const int32_t n_enc_head_dim  = hparams.n_enc_head_dim();
-        const int32_t n_enc_out_chans = hparams.n_enc_out_chans;
-        const int32_t n_pt_embd       = hparams.n_pt_embd;
-
-        const int32_t n_enc_layer_local  = hparams.global_attn_indices().size();
-        const int32_t n_enc_layer_global = n_enc_layer - n_enc_layer_local;
-
-        const int32_t n_img_embd    = hparams.n_img_embd();
-        const int32_t n_window_size = hparams.n_window_size();
-        const int32_t n_patch_size  = hparams.n_patch_size();
-
-        // image encoder
-        {
-            ctx_size += n_enc_state*n_img_embd*n_img_embd*ggml_type_size(GGML_TYPE_F32);
-
-            ctx_size += n_enc_state*3*n_patch_size*n_patch_size*ggml_type_size(GGML_TYPE_F16);
-            ctx_size += n_enc_state*ggml_type_size(GGML_TYPE_F32);
-
-            ctx_size +=     n_enc_state*n_enc_out_chans*1*1*ggml_type_size(GGML_TYPE_F16);
-            ctx_size += n_enc_out_chans*n_enc_out_chans*3*3*ggml_type_size(GGML_TYPE_F16);
-
-            ctx_size += n_enc_out_chans*ggml_type_size(GGML_TYPE_F32);
-            ctx_size += n_enc_out_chans*ggml_type_size(GGML_TYPE_F32);
-
-            ctx_size += n_enc_out_chans*ggml_type_size(GGML_TYPE_F32);
-            ctx_size += n_enc_out_chans*ggml_type_size(GGML_TYPE_F32);
-        }
-
-        // image encoder layers
-        {
-            ctx_size += n_enc_layer*n_enc_state*ggml_type_size(GGML_TYPE_F32);
-            ctx_size += n_enc_layer*n_enc_state*ggml_type_size(GGML_TYPE_F32);
-
-            ctx_size += n_enc_layer_global*n_enc_head_dim*(2*n_img_embd - 1)*ggml_type_size(GGML_TYPE_F16);
-            ctx_size += n_enc_layer_global*n_enc_head_dim*(2*n_img_embd - 1)*ggml_type_size(GGML_TYPE_F16);
-
-            ctx_size += n_enc_layer_local*n_enc_head_dim*(2*n_window_size - 1)*ggml_type_size(GGML_TYPE_F16);
-            ctx_size += n_enc_layer_local*n_enc_head_dim*(2*n_window_size - 1)*ggml_type_size(GGML_TYPE_F16);
-
-            ctx_size += n_enc_layer*3*n_enc_state*n_enc_state*ggml_type_size(GGML_TYPE_F16);
-            ctx_size += n_enc_layer*3*n_enc_state*            ggml_type_size(GGML_TYPE_F32);
-
-            ctx_size += n_enc_layer*n_enc_state*n_enc_state*ggml_type_size(GGML_TYPE_F16);
-            ctx_size += n_enc_layer*n_enc_state*            ggml_type_size(GGML_TYPE_F32);
-
-            ctx_size += n_enc_layer*n_enc_state*ggml_type_size(GGML_TYPE_F32);
-            ctx_size += n_enc_layer*n_enc_state*ggml_type_size(GGML_TYPE_F32);
-
-            ctx_size += n_enc_layer*4*n_enc_state*n_enc_state*ggml_type_size(GGML_TYPE_F16);
-            ctx_size += n_enc_layer*4*n_enc_state*            ggml_type_size(GGML_TYPE_F32);
-
-            ctx_size += n_enc_layer*4*n_enc_state*n_enc_state*ggml_type_size(GGML_TYPE_F16);
-            ctx_size += n_enc_layer*4*n_enc_state*            ggml_type_size(GGML_TYPE_F32);
-        }
-
-        ctx_size += (8 + 14*n_enc_layer)*ggml_tensor_overhead();
-
-        // prompt encoder
-        {
-            ctx_size += n_enc_out_chans*ggml_type_size(GGML_TYPE_F16); // 2*(n_enc_out_chans/2)
-
-            ctx_size += n_enc_out_chans*ggml_type_size(GGML_TYPE_F32);
-            ctx_size += n_pt_embd*n_enc_out_chans*ggml_type_size(GGML_TYPE_F32);
-        }
-
-        ctx_size += (2 + n_pt_embd)*ggml_tensor_overhead();
-
-        // mask decoder
-        {
-            //transformer
-            {
-                const int tfm_layers_count = 2;
-                const int qkv_count = 3;
-                const int norm_count = 4;
-                const int n_hypernet_mpls_count = 4;
-
-                // self_attn
-                ctx_size += tfm_layers_count*qkv_count*n_enc_state*n_enc_state*ggml_type_size(GGML_TYPE_F16);
-                ctx_size += tfm_layers_count*qkv_count*n_enc_state*            ggml_type_size(GGML_TYPE_F32);
-                ctx_size += tfm_layers_count*n_enc_state*                      ggml_type_size(GGML_TYPE_F32);
-
-                // all norms
-                ctx_size += tfm_layers_count*norm_count*n_enc_state*ggml_type_size(GGML_TYPE_F32);
-                ctx_size += tfm_layers_count*norm_count*n_enc_state*ggml_type_size(GGML_TYPE_F32);
-
-                // cross_attn_token_to_img
-                ctx_size += tfm_layers_count*qkv_count*n_enc_state*(n_enc_state/2)*ggml_type_size(GGML_TYPE_F16);
-                ctx_size += tfm_layers_count*qkv_count*(n_enc_state/2)*            ggml_type_size(GGML_TYPE_F32);
-                ctx_size += tfm_layers_count*n_enc_state*                          ggml_type_size(GGML_TYPE_F32);
-
-                // mlp
-                ctx_size += tfm_layers_count*8*n_enc_out_chans*n_enc_out_chans*ggml_type_size(GGML_TYPE_F16);
-                ctx_size += tfm_layers_count*8*n_enc_out_chans*                ggml_type_size(GGML_TYPE_F32);
-                ctx_size += tfm_layers_count*n_enc_out_chans*8*n_enc_out_chans*ggml_type_size(GGML_TYPE_F16);
-                ctx_size += tfm_layers_count*n_enc_out_chans*                  ggml_type_size(GGML_TYPE_F32);
-
-                // cross_attn_img_to_token
-                ctx_size += tfm_layers_count*qkv_count*n_enc_state*(n_enc_state/2)*ggml_type_size(GGML_TYPE_F16);
-                ctx_size += tfm_layers_count*qkv_count*(n_enc_state/2)*            ggml_type_size(GGML_TYPE_F32);
-                ctx_size += tfm_layers_count*n_enc_state*                          ggml_type_size(GGML_TYPE_F32);
-
-                // transformer_final_attn_token_to_img
-                ctx_size += qkv_count*n_enc_state*(n_enc_state/2)*ggml_type_size(GGML_TYPE_F16);
-                ctx_size += qkv_count*(n_enc_state/2)*            ggml_type_size(GGML_TYPE_F32);
-                ctx_size += n_enc_state*                          ggml_type_size(GGML_TYPE_F32);
-
-                // transformer_norm_final
-                ctx_size += norm_count*n_enc_state*ggml_type_size(GGML_TYPE_F32);
-                ctx_size += norm_count*n_enc_state*ggml_type_size(GGML_TYPE_F32);
-
-                // output_upscaling
-                ctx_size += n_enc_out_chans*n_img_embd*2*2*ggml_type_size(GGML_TYPE_F16);
-                ctx_size += 3*n_img_embd*                  ggml_type_size(GGML_TYPE_F32);
-                ctx_size += n_enc_out_chans*n_img_embd*(n_img_embd/2)*2*2*ggml_type_size(GGML_TYPE_F16);
-                ctx_size += (n_img_embd/2)*                               ggml_type_size(GGML_TYPE_F32);
-
-                // output_hypernetworks_mlps
-                ctx_size += n_hypernet_mpls_count*2*n_enc_out_chans*n_enc_out_chans*ggml_type_size(GGML_TYPE_F16);
-                ctx_size += n_hypernet_mpls_count*2*n_enc_out_chans*                ggml_type_size(GGML_TYPE_F32);
-                ctx_size += n_hypernet_mpls_count*n_enc_out_chans*(n_img_embd/2)*ggml_type_size(GGML_TYPE_F16);
-                ctx_size += n_hypernet_mpls_count*(n_img_embd/2)*                ggml_type_size(GGML_TYPE_F32);
-
-                // iou_prediction_head
-                ctx_size += 2*n_enc_out_chans*n_enc_out_chans*ggml_type_size(GGML_TYPE_F16);
-                ctx_size += 2*n_enc_out_chans*                ggml_type_size(GGML_TYPE_F32);
-                ctx_size += n_pt_embd*n_enc_out_chans*ggml_type_size(GGML_TYPE_F16);
-                ctx_size += n_pt_embd*                ggml_type_size(GGML_TYPE_F32);
-
-                // iou_token_w
-                ctx_size += n_enc_out_chans*ggml_type_size(GGML_TYPE_F32);
-
-                // mask_tokens_w
-                ctx_size += n_pt_embd*n_enc_out_chans*ggml_type_size(GGML_TYPE_F32);
-            }
-        }
-        fprintf(stderr, "%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
-
-        return ctx_size;
-    }();
-
-    // create the ggml context
-    {
-        struct ggml_init_params params = {
-            /*.mem_size   =*/ ctx_size,
-            /*.mem_buffer =*/ NULL,
-            /*.no_alloc   =*/ false,
-        };
-
-        ctx = ggml_init(params);
-        if (!ctx) {
-            fprintf(stderr, "%s: ggml_init() failed\n", __func__);
-            return false;
-        }
-    }
-
-    // prepare memory for the weights
-    {
-        const auto & hparams = model.hparams;
-
-        const int32_t n_enc_state      = hparams.n_enc_state;
-        const int32_t n_enc_layer      = hparams.n_enc_layer;
-        const int32_t n_enc_head_dim   = hparams.n_enc_head_dim();
-        const int32_t n_enc_out_chans  = hparams.n_enc_out_chans;
-        const int32_t n_pt_embd        = hparams.n_pt_embd;
-
-        const int32_t n_img_embd    = hparams.n_img_embd();
-        const int32_t n_window_size = hparams.n_window_size();
-        const int32_t n_patch_size  = hparams.n_patch_size();
-
-        model.enc_img.layers.resize(n_enc_layer);
-
-        // image encoder
-        {
-            auto & enc = model.enc_img;
-
-            enc.pe = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_enc_state, n_img_embd, n_img_embd, 1);
-
-            enc.proj_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, n_patch_size, n_patch_size,           3, n_enc_state);
-            enc.proj_b = ggml_new_tensor_3d(ctx, GGML_TYPE_F32,            1,            1, n_enc_state);
-
-            enc.neck_conv_0 = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 1, 1, n_enc_state,     n_enc_out_chans);
-            enc.neck_conv_1 = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, n_enc_out_chans, n_enc_out_chans);
-
-            enc.neck_norm_0_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans);
-            enc.neck_norm_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans);
-
-            enc.neck_norm_1_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans);
-            enc.neck_norm_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans);
-
-            model.tensors["image_encoder.pos_embed"] = enc.pe;
-
-            model.tensors["image_encoder.patch_embed.proj.weight"] = enc.proj_w;
-            model.tensors["image_encoder.patch_embed.proj.bias"]   = enc.proj_b;
-
-            model.tensors["image_encoder.neck.0.weight"] = enc.neck_conv_0;
-            model.tensors["image_encoder.neck.2.weight"] = enc.neck_conv_1;
-
-            model.tensors["image_encoder.neck.1.weight"] = enc.neck_norm_0_w;
-            model.tensors["image_encoder.neck.1.bias"]   = enc.neck_norm_0_b;
-
-            model.tensors["image_encoder.neck.3.weight"] = enc.neck_norm_1_w;
-            model.tensors["image_encoder.neck.3.bias"]   = enc.neck_norm_1_b;
-
-            for (int i = 0; i < n_enc_layer; ++i) {
-                auto & layer = enc.layers[i];
-
-                layer.norm1_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_state);
-                layer.norm1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_state);
-
-                if (hparams.is_global_attn(i)) {
-                    layer.rel_pos_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_enc_head_dim, 2*n_img_embd - 1);
-                    layer.rel_pos_h = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_enc_head_dim, 2*n_img_embd - 1);
-                } else {
-                    layer.rel_pos_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_enc_head_dim, 2*n_window_size - 1);
-                    layer.rel_pos_h = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_enc_head_dim, 2*n_window_size - 1);
-                }
-
-                layer.qkv_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16,   n_enc_state, 3*n_enc_state);
-                layer.qkv_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_enc_state);
-
-                layer.proj_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16,  n_enc_state,   n_enc_state);
-                layer.proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,  n_enc_state);
-
-                layer.norm2_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_state);
-                layer.norm2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_state);
-
-                layer.mlp_lin1_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16,   n_enc_state, 4*n_enc_state);
-                layer.mlp_lin1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_enc_state);
-
-                layer.mlp_lin2_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, 4*n_enc_state,   n_enc_state);
-                layer.mlp_lin2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_enc_state);
-
-                model.tensors["image_encoder.blocks." + std::to_string(i) + ".norm1.weight"] = layer.norm1_w;
-                model.tensors["image_encoder.blocks." + std::to_string(i) + ".norm1.bias"]   = layer.norm1_b;
-
-                model.tensors["image_encoder.blocks." + std::to_string(i) + ".attn.rel_pos_w"] = layer.rel_pos_w;
-                model.tensors["image_encoder.blocks." + std::to_string(i) + ".attn.rel_pos_h"] = layer.rel_pos_h;
-
-                model.tensors["image_encoder.blocks." + std::to_string(i) + ".attn.qkv.weight"] = layer.qkv_w;
-                model.tensors["image_encoder.blocks." + std::to_string(i) + ".attn.qkv.bias"]   = layer.qkv_b;
-
-                model.tensors["image_encoder.blocks." + std::to_string(i) + ".attn.proj.weight"] = layer.proj_w;
-                model.tensors["image_encoder.blocks." + std::to_string(i) + ".attn.proj.bias"]   = layer.proj_b;
-
-                model.tensors["image_encoder.blocks." + std::to_string(i) + ".norm2.weight"] = layer.norm2_w;
-                model.tensors["image_encoder.blocks." + std::to_string(i) + ".norm2.bias"]   = layer.norm2_b;
-
-                model.tensors["image_encoder.blocks." + std::to_string(i) + ".mlp.lin1.weight"] = layer.mlp_lin1_w;
-                model.tensors["image_encoder.blocks." + std::to_string(i) + ".mlp.lin1.bias"]   = layer.mlp_lin1_b;
-
-                model.tensors["image_encoder.blocks." + std::to_string(i) + ".mlp.lin2.weight"] = layer.mlp_lin2_w;
-                model.tensors["image_encoder.blocks." + std::to_string(i) + ".mlp.lin2.bias"]   = layer.mlp_lin2_b;
-            }
-        }
-
-        // prompt encoder
-        {
-            auto & enc = model.enc_prompt;
-
-            enc.pe = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_enc_out_chans/2, 2);
-
-            enc.not_a_pt_embd_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans);
-            enc.no_mask_embd_w  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans);
-
-            model.tensors["prompt_encoder.pe_layer.positional_encoding_gaussian_matrix"] = enc.pe;
-            model.tensors["prompt_encoder.not_a_point_embed.weight"] = enc.not_a_pt_embd_w;
-            model.tensors["prompt_encoder.no_mask_embed.weight"]     = enc.no_mask_embd_w;
-
-            enc.pt_embd.resize(n_pt_embd);
-            for (int i = 0; i < n_pt_embd; i++) {
-                enc.pt_embd[i] = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans);
-
-                model.tensors["prompt_encoder.point_embeddings." + std::to_string(i) + ".weight"] = enc.pt_embd[i];
-            }
-        }
-
-        // mask decoder
-        {
-            auto & dec = model.dec;
-            auto & tfm_layers = dec.transformer_layers;
-
-            const int tfm_layers_count = 2;
-            tfm_layers.resize(tfm_layers_count);
-            for (int i = 0; i < tfm_layers_count; ++i) {
-                auto& l = tfm_layers[i];
-                l.self_attn.q_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_enc_out_chans, n_enc_out_chans);
-                l.self_attn.q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans);
-                l.self_attn.k_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_enc_out_chans, n_enc_out_chans);
-                l.self_attn.k_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans);
-                l.self_attn.v_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_enc_out_chans, n_enc_out_chans);
-                l.self_attn.v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans);
-                l.self_attn.out_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_enc_out_chans, n_enc_out_chans);
-                l.self_attn.out_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans);
-
-                l.norm1_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans);
-                l.norm1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans);
-
-                l.cross_attn_token_to_img.q_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_enc_out_chans, n_enc_out_chans/2);
-                l.cross_attn_token_to_img.q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans/2);
-                l.cross_attn_token_to_img.k_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_enc_out_chans, n_enc_out_chans/2);
-                l.cross_attn_token_to_img.k_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans/2);
-                l.cross_attn_token_to_img.v_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_enc_out_chans, n_enc_out_chans/2);
-                l.cross_attn_token_to_img.v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans/2);
-                l.cross_attn_token_to_img.out_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_enc_out_chans/2, n_enc_out_chans);
-                l.cross_attn_token_to_img.out_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans);
-
-                l.norm2_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans);
-                l.norm2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans);
-
-                l.mlp_lin1_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_enc_out_chans, 8*n_enc_out_chans);
-                l.mlp_lin1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 8*n_enc_out_chans);
-                l.mlp_lin2_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, 8*n_enc_out_chans, n_enc_out_chans);
-                l.mlp_lin2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans);
-
-                l.norm3_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans);
-                l.norm3_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans);
-
-                l.norm4_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans);
-                l.norm4_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans);
-
-                l.cross_attn_img_to_token.q_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_enc_out_chans, n_enc_out_chans/2);
-                l.cross_attn_img_to_token.q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans/2);
-                l.cross_attn_img_to_token.k_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_enc_out_chans, n_enc_out_chans/2);
-                l.cross_attn_img_to_token.k_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans/2);
-                l.cross_attn_img_to_token.v_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_enc_out_chans, n_enc_out_chans/2);
-                l.cross_attn_img_to_token.v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans/2);
-                l.cross_attn_img_to_token.out_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_enc_out_chans/2, n_enc_out_chans);
-                l.cross_attn_img_to_token.out_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans);
-
-                const auto prefix = "mask_decoder.transformer.layers." + std::to_string(i) + ".";
-                model.tensors[prefix + "self_attn.q_proj.weight"] = l.self_attn.q_w;
-                model.tensors[prefix + "self_attn.q_proj.bias"]   = l.self_attn.q_b;
-                model.tensors[prefix + "self_attn.k_proj.weight"] = l.self_attn.k_w;
-                model.tensors[prefix + "self_attn.k_proj.bias"]   = l.self_attn.k_b;
-                model.tensors[prefix + "self_attn.v_proj.weight"] = l.self_attn.v_w;
-                model.tensors[prefix + "self_attn.v_proj.bias"]   = l.self_attn.v_b;
-                model.tensors[prefix + "self_attn.out_proj.weight"] = l.self_attn.out_w;
-                model.tensors[prefix + "self_attn.out_proj.bias"]   = l.self_attn.out_b;
-
-                model.tensors[prefix + "norm1.weight"] = l.norm1_w;
-                model.tensors[prefix + "norm1.bias"]   = l.norm1_b;
-
-                model.tensors[prefix + "cross_attn_token_to_image.q_proj.weight"] = l.cross_attn_token_to_img.q_w;
-                model.tensors[prefix + "cross_attn_token_to_image.q_proj.bias"]   = l.cross_attn_token_to_img.q_b;
-                model.tensors[prefix + "cross_attn_token_to_image.k_proj.weight"] = l.cross_attn_token_to_img.k_w;
-                model.tensors[prefix + "cross_attn_token_to_image.k_proj.bias"]   = l.cross_attn_token_to_img.k_b;
-                model.tensors[prefix + "cross_attn_token_to_image.v_proj.weight"] = l.cross_attn_token_to_img.v_w;
-                model.tensors[prefix + "cross_attn_token_to_image.v_proj.bias"]   = l.cross_attn_token_to_img.v_b;
-                model.tensors[prefix + "cross_attn_token_to_image.out_proj.weight"] = l.cross_attn_token_to_img.out_w;
-                model.tensors[prefix + "cross_attn_token_to_image.out_proj.bias"]   = l.cross_attn_token_to_img.out_b;
-
-                model.tensors[prefix + "norm2.weight"] = l.norm2_w;
-                model.tensors[prefix + "norm2.bias"]   = l.norm2_b;
-
-                model.tensors[prefix + "mlp.lin1.weight"] = l.mlp_lin1_w;
-                model.tensors[prefix + "mlp.lin1.bias"]   = l.mlp_lin1_b;
-                model.tensors[prefix + "mlp.lin2.weight"] = l.mlp_lin2_w;
-                model.tensors[prefix + "mlp.lin2.bias"]   = l.mlp_lin2_b;
-
-                model.tensors[prefix + "norm3.weight"] = l.norm3_w;
-                model.tensors[prefix + "norm3.bias"]   = l.norm3_b;
-                model.tensors[prefix + "norm4.weight"] = l.norm4_w;
-                model.tensors[prefix + "norm4.bias"]   = l.norm4_b;
-
-                model.tensors[prefix + "cross_attn_image_to_token.q_proj.weight"] = l.cross_attn_img_to_token.q_w;
-                model.tensors[prefix + "cross_attn_image_to_token.q_proj.bias"]   = l.cross_attn_img_to_token.q_b;
-                model.tensors[prefix + "cross_attn_image_to_token.k_proj.weight"] = l.cross_attn_img_to_token.k_w;
-                model.tensors[prefix + "cross_attn_image_to_token.k_proj.bias"]   = l.cross_attn_img_to_token.k_b;
-                model.tensors[prefix + "cross_attn_image_to_token.v_proj.weight"] = l.cross_attn_img_to_token.v_w;
-                model.tensors[prefix + "cross_attn_image_to_token.v_proj.bias"]   = l.cross_attn_img_to_token.v_b;
-                model.tensors[prefix + "cross_attn_image_to_token.out_proj.weight"] = l.cross_attn_img_to_token.out_w;
-                model.tensors[prefix + "cross_attn_image_to_token.out_proj.bias"]   = l.cross_attn_img_to_token.out_b;
-            }
-
-            dec.transformer_final_attn_token_to_img.q_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_enc_out_chans, n_enc_out_chans/2);
-            dec.transformer_final_attn_token_to_img.q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans/2);
-            dec.transformer_final_attn_token_to_img.k_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_enc_out_chans, n_enc_out_chans/2);
-            dec.transformer_final_attn_token_to_img.k_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans/2);
-            dec.transformer_final_attn_token_to_img.v_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_enc_out_chans, n_enc_out_chans/2);
-            dec.transformer_final_attn_token_to_img.v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans/2);
-            dec.transformer_final_attn_token_to_img.out_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_enc_out_chans/2, n_enc_out_chans);
-            dec.transformer_final_attn_token_to_img.out_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans);
-
-            model.tensors["mask_decoder.transformer.final_attn_token_to_image.q_proj.weight"] = dec.transformer_final_attn_token_to_img.q_w;
-            model.tensors["mask_decoder.transformer.final_attn_token_to_image.q_proj.bias"]   = dec.transformer_final_attn_token_to_img.q_b;
-            model.tensors["mask_decoder.transformer.final_attn_token_to_image.k_proj.weight"] = dec.transformer_final_attn_token_to_img.k_w;
-            model.tensors["mask_decoder.transformer.final_attn_token_to_image.k_proj.bias"]   = dec.transformer_final_attn_token_to_img.k_b;
-            model.tensors["mask_decoder.transformer.final_attn_token_to_image.v_proj.weight"] = dec.transformer_final_attn_token_to_img.v_w;
-            model.tensors["mask_decoder.transformer.final_attn_token_to_image.v_proj.bias"]   = dec.transformer_final_attn_token_to_img.v_b;
-            model.tensors["mask_decoder.transformer.final_attn_token_to_image.out_proj.weight"] = dec.transformer_final_attn_token_to_img.out_w;
-            model.tensors["mask_decoder.transformer.final_attn_token_to_image.out_proj.bias"]   = dec.transformer_final_attn_token_to_img.out_b;
-
-            dec.transformer_norm_final_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans);
-            dec.transformer_norm_final_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans);
-
-            model.tensors["mask_decoder.transformer.norm_final_attn.weight"] = dec.transformer_norm_final_w;
-            model.tensors["mask_decoder.transformer.norm_final_attn.bias"]   = dec.transformer_norm_final_b;
-
-            dec.output_upscaling_0_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 2, 2, n_img_embd, n_enc_out_chans);
-            dec.output_upscaling_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_img_embd);
-            dec.output_upscaling_1_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_img_embd);
-            dec.output_upscaling_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_img_embd);
-            dec.output_upscaling_3_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16,  2, 2, n_img_embd/2, n_img_embd);
-            dec.output_upscaling_3_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_img_embd/2);
-
-            model.tensors["mask_decoder.output_upscaling.0.weight"] = dec.output_upscaling_0_w;
-            model.tensors["mask_decoder.output_upscaling.0.bias"]   = dec.output_upscaling_0_b;
-            model.tensors["mask_decoder.output_upscaling.1.weight"] = dec.output_upscaling_1_w;
-            model.tensors["mask_decoder.output_upscaling.1.bias"]   = dec.output_upscaling_1_b;
-            model.tensors["mask_decoder.output_upscaling.3.weight"] = dec.output_upscaling_3_w;
-            model.tensors["mask_decoder.output_upscaling.3.bias"]   = dec.output_upscaling_3_b;
-
-            const int n_hypernet_mpls_count = 4;
-            dec.output_hypernet_mlps.resize(n_hypernet_mpls_count);
-            for (int i = 0; i < n_hypernet_mpls_count; ++i) {
-                auto& mlp = dec.output_hypernet_mlps[i];
-
-                mlp.w_0 = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_enc_out_chans, n_enc_out_chans);
-                mlp.b_0 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans);
-                mlp.w_1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_enc_out_chans, n_enc_out_chans);
-                mlp.b_1 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans);
-                mlp.w_2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_enc_out_chans, n_img_embd/2);
-                mlp.b_2 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_img_embd/2);
-
-                const auto prefix = "mask_decoder.output_hypernetworks_mlps." + std::to_string(i) + ".";
-                model.tensors[prefix + "layers.0.weight"] = mlp.w_0;
-                model.tensors[prefix + "layers.0.bias"]   = mlp.b_0;
-                model.tensors[prefix + "layers.1.weight"] = mlp.w_1;
-                model.tensors[prefix + "layers.1.bias"]   = mlp.b_1;
-                model.tensors[prefix + "layers.2.weight"] = mlp.w_2;
-                model.tensors[prefix + "layers.2.bias"]   = mlp.b_2;
-            }
-
-            dec.iou_prediction_head_0_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_enc_out_chans, n_enc_out_chans);
-            dec.iou_prediction_head_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans);
-            dec.iou_prediction_head_1_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_enc_out_chans, n_enc_out_chans);
-            dec.iou_prediction_head_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_enc_out_chans);
-            dec.iou_prediction_head_2_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, n_enc_out_chans, n_pt_embd);
-            dec.iou_prediction_head_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_pt_embd);
-
-            dec.iou_token_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_enc_out_chans, 1);
-            dec.mask_tokens_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_enc_out_chans, n_pt_embd);
-
-            model.tensors["mask_decoder.iou_prediction_head.layers.0.weight"] = dec.iou_prediction_head_0_w;
-            model.tensors["mask_decoder.iou_prediction_head.layers.0.bias"]   = dec.iou_prediction_head_0_b;
-            model.tensors["mask_decoder.iou_prediction_head.layers.1.weight"] = dec.iou_prediction_head_1_w;
-            model.tensors["mask_decoder.iou_prediction_head.layers.1.bias"]   = dec.iou_prediction_head_1_b;
-            model.tensors["mask_decoder.iou_prediction_head.layers.2.weight"] = dec.iou_prediction_head_2_w;
-            model.tensors["mask_decoder.iou_prediction_head.layers.2.bias"]   = dec.iou_prediction_head_2_b;
-
-            model.tensors["mask_decoder.iou_token.weight"] = dec.iou_token_w;
-            model.tensors["mask_decoder.mask_tokens.weight"] = dec.mask_tokens_w;
-        }
-    }
-
-    // load weights
-    {
-        int n_tensors = 0;
-        size_t total_size = 0;
-
-        fprintf(stderr, "%s: ", __func__);
-
-        while (true) {
-            int32_t n_dims;
-            int32_t length;
-            int32_t ftype;
-
-            fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
-            fin.read(reinterpret_cast<char *>(&length), sizeof(length));
-            fin.read(reinterpret_cast<char *>(&ftype),  sizeof(ftype));
-
-            if (fin.eof()) {
-                break;
-            }
-
-            int64_t nelements = 1;
-            int64_t ne[4] = { 1, 1, 1, 1 };
-            for (int i = 0; i < n_dims; ++i) {
-                int32_t ne_cur;
-                fin.read(reinterpret_cast<char *>(&ne_cur), sizeof(ne_cur));
-                ne[i] = ne_cur;
-                nelements *= ne[i];
-            }
-
-            std::string name(length, 0);
-            fin.read(&name[0], length);
-
-            if (model.tensors.find(name.data()) == model.tensors.end()) {
-                fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
-                return false;
-            }
-
-            auto tensor = model.tensors[name.data()];
-            //printf("ne0 = %jd, ne1 = %jd, ne2 = %jd, ne3 = %jd\n", ne[0], ne[1], ne[2], ne[3]);
-
-            if (ggml_nelements(tensor) != nelements) {
-                fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %d, expected %d\n",
-                        __func__, name.data(), (int) nelements, (int) ggml_nelements(tensor));
-                return false;
-            }
-
-            if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1] || tensor->ne[2] != ne[2] || tensor->ne[3] != ne[3]) {
-                fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d, %d, %d], expected [%d, %d, %d, %d]\n",
-                        __func__, name.data(),
-                        (int) ne[0], (int) ne[1], (int) ne[2], (int) ne[3],
-                        (int) tensor->ne[0], (int) tensor->ne[1], (int) tensor->ne[2], (int) tensor->ne[3]);
-                return false;
-            }
-
-            size_t bpe = 0;
-
-            switch (ftype) {
-                case 0: bpe = ggml_type_size(GGML_TYPE_F32);  break;
-                case 1: bpe = ggml_type_size(GGML_TYPE_F16);  break;
-                case 2: bpe = ggml_type_size(GGML_TYPE_Q4_0); assert(ne[0] % 64 == 0); break;
-                case 3: bpe = ggml_type_size(GGML_TYPE_Q4_1); assert(ne[0] % 64 == 0); break;
-                default:
-                        {
-                            fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ftype);
-                            return false;
-                        }
-            };
-
-            if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
-                fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
-                        __func__, name.data(), ggml_nbytes(tensor), (size_t) nelements*bpe);
-                return false;
-            }
-
-            fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
-
-            total_size += ggml_nbytes(tensor);
-            if (++n_tensors % 8 == 0) {
-                fprintf(stderr, ".");
-                fflush(stdout);
-            }
-        }
-
-        if (n_tensors != int(model.tensors.size())) {
-            fprintf(stderr, "%s: model file has %d tensors, but %d tensors were expected\n", __func__, n_tensors, (int) model.tensors.size());
-            return false;
-        }
-
-        fprintf(stderr, " done\n");
-
-        fprintf(stderr, "%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, n_tensors);
-    }
-
-    fin.close();
-
-    return true;
-}
-
-struct ggml_tensor * sam_fill_dense_pe(
-            const sam_model   & model,
-          struct ggml_context * ctx0,
-          struct ggml_cgraph  * gf,
-                  sam_state   & state) {
-    const auto & hparams = model.hparams;
-    const auto & enc     = model.enc_prompt;
-
-    const int32_t n_img_embd = hparams.n_img_embd();
-    const float n_img_embd_inv = 1.0f / n_img_embd;
-
-    struct ggml_tensor * xy_embed_stacked = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 2, n_img_embd, n_img_embd);
-    ggml_allocr_alloc(state.allocr, xy_embed_stacked);
-
-    if (!ggml_allocr_is_measure(state.allocr)) {
-        float * data = (float *) ggml_get_data(xy_embed_stacked);
-        for (int i = 0; i < n_img_embd; ++i) {
-            const int row = 2*i*n_img_embd;
-            const float y_val = 2 * (i + 0.5f) * n_img_embd_inv - 1;
-            for (int j = 0; j < n_img_embd; ++j) {
-                const float x_val = 2 * (j + 0.5f) * n_img_embd_inv - 1;
-                data[row + 2*j + 0] = x_val;
-                data[row + 2*j + 1] = y_val;
-            }
-        }
-    }
-
-    struct ggml_tensor * cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, enc.pe)), xy_embed_stacked);
-
-    cur = ggml_scale(ctx0, cur, float(2.0*M_PI));
-
-    // concat
-    // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/prompt_encoder.py#L192
-    {
-        struct ggml_tensor * t_sin = ggml_map_custom1(ctx0, cur, ggml_sam_sin, GGML_N_TASKS_MAX, NULL);
-        struct ggml_tensor * t_cos = ggml_map_custom1(ctx0, cur, ggml_sam_cos, GGML_N_TASKS_MAX, NULL);
-
-        cur = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, t_sin->ne[0] + t_cos->ne[0], cur->ne[1], cur->ne[2]);
-
-        ggml_build_forward_expand(gf, ggml_cpy(ctx0, t_sin, ggml_view_3d(ctx0, cur, t_sin->ne[0], t_sin->ne[1], t_sin->ne[2], cur->nb[1], cur->nb[2], 0)));
-        ggml_build_forward_expand(gf, ggml_cpy(ctx0, t_cos, ggml_view_3d(ctx0, cur, t_sin->ne[0], t_sin->ne[1], t_sin->ne[2], cur->nb[1], cur->nb[2], t_sin->nb[1])));
-    }
-
-    struct ggml_tensor * pe_img_dense = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3));
-    ggml_build_forward_expand(gf, pe_img_dense);
-
-    return pe_img_dense;
-}
-
-struct ggml_tensor* sam_layer_norm_2d(
-                    struct ggml_context * ctx0,
-                    struct ggml_tensor  * layer,
-                    int                   n_channels,
-                    struct ggml_tensor  * w,
-                    struct ggml_tensor  * b,
-                    float                 eps) {
-    // LayerNorm2d
-    // normalize along channel dimmension
-    // TODO: better implementation
-    layer = ggml_permute(ctx0,
-                ggml_norm(ctx0, ggml_cont(ctx0, ggml_permute(ctx0, layer, 1, 2, 0, 3)), eps),
-                2, 0, 1, 3);
-
-    layer = ggml_add(ctx0,
-              ggml_mul(ctx0,
-                  ggml_repeat(ctx0, ggml_reshape_3d(ctx0, w, 1, 1, n_channels), layer),
-                  layer),
-              ggml_repeat(ctx0, ggml_reshape_3d(ctx0, b, 1, 1, n_channels), layer));
-
-    return layer;
-}
-
-struct ggml_cgraph  * sam_encode_image(
-            const sam_model & model,
-                  sam_state & state,
-        const sam_image_f32 & img) {
-
-    const auto & hparams = model.hparams;
-    const auto & enc     = model.enc_img;
-
-    const int32_t n_enc_state     = hparams.n_enc_state;
-    const int32_t n_enc_layer     = hparams.n_enc_layer;
-    const int32_t n_enc_head      = hparams.n_enc_head;
-    const int32_t n_enc_head_dim  = hparams.n_enc_head_dim();
-    const int32_t n_enc_out_chans = hparams.n_enc_out_chans;
-    const int32_t n_img_size    = hparams.n_img_size();
-    const int32_t n_window_size = hparams.n_window_size();
-
-    struct ggml_init_params ggml_params = {
-        /*.mem_size   =*/ state.buf_compute_img_enc.size(),
-        /*.mem_buffer =*/ state.buf_compute_img_enc.data(),
-        /*.no_alloc   =*/ true, // skip allocating as we use ggml_alloc to allocate exact memory requirements
-    };
-
-    struct ggml_context * ctx0   = ggml_init(ggml_params);
-    struct ggml_cgraph  * gf     = ggml_new_graph(ctx0);
-
-    struct ggml_tensor * inp = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_img_size, n_img_size, 3, 1);
-    ggml_allocr_alloc(state.allocr, inp);
-    if (!ggml_allocr_is_measure(state.allocr)) {
-        float * data = (float *) ggml_get_data(inp);
-
-        const int nx = img.nx;
-        const int ny = img.ny;
-        const int n  = nx*ny;
-
-        GGML_ASSERT(nx == n_img_size && ny == n_img_size);
-
-        for (int k = 0; k < 3; k++) {
-            for (int y = 0; y < ny; y++) {
-                for (int x = 0; x < nx; x++) {
-                    data[k*n + y*nx + x] = img.data[3*(y*nx + x) + k];
-                }
-            }
-        }
-    }
-
-    // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/image_encoder.py#L392
-    struct ggml_tensor * cur = ggml_conv_2d_sk_p0(ctx0, enc.proj_w, inp);
-    cur = ggml_add_inplace(ctx0,
-            cur,
-            ggml_repeat(ctx0, enc.proj_b, cur));
-
-    // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/image_encoder.py#L394
-    // keep in F32
-    cur = ggml_cont(ctx0,
-            ggml_permute(ctx0, cur, 1, 2, 0, 3));
-
-    // convert to F16
-    //cur = ggml_cpy(ctx0,
-    //        ggml_permute(ctx0, cur, 1, 2, 0, 3),
-    //        ggml_new_tensor_3d(ctx0, GGML_TYPE_F16, n_enc_state, n_img_embd, n_img_embd));
-
-    // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/image_encoder.py#L108-L109
-    cur = ggml_add_inplace(ctx0, cur, enc.pe);
-
-    struct ggml_tensor * inpL = cur;
-
-    for (int il = 0; il < n_enc_layer; ++il) {
-        const auto & layer = enc.layers[il];
-
-        // norm
-        // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/image_encoder.py#L168
-        {
-            cur = ggml_norm(ctx0, inpL, hparams.eps);
-
-            // cur = ln_0_w*cur + ln_0_b
-            cur = ggml_mul(ctx0, cur, layer.norm1_w);
-            cur = ggml_add_inplace(ctx0, cur, layer.norm1_b);
-        }
-
-        const int64_t w0 = cur->ne[1];
-        const int64_t h0 = cur->ne[2];
-
-        if (hparams.is_global_attn(il) == false) {
-            // local attention layer - apply window partition
-            // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/image_encoder.py#L169-L172
-            cur = ggml_win_part(ctx0, cur, n_window_size);
-        }
-
-        const int64_t W = cur->ne[1];
-        const int64_t H = cur->ne[2];
-
-        // self-attention
-        {
-            cur = ggml_mul_mat(ctx0, layer.qkv_w, cur);
-            cur = ggml_add_inplace(ctx0, cur, layer.qkv_b);
-
-            // split qkv into separate tensors
-            // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/image_encoder.py#L225-L229
-            const int B = cur->ne[3];
-
-            cur = ggml_reshape_4d(ctx0, cur, n_enc_state, 3, W*H, B);
-            cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 0, 3, 1, 2));
-
-            struct ggml_tensor * Q;
-            struct ggml_tensor * K;
-            struct ggml_tensor * V;
-
-            Q = ggml_view_3d   (ctx0, cur, n_enc_state, W*H, B, cur->nb[1], cur->nb[2], 0*cur->nb[3]);
-            Q = ggml_reshape_4d(ctx0, Q,   n_enc_head_dim, n_enc_head, W*H, B);
-            Q = ggml_cont      (ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
-            Q = ggml_reshape_3d(ctx0, Q,   n_enc_head_dim, W*H, B*n_enc_head);
-
-            K = ggml_view_3d   (ctx0, cur, n_enc_state, W*H, B, cur->nb[1], cur->nb[2], 1*cur->nb[3]);
-            K = ggml_reshape_4d(ctx0, K,   n_enc_head_dim, n_enc_head, W*H, B);
-            K = ggml_cont      (ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
-            K = ggml_reshape_3d(ctx0, K,   n_enc_head_dim, W*H, B*n_enc_head);
-
-            V = ggml_view_3d   (ctx0, cur, n_enc_state, W*H, B, cur->nb[1], cur->nb[2], 2*cur->nb[3]);
-            V = ggml_reshape_4d(ctx0, V,   n_enc_head_dim, n_enc_head, W*H, B);
-            V = ggml_cont      (ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3)); // transposed
-            V = ggml_reshape_3d(ctx0, V,   W*H, n_enc_head_dim, B*n_enc_head);
-
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
-
-            struct ggml_tensor * KQ_scaled =
-                ggml_scale_inplace(ctx0,
-                        KQ,
-                        1.0f/sqrtf(n_enc_head_dim));
-
-            struct ggml_tensor * rw = ggml_get_rel_pos(ctx0, layer.rel_pos_w, W, W);
-            struct ggml_tensor * rh = ggml_get_rel_pos(ctx0, layer.rel_pos_h, H, H);
-
-            struct ggml_tensor * q_r = ggml_reshape_4d(ctx0, Q, n_enc_head_dim, W, H, B*n_enc_head);
-
-            struct ggml_tensor * rel_w = ggml_cont(ctx0, ggml_permute(ctx0,
-                        ggml_mul_mat(ctx0,
-                            rw,
-                            ggml_cont(ctx0, ggml_permute(ctx0, q_r, 0, 2, 1, 3))),
-                        0, 2, 1, 3));
-            struct ggml_tensor * rel_h = ggml_mul_mat(ctx0, rh, q_r);
-
-            struct ggml_tensor * attn = ggml_add_rel_pos_inplace(ctx0, KQ_scaled, rel_w, rel_h);
-
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, attn);
-
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
-
-            cur =
-                ggml_reshape_4d(ctx0,
-                        ggml_cont(ctx0,
-                            ggml_permute(ctx0,
-                                ggml_reshape_4d(ctx0, KQV, n_enc_head_dim, W*H, n_enc_head, B),
-                                0, 2, 1, 3)),
-                        n_enc_state, W, H, B);
-
-            cur = ggml_mul_mat(ctx0, layer.proj_w, cur);
-            cur = ggml_add_inplace(ctx0, cur, layer.proj_b);
-        }
-
-        if (hparams.is_global_attn(il) == false) {
-            // local attention layer - reverse window partition
-            cur = ggml_win_unpart(ctx0, cur, w0, h0, n_window_size);
-        }
-
-        cur = ggml_add_inplace(ctx0, cur, inpL);
-
-        struct ggml_tensor * inpFF = cur;
-
-        // feed-forward network
-        {
-            // norm
-            {
-                cur = ggml_norm(ctx0, inpFF, hparams.eps);
-
-                // cur = mlp_ln_w*cur + mlp_ln_b
-                cur = ggml_mul(ctx0, cur, layer.norm2_w);
-                cur = ggml_add_inplace(ctx0, cur, layer.norm2_b);
-            }
-
-            // fully connected
-            cur = ggml_mul_mat(ctx0, layer.mlp_lin1_w, cur);
-            cur = ggml_add_inplace(ctx0, cur, layer.mlp_lin1_b);
-
-            // GELU activation
-            cur = ggml_gelu(ctx0, cur);
-
-            // projection
-            cur = ggml_mul_mat(ctx0, layer.mlp_lin2_w, cur);
-            cur = ggml_add_inplace(ctx0, cur, layer.mlp_lin2_b);
-        }
-
-        inpL = ggml_add(ctx0, cur, inpFF);
-    }
-
-    cur = ggml_cont(ctx0, ggml_permute(ctx0, inpL, 2, 0, 1, 3));
-
-    cur = ggml_conv_2d_sk_p0(ctx0, enc.neck_conv_0, cur);
-
-    cur = sam_layer_norm_2d(ctx0, cur, n_enc_out_chans, enc.neck_norm_0_w, enc.neck_norm_0_b, hparams.eps);
-
-    cur = ggml_conv_2d_s1_ph(ctx0, enc.neck_conv_1, cur);
-
-    cur = sam_layer_norm_2d(ctx0, cur, n_enc_out_chans, enc.neck_norm_1_w, enc.neck_norm_1_b, hparams.eps);
-
-    cur = ggml_cpy(ctx0, cur, state.embd_img);
-
-    ggml_build_forward_expand(gf, cur);
-    ggml_disconnect_node_from_graph(state.embd_img);
-
-    //ggml_graph_print(&gf);
-
-    ggml_free(ctx0);
-
-    return gf;
-}
-
-
-struct prompt_encoder_result {
-    struct ggml_tensor * embd_prompt_sparse = {};
-    struct ggml_tensor * embd_prompt_dense = {};
-};
-
-// encode a prompt
-//
-// - points
-// - boxes
-// - masks
-//
-// TODO: currently just encode a single point for simplicity
-//
-prompt_encoder_result sam_encode_prompt(
-        const sam_model     & model,
-        struct ggml_context * ctx0,
-        struct ggml_cgraph  * gf,
-                  sam_state & state,
-                        int   nx,
-                        int   ny,
-                  sam_point   point) {
-
-    const auto & hparams = model.hparams;
-    const auto & enc = model.enc_prompt;
-
-    // transform points
-    // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/automatic_mask_generator.py#L276
-    {
-        const int nmax = std::max(nx, ny);
-
-        const float scale = hparams.n_img_size() / (float) nmax;
-
-        const int nx_new = int(nx*scale + 0.5f);
-        const int ny_new = int(ny*scale + 0.5f);
-
-        point.x = point.x*(float(nx_new)/nx) + 0.5f;
-        point.y = point.y*(float(ny_new)/ny) + 0.5f;
-    }
-
-    struct ggml_tensor * inp = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 2, 2);
-
-    ggml_allocr_alloc(state.allocr, inp);
-    if (!ggml_allocr_is_measure(state.allocr)) {
-        // set the input by converting the [0, 1] coordinates to [-1, 1]
-        float * data = (float *) inp->data;
-
-        data[0] = 2.0f*(point.x / hparams.n_img_size()) - 1.0f;
-        data[1] = 2.0f*(point.y / hparams.n_img_size()) - 1.0f;
-
-        // padding
-        // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/prompt_encoder.py#L81-L85
-        data[2] = 2.0f*(0.0f) - 1.0f;
-        data[3] = 2.0f*(0.0f) - 1.0f;
-    }
-
-    struct ggml_tensor * cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, enc.pe)), inp);
-
-    cur = ggml_scale(ctx0, cur, float(2.0*M_PI));
-
-    // concat
-    // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/prompt_encoder.py#L192
-    {
-        struct ggml_tensor * t_sin = ggml_map_custom1(ctx0, cur, ggml_sam_sin, GGML_N_TASKS_MAX, NULL);
-        struct ggml_tensor * t_cos = ggml_map_custom1(ctx0, cur, ggml_sam_cos, GGML_N_TASKS_MAX, NULL);
-
-        cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, t_sin->ne[0] + t_cos->ne[0], cur->ne[1]);
-
-        ggml_build_forward_expand(gf, ggml_cpy(ctx0, t_sin, ggml_view_2d(ctx0, cur, t_sin->ne[0], t_sin->ne[1], cur->nb[1], 0)));
-        ggml_build_forward_expand(gf, ggml_cpy(ctx0, t_cos, ggml_view_2d(ctx0, cur, t_sin->ne[0], t_sin->ne[1], cur->nb[1], t_sin->nb[1])));
-
-        // overwrite label == -1 with not_a_point_embed.weight
-        // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/prompt_encoder.py#L86
-        // TODO: extend for multiple points
-        ggml_build_forward_expand(gf, ggml_cpy(ctx0, enc.not_a_pt_embd_w, ggml_view_2d(ctx0, cur, cur->ne[0], 1, cur->nb[1], cur->nb[1])));
-    }
-
-    // add point_embeddings[1] to label == 1
-    // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/prompt_encoder.py#L90
-    struct ggml_tensor * v = ggml_view_2d(ctx0, cur, cur->ne[0], 1, cur->nb[1], 0);
-    ggml_build_forward_expand(gf, ggml_cpy(ctx0, ggml_add_inplace(ctx0, v, enc.pt_embd[1]), v));
-
-    struct ggml_tensor * embd_prompt_sparse = cur;
-    ggml_build_forward_expand(gf, embd_prompt_sparse);
-
-    struct ggml_tensor * embd_prompt_dense = ggml_repeat(ctx0,
-            ggml_cont(ctx0,
-                ggml_view_3d(ctx0, enc.no_mask_embd_w,
-                    1, 1, enc.no_mask_embd_w->ne[0], enc.no_mask_embd_w->nb[0], enc.no_mask_embd_w->nb[0], 0)),
-            ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hparams.n_img_embd(), hparams.n_img_embd(), hparams.n_enc_out_chans));
-
-    ggml_build_forward_expand(gf, embd_prompt_dense);
-
-    //printf("used_mem = %zu\n", ggml_used_mem(ctx0));
-
-    prompt_encoder_result res;
-    res.embd_prompt_sparse = embd_prompt_sparse;
-    res.embd_prompt_dense  = embd_prompt_dense;
-    return res;
-}
-
-struct ggml_tensor* sam_decode_mask_transformer_attn(
-    const sam_layer_dec_transformer_attn & attn,
-                      struct ggml_tensor * queries,
-                      struct ggml_tensor * keys,
-                      struct ggml_tensor * values,
-                     struct ggml_context * ctx0,
-                         const sam_model & model) {
-    const auto & hparams = model.hparams;
-    const int n_head = hparams.n_dec_heads;
-
-    struct ggml_tensor * Qcur = {};
-    struct ggml_tensor * Kcur = {};
-    struct ggml_tensor * Vcur = {};
-
-    Qcur = ggml_mul_mat(ctx0, attn.q_w, queries);
-    Qcur = ggml_add_inplace(ctx0, Qcur, attn.q_b);
-
-    Kcur = ggml_mul_mat(ctx0, attn.k_w, keys);
-    Kcur = ggml_add_inplace(ctx0, Kcur, attn.k_b);
-
-    Vcur = ggml_mul_mat(ctx0, attn.v_w, values);
-    Vcur = ggml_add_inplace(ctx0, Vcur, attn.v_b);
-
-    struct ggml_tensor * Q = {};
-    struct ggml_tensor * K = {};
-    struct ggml_tensor * V = {};
-
-    Q = ggml_reshape_4d(ctx0, Qcur, Qcur->ne[0]/n_head, n_head, Qcur->ne[1], Qcur->ne[2]);
-    Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
-
-    K = ggml_reshape_4d(ctx0, Kcur, Kcur->ne[0]/n_head, n_head, Kcur->ne[1], Kcur->ne[2]);
-    K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
-
-    V = ggml_reshape_4d(ctx0, Vcur, Vcur->ne[0]/n_head, n_head, Vcur->ne[1], Vcur->ne[2]);
-    V = ggml_cont(ctx0, ggml_permute(ctx0, V, 0, 2, 1, 3));
-
-    // Q * K
-    struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
-
-    struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, 1.0f/sqrt(float(Q->ne[0])));
-
-    struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_scaled);
-
-    struct ggml_tensor * KQV = ggml_mul_mat(ctx0, KQ_soft_max, ggml_cont(ctx0, ggml_transpose(ctx0, V)));
-
-    struct ggml_tensor * KQV_merged = ggml_cont(ctx0, ggml_transpose(ctx0, KQV));
-    KQV_merged = ggml_cont(ctx0, ggml_permute(ctx0, KQV_merged, 0, 2, 1, 3));
-    KQV_merged = ggml_reshape_3d(ctx0, KQV_merged, KQV_merged->ne[0]*KQV_merged->ne[1], KQV_merged->ne[2], KQV_merged->ne[3]);
-    KQV_merged = ggml_mul_mat(ctx0, attn.out_w, KQV_merged);
-    KQV_merged = ggml_add_inplace(ctx0, KQV_merged, attn.out_b);
-
-    return KQV_merged;
-}
-
-struct ggml_tensor * sam_decode_mask_mlp_relu_3(
-     struct ggml_tensor * in,
-     struct ggml_tensor * w_0,
-     struct ggml_tensor * b_0,
-     struct ggml_tensor * w_1,
-     struct ggml_tensor * b_1,
-     struct ggml_tensor * w_2,
-     struct ggml_tensor * b_2,
-    struct ggml_context * ctx0) {
-
-    struct ggml_tensor * cur = {};
-    cur = ggml_mul_mat(ctx0, w_0, in);
-    cur = ggml_add_inplace(ctx0, cur, b_0);
-
-    cur = ggml_relu_inplace(ctx0, cur);
-
-    cur = ggml_mul_mat(ctx0, w_1, cur);
-    cur = ggml_add_inplace(ctx0, cur, b_1);
-
-    cur = ggml_relu_inplace(ctx0, cur);
-
-    cur = ggml_mul_mat(ctx0, w_2, cur);
-    cur = ggml_add_inplace(ctx0, cur, b_2);
-
-    return cur;
-}
-
-bool sam_decode_mask(
-                    const sam_model & model,
-        const prompt_encoder_result & prompt,
-                 struct ggml_tensor * pe_img,
-                struct ggml_context * ctx0,
-                struct ggml_cgraph  * gf,
-                          sam_state & state) {
-
-    const auto & hparams = model.hparams;
-    const auto & dec = model.dec;
-    const int n_img_embd = hparams.n_img_embd();
-
-    struct ggml_tensor * tokens = {};
-    {
-        // Concatenate output tokens
-        // ref: https://github.com/facebookresearch/segment-anything/blob/6fdee8f2727f4506cfbbe553e23b895e27956588/segment_anything/modeling/mask_decoder.py#L120
-        const auto& sparse = prompt.embd_prompt_sparse;
-
-        tokens = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, dec.iou_token_w->ne[0], dec.iou_token_w->ne[1] + dec.mask_tokens_w->ne[1] + sparse->ne[1], sparse->ne[2]);
-
-        const size_t offsets[3] = { 0, dec.iou_token_w->ne[1]*tokens->nb[1], dec.iou_token_w->ne[1]*tokens->nb[1] + dec.mask_tokens_w->ne[1]*tokens->nb[1] };
-        ggml_build_forward_expand(gf, ggml_cpy(ctx0, dec.iou_token_w,   ggml_view_2d(ctx0, tokens, tokens->ne[0], dec.iou_token_w->ne[1],   tokens->nb[1], offsets[0])));
-        ggml_build_forward_expand(gf, ggml_cpy(ctx0, dec.mask_tokens_w, ggml_view_2d(ctx0, tokens, tokens->ne[0], dec.mask_tokens_w->ne[1], tokens->nb[1], offsets[1])));
-        ggml_build_forward_expand(gf, ggml_cpy(ctx0, sparse,            ggml_view_2d(ctx0, tokens, tokens->ne[0], sparse->ne[1],            tokens->nb[1], offsets[2])));
-        // TODO: Sparse prompt embeddings can have more than one point
-    }
-
-
-    struct ggml_tensor * src = {};
-    struct ggml_tensor * pos_src = {};
-    int srcNE[4] = { 0, 0, 0, 0 };
-    {
-        // Expand per-image data in the batch direction to be per-mask
-        // ref: https://github.com/facebookresearch/segment-anything/blob/6fdee8f2727f4506cfbbe553e23b895e27956588/segment_anything/modeling/mask_decoder.py#L125
-        src = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, state.embd_img->ne[0], state.embd_img->ne[1], state.embd_img->ne[2], tokens->ne[2]);
-
-        src = ggml_add(ctx0,
-            ggml_repeat(ctx0,
-                state.embd_img,
-                src),
-            prompt.embd_prompt_dense);
-
-        srcNE[0] = src->ne[0];
-        srcNE[1] = src->ne[1];
-        srcNE[2] = src->ne[2];
-        srcNE[3] = src->ne[3];
-
-        // flatten & permute
-        // ref: https://github.com/facebookresearch/segment-anything/blob/6fdee8f2727f4506cfbbe553e23b895e27956588/segment_anything/modeling/transformer.py#L83
-        src = ggml_cont(ctx0, ggml_permute(ctx0,
-            ggml_view_3d(ctx0,
-                src,
-                src->ne[0]*src->ne[1],
-                src->ne[2],
-                src->ne[3],
-                src->nb[2],
-                src->nb[3],
-                0),
-            1, 0, 2, 3));
-
-        pos_src = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, pe_img->ne[0], pe_img->ne[1], pe_img->ne[2], tokens->ne[2]);
-        pos_src = ggml_repeat(ctx0,
-            pe_img,
-            pos_src);
-
-        // flatten & permute
-        // ref: https://github.com/facebookresearch/segment-anything/blob/6fdee8f2727f4506cfbbe553e23b895e27956588/segment_anything/modeling/transformer.py#L83
-        pos_src = ggml_cont(ctx0, ggml_permute(ctx0,
-            ggml_view_3d(ctx0,
-                pos_src,
-                pos_src->ne[0]*pos_src->ne[1],
-                pos_src->ne[2],
-                pos_src->ne[3],
-                pos_src->nb[2],
-                pos_src->nb[3],
-                0),
-            1, 0, 2, 3));
-    }
-
-    struct ggml_tensor * queries = tokens;
-    struct ggml_tensor * keys = src;
-    {
-        // Run the transformer
-        // ref: https://github.com/facebookresearch/segment-anything/blob/6fdee8f2727f4506cfbbe553e23b895e27956588/segment_anything/modeling/transformer.py#L62
-        for (int i = 0; i < int(model.dec.transformer_layers.size()); ++i) {
-            const auto& tfm_layer = model.dec.transformer_layers[i];
-
-            // Self attention block
-            // ref: https://github.com/facebookresearch/segment-anything/blob/6fdee8f2727f4506cfbbe553e23b895e27956588/segment_anything/modeling/transformer.py#L154
-            const bool skip_first_layer_pe = i == 0;
-            if (skip_first_layer_pe) {
-                queries = sam_decode_mask_transformer_attn(tfm_layer.self_attn, queries, queries, queries, ctx0, model);
-            }
-            else {
-                struct ggml_tensor * q_0 = ggml_add(ctx0, queries, tokens);
-
-                struct ggml_tensor * self_attn = sam_decode_mask_transformer_attn(tfm_layer.self_attn, q_0, q_0, queries, ctx0, model);
-                queries = ggml_add(ctx0, queries, self_attn);
-            }
-
-            queries = ggml_norm(ctx0, queries, hparams.eps_decoder_transformer);
-            queries = ggml_add_inplace(ctx0,
-                    ggml_mul(ctx0, queries, tfm_layer.norm1_w),
-                    tfm_layer.norm1_b);
-
-            // Cross attention block, tokens attending to image embedding
-            // ref: https://github.com/facebookresearch/segment-anything/blob/6fdee8f2727f4506cfbbe553e23b895e27956588/segment_anything/modeling/transformer.py#L163
-            struct ggml_tensor * q_1 = ggml_add(ctx0, queries, tokens);
-            struct ggml_tensor * k_1 = ggml_add(ctx0, keys, pos_src);
-
-            struct ggml_tensor * cross_attn_token_to_img = sam_decode_mask_transformer_attn(tfm_layer.cross_attn_token_to_img, q_1, k_1, keys, ctx0, model);
-
-            queries = ggml_add_inplace(ctx0, queries, cross_attn_token_to_img);
-            queries = ggml_norm_inplace(ctx0, queries, hparams.eps_decoder_transformer);
-            queries = ggml_add_inplace(ctx0,
-                    ggml_mul(ctx0, queries, tfm_layer.norm2_w),
-                    tfm_layer.norm2_b);
-
-            // MLP block
-            // ref: https://github.com/facebookresearch/segment-anything/blob/6fdee8f2727f4506cfbbe553e23b895e27956588/segment_anything/modeling/transformer.py#L170
-            struct ggml_tensor * mlp_out = ggml_mul_mat(ctx0,
-                tfm_layer.mlp_lin1_w,
-                queries);
-
-            mlp_out = ggml_add_inplace(ctx0, mlp_out, tfm_layer.mlp_lin1_b);
-
-            // RELU activation
-            mlp_out = ggml_relu_inplace(ctx0, mlp_out);
-            mlp_out = ggml_mul_mat(ctx0, tfm_layer.mlp_lin2_w, mlp_out);
-
-            mlp_out = ggml_add_inplace(ctx0, mlp_out, tfm_layer.mlp_lin2_b);
-
-            queries = ggml_add_inplace(ctx0, queries, mlp_out);
-            queries = ggml_norm_inplace(ctx0, queries, hparams.eps_decoder_transformer);
-            queries = ggml_add_inplace(ctx0,
-                    ggml_mul(ctx0, queries, tfm_layer.norm3_w),
-                    tfm_layer.norm3_b);
-
-            // Cross attention block, image embedding attending to tokens
-            // ref: https://github.com/facebookresearch/segment-anything/blob/6fdee8f2727f4506cfbbe553e23b895e27956588/segment_anything/modeling/transformer.py#L175
-            struct ggml_tensor * q_2 = ggml_add(ctx0, queries, tokens);
-            struct ggml_tensor * k_2 = ggml_add(ctx0, keys, pos_src);
-
-            struct ggml_tensor * cross_attn_img_to_token = sam_decode_mask_transformer_attn(tfm_layer.cross_attn_img_to_token, k_2, q_2, queries, ctx0, model);
-            keys = ggml_add_inplace(ctx0, keys, cross_attn_img_to_token);
-            keys = ggml_norm_inplace(ctx0, keys, hparams.eps_decoder_transformer);
-            keys = ggml_add_inplace(ctx0,
-                    ggml_mul(ctx0, keys, tfm_layer.norm4_w),
-                    tfm_layer.norm4_b);
-        }
-
-        // Apply the final attention layer from the points to the image
-        // ref: https://github.com/facebookresearch/segment-anything/blob/6fdee8f2727f4506cfbbe553e23b895e27956588/segment_anything/modeling/transformer.py#L99
-        struct ggml_tensor * q = ggml_add(ctx0, queries, tokens);
-        struct ggml_tensor * k = ggml_add(ctx0, keys, pos_src);
-
-        struct ggml_tensor * final_attn_token_to_img = sam_decode_mask_transformer_attn(dec.transformer_final_attn_token_to_img, q, k, keys, ctx0, model);
-
-        queries = ggml_add_inplace(ctx0, queries, final_attn_token_to_img);
-        queries = ggml_norm_inplace(ctx0, queries, hparams.eps_decoder_transformer);
-        queries = ggml_add_inplace(ctx0,
-                ggml_mul(ctx0, queries, dec.transformer_norm_final_w),
-                dec.transformer_norm_final_b);
-    }
-
-
-    struct ggml_tensor * iou_pred = ggml_view_2d(ctx0, queries, queries->ne[0], queries->ne[2], queries->nb[2], 0);
-    const int num_mask_tokens = 4; // num_multimask_outputs + 1
-    struct ggml_tensor * mask_tokens_out = ggml_view_3d(ctx0, queries, queries->ne[0], num_mask_tokens, queries->ne[2], queries->nb[1], num_mask_tokens*queries->nb[1], queries->nb[1]);
-
-    // Upscale mask embeddings and predict masks using the mask tokens
-    // ref: https://github.com/facebookresearch/segment-anything/blob/6fdee8f2727f4506cfbbe553e23b895e27956588/segment_anything/modeling/mask_decoder.py#L136
-    keys = ggml_cont(ctx0, ggml_transpose(ctx0, keys));
-    keys = ggml_view_4d(ctx0, keys, srcNE[0], srcNE[1], srcNE[2], srcNE[3], srcNE[0]*keys->nb[0], keys->nb[1], keys->nb[2], 0);
-    // ggml_build_forward_expand(gf, keys);
-    struct ggml_tensor * upscaled_embedding = {};
-    {
-        // ConvTranspose2d
-        keys = ggml_conv_transpose_2d_p0(ctx0, dec.output_upscaling_0_w, keys, 2);
-        ggml_allocr_alloc(state.allocr, keys); // TODO: This alloc shouldn't be needed
-        keys = ggml_add_inplace(ctx0, keys, ggml_repeat(ctx0,
-                                     ggml_reshape_3d(ctx0, dec.output_upscaling_0_b, 1, 1, dec.output_upscaling_0_b->ne[0]),
-                                     keys));
-
-        keys = sam_layer_norm_2d(ctx0, keys, n_img_embd, dec.output_upscaling_1_w, dec.output_upscaling_1_b, hparams.eps);
-
-        // GELU activation
-        keys = ggml_gelu_inplace(ctx0, keys);
-
-        // ConvTranspose2d
-        keys = ggml_conv_transpose_2d_p0(ctx0, dec.output_upscaling_3_w, keys, 2);
-        ggml_allocr_alloc(state.allocr, keys); // TODO: This alloc shouldn't be needed
-        keys = ggml_add_inplace(ctx0, ggml_repeat(ctx0,
-                                ggml_reshape_3d(ctx0, dec.output_upscaling_3_b, 1, 1, dec.output_upscaling_3_b->ne[0]),
-                                keys), keys);
-        // GELU activation
-        keys = ggml_gelu_inplace(ctx0, keys);
-        upscaled_embedding = ggml_reshape_3d(ctx0, keys, keys->ne[0]*keys->ne[1], keys->ne[2], keys->ne[3]);
-        upscaled_embedding = ggml_cont(ctx0, ggml_transpose(ctx0, upscaled_embedding)); // TODO: Shouldn't be needed
-    }
-
-    struct ggml_tensor * hyper_in = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_img_embd/2, num_mask_tokens, mask_tokens_out->ne[2]);
-
-    for (int i = 0; i < num_mask_tokens; ++i) {
-        const auto& mlp = dec.output_hypernet_mlps[i];
-        struct ggml_tensor * in = ggml_view_2d(ctx0, mask_tokens_out, mask_tokens_out->ne[0], mask_tokens_out->ne[2], mask_tokens_out->nb[1], i*mask_tokens_out->nb[1]);
-        struct ggml_tensor * out = sam_decode_mask_mlp_relu_3(in, mlp.w_0, mlp.b_0, mlp.w_1, mlp.b_1, mlp.w_2, mlp.b_2, ctx0);
-        ggml_build_forward_expand(gf, ggml_cpy(ctx0, out, ggml_view_2d(ctx0, hyper_in, hyper_in->ne[0], hyper_in->ne[2], hyper_in->nb[1], i*hyper_in->nb[1])));
-    }
-
-    struct ggml_tensor * masks = ggml_mul_mat(ctx0, hyper_in, upscaled_embedding);
-    masks = ggml_cont(ctx0, ggml_transpose(ctx0, masks)); // TODO: Shouldn't be needed
-    masks = ggml_reshape_4d(ctx0, masks, keys->ne[0], keys->ne[1], masks->ne[1], keys->ne[3]);
-
-    // Generate mask quality predictions
-    // ref: https://github.com/facebookresearch/segment-anything/blob/6fdee8f2727f4506cfbbe553e23b895e27956588/segment_anything/modeling/mask_decoder.py#L146
-    iou_pred = sam_decode_mask_mlp_relu_3(iou_pred, dec.iou_prediction_head_0_w, dec.iou_prediction_head_0_b, dec.iou_prediction_head_1_w, dec.iou_prediction_head_1_b, dec.iou_prediction_head_2_w, dec.iou_prediction_head_2_b, ctx0);
-
-    // Select the correct mask or masks for output
-    // ref: https://github.com/facebookresearch/segment-anything/blob/6fdee8f2727f4506cfbbe553e23b895e27956588/segment_anything/modeling/mask_decoder.py#L101
-    iou_pred = ggml_cpy(state.ctx, ggml_view_1d(ctx0, iou_pred, iou_pred->ne[0] - 1, iou_pred->nb[0]), state.iou_predictions);
-    masks = ggml_view_4d(ctx0, masks, masks->ne[0], masks->ne[1], masks->ne[2] - 1, masks->ne[3],
-                                      masks->nb[1], masks->nb[2], masks->nb[3], masks->nb[2] /* offset*/);
-    masks = ggml_cpy(state.ctx, masks, state.low_res_masks);
-
-    ggml_build_forward_expand(gf, masks);
-    ggml_build_forward_expand(gf, iou_pred);
-
-    ggml_disconnect_node_from_graph(state.low_res_masks);
-    ggml_disconnect_node_from_graph(state.iou_predictions);
-
-    return true;
-}
-
-bool sam_write_masks(const sam_hparams& hparams, int nx, int ny, const sam_state & state, const std::string & fname) {
-    if (state.low_res_masks->ne[2] == 0) return true;
-    if (state.low_res_masks->ne[2] != state.iou_predictions->ne[0]) {
-        printf("Error: number of masks (%d) does not match number of iou predictions (%d)\n", (int)state.low_res_masks->ne[2], (int)state.iou_predictions->ne[0]);
-        return false;
-    }
-
-    const int n_img_size = hparams.n_img_size();
-    const float mask_threshold = hparams.mask_threshold;
-    const float iou_threshold = hparams.iou_threshold;
-    const float stability_score_threshold = hparams.stability_score_threshold;
-    const float intersection_threshold = mask_threshold + hparams.stability_score_offset;
-    const float union_threshold = mask_threshold - hparams.stability_score_offset;
-
-    const int ne0 = state.low_res_masks->ne[0];
-    const int ne1 = state.low_res_masks->ne[1];
-    const int ne2 = state.low_res_masks->ne[2];
-
-    // Remove padding and upscale masks to the original image size.
-    // ref: https://github.com/facebookresearch/segment-anything/blob/efeab7296ab579d4a261e554eca80faf6b33924a/segment_anything/modeling/sam.py#L140
-
-    const float preprocess_scale = std::max(nx, ny) / float(n_img_size);
-    const int cropped_nx = int(nx / preprocess_scale + 0.5f);
-    const int cropped_ny = int(ny / preprocess_scale + 0.5f);
-
-    const float scale_x_1 = (float)ne0 / (float)n_img_size;
-    const float scale_y_1 = (float)ne1 / (float)n_img_size;
-
-    const float scale_x_2 = float(cropped_nx) / float(nx);
-    const float scale_y_2 = float(cropped_ny) / float(ny);
-
-    const auto iou_data = (float*)state.iou_predictions->data;
-
-    for (int i = 0; i < ne2; ++i) {
-        if (iou_threshold > 0.f && iou_data[i] < iou_threshold) {
-            printf("Skipping mask %d with iou %f below threshold %f\n", i, iou_data[i], iou_threshold);
-            continue; // Filtering masks with iou below the threshold
-        }
-
-        std::vector<float> mask_data(n_img_size*n_img_size);
-        {
-            const float* data = (float *) state.low_res_masks->data + i*ne0*ne1;
-
-            for (int iy = 0; iy < n_img_size; ++iy) {
-                for (int ix = 0; ix < n_img_size; ++ix) {
-                    const float sx = std::max(scale_x_1*(ix + 0.5f) - 0.5f, 0.0f);
-                    const float sy = std::max(scale_y_1*(iy + 0.5f) - 0.5f, 0.0f);
-
-                    const int x0 = std::max(0, (int)sx);
-                    const int y0 = std::max(0, (int)sy);
-
-                    const int x1 = std::min(x0 + 1, ne0 - 1);
-                    const int y1 = std::min(y0 + 1, ne1 - 1);
-
-                    const float dx = sx - x0;
-                    const float dy = sy - y0;
-
-                    const int j00 = y0*ne0 + x0;
-                    const int j01 = y0*ne0 + x1;
-                    const int j10 = y1*ne0 + x0;
-                    const int j11 = y1*ne0 + x1;
-
-                    const float v00 = data[j00];
-                    const float v01 = data[j01];
-                    const float v10 = data[j10];
-                    const float v11 = data[j11];
-
-                    const float v0 = (1-dx)*v00 + dx*v01;
-                    const float v1 = (1-dx)*v10 + dx*v11;
-
-                    const float v = (1-dy)*v0 + dy*v1;
-
-                    mask_data[iy*n_img_size + ix] = v;
-                }
-            }
-        }
-
-        int intersections = 0;
-        int unions = 0;
-        sam_image_u8 res;
-        int min_iy = ny;
-        int max_iy = 0;
-        int min_ix = nx;
-        int max_ix = 0;
-        {
-            const float* data = mask_data.data();
-
-            res.nx = nx;
-            res.ny = ny;
-            res.data.resize(nx*ny);
-
-            for (int iy = 0; iy < ny; ++iy) {
-                for (int ix = 0; ix < nx; ++ix) {
-                    const float sx = std::max(scale_x_2*(ix + 0.5f) - 0.5f, 0.0f);
-                    const float sy = std::max(scale_y_2*(iy + 0.5f) - 0.5f, 0.0f);
-
-                    const int x0 = std::max(0, (int)sx);
-                    const int y0 = std::max(0, (int)sy);
-
-                    const int x1 = std::min(x0 + 1, cropped_nx - 1);
-                    const int y1 = std::min(y0 + 1, cropped_ny - 1);
-
-                    const float dx = sx - x0;
-                    const float dy = sy - y0;
-
-                    const int j00 = y0*n_img_size + x0;
-                    const int j01 = y0*n_img_size + x1;
-                    const int j10 = y1*n_img_size + x0;
-                    const int j11 = y1*n_img_size + x1;
-
-                    const float v00 = data[j00];
-                    const float v01 = data[j01];
-                    const float v10 = data[j10];
-                    const float v11 = data[j11];
-
-                    const float v0 = (1-dx)*v00 + dx*v01;
-                    const float v1 = (1-dx)*v10 + dx*v11;
-
-                    const float v = (1-dy)*v0 + dy*v1;
-
-                    if (v > intersection_threshold) {
-                        intersections++;
-                    }
-                    if (v > union_threshold) {
-                        unions++;
-                    }
-                    if (v > mask_threshold) {
-                        min_iy = std::min(min_iy, iy);
-                        max_iy = std::max(max_iy, iy);
-                        min_ix = std::min(min_ix, ix);
-                        max_ix = std::max(max_ix, ix);
-
-                        res.data[iy*nx + ix] = 255;
-                    }
-                }
-            }
-        }
-
-        const float stability_score = float(intersections) / float(unions);
-        if (stability_score_threshold > 0.f && stability_score < stability_score_threshold) {
-            printf("Skipping mask %d with stability score %f below threshold %f\n", i, stability_score, stability_score_threshold);
-            continue; // Filtering masks with stability score below the threshold
-        }
-
-        printf("Mask %d: iou = %f, stability_score = %f, bbox (%d, %d), (%d, %d)\n",
-                i, iou_data[i], stability_score, min_ix, max_ix, min_iy, max_iy);
-
-        std::string filename = fname + std::to_string(i) + ".png";
-        if (!stbi_write_png(filename.c_str(), res.nx, res.ny, 1, res.data.data(), res.nx)) {
-            printf("%s: failed to write mask %s\n", __func__, filename.c_str());
-            return false;
-        }
-    }
-
-
-    return true;
-}
-
-struct ggml_cgraph  * sam_build_fast_graph(
-        const sam_model     & model,
-                  sam_state & state,
-                        int   nx,
-                        int   ny,
-                  sam_point   point) {
-
-    struct ggml_init_params ggml_params = {
-        /*.mem_size   =*/ state.buf_compute_fast.size(),
-        /*.mem_buffer =*/ state.buf_compute_fast.data(),
-        /*.no_alloc   =*/ true, // skip allocating as we use ggml_alloc to allocate exact memory requirements
-    };
-
-    struct ggml_context * ctx0   = ggml_init(ggml_params);
-    struct ggml_cgraph  * gf     = ggml_new_graph(ctx0);
-
-    prompt_encoder_result enc_res = sam_encode_prompt(model, ctx0, gf, state, nx, ny, point);
-    if (!enc_res.embd_prompt_sparse || !enc_res.embd_prompt_dense) {
-        fprintf(stderr, "%s: failed to encode prompt (%f, %f)\n", __func__, point.x, point.y);
-        return {};
-    }
-
-    struct ggml_tensor * pe_img_dense = sam_fill_dense_pe(model, ctx0, gf, state);
-    if (!pe_img_dense) {
-        fprintf(stderr, "%s: failed to get dense positional encoding\n", __func__);
-        return {};
-    }
-
-    if (!sam_decode_mask(model, enc_res, pe_img_dense, ctx0, gf, state)) {
-         fprintf(stderr, "%s: failed to decode mask\n", __func__);
-         return {};
-    }
-
-    ggml_free(ctx0);
-
-    return gf;
-}
-
-void sam_print_usage(int argc, char ** argv, const sam_params & params) {
-    fprintf(stderr, "usage: %s [options]\n", argv[0]);
-    fprintf(stderr, "\n");
-    fprintf(stderr, "options:\n");
-    fprintf(stderr, "  -h, --help            show this help message and exit\n");
-    fprintf(stderr, "  -s SEED, --seed SEED  RNG seed (default: -1)\n");
-    fprintf(stderr, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
-    fprintf(stderr, "  -m FNAME, --model FNAME\n");
-    fprintf(stderr, "                        model path (default: %s)\n", params.model.c_str());
-    fprintf(stderr, "  -i FNAME, --inp FNAME\n");
-    fprintf(stderr, "                        input file (default: %s)\n", params.fname_inp.c_str());
-    fprintf(stderr, "  -o FNAME, --out FNAME\n");
-    fprintf(stderr, "                        mask file name prefix (default: %s)\n", params.fname_out.c_str());
-    fprintf(stderr, "SAM hyperparameters:\n");
-    fprintf(stderr, "  -mt FLOAT, --mask-threshold\n");
-    fprintf(stderr, "                        mask threshold (default: %f)\n", params.mask_threshold);
-    fprintf(stderr, "  -it FLOAT, --iou-threshold\n");
-    fprintf(stderr, "                        iou threshold (default: %f)\n", params.iou_threshold);
-    fprintf(stderr, "  -st FLOAT, --score-threshold\n");
-    fprintf(stderr, "                        score threshold (default: %f)\n", params.stability_score_threshold);
-    fprintf(stderr, "  -so FLOAT, --score-offset\n");
-    fprintf(stderr, "                        score offset (default: %f)\n", params.stability_score_offset);
-    fprintf(stderr, "  -e FLOAT, --epsilon\n");
-    fprintf(stderr, "                        epsilon (default: %f)\n", params.eps);
-    fprintf(stderr, "  -ed FLOAT, --epsilon-decoder-transformer\n");
-    fprintf(stderr, "                        epsilon decoder transformer (default: %f)\n", params.eps_decoder_transformer);
-    fprintf(stderr, "SAM prompt:\n");
-    fprintf(stderr, "  -p TUPLE, --point-prompt\n");
-    fprintf(stderr, "                        point to be used as prompt for SAM (default: %f,%f). Must be in a format FLOAT,FLOAT \n", params.pt.x, params.pt.y);
-    fprintf(stderr, "\n");
-}
-
-bool sam_params_parse(int argc, char ** argv, sam_params & params) {
-    for (int i = 1; i < argc; i++) {
-        std::string arg = argv[i];
-
-        if (arg == "-s" || arg == "--seed") {
-            params.seed = std::stoi(argv[++i]);
-        } else if (arg == "-t" || arg == "--threads") {
-            params.n_threads = std::stoi(argv[++i]);
-        } else if (arg == "-m" || arg == "--model") {
-            params.model = argv[++i];
-        } else if (arg == "-i" || arg == "--inp") {
-            params.fname_inp = argv[++i];
-        } else if (arg == "-o" || arg == "--out") {
-            params.fname_out = argv[++i];
-        } else if (arg == "-mt" || arg == "--mask-threshold") {
-            params.mask_threshold = std::stof(argv[++i]);
-        } else if (arg == "-it" || arg == "--iou-threshold") {
-            params.iou_threshold = std::stof(argv[++i]);
-        } else if (arg == "-st" || arg == "--score-threshold") {
-            params.stability_score_threshold = std::stof(argv[++i]);
-        } else if (arg == "-so" || arg == "--score-offset") {
-            params.stability_score_offset = std::stof(argv[++i]);
-        } else if (arg == "-e" || arg == "--epsilon") {
-            params.eps = std::stof(argv[++i]);
-        } else if (arg == "-ed" || arg == "--epsilon-decoder-transformer") {
-            params.eps_decoder_transformer = std::stof(argv[++i]);
-        } else if (arg == "-p" || arg == "--point-prompt") {
-            // TODO multiple points per model invocation
-            char* point = argv[++i];
-
-            char* coord = strtok(point, ",");
-            if (!coord){
-                fprintf(stderr, "Error while parsing prompt!\n");
-                exit(1);
-            }
-            params.pt.x = std::stof(coord);
-            coord = strtok(NULL, ",");
-            if (!coord){
-                fprintf(stderr, "Error while parsing prompt!\n");
-                exit(1);
-            }
-            params.pt.y = std::stof(coord);
-        } else if (arg == "-h" || arg == "--help") {
-            sam_print_usage(argc, argv, params);
-            exit(0);
-        } else {
-            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
-            sam_print_usage(argc, argv, params);
-            exit(0);
-        }
-    }
-
-    return true;
-}
-
-int main(int argc, char ** argv) {
-    const int64_t t_main_start_us = ggml_time_us();
-
-    sam_params params;
-    params.model = "models/sam-vit-b/ggml-model-f16.bin";
-
-    sam_model model;
-    sam_state state;
-    int64_t t_load_us = 0;
-
-    if (sam_params_parse(argc, argv, params) == false) {
-        return 1;
-    }
-
-    if (params.seed < 0) {
-        params.seed = time(NULL);
-    }
-    fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
-
-    // load the image
-    sam_image_u8 img0;
-    if (!sam_image_load_from_file(params.fname_inp, img0)) {
-        fprintf(stderr, "%s: failed to load image from '%s'\n", __func__, params.fname_inp.c_str());
-        return 1;
-    }
-    fprintf(stderr, "%s: loaded image '%s' (%d x %d)\n", __func__, params.fname_inp.c_str(), img0.nx, img0.ny);
-
-    // preprocess to f32
-    sam_image_f32 img1;
-    if (!sam_image_preprocess(img0, img1)) {
-        fprintf(stderr, "%s: failed to preprocess image\n", __func__);
-        return 1;
-    }
-    fprintf(stderr, "%s: preprocessed image (%d x %d)\n", __func__, img1.nx, img1.ny);
-
-
-    // load the model
-    {
-        const int64_t t_start_us = ggml_time_us();
-
-        if (!sam_model_load(params, model)) {
-            fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
-            return 1;
-        }
-
-        t_load_us = ggml_time_us() - t_start_us;
-    }
-
-    {
-        static size_t buf_size = 256u*1024*1024;
-
-        struct ggml_init_params ggml_params = {
-            /*.mem_size   =*/ buf_size,
-            /*.mem_buffer =*/ NULL,
-            /*.no_alloc   =*/ false,
-        };
-
-        state.ctx = ggml_init(ggml_params);
-
-        state.embd_img = ggml_new_tensor_3d(state.ctx, GGML_TYPE_F32,
-                model.hparams.n_img_embd(), model.hparams.n_img_embd(), model.hparams.n_enc_out_chans);
-
-        state.low_res_masks = ggml_new_tensor_3d(state.ctx, GGML_TYPE_F32,
-                model.hparams.n_enc_out_chans, model.hparams.n_enc_out_chans, 3);
-
-        state.iou_predictions = ggml_new_tensor_1d(state.ctx, GGML_TYPE_F32, 3);
-    }
-
-
-    static const size_t tensor_alignment = 32;
-    {
-        state.buf_compute_img_enc.resize(ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead());
-        state.allocr = ggml_allocr_new_measure(tensor_alignment);
-        struct ggml_cgraph * gf_measure = sam_encode_image(model, state, img1);
-        if (!gf_measure) {
-            fprintf(stderr, "%s: failed to encode image\n", __func__);
-            return 1;
-        }
-
-        size_t alloc_size = ggml_allocr_alloc_graph(state.allocr, gf_measure) + tensor_alignment;
-        ggml_allocr_free(state.allocr);
-
-        // recreate allocator with exact memory requirements
-        state.buf_alloc_img_enc.resize(alloc_size);
-        state.allocr = ggml_allocr_new(state.buf_alloc_img_enc.data(), state.buf_alloc_img_enc.size(), tensor_alignment);
-
-        // compute the graph with the measured exact memory requirements from above
-        ggml_allocr_reset(state.allocr);
-
-        struct ggml_cgraph  * gf = sam_encode_image(model, state, img1);
-        if (!gf) {
-            fprintf(stderr, "%s: failed to encode image\n", __func__);
-            return 1;
-        }
-
-        ggml_allocr_alloc_graph(state.allocr, gf);
-
-        ggml_graph_compute_helper(state.work_buffer, gf, params.n_threads);
-
-        print_t_f32("embd_img", state.embd_img);
-
-        ggml_allocr_free(state.allocr);
-        state.allocr = NULL;
-        state.work_buffer.clear();
-    }
-    {
-        state.buf_compute_fast.resize(ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead());
-        state.allocr = ggml_allocr_new_measure(tensor_alignment);
-
-        // TODO: more varied prompts
-        fprintf(stderr, "prompt: (%f, %f)\n", params.pt.x, params.pt.y);
-
-        // measure memory requirements for the graph
-        struct ggml_cgraph  * gf_measure = sam_build_fast_graph(model, state, img0.nx, img0.ny, params.pt);
-        if (!gf_measure) {
-            fprintf(stderr, "%s: failed to build fast graph to measure\n", __func__);
-            return 1;
-        }
-
-        size_t alloc_size = ggml_allocr_alloc_graph(state.allocr, gf_measure) + tensor_alignment;
-        ggml_allocr_free(state.allocr);
-
-        // recreate allocator with exact memory requirements
-        state.buf_alloc_fast.resize(alloc_size);
-        state.allocr = ggml_allocr_new(state.buf_alloc_fast.data(), state.buf_alloc_fast.size(), tensor_alignment);
-
-        // compute the graph with the measured exact memory requirements from above
-        ggml_allocr_reset(state.allocr);
-
-        struct ggml_cgraph  * gf = sam_build_fast_graph(model, state, img0.nx, img0.ny, params.pt);
-        if (!gf) {
-            fprintf(stderr, "%s: failed to build fast graph\n", __func__);
-            return 1;
-        }
-
-        ggml_allocr_alloc_graph(state.allocr, gf);
-
-        ggml_graph_compute_helper(state.work_buffer, gf, params.n_threads);
-
-        //print_t_f32("iou_predictions", state.iou_predictions);
-        //print_t_f32("low_res_masks", state.low_res_masks);
-        ggml_allocr_free(state.allocr);
-        state.allocr = NULL;
-    }
-
-    if (!sam_write_masks(model.hparams, img0.nx, img0.ny, state, params.fname_out)) {
-        fprintf(stderr, "%s: failed to write masks\n", __func__);
-        return 1;
-    }
-
-    // report timing
-    {
-        const int64_t t_main_end_us = ggml_time_us();
-
-        fprintf(stderr, "\n\n");
-        fprintf(stderr, "%s:     load time = %8.2f ms\n", __func__, t_load_us/1000.0f);
-        fprintf(stderr, "%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
-    }
-
-    ggml_free(model.ctx);
-
-    return 0;
-}
diff --git a/ggml/examples/starcoder/CMakeLists.txt b/ggml/examples/starcoder/CMakeLists.txt
deleted file mode 100644
index f7b849e..0000000
--- a/ggml/examples/starcoder/CMakeLists.txt
+++ /dev/null
@@ -1,24 +0,0 @@
-#
-# starcoder
-
-set(TEST_TARGET starcoder)
-add_executable(${TEST_TARGET} main.cpp)
-target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)
-
-#
-# starcoder-quantize
-
-set(TEST_TARGET starcoder-quantize)
-add_executable(${TEST_TARGET} quantize.cpp)
-target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)
-
-#
-# For GPU offloading
-
-if (GGML_CUBLAS)
-    add_compile_definitions(GGML_USE_CUBLAS)
-endif()
-if (GGML_CLBLAST)
-    add_compile_definitions(GGML_USE_CLBLAST)
-endif()
-
diff --git a/ggml/examples/starcoder/README.md b/ggml/examples/starcoder/README.md
deleted file mode 100644
index ea64c4d..0000000
--- a/ggml/examples/starcoder/README.md
+++ /dev/null
@@ -1,115 +0,0 @@
-# 💫 StarCoder
-
-This is a C++ example running 💫 StarCoder inference using the [ggml](https://github.com/ggerganov/ggml) library.
-
-The program runs on the CPU - no video card is required.
-
-The example supports the following 💫 StarCoder models:
-
-- `bigcode/starcoder`
-- `bigcode/gpt_bigcode-santacoder` aka the smol StarCoder
-
-Sample performance on MacBook M1 Pro:
-
-TODO
-
-
-Sample output:
-
-```
-$ ./bin/starcoder -h
-usage: ./bin/starcoder [options]
-
-options:
-  -h, --help            show this help message and exit
-  -s SEED, --seed SEED  RNG seed (default: -1)
-  -t N, --threads N     number of threads to use during computation (default: 8)
-  -p PROMPT, --prompt PROMPT
-                        prompt to start generation with (default: random)
-  -n N, --n_predict N   number of tokens to predict (default: 200)
-  --top_k N             top-k sampling (default: 40)
-  --top_p N             top-p sampling (default: 0.9)
-  --temp N              temperature (default: 1.0)
-  -b N, --batch_size N  batch size for prompt processing (default: 8)
-  -m FNAME, --model FNAME
-                        model path (default: models/starcoder-117M/ggml-model.bin)
-
-$ ./bin/starcoder -m ../models/bigcode/gpt_bigcode-santacoder-ggml-q4_1.bin -p "def fibonnaci(" -t 4 --top_k 0 --top_p 0.95 --temp 0.2
-main: seed = 1683881276
-starcoder_model_load: loading model from '../models/bigcode/gpt_bigcode-santacoder-ggml-q4_1.bin'
-starcoder_model_load: n_vocab = 49280
-starcoder_model_load: n_ctx   = 2048
-starcoder_model_load: n_embd  = 2048
-starcoder_model_load: n_head  = 16
-starcoder_model_load: n_layer = 24
-starcoder_model_load: ftype   = 3
-starcoder_model_load: ggml ctx size = 1794.90 MB
-starcoder_model_load: memory size =   768.00 MB, n_mem = 49152
-starcoder_model_load: model size  =  1026.83 MB
-main: prompt: 'def fibonnaci('
-main: number of tokens in prompt = 7, first 8 tokens: 563 24240 78 2658 64 2819 7
-
-def fibonnaci(n):
-    if n == 0:
-        return 0
-    elif n == 1:
-        return 1
-    else:
-        return fibonacci(n-1) + fibonacci(n-2)
-
-print(fibo(10))
-
-main: mem per token =  9597928 bytes
-main:     load time =   480.43 ms
-main:   sample time =    26.21 ms
-main:  predict time =  3987.95 ms / 19.36 ms per token
-main:    total time =  4580.56 ms
-```
-
-## Quick start
-```bash
-git clone https://github.com/ggerganov/ggml
-cd ggml
-
-# Install Python dependencies
-python3 -m pip install -r requirements.txt
-
-# Convert HF model to ggml
-python examples/starcoder/convert-hf-to-ggml.py bigcode/gpt_bigcode-santacoder
-
-# Build ggml + examples
-mkdir build && cd build
-cmake .. && make -j4 starcoder starcoder-quantize
-
-# quantize the model
-./bin/starcoder-quantize ../models/bigcode/gpt_bigcode-santacoder-ggml.bin ../models/bigcode/gpt_bigcode-santacoder-ggml-q4_1.bin 3
-
-# run inference
-./bin/starcoder -m ../models/bigcode/gpt_bigcode-santacoder-ggml-q4_1.bin -p "def fibonnaci(" --top_k 0 --top_p 0.95 --temp 0.2
-```
-
-
-## Downloading and converting the original models (💫 StarCoder)
-
-You can download the original model and convert it to `ggml` format using the script `convert-hf-to-ggml.py`:
-
-```
-# Convert HF model to ggml
-python examples/starcoder/convert-hf-to-ggml.py bigcode/gpt_bigcode-santacoder
-```
-
-This conversion requires that you have python and Transformers installed on your computer.
-
-## Quantizing the models
-
-You can also try to quantize the `ggml` models via 4-bit integer quantization.
-
-```
-# quantize the model
-./bin/starcoder-quantize ../models/bigcode/gpt_bigcode-santacoder-ggml.bin ../models/bigcode/gpt_bigcode-santacoder-ggml-q4_1.bin 3
-```
-
-| Model | Original size | Quantized size | Quantization type |
-| --- | --- | --- | --- |
-| `bigcode/gpt_bigcode-santacoder` | 5396.45 MB | 1026.83 MB | 4-bit integer (q4_1) |
-| `bigcode/starcoder` | 71628.23 MB | 13596.23 MB | 4-bit integer (q4_1) |
diff --git a/ggml/examples/starcoder/convert-hf-to-ggml.py b/ggml/examples/starcoder/convert-hf-to-ggml.py
deleted file mode 100644
index 30af75c..0000000
--- a/ggml/examples/starcoder/convert-hf-to-ggml.py
+++ /dev/null
@@ -1,208 +0,0 @@
-# Convert HF models to ggml format
-#
-
-import sys
-import struct
-import json
-import torch
-import numpy as np
-import re
-import os
-import argparse
-
-from transformers import AutoModelForCausalLM
-from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, BloomForCausalLM
-
-# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
-def bytes_to_unicode():
-    """
-    Returns list of utf-8 byte and a corresponding list of unicode strings.
-    The reversible bpe codes work on unicode strings.
-    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
-    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
-    This is a signficant percentage of your normal, say, 32K bpe vocab.
-    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
-    And avoids mapping to whitespace/control characters the bpe code barfs on.
-    """
-    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
-    cs = bs[:]
-    n = 0
-    for b in range(2**8):
-        if b not in bs:
-            bs.append(b)
-            cs.append(2**8+n)
-            n += 1
-    cs = [chr(n) for n in cs]
-    return dict(zip(bs, cs))
-
-parser = argparse.ArgumentParser(description='Convert starcoder HF model to GGML')
-parser.add_argument('model_name_or_path', type=str, help='Name of model on HF hub, or local model folder')
-parser.add_argument('--outfile', type=str, default='ggml-model.bin', help='Path of GGML file to write.')
-parser.add_argument('--use_f32', action="store_true", help='Save GGML file in fp32')
-
-args = parser.parse_args()
-
-# use 16-bit or 32-bit floats
-use_f16 = not args.use_f32
-
-fname_out = args.outfile
-fname_dir = os.path.dirname(fname_out)
-if fname_dir:
-    os.makedirs(fname_dir, exist_ok=True)
-
-print("Loading model: ", args.model_name_or_path)
-tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
-config = AutoConfig.from_pretrained(args.model_name_or_path, trust_remote_code=True)
-hparams = config.to_dict()
-model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path, config=config, torch_dtype=torch.float16 if use_f16 else torch.float32, low_cpu_mem_usage=True, trust_remote_code=True, offload_state_dict=True)
-print("Model loaded: ", args.model_name_or_path)
-
-list_vars = model.state_dict()
-
-encoder = tokenizer.vocab
-# Add added_tokens (special tokens) to the encoder
-encoder.update(tokenizer.get_added_vocab())
-print(hparams)
-
-print("Saving ggml model to: ", fname_out)
-fout = open(fname_out, "wb")
-
-fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
-vocab_size = hparams["vocab_size"]
-fout.write(struct.pack("i", vocab_size))
-# fout.write(struct.pack("i", len(encoder)))
-fout.write(struct.pack("i", hparams["n_positions"]))
-fout.write(struct.pack("i", hparams["n_embd"]))
-fout.write(struct.pack("i", hparams["n_head"]))
-fout.write(struct.pack("i", hparams["n_layer"]))
-fout.write(struct.pack("i", use_f16))
-
-byte_encoder = bytes_to_unicode()
-byte_decoder = {v:k for k, v in byte_encoder.items()}
-
-fout.write(struct.pack("i", vocab_size))
-
-counter = 0
-# sort by value
-for key in sorted(encoder, key=encoder.get):
-    text = bytearray([byte_decoder[c] for c in key])
-    fout.write(struct.pack("i", len(text)))
-    fout.write(text)
-    counter += 1
-
-# TODO: Repeat last token until vocab_size
-while counter < vocab_size:
-    fout.write(struct.pack("i", len(text)))
-    fout.write(text)
-    counter += 1
-# assert counter == config.vocab_size
-
-for name in list_vars.keys():
-    data = list_vars[name].squeeze().numpy()
-    print("Processing variable: " + name + " with shape: ", data.shape)
-
-    # rename headers to keep compatibility
-    if name == "transformer.ln_f.weight":
-        name = "model/ln_f/g"
-    elif name == "transformer.ln_f.bias":
-        name = "model/ln_f/b"
-    elif name == "transformer.wte.weight":
-        name = "model/wte"
-    elif name == "transformer.wpe.weight":
-        name = "model/wpe"
-    elif name == "lm_head.weight":
-        name = "model/lm_head"
-    elif re.match(r"transformer.h\.\d+\.ln_1\.weight", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/ln_1/g"
-    elif re.match(r"transformer.h\.\d+\.ln_1\.bias", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/ln_1/b"
-    elif re.match(r"transformer.h\.\d+\.attn\.c_attn\.weight", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/attn/c_attn/w"
-    elif re.match(r"transformer.h\.\d+\.attn\.c_attn\.bias", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/attn/c_attn/b"
-    elif re.match(r"transformer.h\.\d+\.attn\.c_proj\.weight", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/attn/c_proj/w"
-    elif re.match(r"transformer.h.\d+.attn.c_proj.bias", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/attn/c_proj/b"
-    elif re.match(r"transformer.h.\d+.ln_2.weight", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/ln_2/g"
-    elif re.match(r"transformer.h.\d+.ln_2.bias", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/ln_2/b"
-    elif re.match(r"transformer.h.\d+.mlp.c_fc.weight", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/mlp/c_fc/w"
-    elif re.match(r"transformer.h.\d+.mlp.c_fc.bias", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/mlp/c_fc/b"
-    elif re.match(r"transformer.h.\d+.mlp.c_proj.weight", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/mlp/c_proj/w"
-    elif re.match(r"transformer.h.\d+.mlp.c_proj.bias", name):
-        i = re.findall("\d+", name)[0]
-        name = f"model/h{i}/mlp/c_proj/b"
-    else:
-        print("Unrecognized variable name. %s", name)
-
-    # we don't need these
-    if name.endswith("attn.masked_bias") or name.endswith(".attn.bias"):
-        print("  Skipping variable: " + name)
-        continue
-
-    n_dims = len(data.shape);
-
-    # ftype == 0 -> float32, ftype == 1 -> float16
-    ftype = 0;
-    if use_f16:
-        if (name == "model/wte" or name == "model/lm_head" or name[-2:] == "/g" or name[-2:] == "/w") and n_dims == 2:
-            print("  Converting to float16")
-            data = data.astype(np.float16)
-            ftype = 1
-        else:
-            print("  Converting to float32")
-            data = data.astype(np.float32)
-            ftype = 0
-
-    "model/h.*/attn/c_attn/w"
-    "model/h.*/attn/c_proj/w"
-    "model/h.*/mlp/c_fc/w"
-    "model/h.*/mlp/c_proj/w"
-    if name[-14:] == "/attn/c_attn/w" or name[-14:] == "/attn/c_attn/b":
-        print("  Duplicate K,V heads to use MHA instead of MQA")
-
-        embed_dim = hparams["n_embd"]
-        head_dim = embed_dim // hparams["n_head"]
-
-        # ((n_heads + 2) * head_dim, hidden_dim) -> (3 * n_heads * head_dim, hidden_dim)
-        q, k ,v = np.split(data, (hparams["n_head"] * head_dim, (hparams["n_head"] + 1) * head_dim), axis=0)
-        # duplicate k, v along the first axis (head_dim, hidden_dim) -> (n_heads * head_dim, hidden_dim)
-        if len(k.shape) == 2:
-            k = np.tile(k, (hparams["n_head"], 1))
-            v = np.tile(v, (hparams["n_head"], 1))
-        elif len(k.shape) == 1:
-            k = np.tile(k, (hparams["n_head"]))
-            v = np.tile(v, (hparams["n_head"]))
-        # concat q, k, v along the first axis (n_heads * head_dim, hidden_dim) -> (3 * n_heads * head_dim, hidden_dim)
-        data = np.concatenate((q, k, v), axis=0)
-
-    # header
-    str = name.encode('utf-8')
-    fout.write(struct.pack("iii", n_dims, len(str), ftype))
-    for i in range(n_dims):
-        fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
-    fout.write(str);
-
-    # data
-    data.tofile(fout)
-
-fout.close()
-
-print("Done. Output file: " + fname_out)
-print("")
diff --git a/ggml/examples/starcoder/main.cpp b/ggml/examples/starcoder/main.cpp
deleted file mode 100644
index b11cbb7..0000000
--- a/ggml/examples/starcoder/main.cpp
+++ /dev/null
@@ -1,924 +0,0 @@
-#include "ggml/ggml.h"
-
-#include "common.h"
-#include "common-ggml.h"
-
-#include <cassert>
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <fstream>
-#include <map>
-#include <string>
-#include <vector>
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-
-// default hparams (GPT-2 117M)
-// https://huggingface.co/bigcode/gpt_bigcode-santacoder/blob/main/config.json
-struct starcoder_hparams {
-    int32_t n_vocab = 49280;
-    int32_t n_ctx   = 2048;
-    int32_t n_embd  = 2048;
-    int32_t n_head  = 16;
-    int32_t n_layer = 24;
-    int32_t ftype   = 1;
-    float   eps     = 1e-5f;
-};
-
-struct starcoder_layer {
-    // normalization
-    struct ggml_tensor * ln_1_g;
-    struct ggml_tensor * ln_1_b;
-
-    struct ggml_tensor * ln_2_g;
-    struct ggml_tensor * ln_2_b;
-
-    // attention
-    struct ggml_tensor * c_attn_attn_w;
-    struct ggml_tensor * c_attn_attn_b;
-
-    struct ggml_tensor * c_attn_proj_w;
-    struct ggml_tensor * c_attn_proj_b;
-
-    // mlp
-    struct ggml_tensor * c_mlp_fc_w;
-    struct ggml_tensor * c_mlp_fc_b;
-
-    struct ggml_tensor * c_mlp_proj_w;
-    struct ggml_tensor * c_mlp_proj_b;
-};
-
-struct starcoder_model {
-    starcoder_hparams hparams;
-
-    // normalization
-    struct ggml_tensor * ln_f_g;
-    struct ggml_tensor * ln_f_b;
-
-    struct ggml_tensor * wte;     // position embedding
-    struct ggml_tensor * wpe;     //    token embedding
-    struct ggml_tensor * lm_head; // language model head
-
-    std::vector<starcoder_layer> layers;
-
-    // key + value memory
-    struct ggml_tensor * memory_k;
-    struct ggml_tensor * memory_v;
-
-    //
-    struct ggml_context * ctx;
-    std::map<std::string, struct ggml_tensor *> tensors;
-};
-
-// load the model's weights from a file
-bool starcoder_model_load(const std::string & fname, starcoder_model & model, gpt_vocab & vocab) {
-    printf("%s: loading model from '%s'\n", __func__, fname.c_str());
-
-    auto fin = std::ifstream(fname, std::ios::binary);
-    if (!fin) {
-        fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
-        return false;
-    }
-
-    // verify magic
-    {
-        uint32_t magic;
-        fin.read((char *) &magic, sizeof(magic));
-        if (magic != GGML_FILE_MAGIC) {
-            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
-            return false;
-        }
-    }
-
-    // load hparams
-    {
-        auto & hparams = model.hparams;
-
-        fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
-        fin.read((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
-        fin.read((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
-        fin.read((char *) &hparams.n_head,  sizeof(hparams.n_head));
-        fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
-        fin.read((char *) &hparams.ftype,   sizeof(hparams.ftype));
-
-        const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
-
-        printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
-        printf("%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
-        printf("%s: n_embd  = %d\n", __func__, hparams.n_embd);
-        printf("%s: n_head  = %d\n", __func__, hparams.n_head);
-        printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
-        printf("%s: ftype   = %d\n", __func__, hparams.ftype);
-        printf("%s: qntvr   = %d\n", __func__, qntvr);
-
-        hparams.ftype %= GGML_QNT_VERSION_FACTOR;
-    }
-
-    // load vocab
-    {
-        int32_t n_vocab = 0;
-        fin.read((char *) &n_vocab, sizeof(n_vocab));
-
-        if (n_vocab != model.hparams.n_vocab) {
-            fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
-                    __func__, fname.c_str(), n_vocab, model.hparams.n_vocab);
-            return false;
-        }
-
-        std::string word;
-        std::vector<char> buf(128);
-
-        for (int i = 0; i < n_vocab; i++) {
-            uint32_t len;
-            fin.read((char *) &len, sizeof(len));
-
-            buf.resize(len);
-            fin.read((char *) buf.data(), len);
-            word.assign(buf.data(), len);
-
-            vocab.token_to_id[word] = i;
-            vocab.id_to_token[i] = word;
-
-            // if (i < 10) fprintf(stderr, "%.s: vocab[%d] = '%s'\n", __func__, i, word.c_str());
-        }
-
-        // Add StarChat special tokens.
-        for (std::string token : {
-                "<|system|>",
-                "<|user|>",
-                "<|assistant|>",
-                "<|end|>",
-                "<fim-prefix>",
-                "<fim-middle>",
-                "<fim-suffix>",
-                "<fim-pad>",
-                "<|end_of_turn|>"
-            }) {
-            if (vocab.token_to_id.find(token) != vocab.token_to_id.end()) {
-                vocab.add_special_token(token);
-            }
-        }
-    }
-
-    // for the big tensors, we have the option to store the data in 16-bit floats or quantized
-    // in order to save memory and also to speed up the computation
-    ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
-    if (wtype == GGML_TYPE_COUNT) {
-        fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
-                __func__, fname.c_str(), model.hparams.ftype);
-        return false;
-    }
-
-    auto & ctx = model.ctx;
-
-    size_t ctx_size = 0;
-
-    {
-        const auto & hparams = model.hparams;
-
-        const int n_embd  = hparams.n_embd;
-        const int n_layer = hparams.n_layer;
-        const int n_ctx   = hparams.n_ctx;
-        const int n_vocab = hparams.n_vocab;
-
-        const int head_dim = n_embd / hparams.n_head;
-        const int kv_heads = hparams.n_head; // 1 if MQA else hparams.n_head
-        const int kv_dim   = kv_heads * head_dim;
-
-        ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_g
-        ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_b
-
-        ctx_size += n_vocab*ggml_row_size(wtype,         n_embd); // wte
-        ctx_size +=   n_ctx*ggml_row_size(GGML_TYPE_F32, n_embd); // wpe
-        ctx_size += n_vocab*ggml_row_size(wtype,         n_embd); // lm_head
-
-        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_g
-        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_b
-
-        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_g
-        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_b
-
-        ctx_size += n_layer*((n_embd + 2*kv_dim)*ggml_row_size(wtype,         n_embd)); // c_attn_attn_w // TODO:
-        ctx_size += n_layer*((n_embd + 2*kv_dim)*ggml_row_size(GGML_TYPE_F32, 1));      // c_attn_attn_b
-
-        ctx_size += n_layer*(ggml_row_size(wtype,         n_embd*n_embd)); // c_attn_proj_w
-        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd));        // c_attn_proj_b
-
-        ctx_size += n_layer*(ggml_row_size(wtype,         4*n_embd*n_embd)); // c_mlp_fc_w
-        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 4*n_embd));        // c_mlp_fc_b
-
-        ctx_size += n_layer*(ggml_row_size(wtype,         4*n_embd*n_embd));         // c_mlp_proj_w
-        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32,   n_embd)); // c_mlp_proj_b
-
-        ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F32, n_embd); // memory_k
-        ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F32, n_embd); // memory_v
-
-        ctx_size += (6 + 12*n_layer)*512; // object overhead
-
-        printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
-    }
-
-    // create the ggml context
-    {
-        struct ggml_init_params params = {
-            /*.mem_size   =*/ ctx_size,
-            /*.mem_buffer =*/ NULL,
-            /*.no_alloc   =*/ false,
-        };
-
-        model.ctx = ggml_init(params);
-        if (!model.ctx) {
-            fprintf(stderr, "%s: ggml_init() failed\n", __func__);
-            return false;
-        }
-    }
-
-    // prepare memory for the weights
-    {
-        const auto & hparams = model.hparams;
-
-        const int n_embd  = hparams.n_embd;
-        const int n_layer = hparams.n_layer;
-        const int n_ctx   = hparams.n_ctx;
-        const int n_vocab = hparams.n_vocab;
-
-        const int head_dim = n_embd / hparams.n_head;
-        const int kv_heads = hparams.n_head; // 1 if MQA else hparams.n_head
-        const int kv_dim   = kv_heads * head_dim;
-
-        model.layers.resize(n_layer);
-
-        model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-        model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-
-        model.wte     = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
-        model.wpe     = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ctx);
-        model.lm_head = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
-
-        // map by name
-        model.tensors["model/ln_f/g"] = model.ln_f_g;
-        model.tensors["model/ln_f/b"] = model.ln_f_b;
-
-        model.tensors["model/wte"]     = model.wte;
-        model.tensors["model/wpe"]     = model.wpe;
-        model.tensors["model/lm_head"] = model.lm_head;
-
-        for (int i = 0; i < n_layer; ++i) {
-            auto & layer = model.layers[i];
-
-            layer.ln_1_g        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-            layer.ln_1_b        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-
-            layer.ln_2_g        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-            layer.ln_2_b        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-
-            layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype,           n_embd, n_embd + 2*kv_dim);
-            layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd + 2*kv_dim);
-
-            layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype,           n_embd, n_embd);
-            layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-
-            layer.c_mlp_fc_w    = ggml_new_tensor_2d(ctx, wtype,           n_embd, 4*n_embd); //TODO: 4*n_embd = config.n_inner
-            layer.c_mlp_fc_b    = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
-
-            layer.c_mlp_proj_w  = ggml_new_tensor_2d(ctx, wtype,         4*n_embd, n_embd);
-            layer.c_mlp_proj_b  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-
-            // map by name
-            model.tensors["model/h" + std::to_string(i) + "/ln_1/g"]        = layer.ln_1_g;
-            model.tensors["model/h" + std::to_string(i) + "/ln_1/b"]        = layer.ln_1_b;
-
-            model.tensors["model/h" + std::to_string(i) + "/ln_2/g"]        = layer.ln_2_g;
-            model.tensors["model/h" + std::to_string(i) + "/ln_2/b"]        = layer.ln_2_b;
-
-            model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/w"] = layer.c_attn_attn_w;
-            model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/b"] = layer.c_attn_attn_b;
-
-            model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/w"] = layer.c_attn_proj_w;
-            model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/b"] = layer.c_attn_proj_b;
-
-            model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/w"]    = layer.c_mlp_fc_w;
-            model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/b"]    = layer.c_mlp_fc_b;
-
-            model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/w"]  = layer.c_mlp_proj_w;
-            model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/b"]  = layer.c_mlp_proj_b;
-        }
-    }
-
-    // key + value memory
-    {
-        const auto & hparams = model.hparams;
-
-        const int n_embd  = hparams.n_embd;
-        const int n_layer = hparams.n_layer;
-        const int n_ctx   = hparams.n_ctx;
-
-        const int n_mem      = n_layer*n_ctx;
-        const int n_elements = n_embd*n_mem;
-
-        model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
-        model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
-
-        const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
-
-        printf("%s: memory size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem);
-    }
-
-    // load weights
-    {
-        size_t total_size = 0;
-
-        bool has_lm_head = false;
-
-        while (true) {
-            int32_t n_dims;
-            int32_t length;
-            int32_t ttype;
-
-            fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
-            fin.read(reinterpret_cast<char *>(&length), sizeof(length));
-            fin.read(reinterpret_cast<char *>(&ttype),  sizeof(ttype));
-
-            if (fin.eof()) {
-                break;
-            }
-
-            int32_t nelements = 1;
-            int32_t ne[2] = { 1, 1 };
-            for (int i = 0; i < n_dims; ++i) {
-                fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
-                nelements *= ne[i];
-            }
-
-            std::string name(length, 0);
-            fin.read(&name[0], length);
-
-            if (model.tensors.find(name) == model.tensors.end()) {
-                fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.c_str());
-                return false;
-            }
-
-            auto tensor = model.tensors[name];
-            if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
-                fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
-                        __func__, name.c_str(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]);
-                return false;
-            }
-            if (ggml_nelements(tensor) != nelements) {
-                fprintf(stderr, "%s: tensor '%s' has wrong size in model file. got %d, expected %d\n",
-                        __func__, name.c_str(), (int) ggml_nelements(tensor), nelements);
-                return false;
-            }
-
-            // for debugging
-            if (0) {
-                printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.c_str(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
-            }
-
-            const size_t bpe = ggml_type_size(ggml_type(ttype));
-
-            if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
-                fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
-                        __func__, name.c_str(), ggml_nbytes(tensor), nelements*bpe);
-                return false;
-            }
-
-            fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
-
-            // GPT-2 models share the WTE tensor as the LM head
-            if (name == "model/wte" && has_lm_head == false) {
-                memcpy(model.lm_head->data, tensor->data, ggml_nbytes(tensor));
-            }
-
-            if (name == "model/lm_head") {
-                has_lm_head = true;
-            }
-
-            total_size += ggml_nbytes(tensor);
-        }
-
-        printf("%s: model size  = %8.2f MB\n", __func__, total_size/1024.0/1024.0);
-    }
-
-    fin.close();
-
-    return true;
-}
-
-// evaluate the transformer
-//
-//   - model:     the model
-//   - n_threads: number of threads to use
-//   - n_past:    the context size so far
-//   - embd_inp:  the embeddings of the tokens in the context
-//   - embd_w:    the predicted logits for the next token
-//
-bool starcoder_eval(
-        const starcoder_model & model,
-        const int n_threads,
-        const int n_past,
-        const std::vector<gpt_vocab::id> & embd_inp,
-              std::vector<float>         & embd_w,
-              size_t                     & mem_per_token) {
-    const int N = embd_inp.size();
-
-    const auto & hparams = model.hparams;
-
-    const int n_embd  = hparams.n_embd;
-    const int n_layer = hparams.n_layer;
-    const int n_ctx   = hparams.n_ctx;
-    const int n_head  = hparams.n_head;
-    const int n_vocab = hparams.n_vocab;
-
-    static size_t buf_size = 256u*1024*1024;
-    static void * buf = malloc(buf_size);
-
-    // use 2 scratch buffers
-    // TODO: very hacky solution - reimplement in a more elegant way
-    static size_t scr0_size = 256u*1024*1024;
-    static void * scr0 = malloc(scr0_size);
-
-    static size_t scr1_size = 256u*1024*1024;
-    static void * scr1 = malloc(scr1_size);
-
-    if (mem_per_token > 0 && mem_per_token*N > buf_size) {
-        const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
-        //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
-
-        // reallocate
-        buf_size = buf_size_new;
-        buf = realloc(buf, buf_size);
-        if (buf == nullptr) {
-            fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size);
-            return false;
-        }
-    }
-
-    struct ggml_init_params params = {
-        /*.mem_size   =*/ buf_size,
-        /*.mem_buffer =*/ buf,
-        /*.no_alloc   =*/ false,
-    };
-
-    struct ggml_context * ctx0 = ggml_init(params);
-    struct ggml_cgraph * gf = ggml_new_graph(ctx0);
-
-    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
-    memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
-
-    struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
-    for (int i = 0; i < N; ++i) {
-        ((int32_t *) position->data)[i] = n_past + i;
-    }
-
-    // wte + wpe
-    struct ggml_tensor * inpL =
-        ggml_add(ctx0,
-                ggml_get_rows(ctx0, model.wte, embd),
-                ggml_get_rows(ctx0, model.wpe, position));
-
-    for (int il = 0; il < n_layer; ++il) {
-        struct ggml_tensor * cur;
-
-        ggml_set_scratch(ctx0, { 0, scr0_size, scr0, });
-
-        // norm
-        {
-            // [ 768, N]
-            cur = ggml_norm(ctx0, inpL, hparams.eps);
-
-            // cur = ln_1_g*cur + ln_1_b
-            // [ 768, N]
-            cur = ggml_add(ctx0,
-                    ggml_mul(ctx0,
-                        ggml_repeat(ctx0, model.layers[il].ln_1_g, cur),
-                        cur),
-                    ggml_repeat(ctx0, model.layers[il].ln_1_b, cur));
-        }
-
-        // attn
-        // [2304, 768] - model.layers[il].c_attn_attn_w
-        // [2304,   1] - model.layers[il].c_attn_attn_b
-        // [ 768,   N] - cur (in)
-        // [2304,   N] - cur (out)
-        //
-        // cur = attn_w*cur + attn_b
-        // [2304, N]
-        {
-            cur = ggml_mul_mat(ctx0,
-                    model.layers[il].c_attn_attn_w,
-                    cur);
-
-            cur = ggml_add(ctx0,
-                    ggml_repeat(ctx0, model.layers[il].c_attn_attn_b, cur),
-                    cur);
-        }
-
-        // self-attention
-        {
-            struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd);
-            struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd);
-            struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd);
-
-            // store key and value to memory
-            if (N >= 1) {
-                struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past));
-                struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past));
-
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
-            }
-
-            // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
-            // [64, N, 12]
-            struct ggml_tensor * Q =
-                ggml_permute(ctx0,
-                        ggml_cpy(ctx0,
-                            Qcur,
-                            ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)),
-                        0, 2, 1, 3);
-
-            // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
-            // [64, n_past + N, 12]
-            struct ggml_tensor * K =
-                ggml_permute(ctx0,
-                        ggml_reshape_3d(ctx0,
-                            ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd),
-                            n_embd/n_head, n_head, n_past + N),
-                        0, 2, 1, 3); //TODO: need to be tiled
-
-            // GG: flash attention
-            //struct ggml_tensor * V =
-            //    ggml_cpy(ctx0,
-            //            ggml_permute(ctx0,
-            //                ggml_reshape_3d(ctx0,
-            //                    ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
-            //                    n_embd/n_head, n_head, n_past + N),
-            //                1, 2, 0, 3),
-            //            ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, n_embd/n_head, n_head));
-
-            //struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, true);
-
-            // K * Q
-            // [n_past + N, N, 12]
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); //TODO: check if it broadcasts
-
-            // KQ_scaled = KQ / sqrt(n_embd/n_head)
-            // [n_past + N, N, 12]
-            struct ggml_tensor * KQ_scaled =
-                ggml_scale_inplace(ctx0, KQ, 1.0f/sqrt(float(n_embd)/n_head));
-
-            // KQ_masked = mask_past(KQ_scaled)
-            // [n_past + N, N, 12]
-            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
-
-            // KQ = soft_max(KQ_masked)
-            // [n_past + N, N, 12]
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
-
-            // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
-            // [n_past + N, 64, 12]
-            struct ggml_tensor * V_trans =
-                ggml_cpy(ctx0,
-                        ggml_permute(ctx0,
-                            ggml_reshape_3d(ctx0,
-                                ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
-                                n_embd/n_head, n_head, n_past + N),
-                            1, 2, 0, 3),
-                        ggml_new_tensor_3d(ctx0, model.memory_v->type, n_past + N, n_embd/n_head, n_head));
-
-            // KQV = transpose(V) * KQ_soft_max
-            // [64, N, 12]
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
-
-            // KQV_merged = KQV.permute(0, 2, 1, 3)
-            // [64, 12, N]
-            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
-
-            // cur = KQV_merged.contiguous().view(n_embd, N)
-            // [768, N]
-            cur = ggml_cpy(ctx0,
-                    KQV_merged,
-                    ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
-        }
-
-        // projection
-        // [ 768, 768] - model.layers[il].c_attn_proj_w
-        // [ 768,   1] - model.layers[il].c_attn_proj_b
-        // [ 768,   N] - cur (in)
-        // [ 768,   N] - cur (out)
-        //
-        // cur = proj_w*cur + proj_b
-        // [768, N]
-        {
-            cur = ggml_mul_mat(ctx0,
-                    model.layers[il].c_attn_proj_w,
-                    cur);
-
-            cur = ggml_add(ctx0,
-                    ggml_repeat(ctx0, model.layers[il].c_attn_proj_b, cur),
-                    cur);
-        }
-
-        // add the input
-        cur = ggml_add(ctx0, cur, inpL);
-
-        struct ggml_tensor * inpFF = cur;
-
-        ggml_set_scratch(ctx0, { 0, scr1_size, scr1, });
-
-        // feed-forward network
-        {
-            // norm
-            {
-                cur = ggml_norm(ctx0, inpFF, hparams.eps);
-
-                // cur = ln_2_g*cur + ln_2_b
-                // [ 768, N]
-                cur = ggml_add(ctx0,
-                        ggml_mul(ctx0,
-                            ggml_repeat(ctx0, model.layers[il].ln_2_g, cur),
-                            cur),
-                        ggml_repeat(ctx0, model.layers[il].ln_2_b, cur));
-            }
-
-            // fully connected
-            // [3072, 768] - model.layers[il].c_mlp_fc_w
-            // [3072,   1] - model.layers[il].c_mlp_fc_b
-            // [ 768,   N] - cur (in)
-            // [3072,   N] - cur (out)
-            //
-            // cur = fc_w*cur + fc_b
-            // [3072, N]
-            cur = ggml_mul_mat(ctx0,
-                    model.layers[il].c_mlp_fc_w,
-                    cur);
-
-            cur = ggml_add(ctx0,
-                    ggml_repeat(ctx0, model.layers[il].c_mlp_fc_b, cur),
-                    cur);
-
-            // GELU activation
-            // [3072, N]
-            cur = ggml_gelu(ctx0, cur);
-
-            // projection
-            // [ 768, 3072] - model.layers[il].c_mlp_proj_w
-            // [ 768,    1] - model.layers[il].c_mlp_proj_b
-            // [3072,    N] - cur (in)
-            // [ 768,    N] - cur (out)
-            //
-            // cur = proj_w*cur + proj_b
-            // [768, N]
-            cur = ggml_mul_mat(ctx0,
-                    model.layers[il].c_mlp_proj_w,
-                    cur);
-
-            cur = ggml_add(ctx0,
-                    ggml_repeat(ctx0, model.layers[il].c_mlp_proj_b, cur),
-                    cur);
-        }
-
-        // input for next layer
-        inpL = ggml_add(ctx0, cur, inpFF);
-    }
-
-    ggml_set_scratch(ctx0, { 0, scr0_size, scr0, });
-
-    // norm
-    {
-        // [ 768, N]
-        inpL = ggml_norm(ctx0, inpL, hparams.eps);
-
-        // inpL = ln_f_g*inpL + ln_f_b
-        // [ 768, N]
-        inpL = ggml_add(ctx0,
-                ggml_mul(ctx0,
-                    ggml_repeat(ctx0, model.ln_f_g, inpL),
-                    inpL),
-                ggml_repeat(ctx0, model.ln_f_b, inpL));
-    }
-
-    ggml_set_scratch(ctx0, { 0, 0, nullptr, });
-
-    // inpL = WTE * inpL
-    // [ 768, 50257] - model.lm_head
-    // [ 768, N]     - inpL
-    inpL = ggml_mul_mat(ctx0, model.lm_head, inpL);
-
-    // logits -> probs
-    //inpL = ggml_soft_max_inplace(ctx0, inpL);
-
-    // run the computation
-    ggml_build_forward_expand(gf, inpL);
-    ggml_graph_compute_with_ctx(ctx0, gf, n_threads);
-
-    //if (n_past%100 == 0) {
-    //    ggml_graph_print   (&gf);
-    //    ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
-    //}
-
-    //embd_w.resize(n_vocab*N);
-    //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
-
-    // return result just for the last token
-    embd_w.resize(n_vocab);
-    memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
-
-    if (mem_per_token == 0) {
-        mem_per_token = ggml_used_mem(ctx0)/N;
-    }
-    //printf("used_mem = %zu MB\n", ggml_used_mem(ctx0)/(1024*1024));
-
-    ggml_free(ctx0);
-
-    return true;
-}
-
-int main(int argc, char ** argv) {
-    ggml_time_init();
-
-    const int64_t t_main_start_us = ggml_time_us();
-
-    gpt_params params;
-
-    if (gpt_params_parse(argc, argv, params) == false) {
-        return 1;
-    }
-
-    if (params.seed < 0) {
-        params.seed = time(NULL);
-    }
-
-    printf("%s: seed = %d\n", __func__, params.seed);
-
-    std::mt19937 rng(params.seed);
-    if (params.prompt.empty()) {
-        params.prompt = gpt_random_prompt(rng);
-    }
-
-    int64_t t_load_us = 0;
-
-    gpt_vocab vocab;
-    starcoder_model model;
-
-    // load the model
-    {
-        const int64_t t_start_us = ggml_time_us();
-
-        if (!starcoder_model_load(params.model, model, vocab)) {
-            fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
-            return 1;
-        }
-
-        t_load_us = ggml_time_us() - t_start_us;
-
-        test_gpt_tokenizer(vocab, params.token_test);
-    }
-
-    if (params.repeat_last_n == -1) {
-        params.repeat_last_n = model.hparams.n_ctx;
-    }
-    printf("\n");
-    printf("%s: temp           = %.3f\n", __func__, params.temp);
-    printf("%s: top_k          = %d\n",   __func__, params.top_k);
-    printf("%s: top_p          = %.3f\n", __func__, params.top_p);
-    printf("%s: repeat_last_n  = %d\n",   __func__, params.repeat_last_n);
-    printf("%s: repeat_penalty = %.3f\n", __func__, params.repeat_penalty);
-
-    int n_past = 0;
-
-    int64_t t_sample_us  = 0;
-    int64_t t_predict_us = 0;
-
-    std::vector<float> logits;
-
-    std::vector<int32_t> last_n_tokens(model.hparams.n_ctx);
-    std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
-
-    // tokenize the prompt
-    std::vector<gpt_vocab::id> embd_inp = ::gpt_tokenize(vocab, params.prompt);
-
-    params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size());
-
-    printf("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
-    printf("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
-    for (size_t i = 0; i < embd_inp.size(); i++) {
-        printf("%s: token[%zu] = %6d, %s\n", __func__, i, embd_inp[i], vocab.id_to_token.at(embd_inp[i]).c_str());
-    }
-    printf("\n\n");
-
-    // Handle StarChat "<|end|>" and OpenCoder "<|end_of_turn>" tokens.
-    gpt_vocab::id starchat_end_token = -1;
-    {
-        const auto it = vocab.token_to_id.find("<|end|>");
-        if (it != vocab.token_to_id.end()) {
-            starchat_end_token = it->second;
-        } else {
-            const auto eot_token_id = vocab.token_to_id.find("<|end_of_turn|>");
-            if (eot_token_id != vocab.token_to_id.end()) {
-              starchat_end_token = eot_token_id->second;
-            }
-        }
-    }
-
-    // submit the input prompt token-by-token
-    // this reduces the memory usage during inference, at the cost of a bit of speed at the beginning
-    std::vector<gpt_vocab::id> embd;
-
-    // determine the required inference memory per token:
-    size_t mem_per_token = 0;
-    starcoder_eval(model, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
-
-    for (size_t i = embd.size(); i < embd_inp.size() + params.n_predict; i++) {
-        // predict
-        if (embd.size() > 0) {
-            const int64_t t_start_us = ggml_time_us();
-
-            if (!starcoder_eval(model, params.n_threads, n_past, embd, logits, mem_per_token)) {
-                printf("Failed to predict\n");
-                return 1;
-            }
-
-            t_predict_us += ggml_time_us() - t_start_us;
-        }
-
-        n_past += embd.size();
-        embd.clear();
-
-        if (i >= embd_inp.size()) {
-            // sample next token
-            const int   top_k = params.top_k;
-            const float top_p = params.top_p;
-            const float temp  = params.temp;
-
-            const int n_vocab = model.hparams.n_vocab;
-
-            gpt_vocab::id id = 0;
-
-            {
-                const int64_t t_start_sample_us = ggml_time_us();
-
-                id = gpt_sample_top_k_top_p_repeat(vocab, logits.data() + (logits.size() - n_vocab), last_n_tokens.data(), last_n_tokens.size(), top_k, top_p, temp, params.repeat_last_n, params.repeat_penalty, rng);
-                t_sample_us += ggml_time_us() - t_start_sample_us;
-            }
-
-            // add it to the context
-            embd.push_back(id);
-
-            last_n_tokens.erase(last_n_tokens.begin());
-            last_n_tokens.push_back(id);
-        } else {
-            // if here, it means we are still processing the input prompt
-            for (size_t k = i; k < embd_inp.size(); k++) {
-                embd.push_back(embd_inp[k]);
-
-                last_n_tokens.erase(last_n_tokens.begin());
-                last_n_tokens.push_back(embd_inp[k]);
-
-                if (int32_t(embd.size()) >= params.n_batch) {
-                    break;
-                }
-            }
-            i += embd.size() - 1;
-        }
-
-        // display text
-        for (auto id : embd) {
-            printf("%s", vocab.id_to_token[id].c_str());
-        }
-        fflush(stdout);
-
-        // check if model is santacoder
-        if (model.hparams.n_layer <= 30 && embd.back() == 49152) {
-            break;
-        }
-        // check if model is starcoder
-        else if (embd.back() == 0) { //TODO: this is only for starcoder
-            break;
-        }
-        // Handle StarChat "<|end|>" token.
-        else if (embd.back() == starchat_end_token && i >= embd_inp.size()) {
-            break;
-        }
-    }
-
-    // report timing
-    {
-        const int64_t t_main_end_us = ggml_time_us();
-
-        printf("\n\n");
-        printf("%s: mem per token = %8zu bytes\n", __func__, mem_per_token);
-        printf("%s:     load time = %8.2f ms\n", __func__, t_load_us/1000.0f);
-        printf("%s:   sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f);
-        printf("%s:  predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/n_past);
-        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
-    }
-
-    ggml_free(model.ctx);
-
-    return 0;
-}
diff --git a/ggml/examples/starcoder/quantize.cpp b/ggml/examples/starcoder/quantize.cpp
deleted file mode 100644
index d3aee3f..0000000
--- a/ggml/examples/starcoder/quantize.cpp
+++ /dev/null
@@ -1,184 +0,0 @@
-#include "ggml/ggml.h"
-
-#include "common.h"
-#include "common-ggml.h"
-
-#include <cassert>
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <fstream>
-#include <map>
-#include <string>
-#include <vector>
-#include <regex>
-
-// default hparams (GPT-2 117M)
-struct starcoder_hparams {
-    int32_t n_vocab = 49280;
-    int32_t n_ctx   = 2048;
-    int32_t n_embd  = 2048;
-    int32_t n_head  = 16;
-    int32_t n_layer = 24;
-    int32_t ftype   = 1;
-};
-
-// quantize a model
-bool starcoder_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_ftype ftype) {
-    gpt_vocab vocab;
-
-    printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
-
-    auto finp = std::ifstream(fname_inp, std::ios::binary);
-    if (!finp) {
-        fprintf(stderr, "%s: failed to open '%s' for reading\n", __func__, fname_inp.c_str());
-        return false;
-    }
-
-    auto fout = std::ofstream(fname_out, std::ios::binary);
-    if (!fout) {
-        fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname_out.c_str());
-        return false;
-    }
-
-    // verify magic
-    {
-        uint32_t magic;
-        finp.read((char *) &magic, sizeof(magic));
-        if (magic != GGML_FILE_MAGIC) {
-            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname_inp.c_str());
-            return false;
-        }
-
-        fout.write((char *) &magic, sizeof(magic));
-    }
-
-    starcoder_hparams hparams;
-
-    // load hparams
-    {
-        finp.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
-        finp.read((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
-        finp.read((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
-        finp.read((char *) &hparams.n_head,  sizeof(hparams.n_head));
-        finp.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
-        finp.read((char *) &hparams.ftype,   sizeof(hparams.ftype));
-
-        const int32_t qntvr_src =    hparams.ftype / GGML_QNT_VERSION_FACTOR;
-        const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype;
-
-        printf("%s: n_vocab     = %d\n", __func__, hparams.n_vocab);
-        printf("%s: n_ctx       = %d\n", __func__, hparams.n_ctx);
-        printf("%s: n_embd      = %d\n", __func__, hparams.n_embd);
-        printf("%s: n_head      = %d\n", __func__, hparams.n_head);
-        printf("%s: n_layer     = %d\n", __func__, hparams.n_layer);
-        printf("%s: ftype (src) = %d\n", __func__, hparams.ftype);
-        printf("%s: qntvr (src) = %d\n", __func__, qntvr_src);
-        printf("%s: ftype (dst) = %d\n", __func__, ftype_dst);
-        printf("%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION);
-
-        fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
-        fout.write((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
-        fout.write((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
-        fout.write((char *) &hparams.n_head,  sizeof(hparams.n_head));
-        fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer));
-        fout.write((char *) &ftype_dst,       sizeof(ftype_dst));
-    }
-
-    // load vocab
-    {
-        int32_t n_vocab = 0;
-        finp.read ((char *) &n_vocab, sizeof(n_vocab));
-        fout.write((char *) &n_vocab, sizeof(n_vocab));
-
-        if (n_vocab != hparams.n_vocab) {
-            fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
-                    __func__, fname_inp.c_str(), n_vocab, hparams.n_vocab);
-            return false;
-        }
-
-        std::string word;
-        for (int i = 0; i < n_vocab; i++) {
-            uint32_t len;
-            finp.read ((char *) &len, sizeof(len));
-            fout.write((char *) &len, sizeof(len));
-
-            word.resize(len);
-            finp.read ((char *) word.data(), len);
-            fout.write((char *) word.data(), len);
-
-            vocab.token_to_id[word] = i;
-            vocab.id_to_token[i] = word;
-        }
-    }
-
-    // regexes of tensor names to be quantized
-    const std::vector<std::string> to_quant = {
-        "model/wte",
-        "model/lm_head",
-        "model/h.*/attn/c_attn/w",
-        "model/h.*/attn/c_proj/w",
-        "model/h.*/mlp/c_fc/w",
-        "model/h.*/mlp/c_proj/w",
-    };
-
-    if (!ggml_common_quantize_0(finp, fout, ftype, to_quant, {})) {
-        fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__, fname_inp.c_str());
-        return false;
-    }
-
-    finp.close();
-    fout.close();
-
-    return true;
-}
-
-// usage:
-//  ./gpt-2-quantize models/gpt-2-117M/ggml-model.bin models/gpt-2-117M/ggml-model-quant.bin type
-//
-int main(int argc, char ** argv) {
-    if (argc != 4) {
-        fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
-        ggml_print_ftypes(stderr);
-        return 1;
-    }
-
-    // needed to initialize f16 tables
-    {
-        struct ggml_init_params params = { 0, NULL, false };
-        struct ggml_context * ctx = ggml_init(params);
-        ggml_free(ctx);
-    }
-
-    const std::string fname_inp = argv[1];
-    const std::string fname_out = argv[2];
-
-    const ggml_ftype ftype = ggml_parse_ftype(argv[3]);
-
-    const int64_t t_main_start_us = ggml_time_us();
-
-    int64_t t_quantize_us = 0;
-
-    // load the model
-    {
-        const int64_t t_start_us = ggml_time_us();
-
-        if (!starcoder_model_quantize(fname_inp, fname_out, ggml_ftype(ftype))) {
-            fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
-            return 1;
-        }
-
-        t_quantize_us = ggml_time_us() - t_start_us;
-    }
-
-    // report timing
-    {
-        const int64_t t_main_end_us = ggml_time_us();
-
-        printf("\n");
-        printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0f);
-        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
-    }
-
-    return 0;
-}
diff --git a/ggml/examples/stb_image.h b/ggml/examples/stb_image.h
deleted file mode 100644
index 5e807a0..0000000
--- a/ggml/examples/stb_image.h
+++ /dev/null
@@ -1,7987 +0,0 @@
-/* stb_image - v2.28 - public domain image loader - http://nothings.org/stb
-                                  no warranty implied; use at your own risk
-
-   Do this:
-      #define STB_IMAGE_IMPLEMENTATION
-   before you include this file in *one* C or C++ file to create the implementation.
-
-   // i.e. it should look like this:
-   #include ...
-   #include ...
-   #include ...
-   #define STB_IMAGE_IMPLEMENTATION
-   #include "stb_image.h"
-
-   You can #define STBI_ASSERT(x) before the #include to avoid using assert.h.
-   And #define STBI_MALLOC, STBI_REALLOC, and STBI_FREE to avoid using malloc,realloc,free
-
-
-   QUICK NOTES:
-      Primarily of interest to game developers and other people who can
-          avoid problematic images and only need the trivial interface
-
-      JPEG baseline & progressive (12 bpc/arithmetic not supported, same as stock IJG lib)
-      PNG 1/2/4/8/16-bit-per-channel
-
-      TGA (not sure what subset, if a subset)
-      BMP non-1bpp, non-RLE
-      PSD (composited view only, no extra channels, 8/16 bit-per-channel)
-
-      GIF (*comp always reports as 4-channel)
-      HDR (radiance rgbE format)
-      PIC (Softimage PIC)
-      PNM (PPM and PGM binary only)
-
-      Animated GIF still needs a proper API, but here's one way to do it:
-          http://gist.github.com/urraka/685d9a6340b26b830d49
-
-      - decode from memory or through FILE (define STBI_NO_STDIO to remove code)
-      - decode from arbitrary I/O callbacks
-      - SIMD acceleration on x86/x64 (SSE2) and ARM (NEON)
-
-   Full documentation under "DOCUMENTATION" below.
-
-
-LICENSE
-
-  See end of file for license information.
-
-RECENT REVISION HISTORY:
-
-      2.28  (2023-01-29) many error fixes, security errors, just tons of stuff
-      2.27  (2021-07-11) document stbi_info better, 16-bit PNM support, bug fixes
-      2.26  (2020-07-13) many minor fixes
-      2.25  (2020-02-02) fix warnings
-      2.24  (2020-02-02) fix warnings; thread-local failure_reason and flip_vertically
-      2.23  (2019-08-11) fix clang static analysis warning
-      2.22  (2019-03-04) gif fixes, fix warnings
-      2.21  (2019-02-25) fix typo in comment
-      2.20  (2019-02-07) support utf8 filenames in Windows; fix warnings and platform ifdefs
-      2.19  (2018-02-11) fix warning
-      2.18  (2018-01-30) fix warnings
-      2.17  (2018-01-29) bugfix, 1-bit BMP, 16-bitness query, fix warnings
-      2.16  (2017-07-23) all functions have 16-bit variants; optimizations; bugfixes
-      2.15  (2017-03-18) fix png-1,2,4; all Imagenet JPGs; no runtime SSE detection on GCC
-      2.14  (2017-03-03) remove deprecated STBI_JPEG_OLD; fixes for Imagenet JPGs
-      2.13  (2016-12-04) experimental 16-bit API, only for PNG so far; fixes
-      2.12  (2016-04-02) fix typo in 2.11 PSD fix that caused crashes
-      2.11  (2016-04-02) 16-bit PNGS; enable SSE2 in non-gcc x64
-                         RGB-format JPEG; remove white matting in PSD;
-                         allocate large structures on the stack;
-                         correct channel count for PNG & BMP
-      2.10  (2016-01-22) avoid warning introduced in 2.09
-      2.09  (2016-01-16) 16-bit TGA; comments in PNM files; STBI_REALLOC_SIZED
-
-   See end of file for full revision history.
-
-
- ============================    Contributors    =========================
-
- Image formats                          Extensions, features
-    Sean Barrett (jpeg, png, bmp)          Jetro Lauha (stbi_info)
-    Nicolas Schulz (hdr, psd)              Martin "SpartanJ" Golini (stbi_info)
-    Jonathan Dummer (tga)                  James "moose2000" Brown (iPhone PNG)
-    Jean-Marc Lienher (gif)                Ben "Disch" Wenger (io callbacks)
-    Tom Seddon (pic)                       Omar Cornut (1/2/4-bit PNG)
-    Thatcher Ulrich (psd)                  Nicolas Guillemot (vertical flip)
-    Ken Miller (pgm, ppm)                  Richard Mitton (16-bit PSD)
-    github:urraka (animated gif)           Junggon Kim (PNM comments)
-    Christopher Forseth (animated gif)     Daniel Gibson (16-bit TGA)
-                                           socks-the-fox (16-bit PNG)
-                                           Jeremy Sawicki (handle all ImageNet JPGs)
- Optimizations & bugfixes                  Mikhail Morozov (1-bit BMP)
-    Fabian "ryg" Giesen                    Anael Seghezzi (is-16-bit query)
-    Arseny Kapoulkine                      Simon Breuss (16-bit PNM)
-    John-Mark Allen
-    Carmelo J Fdez-Aguera
-
- Bug & warning fixes
-    Marc LeBlanc            David Woo          Guillaume George     Martins Mozeiko
-    Christpher Lloyd        Jerry Jansson      Joseph Thomson       Blazej Dariusz Roszkowski
-    Phil Jordan                                Dave Moore           Roy Eltham
-    Hayaki Saito            Nathan Reed        Won Chun
-    Luke Graham             Johan Duparc       Nick Verigakis       the Horde3D community
-    Thomas Ruf              Ronny Chevalier                         github:rlyeh
-    Janez Zemva             John Bartholomew   Michal Cichon        github:romigrou
-    Jonathan Blow           Ken Hamada         Tero Hanninen        github:svdijk
-    Eugene Golushkov        Laurent Gomila     Cort Stratton        github:snagar
-    Aruelien Pocheville     Sergio Gonzalez    Thibault Reuille     github:Zelex
-    Cass Everitt            Ryamond Barbiero                        github:grim210
-    Paul Du Bois            Engin Manap        Aldo Culquicondor    github:sammyhw
-    Philipp Wiesemann       Dale Weiler        Oriol Ferrer Mesia   github:phprus
-    Josh Tobin              Neil Bickford      Matthew Gregan       github:poppolopoppo
-    Julian Raschke          Gregory Mullen     Christian Floisand   github:darealshinji
-    Baldur Karlsson         Kevin Schmidt      JR Smith             github:Michaelangel007
-                            Brad Weinberger    Matvey Cherevko      github:mosra
-    Luca Sas                Alexander Veselov  Zack Middleton       [reserved]
-    Ryan C. Gordon          [reserved]                              [reserved]
-                     DO NOT ADD YOUR NAME HERE
-
-                     Jacko Dirks
-
-  To add your name to the credits, pick a random blank space in the middle and fill it.
-  80% of merge conflicts on stb PRs are due to people adding their name at the end
-  of the credits.
-*/
-
-#ifndef STBI_INCLUDE_STB_IMAGE_H
-#define STBI_INCLUDE_STB_IMAGE_H
-
-// DOCUMENTATION
-//
-// Limitations:
-//    - no 12-bit-per-channel JPEG
-//    - no JPEGs with arithmetic coding
-//    - GIF always returns *comp=4
-//
-// Basic usage (see HDR discussion below for HDR usage):
-//    int x,y,n;
-//    unsigned char *data = stbi_load(filename, &x, &y, &n, 0);
-//    // ... process data if not NULL ...
-//    // ... x = width, y = height, n = # 8-bit components per pixel ...
-//    // ... replace '0' with '1'..'4' to force that many components per pixel
-//    // ... but 'n' will always be the number that it would have been if you said 0
-//    stbi_image_free(data);
-//
-// Standard parameters:
-//    int *x                 -- outputs image width in pixels
-//    int *y                 -- outputs image height in pixels
-//    int *channels_in_file  -- outputs # of image components in image file
-//    int desired_channels   -- if non-zero, # of image components requested in result
-//
-// The return value from an image loader is an 'unsigned char *' which points
-// to the pixel data, or NULL on an allocation failure or if the image is
-// corrupt or invalid. The pixel data consists of *y scanlines of *x pixels,
-// with each pixel consisting of N interleaved 8-bit components; the first
-// pixel pointed to is top-left-most in the image. There is no padding between
-// image scanlines or between pixels, regardless of format. The number of
-// components N is 'desired_channels' if desired_channels is non-zero, or
-// *channels_in_file otherwise. If desired_channels is non-zero,
-// *channels_in_file has the number of components that _would_ have been
-// output otherwise. E.g. if you set desired_channels to 4, you will always
-// get RGBA output, but you can check *channels_in_file to see if it's trivially
-// opaque because e.g. there were only 3 channels in the source image.
-//
-// An output image with N components has the following components interleaved
-// in this order in each pixel:
-//
-//     N=#comp     components
-//       1           grey
-//       2           grey, alpha
-//       3           red, green, blue
-//       4           red, green, blue, alpha
-//
-// If image loading fails for any reason, the return value will be NULL,
-// and *x, *y, *channels_in_file will be unchanged. The function
-// stbi_failure_reason() can be queried for an extremely brief, end-user
-// unfriendly explanation of why the load failed. Define STBI_NO_FAILURE_STRINGS
-// to avoid compiling these strings at all, and STBI_FAILURE_USERMSG to get slightly
-// more user-friendly ones.
-//
-// Paletted PNG, BMP, GIF, and PIC images are automatically depalettized.
-//
-// To query the width, height and component count of an image without having to
-// decode the full file, you can use the stbi_info family of functions:
-//
-//   int x,y,n,ok;
-//   ok = stbi_info(filename, &x, &y, &n);
-//   // returns ok=1 and sets x, y, n if image is a supported format,
-//   // 0 otherwise.
-//
-// Note that stb_image pervasively uses ints in its public API for sizes,
-// including sizes of memory buffers. This is now part of the API and thus
-// hard to change without causing breakage. As a result, the various image
-// loaders all have certain limits on image size; these differ somewhat
-// by format but generally boil down to either just under 2GB or just under
-// 1GB. When the decoded image would be larger than this, stb_image decoding
-// will fail.
-//
-// Additionally, stb_image will reject image files that have any of their
-// dimensions set to a larger value than the configurable STBI_MAX_DIMENSIONS,
-// which defaults to 2**24 = 16777216 pixels. Due to the above memory limit,
-// the only way to have an image with such dimensions load correctly
-// is for it to have a rather extreme aspect ratio. Either way, the
-// assumption here is that such larger images are likely to be malformed
-// or malicious. If you do need to load an image with individual dimensions
-// larger than that, and it still fits in the overall size limit, you can
-// #define STBI_MAX_DIMENSIONS on your own to be something larger.
-//
-// ===========================================================================
-//
-// UNICODE:
-//
-//   If compiling for Windows and you wish to use Unicode filenames, compile
-//   with
-//       #define STBI_WINDOWS_UTF8
-//   and pass utf8-encoded filenames. Call stbi_convert_wchar_to_utf8 to convert
-//   Windows wchar_t filenames to utf8.
-//
-// ===========================================================================
-//
-// Philosophy
-//
-// stb libraries are designed with the following priorities:
-//
-//    1. easy to use
-//    2. easy to maintain
-//    3. good performance
-//
-// Sometimes I let "good performance" creep up in priority over "easy to maintain",
-// and for best performance I may provide less-easy-to-use APIs that give higher
-// performance, in addition to the easy-to-use ones. Nevertheless, it's important
-// to keep in mind that from the standpoint of you, a client of this library,
-// all you care about is #1 and #3, and stb libraries DO NOT emphasize #3 above all.
-//
-// Some secondary priorities arise directly from the first two, some of which
-// provide more explicit reasons why performance can't be emphasized.
-//
-//    - Portable ("ease of use")
-//    - Small source code footprint ("easy to maintain")
-//    - No dependencies ("ease of use")
-//
-// ===========================================================================
-//
-// I/O callbacks
-//
-// I/O callbacks allow you to read from arbitrary sources, like packaged
-// files or some other source. Data read from callbacks are processed
-// through a small internal buffer (currently 128 bytes) to try to reduce
-// overhead.
-//
-// The three functions you must define are "read" (reads some bytes of data),
-// "skip" (skips some bytes of data), "eof" (reports if the stream is at the end).
-//
-// ===========================================================================
-//
-// SIMD support
-//
-// The JPEG decoder will try to automatically use SIMD kernels on x86 when
-// supported by the compiler. For ARM Neon support, you must explicitly
-// request it.
-//
-// (The old do-it-yourself SIMD API is no longer supported in the current
-// code.)
-//
-// On x86, SSE2 will automatically be used when available based on a run-time
-// test; if not, the generic C versions are used as a fall-back. On ARM targets,
-// the typical path is to have separate builds for NEON and non-NEON devices
-// (at least this is true for iOS and Android). Therefore, the NEON support is
-// toggled by a build flag: define STBI_NEON to get NEON loops.
-//
-// If for some reason you do not want to use any of SIMD code, or if
-// you have issues compiling it, you can disable it entirely by
-// defining STBI_NO_SIMD.
-//
-// ===========================================================================
-//
-// HDR image support   (disable by defining STBI_NO_HDR)
-//
-// stb_image supports loading HDR images in general, and currently the Radiance
-// .HDR file format specifically. You can still load any file through the existing
-// interface; if you attempt to load an HDR file, it will be automatically remapped
-// to LDR, assuming gamma 2.2 and an arbitrary scale factor defaulting to 1;
-// both of these constants can be reconfigured through this interface:
-//
-//     stbi_hdr_to_ldr_gamma(2.2f);
-//     stbi_hdr_to_ldr_scale(1.0f);
-//
-// (note, do not use _inverse_ constants; stbi_image will invert them
-// appropriately).
-//
-// Additionally, there is a new, parallel interface for loading files as
-// (linear) floats to preserve the full dynamic range:
-//
-//    float *data = stbi_loadf(filename, &x, &y, &n, 0);
-//
-// If you load LDR images through this interface, those images will
-// be promoted to floating point values, run through the inverse of
-// constants corresponding to the above:
-//
-//     stbi_ldr_to_hdr_scale(1.0f);
-//     stbi_ldr_to_hdr_gamma(2.2f);
-//
-// Finally, given a filename (or an open file or memory block--see header
-// file for details) containing image data, you can query for the "most
-// appropriate" interface to use (that is, whether the image is HDR or
-// not), using:
-//
-//     stbi_is_hdr(char *filename);
-//
-// ===========================================================================
-//
-// iPhone PNG support:
-//
-// We optionally support converting iPhone-formatted PNGs (which store
-// premultiplied BGRA) back to RGB, even though they're internally encoded
-// differently. To enable this conversion, call
-// stbi_convert_iphone_png_to_rgb(1).
-//
-// Call stbi_set_unpremultiply_on_load(1) as well to force a divide per
-// pixel to remove any premultiplied alpha *only* if the image file explicitly
-// says there's premultiplied data (currently only happens in iPhone images,
-// and only if iPhone convert-to-rgb processing is on).
-//
-// ===========================================================================
-//
-// ADDITIONAL CONFIGURATION
-//
-//  - You can suppress implementation of any of the decoders to reduce
-//    your code footprint by #defining one or more of the following
-//    symbols before creating the implementation.
-//
-//        STBI_NO_JPEG
-//        STBI_NO_PNG
-//        STBI_NO_BMP
-//        STBI_NO_PSD
-//        STBI_NO_TGA
-//        STBI_NO_GIF
-//        STBI_NO_HDR
-//        STBI_NO_PIC
-//        STBI_NO_PNM   (.ppm and .pgm)
-//
-//  - You can request *only* certain decoders and suppress all other ones
-//    (this will be more forward-compatible, as addition of new decoders
-//    doesn't require you to disable them explicitly):
-//
-//        STBI_ONLY_JPEG
-//        STBI_ONLY_PNG
-//        STBI_ONLY_BMP
-//        STBI_ONLY_PSD
-//        STBI_ONLY_TGA
-//        STBI_ONLY_GIF
-//        STBI_ONLY_HDR
-//        STBI_ONLY_PIC
-//        STBI_ONLY_PNM   (.ppm and .pgm)
-//
-//   - If you use STBI_NO_PNG (or _ONLY_ without PNG), and you still
-//     want the zlib decoder to be available, #define STBI_SUPPORT_ZLIB
-//
-//  - If you define STBI_MAX_DIMENSIONS, stb_image will reject images greater
-//    than that size (in either width or height) without further processing.
-//    This is to let programs in the wild set an upper bound to prevent
-//    denial-of-service attacks on untrusted data, as one could generate a
-//    valid image of gigantic dimensions and force stb_image to allocate a
-//    huge block of memory and spend disproportionate time decoding it. By
-//    default this is set to (1 << 24), which is 16777216, but that's still
-//    very big.
-
-#ifndef STBI_NO_STDIO
-#include <stdio.h>
-#endif // STBI_NO_STDIO
-
-#define STBI_VERSION 1
-
-enum
-{
-   STBI_default = 0, // only used for desired_channels
-
-   STBI_grey       = 1,
-   STBI_grey_alpha = 2,
-   STBI_rgb        = 3,
-   STBI_rgb_alpha  = 4
-};
-
-#include <stdlib.h>
-typedef unsigned char stbi_uc;
-typedef unsigned short stbi_us;
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#ifndef STBIDEF
-#ifdef STB_IMAGE_STATIC
-#define STBIDEF static
-#else
-#define STBIDEF extern
-#endif
-#endif
-
-//////////////////////////////////////////////////////////////////////////////
-//
-// PRIMARY API - works on images of any type
-//
-
-//
-// load image by filename, open file, or memory buffer
-//
-
-typedef struct
-{
-   int      (*read)  (void *user,char *data,int size);   // fill 'data' with 'size' bytes.  return number of bytes actually read
-   void     (*skip)  (void *user,int n);                 // skip the next 'n' bytes, or 'unget' the last -n bytes if negative
-   int      (*eof)   (void *user);                       // returns nonzero if we are at end of file/data
-} stbi_io_callbacks;
-
-////////////////////////////////////
-//
-// 8-bits-per-channel interface
-//
-
-STBIDEF stbi_uc *stbi_load_from_memory   (stbi_uc           const *buffer, int len   , int *x, int *y, int *channels_in_file, int desired_channels);
-STBIDEF stbi_uc *stbi_load_from_callbacks(stbi_io_callbacks const *clbk  , void *user, int *x, int *y, int *channels_in_file, int desired_channels);
-
-#ifndef STBI_NO_STDIO
-STBIDEF stbi_uc *stbi_load            (char const *filename, int *x, int *y, int *channels_in_file, int desired_channels);
-STBIDEF stbi_uc *stbi_load_from_file  (FILE *f, int *x, int *y, int *channels_in_file, int desired_channels);
-// for stbi_load_from_file, file pointer is left pointing immediately after image
-#endif
-
-#ifndef STBI_NO_GIF
-STBIDEF stbi_uc *stbi_load_gif_from_memory(stbi_uc const *buffer, int len, int **delays, int *x, int *y, int *z, int *comp, int req_comp);
-#endif
-
-#ifdef STBI_WINDOWS_UTF8
-STBIDEF int stbi_convert_wchar_to_utf8(char *buffer, size_t bufferlen, const wchar_t* input);
-#endif
-
-////////////////////////////////////
-//
-// 16-bits-per-channel interface
-//
-
-STBIDEF stbi_us *stbi_load_16_from_memory   (stbi_uc const *buffer, int len, int *x, int *y, int *channels_in_file, int desired_channels);
-STBIDEF stbi_us *stbi_load_16_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *channels_in_file, int desired_channels);
-
-#ifndef STBI_NO_STDIO
-STBIDEF stbi_us *stbi_load_16          (char const *filename, int *x, int *y, int *channels_in_file, int desired_channels);
-STBIDEF stbi_us *stbi_load_from_file_16(FILE *f, int *x, int *y, int *channels_in_file, int desired_channels);
-#endif
-
-////////////////////////////////////
-//
-// float-per-channel interface
-//
-#ifndef STBI_NO_LINEAR
-   STBIDEF float *stbi_loadf_from_memory     (stbi_uc const *buffer, int len, int *x, int *y, int *channels_in_file, int desired_channels);
-   STBIDEF float *stbi_loadf_from_callbacks  (stbi_io_callbacks const *clbk, void *user, int *x, int *y,  int *channels_in_file, int desired_channels);
-
-   #ifndef STBI_NO_STDIO
-   STBIDEF float *stbi_loadf            (char const *filename, int *x, int *y, int *channels_in_file, int desired_channels);
-   STBIDEF float *stbi_loadf_from_file  (FILE *f, int *x, int *y, int *channels_in_file, int desired_channels);
-   #endif
-#endif
-
-#ifndef STBI_NO_HDR
-   STBIDEF void   stbi_hdr_to_ldr_gamma(float gamma);
-   STBIDEF void   stbi_hdr_to_ldr_scale(float scale);
-#endif // STBI_NO_HDR
-
-#ifndef STBI_NO_LINEAR
-   STBIDEF void   stbi_ldr_to_hdr_gamma(float gamma);
-   STBIDEF void   stbi_ldr_to_hdr_scale(float scale);
-#endif // STBI_NO_LINEAR
-
-// stbi_is_hdr is always defined, but always returns false if STBI_NO_HDR
-STBIDEF int    stbi_is_hdr_from_callbacks(stbi_io_callbacks const *clbk, void *user);
-STBIDEF int    stbi_is_hdr_from_memory(stbi_uc const *buffer, int len);
-#ifndef STBI_NO_STDIO
-STBIDEF int      stbi_is_hdr          (char const *filename);
-STBIDEF int      stbi_is_hdr_from_file(FILE *f);
-#endif // STBI_NO_STDIO
-
-
-// get a VERY brief reason for failure
-// on most compilers (and ALL modern mainstream compilers) this is threadsafe
-STBIDEF const char *stbi_failure_reason  (void);
-
-// free the loaded image -- this is just free()
-STBIDEF void     stbi_image_free      (void *retval_from_stbi_load);
-
-// get image dimensions & components without fully decoding
-STBIDEF int      stbi_info_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp);
-STBIDEF int      stbi_info_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *comp);
-STBIDEF int      stbi_is_16_bit_from_memory(stbi_uc const *buffer, int len);
-STBIDEF int      stbi_is_16_bit_from_callbacks(stbi_io_callbacks const *clbk, void *user);
-
-#ifndef STBI_NO_STDIO
-STBIDEF int      stbi_info               (char const *filename,     int *x, int *y, int *comp);
-STBIDEF int      stbi_info_from_file     (FILE *f,                  int *x, int *y, int *comp);
-STBIDEF int      stbi_is_16_bit          (char const *filename);
-STBIDEF int      stbi_is_16_bit_from_file(FILE *f);
-#endif
-
-
-
-// for image formats that explicitly notate that they have premultiplied alpha,
-// we just return the colors as stored in the file. set this flag to force
-// unpremultiplication. results are undefined if the unpremultiply overflow.
-STBIDEF void stbi_set_unpremultiply_on_load(int flag_true_if_should_unpremultiply);
-
-// indicate whether we should process iphone images back to canonical format,
-// or just pass them through "as-is"
-STBIDEF void stbi_convert_iphone_png_to_rgb(int flag_true_if_should_convert);
-
-// flip the image vertically, so the first pixel in the output array is the bottom left
-STBIDEF void stbi_set_flip_vertically_on_load(int flag_true_if_should_flip);
-
-// as above, but only applies to images loaded on the thread that calls the function
-// this function is only available if your compiler supports thread-local variables;
-// calling it will fail to link if your compiler doesn't
-STBIDEF void stbi_set_unpremultiply_on_load_thread(int flag_true_if_should_unpremultiply);
-STBIDEF void stbi_convert_iphone_png_to_rgb_thread(int flag_true_if_should_convert);
-STBIDEF void stbi_set_flip_vertically_on_load_thread(int flag_true_if_should_flip);
-
-// ZLIB client - used by PNG, available for other purposes
-
-STBIDEF char *stbi_zlib_decode_malloc_guesssize(const char *buffer, int len, int initial_size, int *outlen);
-STBIDEF char *stbi_zlib_decode_malloc_guesssize_headerflag(const char *buffer, int len, int initial_size, int *outlen, int parse_header);
-STBIDEF char *stbi_zlib_decode_malloc(const char *buffer, int len, int *outlen);
-STBIDEF int   stbi_zlib_decode_buffer(char *obuffer, int olen, const char *ibuffer, int ilen);
-
-STBIDEF char *stbi_zlib_decode_noheader_malloc(const char *buffer, int len, int *outlen);
-STBIDEF int   stbi_zlib_decode_noheader_buffer(char *obuffer, int olen, const char *ibuffer, int ilen);
-
-
-#ifdef __cplusplus
-}
-#endif
-
-//
-//
-////   end header file   /////////////////////////////////////////////////////
-#endif // STBI_INCLUDE_STB_IMAGE_H
-
-#ifdef STB_IMAGE_IMPLEMENTATION
-
-#if defined(STBI_ONLY_JPEG) || defined(STBI_ONLY_PNG) || defined(STBI_ONLY_BMP) \
-  || defined(STBI_ONLY_TGA) || defined(STBI_ONLY_GIF) || defined(STBI_ONLY_PSD) \
-  || defined(STBI_ONLY_HDR) || defined(STBI_ONLY_PIC) || defined(STBI_ONLY_PNM) \
-  || defined(STBI_ONLY_ZLIB)
-   #ifndef STBI_ONLY_JPEG
-   #define STBI_NO_JPEG
-   #endif
-   #ifndef STBI_ONLY_PNG
-   #define STBI_NO_PNG
-   #endif
-   #ifndef STBI_ONLY_BMP
-   #define STBI_NO_BMP
-   #endif
-   #ifndef STBI_ONLY_PSD
-   #define STBI_NO_PSD
-   #endif
-   #ifndef STBI_ONLY_TGA
-   #define STBI_NO_TGA
-   #endif
-   #ifndef STBI_ONLY_GIF
-   #define STBI_NO_GIF
-   #endif
-   #ifndef STBI_ONLY_HDR
-   #define STBI_NO_HDR
-   #endif
-   #ifndef STBI_ONLY_PIC
-   #define STBI_NO_PIC
-   #endif
-   #ifndef STBI_ONLY_PNM
-   #define STBI_NO_PNM
-   #endif
-#endif
-
-#if defined(STBI_NO_PNG) && !defined(STBI_SUPPORT_ZLIB) && !defined(STBI_NO_ZLIB)
-#define STBI_NO_ZLIB
-#endif
-
-
-#include <stdarg.h>
-#include <stddef.h> // ptrdiff_t on osx
-#include <stdlib.h>
-#include <string.h>
-#include <limits.h>
-
-#if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR)
-#include <math.h>  // ldexp, pow
-#endif
-
-#ifndef STBI_NO_STDIO
-#include <stdio.h>
-#endif
-
-#ifndef STBI_ASSERT
-#include <assert.h>
-#define STBI_ASSERT(x) assert(x)
-#endif
-
-#ifdef __cplusplus
-#define STBI_EXTERN extern "C"
-#else
-#define STBI_EXTERN extern
-#endif
-
-
-#ifndef _MSC_VER
-   #ifdef __cplusplus
-   #define stbi_inline inline
-   #else
-   #define stbi_inline
-   #endif
-#else
-   #define stbi_inline __forceinline
-#endif
-
-#ifndef STBI_NO_THREAD_LOCALS
-   #if defined(__cplusplus) &&  __cplusplus >= 201103L
-      #define STBI_THREAD_LOCAL       thread_local
-   #elif defined(__GNUC__) && __GNUC__ < 5
-      #define STBI_THREAD_LOCAL       __thread
-   #elif defined(_MSC_VER)
-      #define STBI_THREAD_LOCAL       __declspec(thread)
-   #elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 201112L && !defined(__STDC_NO_THREADS__)
-      #define STBI_THREAD_LOCAL       _Thread_local
-   #endif
-
-   #ifndef STBI_THREAD_LOCAL
-      #if defined(__GNUC__)
-        #define STBI_THREAD_LOCAL       __thread
-      #endif
-   #endif
-#endif
-
-#if defined(_MSC_VER) || defined(__SYMBIAN32__)
-typedef unsigned short stbi__uint16;
-typedef   signed short stbi__int16;
-typedef unsigned int   stbi__uint32;
-typedef   signed int   stbi__int32;
-#else
-#include <stdint.h>
-typedef uint16_t stbi__uint16;
-typedef int16_t  stbi__int16;
-typedef uint32_t stbi__uint32;
-typedef int32_t  stbi__int32;
-#endif
-
-// should produce compiler error if size is wrong
-typedef unsigned char validate_uint32[sizeof(stbi__uint32)==4 ? 1 : -1];
-
-#ifdef _MSC_VER
-#define STBI_NOTUSED(v)  (void)(v)
-#else
-#define STBI_NOTUSED(v)  (void)sizeof(v)
-#endif
-
-#ifdef _MSC_VER
-#define STBI_HAS_LROTL
-#endif
-
-#ifdef STBI_HAS_LROTL
-   #define stbi_lrot(x,y)  _lrotl(x,y)
-#else
-   #define stbi_lrot(x,y)  (((x) << (y)) | ((x) >> (-(y) & 31)))
-#endif
-
-#if defined(STBI_MALLOC) && defined(STBI_FREE) && (defined(STBI_REALLOC) || defined(STBI_REALLOC_SIZED))
-// ok
-#elif !defined(STBI_MALLOC) && !defined(STBI_FREE) && !defined(STBI_REALLOC) && !defined(STBI_REALLOC_SIZED)
-// ok
-#else
-#error "Must define all or none of STBI_MALLOC, STBI_FREE, and STBI_REALLOC (or STBI_REALLOC_SIZED)."
-#endif
-
-#ifndef STBI_MALLOC
-#define STBI_MALLOC(sz)           malloc(sz)
-#define STBI_REALLOC(p,newsz)     realloc(p,newsz)
-#define STBI_FREE(p)              free(p)
-#endif
-
-#ifndef STBI_REALLOC_SIZED
-#define STBI_REALLOC_SIZED(p,oldsz,newsz) STBI_REALLOC(p,newsz)
-#endif
-
-// x86/x64 detection
-#if defined(__x86_64__) || defined(_M_X64)
-#define STBI__X64_TARGET
-#elif defined(__i386) || defined(_M_IX86)
-#define STBI__X86_TARGET
-#endif
-
-#if defined(__GNUC__) && defined(STBI__X86_TARGET) && !defined(__SSE2__) && !defined(STBI_NO_SIMD)
-// gcc doesn't support sse2 intrinsics unless you compile with -msse2,
-// which in turn means it gets to use SSE2 everywhere. This is unfortunate,
-// but previous attempts to provide the SSE2 functions with runtime
-// detection caused numerous issues. The way architecture extensions are
-// exposed in GCC/Clang is, sadly, not really suited for one-file libs.
-// New behavior: if compiled with -msse2, we use SSE2 without any
-// detection; if not, we don't use it at all.
-#define STBI_NO_SIMD
-#endif
-
-#if defined(__MINGW32__) && defined(STBI__X86_TARGET) && !defined(STBI_MINGW_ENABLE_SSE2) && !defined(STBI_NO_SIMD)
-// Note that __MINGW32__ doesn't actually mean 32-bit, so we have to avoid STBI__X64_TARGET
-//
-// 32-bit MinGW wants ESP to be 16-byte aligned, but this is not in the
-// Windows ABI and VC++ as well as Windows DLLs don't maintain that invariant.
-// As a result, enabling SSE2 on 32-bit MinGW is dangerous when not
-// simultaneously enabling "-mstackrealign".
-//
-// See https://github.com/nothings/stb/issues/81 for more information.
-//
-// So default to no SSE2 on 32-bit MinGW. If you've read this far and added
-// -mstackrealign to your build settings, feel free to #define STBI_MINGW_ENABLE_SSE2.
-#define STBI_NO_SIMD
-#endif
-
-#if !defined(STBI_NO_SIMD) && (defined(STBI__X86_TARGET) || defined(STBI__X64_TARGET))
-#define STBI_SSE2
-#include <emmintrin.h>
-
-#ifdef _MSC_VER
-
-#if _MSC_VER >= 1400  // not VC6
-#include <intrin.h> // __cpuid
-static int stbi__cpuid3(void)
-{
-   int info[4];
-   __cpuid(info,1);
-   return info[3];
-}
-#else
-static int stbi__cpuid3(void)
-{
-   int res;
-   __asm {
-      mov  eax,1
-      cpuid
-      mov  res,edx
-   }
-   return res;
-}
-#endif
-
-#define STBI_SIMD_ALIGN(type, name) __declspec(align(16)) type name
-
-#if !defined(STBI_NO_JPEG) && defined(STBI_SSE2)
-static int stbi__sse2_available(void)
-{
-   int info3 = stbi__cpuid3();
-   return ((info3 >> 26) & 1) != 0;
-}
-#endif
-
-#else // assume GCC-style if not VC++
-#define STBI_SIMD_ALIGN(type, name) type name __attribute__((aligned(16)))
-
-#if !defined(STBI_NO_JPEG) && defined(STBI_SSE2)
-static int stbi__sse2_available(void)
-{
-   // If we're even attempting to compile this on GCC/Clang, that means
-   // -msse2 is on, which means the compiler is allowed to use SSE2
-   // instructions at will, and so are we.
-   return 1;
-}
-#endif
-
-#endif
-#endif
-
-// ARM NEON
-#if defined(STBI_NO_SIMD) && defined(STBI_NEON)
-#undef STBI_NEON
-#endif
-
-#ifdef STBI_NEON
-#include <arm_neon.h>
-#ifdef _MSC_VER
-#define STBI_SIMD_ALIGN(type, name) __declspec(align(16)) type name
-#else
-#define STBI_SIMD_ALIGN(type, name) type name __attribute__((aligned(16)))
-#endif
-#endif
-
-#ifndef STBI_SIMD_ALIGN
-#define STBI_SIMD_ALIGN(type, name) type name
-#endif
-
-#ifndef STBI_MAX_DIMENSIONS
-#define STBI_MAX_DIMENSIONS (1 << 24)
-#endif
-
-///////////////////////////////////////////////
-//
-//  stbi__context struct and start_xxx functions
-
-// stbi__context structure is our basic context used by all images, so it
-// contains all the IO context, plus some basic image information
-typedef struct
-{
-   stbi__uint32 img_x, img_y;
-   int img_n, img_out_n;
-
-   stbi_io_callbacks io;
-   void *io_user_data;
-
-   int read_from_callbacks;
-   int buflen;
-   stbi_uc buffer_start[128];
-   int callback_already_read;
-
-   stbi_uc *img_buffer, *img_buffer_end;
-   stbi_uc *img_buffer_original, *img_buffer_original_end;
-} stbi__context;
-
-
-static void stbi__refill_buffer(stbi__context *s);
-
-// initialize a memory-decode context
-static void stbi__start_mem(stbi__context *s, stbi_uc const *buffer, int len)
-{
-   s->io.read = NULL;
-   s->read_from_callbacks = 0;
-   s->callback_already_read = 0;
-   s->img_buffer = s->img_buffer_original = (stbi_uc *) buffer;
-   s->img_buffer_end = s->img_buffer_original_end = (stbi_uc *) buffer+len;
-}
-
-// initialize a callback-based context
-static void stbi__start_callbacks(stbi__context *s, stbi_io_callbacks *c, void *user)
-{
-   s->io = *c;
-   s->io_user_data = user;
-   s->buflen = sizeof(s->buffer_start);
-   s->read_from_callbacks = 1;
-   s->callback_already_read = 0;
-   s->img_buffer = s->img_buffer_original = s->buffer_start;
-   stbi__refill_buffer(s);
-   s->img_buffer_original_end = s->img_buffer_end;
-}
-
-#ifndef STBI_NO_STDIO
-
-static int stbi__stdio_read(void *user, char *data, int size)
-{
-   return (int) fread(data,1,size,(FILE*) user);
-}
-
-static void stbi__stdio_skip(void *user, int n)
-{
-   int ch;
-   fseek((FILE*) user, n, SEEK_CUR);
-   ch = fgetc((FILE*) user);  /* have to read a byte to reset feof()'s flag */
-   if (ch != EOF) {
-      ungetc(ch, (FILE *) user);  /* push byte back onto stream if valid. */
-   }
-}
-
-static int stbi__stdio_eof(void *user)
-{
-   return feof((FILE*) user) || ferror((FILE *) user);
-}
-
-static stbi_io_callbacks stbi__stdio_callbacks =
-{
-   stbi__stdio_read,
-   stbi__stdio_skip,
-   stbi__stdio_eof,
-};
-
-static void stbi__start_file(stbi__context *s, FILE *f)
-{
-   stbi__start_callbacks(s, &stbi__stdio_callbacks, (void *) f);
-}
-
-//static void stop_file(stbi__context *s) { }
-
-#endif // !STBI_NO_STDIO
-
-static void stbi__rewind(stbi__context *s)
-{
-   // conceptually rewind SHOULD rewind to the beginning of the stream,
-   // but we just rewind to the beginning of the initial buffer, because
-   // we only use it after doing 'test', which only ever looks at at most 92 bytes
-   s->img_buffer = s->img_buffer_original;
-   s->img_buffer_end = s->img_buffer_original_end;
-}
-
-enum
-{
-   STBI_ORDER_RGB,
-   STBI_ORDER_BGR
-};
-
-typedef struct
-{
-   int bits_per_channel;
-   int num_channels;
-   int channel_order;
-} stbi__result_info;
-
-#ifndef STBI_NO_JPEG
-static int      stbi__jpeg_test(stbi__context *s);
-static void    *stbi__jpeg_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
-static int      stbi__jpeg_info(stbi__context *s, int *x, int *y, int *comp);
-#endif
-
-#ifndef STBI_NO_PNG
-static int      stbi__png_test(stbi__context *s);
-static void    *stbi__png_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
-static int      stbi__png_info(stbi__context *s, int *x, int *y, int *comp);
-static int      stbi__png_is16(stbi__context *s);
-#endif
-
-#ifndef STBI_NO_BMP
-static int      stbi__bmp_test(stbi__context *s);
-static void    *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
-static int      stbi__bmp_info(stbi__context *s, int *x, int *y, int *comp);
-#endif
-
-#ifndef STBI_NO_TGA
-static int      stbi__tga_test(stbi__context *s);
-static void    *stbi__tga_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
-static int      stbi__tga_info(stbi__context *s, int *x, int *y, int *comp);
-#endif
-
-#ifndef STBI_NO_PSD
-static int      stbi__psd_test(stbi__context *s);
-static void    *stbi__psd_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri, int bpc);
-static int      stbi__psd_info(stbi__context *s, int *x, int *y, int *comp);
-static int      stbi__psd_is16(stbi__context *s);
-#endif
-
-#ifndef STBI_NO_HDR
-static int      stbi__hdr_test(stbi__context *s);
-static float   *stbi__hdr_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
-static int      stbi__hdr_info(stbi__context *s, int *x, int *y, int *comp);
-#endif
-
-#ifndef STBI_NO_PIC
-static int      stbi__pic_test(stbi__context *s);
-static void    *stbi__pic_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
-static int      stbi__pic_info(stbi__context *s, int *x, int *y, int *comp);
-#endif
-
-#ifndef STBI_NO_GIF
-static int      stbi__gif_test(stbi__context *s);
-static void    *stbi__gif_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
-static void    *stbi__load_gif_main(stbi__context *s, int **delays, int *x, int *y, int *z, int *comp, int req_comp);
-static int      stbi__gif_info(stbi__context *s, int *x, int *y, int *comp);
-#endif
-
-#ifndef STBI_NO_PNM
-static int      stbi__pnm_test(stbi__context *s);
-static void    *stbi__pnm_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
-static int      stbi__pnm_info(stbi__context *s, int *x, int *y, int *comp);
-static int      stbi__pnm_is16(stbi__context *s);
-#endif
-
-static
-#ifdef STBI_THREAD_LOCAL
-STBI_THREAD_LOCAL
-#endif
-const char *stbi__g_failure_reason;
-
-STBIDEF const char *stbi_failure_reason(void)
-{
-   return stbi__g_failure_reason;
-}
-
-#ifndef STBI_NO_FAILURE_STRINGS
-static int stbi__err(const char *str)
-{
-   stbi__g_failure_reason = str;
-   return 0;
-}
-#endif
-
-static void *stbi__malloc(size_t size)
-{
-    return STBI_MALLOC(size);
-}
-
-// stb_image uses ints pervasively, including for offset calculations.
-// therefore the largest decoded image size we can support with the
-// current code, even on 64-bit targets, is INT_MAX. this is not a
-// significant limitation for the intended use case.
-//
-// we do, however, need to make sure our size calculations don't
-// overflow. hence a few helper functions for size calculations that
-// multiply integers together, making sure that they're non-negative
-// and no overflow occurs.
-
-// return 1 if the sum is valid, 0 on overflow.
-// negative terms are considered invalid.
-static int stbi__addsizes_valid(int a, int b)
-{
-   if (b < 0) return 0;
-   // now 0 <= b <= INT_MAX, hence also
-   // 0 <= INT_MAX - b <= INTMAX.
-   // And "a + b <= INT_MAX" (which might overflow) is the
-   // same as a <= INT_MAX - b (no overflow)
-   return a <= INT_MAX - b;
-}
-
-// returns 1 if the product is valid, 0 on overflow.
-// negative factors are considered invalid.
-static int stbi__mul2sizes_valid(int a, int b)
-{
-   if (a < 0 || b < 0) return 0;
-   if (b == 0) return 1; // mul-by-0 is always safe
-   // portable way to check for no overflows in a*b
-   return a <= INT_MAX/b;
-}
-
-#if !defined(STBI_NO_JPEG) || !defined(STBI_NO_PNG) || !defined(STBI_NO_TGA) || !defined(STBI_NO_HDR)
-// returns 1 if "a*b + add" has no negative terms/factors and doesn't overflow
-static int stbi__mad2sizes_valid(int a, int b, int add)
-{
-   return stbi__mul2sizes_valid(a, b) && stbi__addsizes_valid(a*b, add);
-}
-#endif
-
-// returns 1 if "a*b*c + add" has no negative terms/factors and doesn't overflow
-static int stbi__mad3sizes_valid(int a, int b, int c, int add)
-{
-   return stbi__mul2sizes_valid(a, b) && stbi__mul2sizes_valid(a*b, c) &&
-      stbi__addsizes_valid(a*b*c, add);
-}
-
-// returns 1 if "a*b*c*d + add" has no negative terms/factors and doesn't overflow
-#if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR) || !defined(STBI_NO_PNM)
-static int stbi__mad4sizes_valid(int a, int b, int c, int d, int add)
-{
-   return stbi__mul2sizes_valid(a, b) && stbi__mul2sizes_valid(a*b, c) &&
-      stbi__mul2sizes_valid(a*b*c, d) && stbi__addsizes_valid(a*b*c*d, add);
-}
-#endif
-
-#if !defined(STBI_NO_JPEG) || !defined(STBI_NO_PNG) || !defined(STBI_NO_TGA) || !defined(STBI_NO_HDR)
-// mallocs with size overflow checking
-static void *stbi__malloc_mad2(int a, int b, int add)
-{
-   if (!stbi__mad2sizes_valid(a, b, add)) return NULL;
-   return stbi__malloc(a*b + add);
-}
-#endif
-
-static void *stbi__malloc_mad3(int a, int b, int c, int add)
-{
-   if (!stbi__mad3sizes_valid(a, b, c, add)) return NULL;
-   return stbi__malloc(a*b*c + add);
-}
-
-#if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR) || !defined(STBI_NO_PNM)
-static void *stbi__malloc_mad4(int a, int b, int c, int d, int add)
-{
-   if (!stbi__mad4sizes_valid(a, b, c, d, add)) return NULL;
-   return stbi__malloc(a*b*c*d + add);
-}
-#endif
-
-// returns 1 if the sum of two signed ints is valid (between -2^31 and 2^31-1 inclusive), 0 on overflow.
-static int stbi__addints_valid(int a, int b)
-{
-   if ((a >= 0) != (b >= 0)) return 1; // a and b have different signs, so no overflow
-   if (a < 0 && b < 0) return a >= INT_MIN - b; // same as a + b >= INT_MIN; INT_MIN - b cannot overflow since b < 0.
-   return a <= INT_MAX - b;
-}
-
-// returns 1 if the product of two signed shorts is valid, 0 on overflow.
-static int stbi__mul2shorts_valid(short a, short b)
-{
-   if (b == 0 || b == -1) return 1; // multiplication by 0 is always 0; check for -1 so SHRT_MIN/b doesn't overflow
-   if ((a >= 0) == (b >= 0)) return a <= SHRT_MAX/b; // product is positive, so similar to mul2sizes_valid
-   if (b < 0) return a <= SHRT_MIN / b; // same as a * b >= SHRT_MIN
-   return a >= SHRT_MIN / b;
-}
-
-// stbi__err - error
-// stbi__errpf - error returning pointer to float
-// stbi__errpuc - error returning pointer to unsigned char
-
-#ifdef STBI_NO_FAILURE_STRINGS
-   #define stbi__err(x,y)  0
-#elif defined(STBI_FAILURE_USERMSG)
-   #define stbi__err(x,y)  stbi__err(y)
-#else
-   #define stbi__err(x,y)  stbi__err(x)
-#endif
-
-#define stbi__errpf(x,y)   ((float *)(size_t) (stbi__err(x,y)?NULL:NULL))
-#define stbi__errpuc(x,y)  ((unsigned char *)(size_t) (stbi__err(x,y)?NULL:NULL))
-
-STBIDEF void stbi_image_free(void *retval_from_stbi_load)
-{
-   STBI_FREE(retval_from_stbi_load);
-}
-
-#ifndef STBI_NO_LINEAR
-static float   *stbi__ldr_to_hdr(stbi_uc *data, int x, int y, int comp);
-#endif
-
-#ifndef STBI_NO_HDR
-static stbi_uc *stbi__hdr_to_ldr(float   *data, int x, int y, int comp);
-#endif
-
-static int stbi__vertically_flip_on_load_global = 0;
-
-STBIDEF void stbi_set_flip_vertically_on_load(int flag_true_if_should_flip)
-{
-   stbi__vertically_flip_on_load_global = flag_true_if_should_flip;
-}
-
-#ifndef STBI_THREAD_LOCAL
-#define stbi__vertically_flip_on_load  stbi__vertically_flip_on_load_global
-#else
-static STBI_THREAD_LOCAL int stbi__vertically_flip_on_load_local, stbi__vertically_flip_on_load_set;
-
-STBIDEF void stbi_set_flip_vertically_on_load_thread(int flag_true_if_should_flip)
-{
-   stbi__vertically_flip_on_load_local = flag_true_if_should_flip;
-   stbi__vertically_flip_on_load_set = 1;
-}
-
-#define stbi__vertically_flip_on_load  (stbi__vertically_flip_on_load_set       \
-                                         ? stbi__vertically_flip_on_load_local  \
-                                         : stbi__vertically_flip_on_load_global)
-#endif // STBI_THREAD_LOCAL
-
-static void *stbi__load_main(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri, int bpc)
-{
-   memset(ri, 0, sizeof(*ri)); // make sure it's initialized if we add new fields
-   ri->bits_per_channel = 8; // default is 8 so most paths don't have to be changed
-   ri->channel_order = STBI_ORDER_RGB; // all current input & output are this, but this is here so we can add BGR order
-   ri->num_channels = 0;
-
-   // test the formats with a very explicit header first (at least a FOURCC
-   // or distinctive magic number first)
-   #ifndef STBI_NO_PNG
-   if (stbi__png_test(s))  return stbi__png_load(s,x,y,comp,req_comp, ri);
-   #endif
-   #ifndef STBI_NO_BMP
-   if (stbi__bmp_test(s))  return stbi__bmp_load(s,x,y,comp,req_comp, ri);
-   #endif
-   #ifndef STBI_NO_GIF
-   if (stbi__gif_test(s))  return stbi__gif_load(s,x,y,comp,req_comp, ri);
-   #endif
-   #ifndef STBI_NO_PSD
-   if (stbi__psd_test(s))  return stbi__psd_load(s,x,y,comp,req_comp, ri, bpc);
-   #else
-   STBI_NOTUSED(bpc);
-   #endif
-   #ifndef STBI_NO_PIC
-   if (stbi__pic_test(s))  return stbi__pic_load(s,x,y,comp,req_comp, ri);
-   #endif
-
-   // then the formats that can end up attempting to load with just 1 or 2
-   // bytes matching expectations; these are prone to false positives, so
-   // try them later
-   #ifndef STBI_NO_JPEG
-   if (stbi__jpeg_test(s)) return stbi__jpeg_load(s,x,y,comp,req_comp, ri);
-   #endif
-   #ifndef STBI_NO_PNM
-   if (stbi__pnm_test(s))  return stbi__pnm_load(s,x,y,comp,req_comp, ri);
-   #endif
-
-   #ifndef STBI_NO_HDR
-   if (stbi__hdr_test(s)) {
-      float *hdr = stbi__hdr_load(s, x,y,comp,req_comp, ri);
-      return stbi__hdr_to_ldr(hdr, *x, *y, req_comp ? req_comp : *comp);
-   }
-   #endif
-
-   #ifndef STBI_NO_TGA
-   // test tga last because it's a crappy test!
-   if (stbi__tga_test(s))
-      return stbi__tga_load(s,x,y,comp,req_comp, ri);
-   #endif
-
-   return stbi__errpuc("unknown image type", "Image not of any known type, or corrupt");
-}
-
-static stbi_uc *stbi__convert_16_to_8(stbi__uint16 *orig, int w, int h, int channels)
-{
-   int i;
-   int img_len = w * h * channels;
-   stbi_uc *reduced;
-
-   reduced = (stbi_uc *) stbi__malloc(img_len);
-   if (reduced == NULL) return stbi__errpuc("outofmem", "Out of memory");
-
-   for (i = 0; i < img_len; ++i)
-      reduced[i] = (stbi_uc)((orig[i] >> 8) & 0xFF); // top half of each byte is sufficient approx of 16->8 bit scaling
-
-   STBI_FREE(orig);
-   return reduced;
-}
-
-static stbi__uint16 *stbi__convert_8_to_16(stbi_uc *orig, int w, int h, int channels)
-{
-   int i;
-   int img_len = w * h * channels;
-   stbi__uint16 *enlarged;
-
-   enlarged = (stbi__uint16 *) stbi__malloc(img_len*2);
-   if (enlarged == NULL) return (stbi__uint16 *) stbi__errpuc("outofmem", "Out of memory");
-
-   for (i = 0; i < img_len; ++i)
-      enlarged[i] = (stbi__uint16)((orig[i] << 8) + orig[i]); // replicate to high and low byte, maps 0->0, 255->0xffff
-
-   STBI_FREE(orig);
-   return enlarged;
-}
-
-static void stbi__vertical_flip(void *image, int w, int h, int bytes_per_pixel)
-{
-   int row;
-   size_t bytes_per_row = (size_t)w * bytes_per_pixel;
-   stbi_uc temp[2048];
-   stbi_uc *bytes = (stbi_uc *)image;
-
-   for (row = 0; row < (h>>1); row++) {
-      stbi_uc *row0 = bytes + row*bytes_per_row;
-      stbi_uc *row1 = bytes + (h - row - 1)*bytes_per_row;
-      // swap row0 with row1
-      size_t bytes_left = bytes_per_row;
-      while (bytes_left) {
-         size_t bytes_copy = (bytes_left < sizeof(temp)) ? bytes_left : sizeof(temp);
-         memcpy(temp, row0, bytes_copy);
-         memcpy(row0, row1, bytes_copy);
-         memcpy(row1, temp, bytes_copy);
-         row0 += bytes_copy;
-         row1 += bytes_copy;
-         bytes_left -= bytes_copy;
-      }
-   }
-}
-
-#ifndef STBI_NO_GIF
-static void stbi__vertical_flip_slices(void *image, int w, int h, int z, int bytes_per_pixel)
-{
-   int slice;
-   int slice_size = w * h * bytes_per_pixel;
-
-   stbi_uc *bytes = (stbi_uc *)image;
-   for (slice = 0; slice < z; ++slice) {
-      stbi__vertical_flip(bytes, w, h, bytes_per_pixel);
-      bytes += slice_size;
-   }
-}
-#endif
-
-static unsigned char *stbi__load_and_postprocess_8bit(stbi__context *s, int *x, int *y, int *comp, int req_comp)
-{
-   stbi__result_info ri;
-   void *result = stbi__load_main(s, x, y, comp, req_comp, &ri, 8);
-
-   if (result == NULL)
-      return NULL;
-
-   // it is the responsibility of the loaders to make sure we get either 8 or 16 bit.
-   STBI_ASSERT(ri.bits_per_channel == 8 || ri.bits_per_channel == 16);
-
-   if (ri.bits_per_channel != 8) {
-      result = stbi__convert_16_to_8((stbi__uint16 *) result, *x, *y, req_comp == 0 ? *comp : req_comp);
-      ri.bits_per_channel = 8;
-   }
-
-   // @TODO: move stbi__convert_format to here
-
-   if (stbi__vertically_flip_on_load) {
-      int channels = req_comp ? req_comp : *comp;
-      stbi__vertical_flip(result, *x, *y, channels * sizeof(stbi_uc));
-   }
-
-   return (unsigned char *) result;
-}
-
-static stbi__uint16 *stbi__load_and_postprocess_16bit(stbi__context *s, int *x, int *y, int *comp, int req_comp)
-{
-   stbi__result_info ri;
-   void *result = stbi__load_main(s, x, y, comp, req_comp, &ri, 16);
-
-   if (result == NULL)
-      return NULL;
-
-   // it is the responsibility of the loaders to make sure we get either 8 or 16 bit.
-   STBI_ASSERT(ri.bits_per_channel == 8 || ri.bits_per_channel == 16);
-
-   if (ri.bits_per_channel != 16) {
-      result = stbi__convert_8_to_16((stbi_uc *) result, *x, *y, req_comp == 0 ? *comp : req_comp);
-      ri.bits_per_channel = 16;
-   }
-
-   // @TODO: move stbi__convert_format16 to here
-   // @TODO: special case RGB-to-Y (and RGBA-to-YA) for 8-bit-to-16-bit case to keep more precision
-
-   if (stbi__vertically_flip_on_load) {
-      int channels = req_comp ? req_comp : *comp;
-      stbi__vertical_flip(result, *x, *y, channels * sizeof(stbi__uint16));
-   }
-
-   return (stbi__uint16 *) result;
-}
-
-#if !defined(STBI_NO_HDR) && !defined(STBI_NO_LINEAR)
-static void stbi__float_postprocess(float *result, int *x, int *y, int *comp, int req_comp)
-{
-   if (stbi__vertically_flip_on_load && result != NULL) {
-      int channels = req_comp ? req_comp : *comp;
-      stbi__vertical_flip(result, *x, *y, channels * sizeof(float));
-   }
-}
-#endif
-
-#ifndef STBI_NO_STDIO
-
-#if defined(_WIN32) && defined(STBI_WINDOWS_UTF8)
-STBI_EXTERN __declspec(dllimport) int __stdcall MultiByteToWideChar(unsigned int cp, unsigned long flags, const char *str, int cbmb, wchar_t *widestr, int cchwide);
-STBI_EXTERN __declspec(dllimport) int __stdcall WideCharToMultiByte(unsigned int cp, unsigned long flags, const wchar_t *widestr, int cchwide, char *str, int cbmb, const char *defchar, int *used_default);
-#endif
-
-#if defined(_WIN32) && defined(STBI_WINDOWS_UTF8)
-STBIDEF int stbi_convert_wchar_to_utf8(char *buffer, size_t bufferlen, const wchar_t* input)
-{
-	return WideCharToMultiByte(65001 /* UTF8 */, 0, input, -1, buffer, (int) bufferlen, NULL, NULL);
-}
-#endif
-
-static FILE *stbi__fopen(char const *filename, char const *mode)
-{
-   FILE *f;
-#if defined(_WIN32) && defined(STBI_WINDOWS_UTF8)
-   wchar_t wMode[64];
-   wchar_t wFilename[1024];
-	if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, filename, -1, wFilename, sizeof(wFilename)/sizeof(*wFilename)))
-      return 0;
-
-	if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, mode, -1, wMode, sizeof(wMode)/sizeof(*wMode)))
-      return 0;
-
-#if defined(_MSC_VER) && _MSC_VER >= 1400
-	if (0 != _wfopen_s(&f, wFilename, wMode))
-		f = 0;
-#else
-   f = _wfopen(wFilename, wMode);
-#endif
-
-#elif defined(_MSC_VER) && _MSC_VER >= 1400
-   if (0 != fopen_s(&f, filename, mode))
-      f=0;
-#else
-   f = fopen(filename, mode);
-#endif
-   return f;
-}
-
-
-STBIDEF stbi_uc *stbi_load(char const *filename, int *x, int *y, int *comp, int req_comp)
-{
-   FILE *f = stbi__fopen(filename, "rb");
-   unsigned char *result;
-   if (!f) return stbi__errpuc("can't fopen", "Unable to open file");
-   result = stbi_load_from_file(f,x,y,comp,req_comp);
-   fclose(f);
-   return result;
-}
-
-STBIDEF stbi_uc *stbi_load_from_file(FILE *f, int *x, int *y, int *comp, int req_comp)
-{
-   unsigned char *result;
-   stbi__context s;
-   stbi__start_file(&s,f);
-   result = stbi__load_and_postprocess_8bit(&s,x,y,comp,req_comp);
-   if (result) {
-      // need to 'unget' all the characters in the IO buffer
-      fseek(f, - (int) (s.img_buffer_end - s.img_buffer), SEEK_CUR);
-   }
-   return result;
-}
-
-STBIDEF stbi__uint16 *stbi_load_from_file_16(FILE *f, int *x, int *y, int *comp, int req_comp)
-{
-   stbi__uint16 *result;
-   stbi__context s;
-   stbi__start_file(&s,f);
-   result = stbi__load_and_postprocess_16bit(&s,x,y,comp,req_comp);
-   if (result) {
-      // need to 'unget' all the characters in the IO buffer
-      fseek(f, - (int) (s.img_buffer_end - s.img_buffer), SEEK_CUR);
-   }
-   return result;
-}
-
-STBIDEF stbi_us *stbi_load_16(char const *filename, int *x, int *y, int *comp, int req_comp)
-{
-   FILE *f = stbi__fopen(filename, "rb");
-   stbi__uint16 *result;
-   if (!f) return (stbi_us *) stbi__errpuc("can't fopen", "Unable to open file");
-   result = stbi_load_from_file_16(f,x,y,comp,req_comp);
-   fclose(f);
-   return result;
-}
-
-
-#endif //!STBI_NO_STDIO
-
-STBIDEF stbi_us *stbi_load_16_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *channels_in_file, int desired_channels)
-{
-   stbi__context s;
-   stbi__start_mem(&s,buffer,len);
-   return stbi__load_and_postprocess_16bit(&s,x,y,channels_in_file,desired_channels);
-}
-
-STBIDEF stbi_us *stbi_load_16_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *channels_in_file, int desired_channels)
-{
-   stbi__context s;
-   stbi__start_callbacks(&s, (stbi_io_callbacks *)clbk, user);
-   return stbi__load_and_postprocess_16bit(&s,x,y,channels_in_file,desired_channels);
-}
-
-STBIDEF stbi_uc *stbi_load_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp, int req_comp)
-{
-   stbi__context s;
-   stbi__start_mem(&s,buffer,len);
-   return stbi__load_and_postprocess_8bit(&s,x,y,comp,req_comp);
-}
-
-STBIDEF stbi_uc *stbi_load_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *comp, int req_comp)
-{
-   stbi__context s;
-   stbi__start_callbacks(&s, (stbi_io_callbacks *) clbk, user);
-   return stbi__load_and_postprocess_8bit(&s,x,y,comp,req_comp);
-}
-
-#ifndef STBI_NO_GIF
-STBIDEF stbi_uc *stbi_load_gif_from_memory(stbi_uc const *buffer, int len, int **delays, int *x, int *y, int *z, int *comp, int req_comp)
-{
-   unsigned char *result;
-   stbi__context s;
-   stbi__start_mem(&s,buffer,len);
-
-   result = (unsigned char*) stbi__load_gif_main(&s, delays, x, y, z, comp, req_comp);
-   if (stbi__vertically_flip_on_load) {
-      stbi__vertical_flip_slices( result, *x, *y, *z, *comp );
-   }
-
-   return result;
-}
-#endif
-
-#ifndef STBI_NO_LINEAR
-static float *stbi__loadf_main(stbi__context *s, int *x, int *y, int *comp, int req_comp)
-{
-   unsigned char *data;
-   #ifndef STBI_NO_HDR
-   if (stbi__hdr_test(s)) {
-      stbi__result_info ri;
-      float *hdr_data = stbi__hdr_load(s,x,y,comp,req_comp, &ri);
-      if (hdr_data)
-         stbi__float_postprocess(hdr_data,x,y,comp,req_comp);
-      return hdr_data;
-   }
-   #endif
-   data = stbi__load_and_postprocess_8bit(s, x, y, comp, req_comp);
-   if (data)
-      return stbi__ldr_to_hdr(data, *x, *y, req_comp ? req_comp : *comp);
-   return stbi__errpf("unknown image type", "Image not of any known type, or corrupt");
-}
-
-STBIDEF float *stbi_loadf_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp, int req_comp)
-{
-   stbi__context s;
-   stbi__start_mem(&s,buffer,len);
-   return stbi__loadf_main(&s,x,y,comp,req_comp);
-}
-
-STBIDEF float *stbi_loadf_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *comp, int req_comp)
-{
-   stbi__context s;
-   stbi__start_callbacks(&s, (stbi_io_callbacks *) clbk, user);
-   return stbi__loadf_main(&s,x,y,comp,req_comp);
-}
-
-#ifndef STBI_NO_STDIO
-STBIDEF float *stbi_loadf(char const *filename, int *x, int *y, int *comp, int req_comp)
-{
-   float *result;
-   FILE *f = stbi__fopen(filename, "rb");
-   if (!f) return stbi__errpf("can't fopen", "Unable to open file");
-   result = stbi_loadf_from_file(f,x,y,comp,req_comp);
-   fclose(f);
-   return result;
-}
-
-STBIDEF float *stbi_loadf_from_file(FILE *f, int *x, int *y, int *comp, int req_comp)
-{
-   stbi__context s;
-   stbi__start_file(&s,f);
-   return stbi__loadf_main(&s,x,y,comp,req_comp);
-}
-#endif // !STBI_NO_STDIO
-
-#endif // !STBI_NO_LINEAR
-
-// these is-hdr-or-not is defined independent of whether STBI_NO_LINEAR is
-// defined, for API simplicity; if STBI_NO_LINEAR is defined, it always
-// reports false!
-
-STBIDEF int stbi_is_hdr_from_memory(stbi_uc const *buffer, int len)
-{
-   #ifndef STBI_NO_HDR
-   stbi__context s;
-   stbi__start_mem(&s,buffer,len);
-   return stbi__hdr_test(&s);
-   #else
-   STBI_NOTUSED(buffer);
-   STBI_NOTUSED(len);
-   return 0;
-   #endif
-}
-
-#ifndef STBI_NO_STDIO
-STBIDEF int      stbi_is_hdr          (char const *filename)
-{
-   FILE *f = stbi__fopen(filename, "rb");
-   int result=0;
-   if (f) {
-      result = stbi_is_hdr_from_file(f);
-      fclose(f);
-   }
-   return result;
-}
-
-STBIDEF int stbi_is_hdr_from_file(FILE *f)
-{
-   #ifndef STBI_NO_HDR
-   long pos = ftell(f);
-   int res;
-   stbi__context s;
-   stbi__start_file(&s,f);
-   res = stbi__hdr_test(&s);
-   fseek(f, pos, SEEK_SET);
-   return res;
-   #else
-   STBI_NOTUSED(f);
-   return 0;
-   #endif
-}
-#endif // !STBI_NO_STDIO
-
-STBIDEF int      stbi_is_hdr_from_callbacks(stbi_io_callbacks const *clbk, void *user)
-{
-   #ifndef STBI_NO_HDR
-   stbi__context s;
-   stbi__start_callbacks(&s, (stbi_io_callbacks *) clbk, user);
-   return stbi__hdr_test(&s);
-   #else
-   STBI_NOTUSED(clbk);
-   STBI_NOTUSED(user);
-   return 0;
-   #endif
-}
-
-#ifndef STBI_NO_LINEAR
-static float stbi__l2h_gamma=2.2f, stbi__l2h_scale=1.0f;
-
-STBIDEF void   stbi_ldr_to_hdr_gamma(float gamma) { stbi__l2h_gamma = gamma; }
-STBIDEF void   stbi_ldr_to_hdr_scale(float scale) { stbi__l2h_scale = scale; }
-#endif
-
-static float stbi__h2l_gamma_i=1.0f/2.2f, stbi__h2l_scale_i=1.0f;
-
-STBIDEF void   stbi_hdr_to_ldr_gamma(float gamma) { stbi__h2l_gamma_i = 1/gamma; }
-STBIDEF void   stbi_hdr_to_ldr_scale(float scale) { stbi__h2l_scale_i = 1/scale; }
-
-
-//////////////////////////////////////////////////////////////////////////////
-//
-// Common code used by all image loaders
-//
-
-enum
-{
-   STBI__SCAN_load=0,
-   STBI__SCAN_type,
-   STBI__SCAN_header
-};
-
-static void stbi__refill_buffer(stbi__context *s)
-{
-   int n = (s->io.read)(s->io_user_data,(char*)s->buffer_start,s->buflen);
-   s->callback_already_read += (int) (s->img_buffer - s->img_buffer_original);
-   if (n == 0) {
-      // at end of file, treat same as if from memory, but need to handle case
-      // where s->img_buffer isn't pointing to safe memory, e.g. 0-byte file
-      s->read_from_callbacks = 0;
-      s->img_buffer = s->buffer_start;
-      s->img_buffer_end = s->buffer_start+1;
-      *s->img_buffer = 0;
-   } else {
-      s->img_buffer = s->buffer_start;
-      s->img_buffer_end = s->buffer_start + n;
-   }
-}
-
-stbi_inline static stbi_uc stbi__get8(stbi__context *s)
-{
-   if (s->img_buffer < s->img_buffer_end)
-      return *s->img_buffer++;
-   if (s->read_from_callbacks) {
-      stbi__refill_buffer(s);
-      return *s->img_buffer++;
-   }
-   return 0;
-}
-
-#if defined(STBI_NO_JPEG) && defined(STBI_NO_HDR) && defined(STBI_NO_PIC) && defined(STBI_NO_PNM)
-// nothing
-#else
-stbi_inline static int stbi__at_eof(stbi__context *s)
-{
-   if (s->io.read) {
-      if (!(s->io.eof)(s->io_user_data)) return 0;
-      // if feof() is true, check if buffer = end
-      // special case: we've only got the special 0 character at the end
-      if (s->read_from_callbacks == 0) return 1;
-   }
-
-   return s->img_buffer >= s->img_buffer_end;
-}
-#endif
-
-#if defined(STBI_NO_JPEG) && defined(STBI_NO_PNG) && defined(STBI_NO_BMP) && defined(STBI_NO_PSD) && defined(STBI_NO_TGA) && defined(STBI_NO_GIF) && defined(STBI_NO_PIC)
-// nothing
-#else
-static void stbi__skip(stbi__context *s, int n)
-{
-   if (n == 0) return;  // already there!
-   if (n < 0) {
-      s->img_buffer = s->img_buffer_end;
-      return;
-   }
-   if (s->io.read) {
-      int blen = (int) (s->img_buffer_end - s->img_buffer);
-      if (blen < n) {
-         s->img_buffer = s->img_buffer_end;
-         (s->io.skip)(s->io_user_data, n - blen);
-         return;
-      }
-   }
-   s->img_buffer += n;
-}
-#endif
-
-#if defined(STBI_NO_PNG) && defined(STBI_NO_TGA) && defined(STBI_NO_HDR) && defined(STBI_NO_PNM)
-// nothing
-#else
-static int stbi__getn(stbi__context *s, stbi_uc *buffer, int n)
-{
-   if (s->io.read) {
-      int blen = (int) (s->img_buffer_end - s->img_buffer);
-      if (blen < n) {
-         int res, count;
-
-         memcpy(buffer, s->img_buffer, blen);
-
-         count = (s->io.read)(s->io_user_data, (char*) buffer + blen, n - blen);
-         res = (count == (n-blen));
-         s->img_buffer = s->img_buffer_end;
-         return res;
-      }
-   }
-
-   if (s->img_buffer+n <= s->img_buffer_end) {
-      memcpy(buffer, s->img_buffer, n);
-      s->img_buffer += n;
-      return 1;
-   } else
-      return 0;
-}
-#endif
-
-#if defined(STBI_NO_JPEG) && defined(STBI_NO_PNG) && defined(STBI_NO_PSD) && defined(STBI_NO_PIC)
-// nothing
-#else
-static int stbi__get16be(stbi__context *s)
-{
-   int z = stbi__get8(s);
-   return (z << 8) + stbi__get8(s);
-}
-#endif
-
-#if defined(STBI_NO_PNG) && defined(STBI_NO_PSD) && defined(STBI_NO_PIC)
-// nothing
-#else
-static stbi__uint32 stbi__get32be(stbi__context *s)
-{
-   stbi__uint32 z = stbi__get16be(s);
-   return (z << 16) + stbi__get16be(s);
-}
-#endif
-
-#if defined(STBI_NO_BMP) && defined(STBI_NO_TGA) && defined(STBI_NO_GIF)
-// nothing
-#else
-static int stbi__get16le(stbi__context *s)
-{
-   int z = stbi__get8(s);
-   return z + (stbi__get8(s) << 8);
-}
-#endif
-
-#ifndef STBI_NO_BMP
-static stbi__uint32 stbi__get32le(stbi__context *s)
-{
-   stbi__uint32 z = stbi__get16le(s);
-   z += (stbi__uint32)stbi__get16le(s) << 16;
-   return z;
-}
-#endif
-
-#define STBI__BYTECAST(x)  ((stbi_uc) ((x) & 255))  // truncate int to byte without warnings
-
-#if defined(STBI_NO_JPEG) && defined(STBI_NO_PNG) && defined(STBI_NO_BMP) && defined(STBI_NO_PSD) && defined(STBI_NO_TGA) && defined(STBI_NO_GIF) && defined(STBI_NO_PIC) && defined(STBI_NO_PNM)
-// nothing
-#else
-//////////////////////////////////////////////////////////////////////////////
-//
-//  generic converter from built-in img_n to req_comp
-//    individual types do this automatically as much as possible (e.g. jpeg
-//    does all cases internally since it needs to colorspace convert anyway,
-//    and it never has alpha, so very few cases ). png can automatically
-//    interleave an alpha=255 channel, but falls back to this for other cases
-//
-//  assume data buffer is malloced, so malloc a new one and free that one
-//  only failure mode is malloc failing
-
-static stbi_uc stbi__compute_y(int r, int g, int b)
-{
-   return (stbi_uc) (((r*77) + (g*150) +  (29*b)) >> 8);
-}
-#endif
-
-#if defined(STBI_NO_PNG) && defined(STBI_NO_BMP) && defined(STBI_NO_PSD) && defined(STBI_NO_TGA) && defined(STBI_NO_GIF) && defined(STBI_NO_PIC) && defined(STBI_NO_PNM)
-// nothing
-#else
-static unsigned char *stbi__convert_format(unsigned char *data, int img_n, int req_comp, unsigned int x, unsigned int y)
-{
-   int i,j;
-   unsigned char *good;
-
-   if (req_comp == img_n) return data;
-   STBI_ASSERT(req_comp >= 1 && req_comp <= 4);
-
-   good = (unsigned char *) stbi__malloc_mad3(req_comp, x, y, 0);
-   if (good == NULL) {
-      STBI_FREE(data);
-      return stbi__errpuc("outofmem", "Out of memory");
-   }
-
-   for (j=0; j < (int) y; ++j) {
-      unsigned char *src  = data + j * x * img_n   ;
-      unsigned char *dest = good + j * x * req_comp;
-
-      #define STBI__COMBO(a,b)  ((a)*8+(b))
-      #define STBI__CASE(a,b)   case STBI__COMBO(a,b): for(i=x-1; i >= 0; --i, src += a, dest += b)
-      // convert source image with img_n components to one with req_comp components;
-      // avoid switch per pixel, so use switch per scanline and massive macros
-      switch (STBI__COMBO(img_n, req_comp)) {
-         STBI__CASE(1,2) { dest[0]=src[0]; dest[1]=255;                                     } break;
-         STBI__CASE(1,3) { dest[0]=dest[1]=dest[2]=src[0];                                  } break;
-         STBI__CASE(1,4) { dest[0]=dest[1]=dest[2]=src[0]; dest[3]=255;                     } break;
-         STBI__CASE(2,1) { dest[0]=src[0];                                                  } break;
-         STBI__CASE(2,3) { dest[0]=dest[1]=dest[2]=src[0];                                  } break;
-         STBI__CASE(2,4) { dest[0]=dest[1]=dest[2]=src[0]; dest[3]=src[1];                  } break;
-         STBI__CASE(3,4) { dest[0]=src[0];dest[1]=src[1];dest[2]=src[2];dest[3]=255;        } break;
-         STBI__CASE(3,1) { dest[0]=stbi__compute_y(src[0],src[1],src[2]);                   } break;
-         STBI__CASE(3,2) { dest[0]=stbi__compute_y(src[0],src[1],src[2]); dest[1] = 255;    } break;
-         STBI__CASE(4,1) { dest[0]=stbi__compute_y(src[0],src[1],src[2]);                   } break;
-         STBI__CASE(4,2) { dest[0]=stbi__compute_y(src[0],src[1],src[2]); dest[1] = src[3]; } break;
-         STBI__CASE(4,3) { dest[0]=src[0];dest[1]=src[1];dest[2]=src[2];                    } break;
-         default: STBI_ASSERT(0); STBI_FREE(data); STBI_FREE(good); return stbi__errpuc("unsupported", "Unsupported format conversion");
-      }
-      #undef STBI__CASE
-   }
-
-   STBI_FREE(data);
-   return good;
-}
-#endif
-
-#if defined(STBI_NO_PNG) && defined(STBI_NO_PSD)
-// nothing
-#else
-static stbi__uint16 stbi__compute_y_16(int r, int g, int b)
-{
-   return (stbi__uint16) (((r*77) + (g*150) +  (29*b)) >> 8);
-}
-#endif
-
-#if defined(STBI_NO_PNG) && defined(STBI_NO_PSD)
-// nothing
-#else
-static stbi__uint16 *stbi__convert_format16(stbi__uint16 *data, int img_n, int req_comp, unsigned int x, unsigned int y)
-{
-   int i,j;
-   stbi__uint16 *good;
-
-   if (req_comp == img_n) return data;
-   STBI_ASSERT(req_comp >= 1 && req_comp <= 4);
-
-   good = (stbi__uint16 *) stbi__malloc(req_comp * x * y * 2);
-   if (good == NULL) {
-      STBI_FREE(data);
-      return (stbi__uint16 *) stbi__errpuc("outofmem", "Out of memory");
-   }
-
-   for (j=0; j < (int) y; ++j) {
-      stbi__uint16 *src  = data + j * x * img_n   ;
-      stbi__uint16 *dest = good + j * x * req_comp;
-
-      #define STBI__COMBO(a,b)  ((a)*8+(b))
-      #define STBI__CASE(a,b)   case STBI__COMBO(a,b): for(i=x-1; i >= 0; --i, src += a, dest += b)
-      // convert source image with img_n components to one with req_comp components;
-      // avoid switch per pixel, so use switch per scanline and massive macros
-      switch (STBI__COMBO(img_n, req_comp)) {
-         STBI__CASE(1,2) { dest[0]=src[0]; dest[1]=0xffff;                                     } break;
-         STBI__CASE(1,3) { dest[0]=dest[1]=dest[2]=src[0];                                     } break;
-         STBI__CASE(1,4) { dest[0]=dest[1]=dest[2]=src[0]; dest[3]=0xffff;                     } break;
-         STBI__CASE(2,1) { dest[0]=src[0];                                                     } break;
-         STBI__CASE(2,3) { dest[0]=dest[1]=dest[2]=src[0];                                     } break;
-         STBI__CASE(2,4) { dest[0]=dest[1]=dest[2]=src[0]; dest[3]=src[1];                     } break;
-         STBI__CASE(3,4) { dest[0]=src[0];dest[1]=src[1];dest[2]=src[2];dest[3]=0xffff;        } break;
-         STBI__CASE(3,1) { dest[0]=stbi__compute_y_16(src[0],src[1],src[2]);                   } break;
-         STBI__CASE(3,2) { dest[0]=stbi__compute_y_16(src[0],src[1],src[2]); dest[1] = 0xffff; } break;
-         STBI__CASE(4,1) { dest[0]=stbi__compute_y_16(src[0],src[1],src[2]);                   } break;
-         STBI__CASE(4,2) { dest[0]=stbi__compute_y_16(src[0],src[1],src[2]); dest[1] = src[3]; } break;
-         STBI__CASE(4,3) { dest[0]=src[0];dest[1]=src[1];dest[2]=src[2];                       } break;
-         default: STBI_ASSERT(0); STBI_FREE(data); STBI_FREE(good); return (stbi__uint16*) stbi__errpuc("unsupported", "Unsupported format conversion");
-      }
-      #undef STBI__CASE
-   }
-
-   STBI_FREE(data);
-   return good;
-}
-#endif
-
-#ifndef STBI_NO_LINEAR
-static float   *stbi__ldr_to_hdr(stbi_uc *data, int x, int y, int comp)
-{
-   int i,k,n;
-   float *output;
-   if (!data) return NULL;
-   output = (float *) stbi__malloc_mad4(x, y, comp, sizeof(float), 0);
-   if (output == NULL) { STBI_FREE(data); return stbi__errpf("outofmem", "Out of memory"); }
-   // compute number of non-alpha components
-   if (comp & 1) n = comp; else n = comp-1;
-   for (i=0; i < x*y; ++i) {
-      for (k=0; k < n; ++k) {
-         output[i*comp + k] = (float) (pow(data[i*comp+k]/255.0f, stbi__l2h_gamma) * stbi__l2h_scale);
-      }
-   }
-   if (n < comp) {
-      for (i=0; i < x*y; ++i) {
-         output[i*comp + n] = data[i*comp + n]/255.0f;
-      }
-   }
-   STBI_FREE(data);
-   return output;
-}
-#endif
-
-#ifndef STBI_NO_HDR
-#define stbi__float2int(x)   ((int) (x))
-static stbi_uc *stbi__hdr_to_ldr(float   *data, int x, int y, int comp)
-{
-   int i,k,n;
-   stbi_uc *output;
-   if (!data) return NULL;
-   output = (stbi_uc *) stbi__malloc_mad3(x, y, comp, 0);
-   if (output == NULL) { STBI_FREE(data); return stbi__errpuc("outofmem", "Out of memory"); }
-   // compute number of non-alpha components
-   if (comp & 1) n = comp; else n = comp-1;
-   for (i=0; i < x*y; ++i) {
-      for (k=0; k < n; ++k) {
-         float z = (float) pow(data[i*comp+k]*stbi__h2l_scale_i, stbi__h2l_gamma_i) * 255 + 0.5f;
-         if (z < 0) z = 0;
-         if (z > 255) z = 255;
-         output[i*comp + k] = (stbi_uc) stbi__float2int(z);
-      }
-      if (k < comp) {
-         float z = data[i*comp+k] * 255 + 0.5f;
-         if (z < 0) z = 0;
-         if (z > 255) z = 255;
-         output[i*comp + k] = (stbi_uc) stbi__float2int(z);
-      }
-   }
-   STBI_FREE(data);
-   return output;
-}
-#endif
-
-//////////////////////////////////////////////////////////////////////////////
-//
-//  "baseline" JPEG/JFIF decoder
-//
-//    simple implementation
-//      - doesn't support delayed output of y-dimension
-//      - simple interface (only one output format: 8-bit interleaved RGB)
-//      - doesn't try to recover corrupt jpegs
-//      - doesn't allow partial loading, loading multiple at once
-//      - still fast on x86 (copying globals into locals doesn't help x86)
-//      - allocates lots of intermediate memory (full size of all components)
-//        - non-interleaved case requires this anyway
-//        - allows good upsampling (see next)
-//    high-quality
-//      - upsampled channels are bilinearly interpolated, even across blocks
-//      - quality integer IDCT derived from IJG's 'slow'
-//    performance
-//      - fast huffman; reasonable integer IDCT
-//      - some SIMD kernels for common paths on targets with SSE2/NEON
-//      - uses a lot of intermediate memory, could cache poorly
-
-#ifndef STBI_NO_JPEG
-
-// huffman decoding acceleration
-#define FAST_BITS   9  // larger handles more cases; smaller stomps less cache
-
-typedef struct
-{
-   stbi_uc  fast[1 << FAST_BITS];
-   // weirdly, repacking this into AoS is a 10% speed loss, instead of a win
-   stbi__uint16 code[256];
-   stbi_uc  values[256];
-   stbi_uc  size[257];
-   unsigned int maxcode[18];
-   int    delta[17];   // old 'firstsymbol' - old 'firstcode'
-} stbi__huffman;
-
-typedef struct
-{
-   stbi__context *s;
-   stbi__huffman huff_dc[4];
-   stbi__huffman huff_ac[4];
-   stbi__uint16 dequant[4][64];
-   stbi__int16 fast_ac[4][1 << FAST_BITS];
-
-// sizes for components, interleaved MCUs
-   int img_h_max, img_v_max;
-   int img_mcu_x, img_mcu_y;
-   int img_mcu_w, img_mcu_h;
-
-// definition of jpeg image component
-   struct
-   {
-      int id;
-      int h,v;
-      int tq;
-      int hd,ha;
-      int dc_pred;
-
-      int x,y,w2,h2;
-      stbi_uc *data;
-      void *raw_data, *raw_coeff;
-      stbi_uc *linebuf;
-      short   *coeff;   // progressive only
-      int      coeff_w, coeff_h; // number of 8x8 coefficient blocks
-   } img_comp[4];
-
-   stbi__uint32   code_buffer; // jpeg entropy-coded buffer
-   int            code_bits;   // number of valid bits
-   unsigned char  marker;      // marker seen while filling entropy buffer
-   int            nomore;      // flag if we saw a marker so must stop
-
-   int            progressive;
-   int            spec_start;
-   int            spec_end;
-   int            succ_high;
-   int            succ_low;
-   int            eob_run;
-   int            jfif;
-   int            app14_color_transform; // Adobe APP14 tag
-   int            rgb;
-
-   int scan_n, order[4];
-   int restart_interval, todo;
-
-// kernels
-   void (*idct_block_kernel)(stbi_uc *out, int out_stride, short data[64]);
-   void (*YCbCr_to_RGB_kernel)(stbi_uc *out, const stbi_uc *y, const stbi_uc *pcb, const stbi_uc *pcr, int count, int step);
-   stbi_uc *(*resample_row_hv_2_kernel)(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs);
-} stbi__jpeg;
-
-static int stbi__build_huffman(stbi__huffman *h, int *count)
-{
-   int i,j,k=0;
-   unsigned int code;
-   // build size list for each symbol (from JPEG spec)
-   for (i=0; i < 16; ++i) {
-      for (j=0; j < count[i]; ++j) {
-         h->size[k++] = (stbi_uc) (i+1);
-         if(k >= 257) return stbi__err("bad size list","Corrupt JPEG");
-      }
-   }
-   h->size[k] = 0;
-
-   // compute actual symbols (from jpeg spec)
-   code = 0;
-   k = 0;
-   for(j=1; j <= 16; ++j) {
-      // compute delta to add to code to compute symbol id
-      h->delta[j] = k - code;
-      if (h->size[k] == j) {
-         while (h->size[k] == j)
-            h->code[k++] = (stbi__uint16) (code++);
-         if (code-1 >= (1u << j)) return stbi__err("bad code lengths","Corrupt JPEG");
-      }
-      // compute largest code + 1 for this size, preshifted as needed later
-      h->maxcode[j] = code << (16-j);
-      code <<= 1;
-   }
-   h->maxcode[j] = 0xffffffff;
-
-   // build non-spec acceleration table; 255 is flag for not-accelerated
-   memset(h->fast, 255, 1 << FAST_BITS);
-   for (i=0; i < k; ++i) {
-      int s = h->size[i];
-      if (s <= FAST_BITS) {
-         int c = h->code[i] << (FAST_BITS-s);
-         int m = 1 << (FAST_BITS-s);
-         for (j=0; j < m; ++j) {
-            h->fast[c+j] = (stbi_uc) i;
-         }
-      }
-   }
-   return 1;
-}
-
-// build a table that decodes both magnitude and value of small ACs in
-// one go.
-static void stbi__build_fast_ac(stbi__int16 *fast_ac, stbi__huffman *h)
-{
-   int i;
-   for (i=0; i < (1 << FAST_BITS); ++i) {
-      stbi_uc fast = h->fast[i];
-      fast_ac[i] = 0;
-      if (fast < 255) {
-         int rs = h->values[fast];
-         int run = (rs >> 4) & 15;
-         int magbits = rs & 15;
-         int len = h->size[fast];
-
-         if (magbits && len + magbits <= FAST_BITS) {
-            // magnitude code followed by receive_extend code
-            int k = ((i << len) & ((1 << FAST_BITS) - 1)) >> (FAST_BITS - magbits);
-            int m = 1 << (magbits - 1);
-            if (k < m) k += (~0U << magbits) + 1;
-            // if the result is small enough, we can fit it in fast_ac table
-            if (k >= -128 && k <= 127)
-               fast_ac[i] = (stbi__int16) ((k * 256) + (run * 16) + (len + magbits));
-         }
-      }
-   }
-}
-
-static void stbi__grow_buffer_unsafe(stbi__jpeg *j)
-{
-   do {
-      unsigned int b = j->nomore ? 0 : stbi__get8(j->s);
-      if (b == 0xff) {
-         int c = stbi__get8(j->s);
-         while (c == 0xff) c = stbi__get8(j->s); // consume fill bytes
-         if (c != 0) {
-            j->marker = (unsigned char) c;
-            j->nomore = 1;
-            return;
-         }
-      }
-      j->code_buffer |= b << (24 - j->code_bits);
-      j->code_bits += 8;
-   } while (j->code_bits <= 24);
-}
-
-// (1 << n) - 1
-static const stbi__uint32 stbi__bmask[17]={0,1,3,7,15,31,63,127,255,511,1023,2047,4095,8191,16383,32767,65535};
-
-// decode a jpeg huffman value from the bitstream
-stbi_inline static int stbi__jpeg_huff_decode(stbi__jpeg *j, stbi__huffman *h)
-{
-   unsigned int temp;
-   int c,k;
-
-   if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
-
-   // look at the top FAST_BITS and determine what symbol ID it is,
-   // if the code is <= FAST_BITS
-   c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1);
-   k = h->fast[c];
-   if (k < 255) {
-      int s = h->size[k];
-      if (s > j->code_bits)
-         return -1;
-      j->code_buffer <<= s;
-      j->code_bits -= s;
-      return h->values[k];
-   }
-
-   // naive test is to shift the code_buffer down so k bits are
-   // valid, then test against maxcode. To speed this up, we've
-   // preshifted maxcode left so that it has (16-k) 0s at the
-   // end; in other words, regardless of the number of bits, it
-   // wants to be compared against something shifted to have 16;
-   // that way we don't need to shift inside the loop.
-   temp = j->code_buffer >> 16;
-   for (k=FAST_BITS+1 ; ; ++k)
-      if (temp < h->maxcode[k])
-         break;
-   if (k == 17) {
-      // error! code not found
-      j->code_bits -= 16;
-      return -1;
-   }
-
-   if (k > j->code_bits)
-      return -1;
-
-   // convert the huffman code to the symbol id
-   c = ((j->code_buffer >> (32 - k)) & stbi__bmask[k]) + h->delta[k];
-   if(c < 0 || c >= 256) // symbol id out of bounds!
-       return -1;
-   STBI_ASSERT((((j->code_buffer) >> (32 - h->size[c])) & stbi__bmask[h->size[c]]) == h->code[c]);
-
-   // convert the id to a symbol
-   j->code_bits -= k;
-   j->code_buffer <<= k;
-   return h->values[c];
-}
-
-// bias[n] = (-1<<n) + 1
-static const int stbi__jbias[16] = {0,-1,-3,-7,-15,-31,-63,-127,-255,-511,-1023,-2047,-4095,-8191,-16383,-32767};
-
-// combined JPEG 'receive' and JPEG 'extend', since baseline
-// always extends everything it receives.
-stbi_inline static int stbi__extend_receive(stbi__jpeg *j, int n)
-{
-   unsigned int k;
-   int sgn;
-   if (j->code_bits < n) stbi__grow_buffer_unsafe(j);
-   if (j->code_bits < n) return 0; // ran out of bits from stream, return 0s intead of continuing
-
-   sgn = j->code_buffer >> 31; // sign bit always in MSB; 0 if MSB clear (positive), 1 if MSB set (negative)
-   k = stbi_lrot(j->code_buffer, n);
-   j->code_buffer = k & ~stbi__bmask[n];
-   k &= stbi__bmask[n];
-   j->code_bits -= n;
-   return k + (stbi__jbias[n] & (sgn - 1));
-}
-
-// get some unsigned bits
-stbi_inline static int stbi__jpeg_get_bits(stbi__jpeg *j, int n)
-{
-   unsigned int k;
-   if (j->code_bits < n) stbi__grow_buffer_unsafe(j);
-   if (j->code_bits < n) return 0; // ran out of bits from stream, return 0s intead of continuing
-   k = stbi_lrot(j->code_buffer, n);
-   j->code_buffer = k & ~stbi__bmask[n];
-   k &= stbi__bmask[n];
-   j->code_bits -= n;
-   return k;
-}
-
-stbi_inline static int stbi__jpeg_get_bit(stbi__jpeg *j)
-{
-   unsigned int k;
-   if (j->code_bits < 1) stbi__grow_buffer_unsafe(j);
-   if (j->code_bits < 1) return 0; // ran out of bits from stream, return 0s intead of continuing
-   k = j->code_buffer;
-   j->code_buffer <<= 1;
-   --j->code_bits;
-   return k & 0x80000000;
-}
-
-// given a value that's at position X in the zigzag stream,
-// where does it appear in the 8x8 matrix coded as row-major?
-static const stbi_uc stbi__jpeg_dezigzag[64+15] =
-{
-    0,  1,  8, 16,  9,  2,  3, 10,
-   17, 24, 32, 25, 18, 11,  4,  5,
-   12, 19, 26, 33, 40, 48, 41, 34,
-   27, 20, 13,  6,  7, 14, 21, 28,
-   35, 42, 49, 56, 57, 50, 43, 36,
-   29, 22, 15, 23, 30, 37, 44, 51,
-   58, 59, 52, 45, 38, 31, 39, 46,
-   53, 60, 61, 54, 47, 55, 62, 63,
-   // let corrupt input sample past end
-   63, 63, 63, 63, 63, 63, 63, 63,
-   63, 63, 63, 63, 63, 63, 63
-};
-
-// decode one 64-entry block--
-static int stbi__jpeg_decode_block(stbi__jpeg *j, short data[64], stbi__huffman *hdc, stbi__huffman *hac, stbi__int16 *fac, int b, stbi__uint16 *dequant)
-{
-   int diff,dc,k;
-   int t;
-
-   if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
-   t = stbi__jpeg_huff_decode(j, hdc);
-   if (t < 0 || t > 15) return stbi__err("bad huffman code","Corrupt JPEG");
-
-   // 0 all the ac values now so we can do it 32-bits at a time
-   memset(data,0,64*sizeof(data[0]));
-
-   diff = t ? stbi__extend_receive(j, t) : 0;
-   if (!stbi__addints_valid(j->img_comp[b].dc_pred, diff)) return stbi__err("bad delta","Corrupt JPEG");
-   dc = j->img_comp[b].dc_pred + diff;
-   j->img_comp[b].dc_pred = dc;
-   if (!stbi__mul2shorts_valid(dc, dequant[0])) return stbi__err("can't merge dc and ac", "Corrupt JPEG");
-   data[0] = (short) (dc * dequant[0]);
-
-   // decode AC components, see JPEG spec
-   k = 1;
-   do {
-      unsigned int zig;
-      int c,r,s;
-      if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
-      c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1);
-      r = fac[c];
-      if (r) { // fast-AC path
-         k += (r >> 4) & 15; // run
-         s = r & 15; // combined length
-         if (s > j->code_bits) return stbi__err("bad huffman code", "Combined length longer than code bits available");
-         j->code_buffer <<= s;
-         j->code_bits -= s;
-         // decode into unzigzag'd location
-         zig = stbi__jpeg_dezigzag[k++];
-         data[zig] = (short) ((r >> 8) * dequant[zig]);
-      } else {
-         int rs = stbi__jpeg_huff_decode(j, hac);
-         if (rs < 0) return stbi__err("bad huffman code","Corrupt JPEG");
-         s = rs & 15;
-         r = rs >> 4;
-         if (s == 0) {
-            if (rs != 0xf0) break; // end block
-            k += 16;
-         } else {
-            k += r;
-            // decode into unzigzag'd location
-            zig = stbi__jpeg_dezigzag[k++];
-            data[zig] = (short) (stbi__extend_receive(j,s) * dequant[zig]);
-         }
-      }
-   } while (k < 64);
-   return 1;
-}
-
-static int stbi__jpeg_decode_block_prog_dc(stbi__jpeg *j, short data[64], stbi__huffman *hdc, int b)
-{
-   int diff,dc;
-   int t;
-   if (j->spec_end != 0) return stbi__err("can't merge dc and ac", "Corrupt JPEG");
-
-   if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
-
-   if (j->succ_high == 0) {
-      // first scan for DC coefficient, must be first
-      memset(data,0,64*sizeof(data[0])); // 0 all the ac values now
-      t = stbi__jpeg_huff_decode(j, hdc);
-      if (t < 0 || t > 15) return stbi__err("can't merge dc and ac", "Corrupt JPEG");
-      diff = t ? stbi__extend_receive(j, t) : 0;
-
-      if (!stbi__addints_valid(j->img_comp[b].dc_pred, diff)) return stbi__err("bad delta", "Corrupt JPEG");
-      dc = j->img_comp[b].dc_pred + diff;
-      j->img_comp[b].dc_pred = dc;
-      if (!stbi__mul2shorts_valid(dc, 1 << j->succ_low)) return stbi__err("can't merge dc and ac", "Corrupt JPEG");
-      data[0] = (short) (dc * (1 << j->succ_low));
-   } else {
-      // refinement scan for DC coefficient
-      if (stbi__jpeg_get_bit(j))
-         data[0] += (short) (1 << j->succ_low);
-   }
-   return 1;
-}
-
-// @OPTIMIZE: store non-zigzagged during the decode passes,
-// and only de-zigzag when dequantizing
-static int stbi__jpeg_decode_block_prog_ac(stbi__jpeg *j, short data[64], stbi__huffman *hac, stbi__int16 *fac)
-{
-   int k;
-   if (j->spec_start == 0) return stbi__err("can't merge dc and ac", "Corrupt JPEG");
-
-   if (j->succ_high == 0) {
-      int shift = j->succ_low;
-
-      if (j->eob_run) {
-         --j->eob_run;
-         return 1;
-      }
-
-      k = j->spec_start;
-      do {
-         unsigned int zig;
-         int c,r,s;
-         if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
-         c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1);
-         r = fac[c];
-         if (r) { // fast-AC path
-            k += (r >> 4) & 15; // run
-            s = r & 15; // combined length
-            if (s > j->code_bits) return stbi__err("bad huffman code", "Combined length longer than code bits available");
-            j->code_buffer <<= s;
-            j->code_bits -= s;
-            zig = stbi__jpeg_dezigzag[k++];
-            data[zig] = (short) ((r >> 8) * (1 << shift));
-         } else {
-            int rs = stbi__jpeg_huff_decode(j, hac);
-            if (rs < 0) return stbi__err("bad huffman code","Corrupt JPEG");
-            s = rs & 15;
-            r = rs >> 4;
-            if (s == 0) {
-               if (r < 15) {
-                  j->eob_run = (1 << r);
-                  if (r)
-                     j->eob_run += stbi__jpeg_get_bits(j, r);
-                  --j->eob_run;
-                  break;
-               }
-               k += 16;
-            } else {
-               k += r;
-               zig = stbi__jpeg_dezigzag[k++];
-               data[zig] = (short) (stbi__extend_receive(j,s) * (1 << shift));
-            }
-         }
-      } while (k <= j->spec_end);
-   } else {
-      // refinement scan for these AC coefficients
-
-      short bit = (short) (1 << j->succ_low);
-
-      if (j->eob_run) {
-         --j->eob_run;
-         for (k = j->spec_start; k <= j->spec_end; ++k) {
-            short *p = &data[stbi__jpeg_dezigzag[k]];
-            if (*p != 0)
-               if (stbi__jpeg_get_bit(j))
-                  if ((*p & bit)==0) {
-                     if (*p > 0)
-                        *p += bit;
-                     else
-                        *p -= bit;
-                  }
-         }
-      } else {
-         k = j->spec_start;
-         do {
-            int r,s;
-            int rs = stbi__jpeg_huff_decode(j, hac); // @OPTIMIZE see if we can use the fast path here, advance-by-r is so slow, eh
-            if (rs < 0) return stbi__err("bad huffman code","Corrupt JPEG");
-            s = rs & 15;
-            r = rs >> 4;
-            if (s == 0) {
-               if (r < 15) {
-                  j->eob_run = (1 << r) - 1;
-                  if (r)
-                     j->eob_run += stbi__jpeg_get_bits(j, r);
-                  r = 64; // force end of block
-               } else {
-                  // r=15 s=0 should write 16 0s, so we just do
-                  // a run of 15 0s and then write s (which is 0),
-                  // so we don't have to do anything special here
-               }
-            } else {
-               if (s != 1) return stbi__err("bad huffman code", "Corrupt JPEG");
-               // sign bit
-               if (stbi__jpeg_get_bit(j))
-                  s = bit;
-               else
-                  s = -bit;
-            }
-
-            // advance by r
-            while (k <= j->spec_end) {
-               short *p = &data[stbi__jpeg_dezigzag[k++]];
-               if (*p != 0) {
-                  if (stbi__jpeg_get_bit(j))
-                     if ((*p & bit)==0) {
-                        if (*p > 0)
-                           *p += bit;
-                        else
-                           *p -= bit;
-                     }
-               } else {
-                  if (r == 0) {
-                     *p = (short) s;
-                     break;
-                  }
-                  --r;
-               }
-            }
-         } while (k <= j->spec_end);
-      }
-   }
-   return 1;
-}
-
-// take a -128..127 value and stbi__clamp it and convert to 0..255
-stbi_inline static stbi_uc stbi__clamp(int x)
-{
-   // trick to use a single test to catch both cases
-   if ((unsigned int) x > 255) {
-      if (x < 0) return 0;
-      if (x > 255) return 255;
-   }
-   return (stbi_uc) x;
-}
-
-#define stbi__f2f(x)  ((int) (((x) * 4096 + 0.5)))
-#define stbi__fsh(x)  ((x) * 4096)
-
-// derived from jidctint -- DCT_ISLOW
-#define STBI__IDCT_1D(s0,s1,s2,s3,s4,s5,s6,s7) \
-   int t0,t1,t2,t3,p1,p2,p3,p4,p5,x0,x1,x2,x3; \
-   p2 = s2;                                    \
-   p3 = s6;                                    \
-   p1 = (p2+p3) * stbi__f2f(0.5411961f);       \
-   t2 = p1 + p3*stbi__f2f(-1.847759065f);      \
-   t3 = p1 + p2*stbi__f2f( 0.765366865f);      \
-   p2 = s0;                                    \
-   p3 = s4;                                    \
-   t0 = stbi__fsh(p2+p3);                      \
-   t1 = stbi__fsh(p2-p3);                      \
-   x0 = t0+t3;                                 \
-   x3 = t0-t3;                                 \
-   x1 = t1+t2;                                 \
-   x2 = t1-t2;                                 \
-   t0 = s7;                                    \
-   t1 = s5;                                    \
-   t2 = s3;                                    \
-   t3 = s1;                                    \
-   p3 = t0+t2;                                 \
-   p4 = t1+t3;                                 \
-   p1 = t0+t3;                                 \
-   p2 = t1+t2;                                 \
-   p5 = (p3+p4)*stbi__f2f( 1.175875602f);      \
-   t0 = t0*stbi__f2f( 0.298631336f);           \
-   t1 = t1*stbi__f2f( 2.053119869f);           \
-   t2 = t2*stbi__f2f( 3.072711026f);           \
-   t3 = t3*stbi__f2f( 1.501321110f);           \
-   p1 = p5 + p1*stbi__f2f(-0.899976223f);      \
-   p2 = p5 + p2*stbi__f2f(-2.562915447f);      \
-   p3 = p3*stbi__f2f(-1.961570560f);           \
-   p4 = p4*stbi__f2f(-0.390180644f);           \
-   t3 += p1+p4;                                \
-   t2 += p2+p3;                                \
-   t1 += p2+p4;                                \
-   t0 += p1+p3;
-
-static void stbi__idct_block(stbi_uc *out, int out_stride, short data[64])
-{
-   int i,val[64],*v=val;
-   stbi_uc *o;
-   short *d = data;
-
-   // columns
-   for (i=0; i < 8; ++i,++d, ++v) {
-      // if all zeroes, shortcut -- this avoids dequantizing 0s and IDCTing
-      if (d[ 8]==0 && d[16]==0 && d[24]==0 && d[32]==0
-           && d[40]==0 && d[48]==0 && d[56]==0) {
-         //    no shortcut                 0     seconds
-         //    (1|2|3|4|5|6|7)==0          0     seconds
-         //    all separate               -0.047 seconds
-         //    1 && 2|3 && 4|5 && 6|7:    -0.047 seconds
-         int dcterm = d[0]*4;
-         v[0] = v[8] = v[16] = v[24] = v[32] = v[40] = v[48] = v[56] = dcterm;
-      } else {
-         STBI__IDCT_1D(d[ 0],d[ 8],d[16],d[24],d[32],d[40],d[48],d[56])
-         // constants scaled things up by 1<<12; let's bring them back
-         // down, but keep 2 extra bits of precision
-         x0 += 512; x1 += 512; x2 += 512; x3 += 512;
-         v[ 0] = (x0+t3) >> 10;
-         v[56] = (x0-t3) >> 10;
-         v[ 8] = (x1+t2) >> 10;
-         v[48] = (x1-t2) >> 10;
-         v[16] = (x2+t1) >> 10;
-         v[40] = (x2-t1) >> 10;
-         v[24] = (x3+t0) >> 10;
-         v[32] = (x3-t0) >> 10;
-      }
-   }
-
-   for (i=0, v=val, o=out; i < 8; ++i,v+=8,o+=out_stride) {
-      // no fast case since the first 1D IDCT spread components out
-      STBI__IDCT_1D(v[0],v[1],v[2],v[3],v[4],v[5],v[6],v[7])
-      // constants scaled things up by 1<<12, plus we had 1<<2 from first
-      // loop, plus horizontal and vertical each scale by sqrt(8) so together
-      // we've got an extra 1<<3, so 1<<17 total we need to remove.
-      // so we want to round that, which means adding 0.5 * 1<<17,
-      // aka 65536. Also, we'll end up with -128 to 127 that we want
-      // to encode as 0..255 by adding 128, so we'll add that before the shift
-      x0 += 65536 + (128<<17);
-      x1 += 65536 + (128<<17);
-      x2 += 65536 + (128<<17);
-      x3 += 65536 + (128<<17);
-      // tried computing the shifts into temps, or'ing the temps to see
-      // if any were out of range, but that was slower
-      o[0] = stbi__clamp((x0+t3) >> 17);
-      o[7] = stbi__clamp((x0-t3) >> 17);
-      o[1] = stbi__clamp((x1+t2) >> 17);
-      o[6] = stbi__clamp((x1-t2) >> 17);
-      o[2] = stbi__clamp((x2+t1) >> 17);
-      o[5] = stbi__clamp((x2-t1) >> 17);
-      o[3] = stbi__clamp((x3+t0) >> 17);
-      o[4] = stbi__clamp((x3-t0) >> 17);
-   }
-}
-
-#ifdef STBI_SSE2
-// sse2 integer IDCT. not the fastest possible implementation but it
-// produces bit-identical results to the generic C version so it's
-// fully "transparent".
-static void stbi__idct_simd(stbi_uc *out, int out_stride, short data[64])
-{
-   // This is constructed to match our regular (generic) integer IDCT exactly.
-   __m128i row0, row1, row2, row3, row4, row5, row6, row7;
-   __m128i tmp;
-
-   // dot product constant: even elems=x, odd elems=y
-   #define dct_const(x,y)  _mm_setr_epi16((x),(y),(x),(y),(x),(y),(x),(y))
-
-   // out(0) = c0[even]*x + c0[odd]*y   (c0, x, y 16-bit, out 32-bit)
-   // out(1) = c1[even]*x + c1[odd]*y
-   #define dct_rot(out0,out1, x,y,c0,c1) \
-      __m128i c0##lo = _mm_unpacklo_epi16((x),(y)); \
-      __m128i c0##hi = _mm_unpackhi_epi16((x),(y)); \
-      __m128i out0##_l = _mm_madd_epi16(c0##lo, c0); \
-      __m128i out0##_h = _mm_madd_epi16(c0##hi, c0); \
-      __m128i out1##_l = _mm_madd_epi16(c0##lo, c1); \
-      __m128i out1##_h = _mm_madd_epi16(c0##hi, c1)
-
-   // out = in << 12  (in 16-bit, out 32-bit)
-   #define dct_widen(out, in) \
-      __m128i out##_l = _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), (in)), 4); \
-      __m128i out##_h = _mm_srai_epi32(_mm_unpackhi_epi16(_mm_setzero_si128(), (in)), 4)
-
-   // wide add
-   #define dct_wadd(out, a, b) \
-      __m128i out##_l = _mm_add_epi32(a##_l, b##_l); \
-      __m128i out##_h = _mm_add_epi32(a##_h, b##_h)
-
-   // wide sub
-   #define dct_wsub(out, a, b) \
-      __m128i out##_l = _mm_sub_epi32(a##_l, b##_l); \
-      __m128i out##_h = _mm_sub_epi32(a##_h, b##_h)
-
-   // butterfly a/b, add bias, then shift by "s" and pack
-   #define dct_bfly32o(out0, out1, a,b,bias,s) \
-      { \
-         __m128i abiased_l = _mm_add_epi32(a##_l, bias); \
-         __m128i abiased_h = _mm_add_epi32(a##_h, bias); \
-         dct_wadd(sum, abiased, b); \
-         dct_wsub(dif, abiased, b); \
-         out0 = _mm_packs_epi32(_mm_srai_epi32(sum_l, s), _mm_srai_epi32(sum_h, s)); \
-         out1 = _mm_packs_epi32(_mm_srai_epi32(dif_l, s), _mm_srai_epi32(dif_h, s)); \
-      }
-
-   // 8-bit interleave step (for transposes)
-   #define dct_interleave8(a, b) \
-      tmp = a; \
-      a = _mm_unpacklo_epi8(a, b); \
-      b = _mm_unpackhi_epi8(tmp, b)
-
-   // 16-bit interleave step (for transposes)
-   #define dct_interleave16(a, b) \
-      tmp = a; \
-      a = _mm_unpacklo_epi16(a, b); \
-      b = _mm_unpackhi_epi16(tmp, b)
-
-   #define dct_pass(bias,shift) \
-      { \
-         /* even part */ \
-         dct_rot(t2e,t3e, row2,row6, rot0_0,rot0_1); \
-         __m128i sum04 = _mm_add_epi16(row0, row4); \
-         __m128i dif04 = _mm_sub_epi16(row0, row4); \
-         dct_widen(t0e, sum04); \
-         dct_widen(t1e, dif04); \
-         dct_wadd(x0, t0e, t3e); \
-         dct_wsub(x3, t0e, t3e); \
-         dct_wadd(x1, t1e, t2e); \
-         dct_wsub(x2, t1e, t2e); \
-         /* odd part */ \
-         dct_rot(y0o,y2o, row7,row3, rot2_0,rot2_1); \
-         dct_rot(y1o,y3o, row5,row1, rot3_0,rot3_1); \
-         __m128i sum17 = _mm_add_epi16(row1, row7); \
-         __m128i sum35 = _mm_add_epi16(row3, row5); \
-         dct_rot(y4o,y5o, sum17,sum35, rot1_0,rot1_1); \
-         dct_wadd(x4, y0o, y4o); \
-         dct_wadd(x5, y1o, y5o); \
-         dct_wadd(x6, y2o, y5o); \
-         dct_wadd(x7, y3o, y4o); \
-         dct_bfly32o(row0,row7, x0,x7,bias,shift); \
-         dct_bfly32o(row1,row6, x1,x6,bias,shift); \
-         dct_bfly32o(row2,row5, x2,x5,bias,shift); \
-         dct_bfly32o(row3,row4, x3,x4,bias,shift); \
-      }
-
-   __m128i rot0_0 = dct_const(stbi__f2f(0.5411961f), stbi__f2f(0.5411961f) + stbi__f2f(-1.847759065f));
-   __m128i rot0_1 = dct_const(stbi__f2f(0.5411961f) + stbi__f2f( 0.765366865f), stbi__f2f(0.5411961f));
-   __m128i rot1_0 = dct_const(stbi__f2f(1.175875602f) + stbi__f2f(-0.899976223f), stbi__f2f(1.175875602f));
-   __m128i rot1_1 = dct_const(stbi__f2f(1.175875602f), stbi__f2f(1.175875602f) + stbi__f2f(-2.562915447f));
-   __m128i rot2_0 = dct_const(stbi__f2f(-1.961570560f) + stbi__f2f( 0.298631336f), stbi__f2f(-1.961570560f));
-   __m128i rot2_1 = dct_const(stbi__f2f(-1.961570560f), stbi__f2f(-1.961570560f) + stbi__f2f( 3.072711026f));
-   __m128i rot3_0 = dct_const(stbi__f2f(-0.390180644f) + stbi__f2f( 2.053119869f), stbi__f2f(-0.390180644f));
-   __m128i rot3_1 = dct_const(stbi__f2f(-0.390180644f), stbi__f2f(-0.390180644f) + stbi__f2f( 1.501321110f));
-
-   // rounding biases in column/row passes, see stbi__idct_block for explanation.
-   __m128i bias_0 = _mm_set1_epi32(512);
-   __m128i bias_1 = _mm_set1_epi32(65536 + (128<<17));
-
-   // load
-   row0 = _mm_load_si128((const __m128i *) (data + 0*8));
-   row1 = _mm_load_si128((const __m128i *) (data + 1*8));
-   row2 = _mm_load_si128((const __m128i *) (data + 2*8));
-   row3 = _mm_load_si128((const __m128i *) (data + 3*8));
-   row4 = _mm_load_si128((const __m128i *) (data + 4*8));
-   row5 = _mm_load_si128((const __m128i *) (data + 5*8));
-   row6 = _mm_load_si128((const __m128i *) (data + 6*8));
-   row7 = _mm_load_si128((const __m128i *) (data + 7*8));
-
-   // column pass
-   dct_pass(bias_0, 10);
-
-   {
-      // 16bit 8x8 transpose pass 1
-      dct_interleave16(row0, row4);
-      dct_interleave16(row1, row5);
-      dct_interleave16(row2, row6);
-      dct_interleave16(row3, row7);
-
-      // transpose pass 2
-      dct_interleave16(row0, row2);
-      dct_interleave16(row1, row3);
-      dct_interleave16(row4, row6);
-      dct_interleave16(row5, row7);
-
-      // transpose pass 3
-      dct_interleave16(row0, row1);
-      dct_interleave16(row2, row3);
-      dct_interleave16(row4, row5);
-      dct_interleave16(row6, row7);
-   }
-
-   // row pass
-   dct_pass(bias_1, 17);
-
-   {
-      // pack
-      __m128i p0 = _mm_packus_epi16(row0, row1); // a0a1a2a3...a7b0b1b2b3...b7
-      __m128i p1 = _mm_packus_epi16(row2, row3);
-      __m128i p2 = _mm_packus_epi16(row4, row5);
-      __m128i p3 = _mm_packus_epi16(row6, row7);
-
-      // 8bit 8x8 transpose pass 1
-      dct_interleave8(p0, p2); // a0e0a1e1...
-      dct_interleave8(p1, p3); // c0g0c1g1...
-
-      // transpose pass 2
-      dct_interleave8(p0, p1); // a0c0e0g0...
-      dct_interleave8(p2, p3); // b0d0f0h0...
-
-      // transpose pass 3
-      dct_interleave8(p0, p2); // a0b0c0d0...
-      dct_interleave8(p1, p3); // a4b4c4d4...
-
-      // store
-      _mm_storel_epi64((__m128i *) out, p0); out += out_stride;
-      _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p0, 0x4e)); out += out_stride;
-      _mm_storel_epi64((__m128i *) out, p2); out += out_stride;
-      _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p2, 0x4e)); out += out_stride;
-      _mm_storel_epi64((__m128i *) out, p1); out += out_stride;
-      _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p1, 0x4e)); out += out_stride;
-      _mm_storel_epi64((__m128i *) out, p3); out += out_stride;
-      _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p3, 0x4e));
-   }
-
-#undef dct_const
-#undef dct_rot
-#undef dct_widen
-#undef dct_wadd
-#undef dct_wsub
-#undef dct_bfly32o
-#undef dct_interleave8
-#undef dct_interleave16
-#undef dct_pass
-}
-
-#endif // STBI_SSE2
-
-#ifdef STBI_NEON
-
-// NEON integer IDCT. should produce bit-identical
-// results to the generic C version.
-static void stbi__idct_simd(stbi_uc *out, int out_stride, short data[64])
-{
-   int16x8_t row0, row1, row2, row3, row4, row5, row6, row7;
-
-   int16x4_t rot0_0 = vdup_n_s16(stbi__f2f(0.5411961f));
-   int16x4_t rot0_1 = vdup_n_s16(stbi__f2f(-1.847759065f));
-   int16x4_t rot0_2 = vdup_n_s16(stbi__f2f( 0.765366865f));
-   int16x4_t rot1_0 = vdup_n_s16(stbi__f2f( 1.175875602f));
-   int16x4_t rot1_1 = vdup_n_s16(stbi__f2f(-0.899976223f));
-   int16x4_t rot1_2 = vdup_n_s16(stbi__f2f(-2.562915447f));
-   int16x4_t rot2_0 = vdup_n_s16(stbi__f2f(-1.961570560f));
-   int16x4_t rot2_1 = vdup_n_s16(stbi__f2f(-0.390180644f));
-   int16x4_t rot3_0 = vdup_n_s16(stbi__f2f( 0.298631336f));
-   int16x4_t rot3_1 = vdup_n_s16(stbi__f2f( 2.053119869f));
-   int16x4_t rot3_2 = vdup_n_s16(stbi__f2f( 3.072711026f));
-   int16x4_t rot3_3 = vdup_n_s16(stbi__f2f( 1.501321110f));
-
-#define dct_long_mul(out, inq, coeff) \
-   int32x4_t out##_l = vmull_s16(vget_low_s16(inq), coeff); \
-   int32x4_t out##_h = vmull_s16(vget_high_s16(inq), coeff)
-
-#define dct_long_mac(out, acc, inq, coeff) \
-   int32x4_t out##_l = vmlal_s16(acc##_l, vget_low_s16(inq), coeff); \
-   int32x4_t out##_h = vmlal_s16(acc##_h, vget_high_s16(inq), coeff)
-
-#define dct_widen(out, inq) \
-   int32x4_t out##_l = vshll_n_s16(vget_low_s16(inq), 12); \
-   int32x4_t out##_h = vshll_n_s16(vget_high_s16(inq), 12)
-
-// wide add
-#define dct_wadd(out, a, b) \
-   int32x4_t out##_l = vaddq_s32(a##_l, b##_l); \
-   int32x4_t out##_h = vaddq_s32(a##_h, b##_h)
-
-// wide sub
-#define dct_wsub(out, a, b) \
-   int32x4_t out##_l = vsubq_s32(a##_l, b##_l); \
-   int32x4_t out##_h = vsubq_s32(a##_h, b##_h)
-
-// butterfly a/b, then shift using "shiftop" by "s" and pack
-#define dct_bfly32o(out0,out1, a,b,shiftop,s) \
-   { \
-      dct_wadd(sum, a, b); \
-      dct_wsub(dif, a, b); \
-      out0 = vcombine_s16(shiftop(sum_l, s), shiftop(sum_h, s)); \
-      out1 = vcombine_s16(shiftop(dif_l, s), shiftop(dif_h, s)); \
-   }
-
-#define dct_pass(shiftop, shift) \
-   { \
-      /* even part */ \
-      int16x8_t sum26 = vaddq_s16(row2, row6); \
-      dct_long_mul(p1e, sum26, rot0_0); \
-      dct_long_mac(t2e, p1e, row6, rot0_1); \
-      dct_long_mac(t3e, p1e, row2, rot0_2); \
-      int16x8_t sum04 = vaddq_s16(row0, row4); \
-      int16x8_t dif04 = vsubq_s16(row0, row4); \
-      dct_widen(t0e, sum04); \
-      dct_widen(t1e, dif04); \
-      dct_wadd(x0, t0e, t3e); \
-      dct_wsub(x3, t0e, t3e); \
-      dct_wadd(x1, t1e, t2e); \
-      dct_wsub(x2, t1e, t2e); \
-      /* odd part */ \
-      int16x8_t sum15 = vaddq_s16(row1, row5); \
-      int16x8_t sum17 = vaddq_s16(row1, row7); \
-      int16x8_t sum35 = vaddq_s16(row3, row5); \
-      int16x8_t sum37 = vaddq_s16(row3, row7); \
-      int16x8_t sumodd = vaddq_s16(sum17, sum35); \
-      dct_long_mul(p5o, sumodd, rot1_0); \
-      dct_long_mac(p1o, p5o, sum17, rot1_1); \
-      dct_long_mac(p2o, p5o, sum35, rot1_2); \
-      dct_long_mul(p3o, sum37, rot2_0); \
-      dct_long_mul(p4o, sum15, rot2_1); \
-      dct_wadd(sump13o, p1o, p3o); \
-      dct_wadd(sump24o, p2o, p4o); \
-      dct_wadd(sump23o, p2o, p3o); \
-      dct_wadd(sump14o, p1o, p4o); \
-      dct_long_mac(x4, sump13o, row7, rot3_0); \
-      dct_long_mac(x5, sump24o, row5, rot3_1); \
-      dct_long_mac(x6, sump23o, row3, rot3_2); \
-      dct_long_mac(x7, sump14o, row1, rot3_3); \
-      dct_bfly32o(row0,row7, x0,x7,shiftop,shift); \
-      dct_bfly32o(row1,row6, x1,x6,shiftop,shift); \
-      dct_bfly32o(row2,row5, x2,x5,shiftop,shift); \
-      dct_bfly32o(row3,row4, x3,x4,shiftop,shift); \
-   }
-
-   // load
-   row0 = vld1q_s16(data + 0*8);
-   row1 = vld1q_s16(data + 1*8);
-   row2 = vld1q_s16(data + 2*8);
-   row3 = vld1q_s16(data + 3*8);
-   row4 = vld1q_s16(data + 4*8);
-   row5 = vld1q_s16(data + 5*8);
-   row6 = vld1q_s16(data + 6*8);
-   row7 = vld1q_s16(data + 7*8);
-
-   // add DC bias
-   row0 = vaddq_s16(row0, vsetq_lane_s16(1024, vdupq_n_s16(0), 0));
-
-   // column pass
-   dct_pass(vrshrn_n_s32, 10);
-
-   // 16bit 8x8 transpose
-   {
-// these three map to a single VTRN.16, VTRN.32, and VSWP, respectively.
-// whether compilers actually get this is another story, sadly.
-#define dct_trn16(x, y) { int16x8x2_t t = vtrnq_s16(x, y); x = t.val[0]; y = t.val[1]; }
-#define dct_trn32(x, y) { int32x4x2_t t = vtrnq_s32(vreinterpretq_s32_s16(x), vreinterpretq_s32_s16(y)); x = vreinterpretq_s16_s32(t.val[0]); y = vreinterpretq_s16_s32(t.val[1]); }
-#define dct_trn64(x, y) { int16x8_t x0 = x; int16x8_t y0 = y; x = vcombine_s16(vget_low_s16(x0), vget_low_s16(y0)); y = vcombine_s16(vget_high_s16(x0), vget_high_s16(y0)); }
-
-      // pass 1
-      dct_trn16(row0, row1); // a0b0a2b2a4b4a6b6
-      dct_trn16(row2, row3);
-      dct_trn16(row4, row5);
-      dct_trn16(row6, row7);
-
-      // pass 2
-      dct_trn32(row0, row2); // a0b0c0d0a4b4c4d4
-      dct_trn32(row1, row3);
-      dct_trn32(row4, row6);
-      dct_trn32(row5, row7);
-
-      // pass 3
-      dct_trn64(row0, row4); // a0b0c0d0e0f0g0h0
-      dct_trn64(row1, row5);
-      dct_trn64(row2, row6);
-      dct_trn64(row3, row7);
-
-#undef dct_trn16
-#undef dct_trn32
-#undef dct_trn64
-   }
-
-   // row pass
-   // vrshrn_n_s32 only supports shifts up to 16, we need
-   // 17. so do a non-rounding shift of 16 first then follow
-   // up with a rounding shift by 1.
-   dct_pass(vshrn_n_s32, 16);
-
-   {
-      // pack and round
-      uint8x8_t p0 = vqrshrun_n_s16(row0, 1);
-      uint8x8_t p1 = vqrshrun_n_s16(row1, 1);
-      uint8x8_t p2 = vqrshrun_n_s16(row2, 1);
-      uint8x8_t p3 = vqrshrun_n_s16(row3, 1);
-      uint8x8_t p4 = vqrshrun_n_s16(row4, 1);
-      uint8x8_t p5 = vqrshrun_n_s16(row5, 1);
-      uint8x8_t p6 = vqrshrun_n_s16(row6, 1);
-      uint8x8_t p7 = vqrshrun_n_s16(row7, 1);
-
-      // again, these can translate into one instruction, but often don't.
-#define dct_trn8_8(x, y) { uint8x8x2_t t = vtrn_u8(x, y); x = t.val[0]; y = t.val[1]; }
-#define dct_trn8_16(x, y) { uint16x4x2_t t = vtrn_u16(vreinterpret_u16_u8(x), vreinterpret_u16_u8(y)); x = vreinterpret_u8_u16(t.val[0]); y = vreinterpret_u8_u16(t.val[1]); }
-#define dct_trn8_32(x, y) { uint32x2x2_t t = vtrn_u32(vreinterpret_u32_u8(x), vreinterpret_u32_u8(y)); x = vreinterpret_u8_u32(t.val[0]); y = vreinterpret_u8_u32(t.val[1]); }
-
-      // sadly can't use interleaved stores here since we only write
-      // 8 bytes to each scan line!
-
-      // 8x8 8-bit transpose pass 1
-      dct_trn8_8(p0, p1);
-      dct_trn8_8(p2, p3);
-      dct_trn8_8(p4, p5);
-      dct_trn8_8(p6, p7);
-
-      // pass 2
-      dct_trn8_16(p0, p2);
-      dct_trn8_16(p1, p3);
-      dct_trn8_16(p4, p6);
-      dct_trn8_16(p5, p7);
-
-      // pass 3
-      dct_trn8_32(p0, p4);
-      dct_trn8_32(p1, p5);
-      dct_trn8_32(p2, p6);
-      dct_trn8_32(p3, p7);
-
-      // store
-      vst1_u8(out, p0); out += out_stride;
-      vst1_u8(out, p1); out += out_stride;
-      vst1_u8(out, p2); out += out_stride;
-      vst1_u8(out, p3); out += out_stride;
-      vst1_u8(out, p4); out += out_stride;
-      vst1_u8(out, p5); out += out_stride;
-      vst1_u8(out, p6); out += out_stride;
-      vst1_u8(out, p7);
-
-#undef dct_trn8_8
-#undef dct_trn8_16
-#undef dct_trn8_32
-   }
-
-#undef dct_long_mul
-#undef dct_long_mac
-#undef dct_widen
-#undef dct_wadd
-#undef dct_wsub
-#undef dct_bfly32o
-#undef dct_pass
-}
-
-#endif // STBI_NEON
-
-#define STBI__MARKER_none  0xff
-// if there's a pending marker from the entropy stream, return that
-// otherwise, fetch from the stream and get a marker. if there's no
-// marker, return 0xff, which is never a valid marker value
-static stbi_uc stbi__get_marker(stbi__jpeg *j)
-{
-   stbi_uc x;
-   if (j->marker != STBI__MARKER_none) { x = j->marker; j->marker = STBI__MARKER_none; return x; }
-   x = stbi__get8(j->s);
-   if (x != 0xff) return STBI__MARKER_none;
-   while (x == 0xff)
-      x = stbi__get8(j->s); // consume repeated 0xff fill bytes
-   return x;
-}
-
-// in each scan, we'll have scan_n components, and the order
-// of the components is specified by order[]
-#define STBI__RESTART(x)     ((x) >= 0xd0 && (x) <= 0xd7)
-
-// after a restart interval, stbi__jpeg_reset the entropy decoder and
-// the dc prediction
-static void stbi__jpeg_reset(stbi__jpeg *j)
-{
-   j->code_bits = 0;
-   j->code_buffer = 0;
-   j->nomore = 0;
-   j->img_comp[0].dc_pred = j->img_comp[1].dc_pred = j->img_comp[2].dc_pred = j->img_comp[3].dc_pred = 0;
-   j->marker = STBI__MARKER_none;
-   j->todo = j->restart_interval ? j->restart_interval : 0x7fffffff;
-   j->eob_run = 0;
-   // no more than 1<<31 MCUs if no restart_interal? that's plenty safe,
-   // since we don't even allow 1<<30 pixels
-}
-
-static int stbi__parse_entropy_coded_data(stbi__jpeg *z)
-{
-   stbi__jpeg_reset(z);
-   if (!z->progressive) {
-      if (z->scan_n == 1) {
-         int i,j;
-         STBI_SIMD_ALIGN(short, data[64]);
-         int n = z->order[0];
-         // non-interleaved data, we just need to process one block at a time,
-         // in trivial scanline order
-         // number of blocks to do just depends on how many actual "pixels" this
-         // component has, independent of interleaved MCU blocking and such
-         int w = (z->img_comp[n].x+7) >> 3;
-         int h = (z->img_comp[n].y+7) >> 3;
-         for (j=0; j < h; ++j) {
-            for (i=0; i < w; ++i) {
-               int ha = z->img_comp[n].ha;
-               if (!stbi__jpeg_decode_block(z, data, z->huff_dc+z->img_comp[n].hd, z->huff_ac+ha, z->fast_ac[ha], n, z->dequant[z->img_comp[n].tq])) return 0;
-               z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*j*8+i*8, z->img_comp[n].w2, data);
-               // every data block is an MCU, so countdown the restart interval
-               if (--z->todo <= 0) {
-                  if (z->code_bits < 24) stbi__grow_buffer_unsafe(z);
-                  // if it's NOT a restart, then just bail, so we get corrupt data
-                  // rather than no data
-                  if (!STBI__RESTART(z->marker)) return 1;
-                  stbi__jpeg_reset(z);
-               }
-            }
-         }
-         return 1;
-      } else { // interleaved
-         int i,j,k,x,y;
-         STBI_SIMD_ALIGN(short, data[64]);
-         for (j=0; j < z->img_mcu_y; ++j) {
-            for (i=0; i < z->img_mcu_x; ++i) {
-               // scan an interleaved mcu... process scan_n components in order
-               for (k=0; k < z->scan_n; ++k) {
-                  int n = z->order[k];
-                  // scan out an mcu's worth of this component; that's just determined
-                  // by the basic H and V specified for the component
-                  for (y=0; y < z->img_comp[n].v; ++y) {
-                     for (x=0; x < z->img_comp[n].h; ++x) {
-                        int x2 = (i*z->img_comp[n].h + x)*8;
-                        int y2 = (j*z->img_comp[n].v + y)*8;
-                        int ha = z->img_comp[n].ha;
-                        if (!stbi__jpeg_decode_block(z, data, z->huff_dc+z->img_comp[n].hd, z->huff_ac+ha, z->fast_ac[ha], n, z->dequant[z->img_comp[n].tq])) return 0;
-                        z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*y2+x2, z->img_comp[n].w2, data);
-                     }
-                  }
-               }
-               // after all interleaved components, that's an interleaved MCU,
-               // so now count down the restart interval
-               if (--z->todo <= 0) {
-                  if (z->code_bits < 24) stbi__grow_buffer_unsafe(z);
-                  if (!STBI__RESTART(z->marker)) return 1;
-                  stbi__jpeg_reset(z);
-               }
-            }
-         }
-         return 1;
-      }
-   } else {
-      if (z->scan_n == 1) {
-         int i,j;
-         int n = z->order[0];
-         // non-interleaved data, we just need to process one block at a time,
-         // in trivial scanline order
-         // number of blocks to do just depends on how many actual "pixels" this
-         // component has, independent of interleaved MCU blocking and such
-         int w = (z->img_comp[n].x+7) >> 3;
-         int h = (z->img_comp[n].y+7) >> 3;
-         for (j=0; j < h; ++j) {
-            for (i=0; i < w; ++i) {
-               short *data = z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w);
-               if (z->spec_start == 0) {
-                  if (!stbi__jpeg_decode_block_prog_dc(z, data, &z->huff_dc[z->img_comp[n].hd], n))
-                     return 0;
-               } else {
-                  int ha = z->img_comp[n].ha;
-                  if (!stbi__jpeg_decode_block_prog_ac(z, data, &z->huff_ac[ha], z->fast_ac[ha]))
-                     return 0;
-               }
-               // every data block is an MCU, so countdown the restart interval
-               if (--z->todo <= 0) {
-                  if (z->code_bits < 24) stbi__grow_buffer_unsafe(z);
-                  if (!STBI__RESTART(z->marker)) return 1;
-                  stbi__jpeg_reset(z);
-               }
-            }
-         }
-         return 1;
-      } else { // interleaved
-         int i,j,k,x,y;
-         for (j=0; j < z->img_mcu_y; ++j) {
-            for (i=0; i < z->img_mcu_x; ++i) {
-               // scan an interleaved mcu... process scan_n components in order
-               for (k=0; k < z->scan_n; ++k) {
-                  int n = z->order[k];
-                  // scan out an mcu's worth of this component; that's just determined
-                  // by the basic H and V specified for the component
-                  for (y=0; y < z->img_comp[n].v; ++y) {
-                     for (x=0; x < z->img_comp[n].h; ++x) {
-                        int x2 = (i*z->img_comp[n].h + x);
-                        int y2 = (j*z->img_comp[n].v + y);
-                        short *data = z->img_comp[n].coeff + 64 * (x2 + y2 * z->img_comp[n].coeff_w);
-                        if (!stbi__jpeg_decode_block_prog_dc(z, data, &z->huff_dc[z->img_comp[n].hd], n))
-                           return 0;
-                     }
-                  }
-               }
-               // after all interleaved components, that's an interleaved MCU,
-               // so now count down the restart interval
-               if (--z->todo <= 0) {
-                  if (z->code_bits < 24) stbi__grow_buffer_unsafe(z);
-                  if (!STBI__RESTART(z->marker)) return 1;
-                  stbi__jpeg_reset(z);
-               }
-            }
-         }
-         return 1;
-      }
-   }
-}
-
-static void stbi__jpeg_dequantize(short *data, stbi__uint16 *dequant)
-{
-   int i;
-   for (i=0; i < 64; ++i)
-      data[i] *= dequant[i];
-}
-
-static void stbi__jpeg_finish(stbi__jpeg *z)
-{
-   if (z->progressive) {
-      // dequantize and idct the data
-      int i,j,n;
-      for (n=0; n < z->s->img_n; ++n) {
-         int w = (z->img_comp[n].x+7) >> 3;
-         int h = (z->img_comp[n].y+7) >> 3;
-         for (j=0; j < h; ++j) {
-            for (i=0; i < w; ++i) {
-               short *data = z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w);
-               stbi__jpeg_dequantize(data, z->dequant[z->img_comp[n].tq]);
-               z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*j*8+i*8, z->img_comp[n].w2, data);
-            }
-         }
-      }
-   }
-}
-
-static int stbi__process_marker(stbi__jpeg *z, int m)
-{
-   int L;
-   switch (m) {
-      case STBI__MARKER_none: // no marker found
-         return stbi__err("expected marker","Corrupt JPEG");
-
-      case 0xDD: // DRI - specify restart interval
-         if (stbi__get16be(z->s) != 4) return stbi__err("bad DRI len","Corrupt JPEG");
-         z->restart_interval = stbi__get16be(z->s);
-         return 1;
-
-      case 0xDB: // DQT - define quantization table
-         L = stbi__get16be(z->s)-2;
-         while (L > 0) {
-            int q = stbi__get8(z->s);
-            int p = q >> 4, sixteen = (p != 0);
-            int t = q & 15,i;
-            if (p != 0 && p != 1) return stbi__err("bad DQT type","Corrupt JPEG");
-            if (t > 3) return stbi__err("bad DQT table","Corrupt JPEG");
-
-            for (i=0; i < 64; ++i)
-               z->dequant[t][stbi__jpeg_dezigzag[i]] = (stbi__uint16)(sixteen ? stbi__get16be(z->s) : stbi__get8(z->s));
-            L -= (sixteen ? 129 : 65);
-         }
-         return L==0;
-
-      case 0xC4: // DHT - define huffman table
-         L = stbi__get16be(z->s)-2;
-         while (L > 0) {
-            stbi_uc *v;
-            int sizes[16],i,n=0;
-            int q = stbi__get8(z->s);
-            int tc = q >> 4;
-            int th = q & 15;
-            if (tc > 1 || th > 3) return stbi__err("bad DHT header","Corrupt JPEG");
-            for (i=0; i < 16; ++i) {
-               sizes[i] = stbi__get8(z->s);
-               n += sizes[i];
-            }
-            if(n > 256) return stbi__err("bad DHT header","Corrupt JPEG"); // Loop over i < n would write past end of values!
-            L -= 17;
-            if (tc == 0) {
-               if (!stbi__build_huffman(z->huff_dc+th, sizes)) return 0;
-               v = z->huff_dc[th].values;
-            } else {
-               if (!stbi__build_huffman(z->huff_ac+th, sizes)) return 0;
-               v = z->huff_ac[th].values;
-            }
-            for (i=0; i < n; ++i)
-               v[i] = stbi__get8(z->s);
-            if (tc != 0)
-               stbi__build_fast_ac(z->fast_ac[th], z->huff_ac + th);
-            L -= n;
-         }
-         return L==0;
-   }
-
-   // check for comment block or APP blocks
-   if ((m >= 0xE0 && m <= 0xEF) || m == 0xFE) {
-      L = stbi__get16be(z->s);
-      if (L < 2) {
-         if (m == 0xFE)
-            return stbi__err("bad COM len","Corrupt JPEG");
-         else
-            return stbi__err("bad APP len","Corrupt JPEG");
-      }
-      L -= 2;
-
-      if (m == 0xE0 && L >= 5) { // JFIF APP0 segment
-         static const unsigned char tag[5] = {'J','F','I','F','\0'};
-         int ok = 1;
-         int i;
-         for (i=0; i < 5; ++i)
-            if (stbi__get8(z->s) != tag[i])
-               ok = 0;
-         L -= 5;
-         if (ok)
-            z->jfif = 1;
-      } else if (m == 0xEE && L >= 12) { // Adobe APP14 segment
-         static const unsigned char tag[6] = {'A','d','o','b','e','\0'};
-         int ok = 1;
-         int i;
-         for (i=0; i < 6; ++i)
-            if (stbi__get8(z->s) != tag[i])
-               ok = 0;
-         L -= 6;
-         if (ok) {
-            stbi__get8(z->s); // version
-            stbi__get16be(z->s); // flags0
-            stbi__get16be(z->s); // flags1
-            z->app14_color_transform = stbi__get8(z->s); // color transform
-            L -= 6;
-         }
-      }
-
-      stbi__skip(z->s, L);
-      return 1;
-   }
-
-   return stbi__err("unknown marker","Corrupt JPEG");
-}
-
-// after we see SOS
-static int stbi__process_scan_header(stbi__jpeg *z)
-{
-   int i;
-   int Ls = stbi__get16be(z->s);
-   z->scan_n = stbi__get8(z->s);
-   if (z->scan_n < 1 || z->scan_n > 4 || z->scan_n > (int) z->s->img_n) return stbi__err("bad SOS component count","Corrupt JPEG");
-   if (Ls != 6+2*z->scan_n) return stbi__err("bad SOS len","Corrupt JPEG");
-   for (i=0; i < z->scan_n; ++i) {
-      int id = stbi__get8(z->s), which;
-      int q = stbi__get8(z->s);
-      for (which = 0; which < z->s->img_n; ++which)
-         if (z->img_comp[which].id == id)
-            break;
-      if (which == z->s->img_n) return 0; // no match
-      z->img_comp[which].hd = q >> 4;   if (z->img_comp[which].hd > 3) return stbi__err("bad DC huff","Corrupt JPEG");
-      z->img_comp[which].ha = q & 15;   if (z->img_comp[which].ha > 3) return stbi__err("bad AC huff","Corrupt JPEG");
-      z->order[i] = which;
-   }
-
-   {
-      int aa;
-      z->spec_start = stbi__get8(z->s);
-      z->spec_end   = stbi__get8(z->s); // should be 63, but might be 0
-      aa = stbi__get8(z->s);
-      z->succ_high = (aa >> 4);
-      z->succ_low  = (aa & 15);
-      if (z->progressive) {
-         if (z->spec_start > 63 || z->spec_end > 63  || z->spec_start > z->spec_end || z->succ_high > 13 || z->succ_low > 13)
-            return stbi__err("bad SOS", "Corrupt JPEG");
-      } else {
-         if (z->spec_start != 0) return stbi__err("bad SOS","Corrupt JPEG");
-         if (z->succ_high != 0 || z->succ_low != 0) return stbi__err("bad SOS","Corrupt JPEG");
-         z->spec_end = 63;
-      }
-   }
-
-   return 1;
-}
-
-static int stbi__free_jpeg_components(stbi__jpeg *z, int ncomp, int why)
-{
-   int i;
-   for (i=0; i < ncomp; ++i) {
-      if (z->img_comp[i].raw_data) {
-         STBI_FREE(z->img_comp[i].raw_data);
-         z->img_comp[i].raw_data = NULL;
-         z->img_comp[i].data = NULL;
-      }
-      if (z->img_comp[i].raw_coeff) {
-         STBI_FREE(z->img_comp[i].raw_coeff);
-         z->img_comp[i].raw_coeff = 0;
-         z->img_comp[i].coeff = 0;
-      }
-      if (z->img_comp[i].linebuf) {
-         STBI_FREE(z->img_comp[i].linebuf);
-         z->img_comp[i].linebuf = NULL;
-      }
-   }
-   return why;
-}
-
-static int stbi__process_frame_header(stbi__jpeg *z, int scan)
-{
-   stbi__context *s = z->s;
-   int Lf,p,i,q, h_max=1,v_max=1,c;
-   Lf = stbi__get16be(s);         if (Lf < 11) return stbi__err("bad SOF len","Corrupt JPEG"); // JPEG
-   p  = stbi__get8(s);            if (p != 8) return stbi__err("only 8-bit","JPEG format not supported: 8-bit only"); // JPEG baseline
-   s->img_y = stbi__get16be(s);   if (s->img_y == 0) return stbi__err("no header height", "JPEG format not supported: delayed height"); // Legal, but we don't handle it--but neither does IJG
-   s->img_x = stbi__get16be(s);   if (s->img_x == 0) return stbi__err("0 width","Corrupt JPEG"); // JPEG requires
-   if (s->img_y > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)");
-   if (s->img_x > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)");
-   c = stbi__get8(s);
-   if (c != 3 && c != 1 && c != 4) return stbi__err("bad component count","Corrupt JPEG");
-   s->img_n = c;
-   for (i=0; i < c; ++i) {
-      z->img_comp[i].data = NULL;
-      z->img_comp[i].linebuf = NULL;
-   }
-
-   if (Lf != 8+3*s->img_n) return stbi__err("bad SOF len","Corrupt JPEG");
-
-   z->rgb = 0;
-   for (i=0; i < s->img_n; ++i) {
-      static const unsigned char rgb[3] = { 'R', 'G', 'B' };
-      z->img_comp[i].id = stbi__get8(s);
-      if (s->img_n == 3 && z->img_comp[i].id == rgb[i])
-         ++z->rgb;
-      q = stbi__get8(s);
-      z->img_comp[i].h = (q >> 4);  if (!z->img_comp[i].h || z->img_comp[i].h > 4) return stbi__err("bad H","Corrupt JPEG");
-      z->img_comp[i].v = q & 15;    if (!z->img_comp[i].v || z->img_comp[i].v > 4) return stbi__err("bad V","Corrupt JPEG");
-      z->img_comp[i].tq = stbi__get8(s);  if (z->img_comp[i].tq > 3) return stbi__err("bad TQ","Corrupt JPEG");
-   }
-
-   if (scan != STBI__SCAN_load) return 1;
-
-   if (!stbi__mad3sizes_valid(s->img_x, s->img_y, s->img_n, 0)) return stbi__err("too large", "Image too large to decode");
-
-   for (i=0; i < s->img_n; ++i) {
-      if (z->img_comp[i].h > h_max) h_max = z->img_comp[i].h;
-      if (z->img_comp[i].v > v_max) v_max = z->img_comp[i].v;
-   }
-
-   // check that plane subsampling factors are integer ratios; our resamplers can't deal with fractional ratios
-   // and I've never seen a non-corrupted JPEG file actually use them
-   for (i=0; i < s->img_n; ++i) {
-      if (h_max % z->img_comp[i].h != 0) return stbi__err("bad H","Corrupt JPEG");
-      if (v_max % z->img_comp[i].v != 0) return stbi__err("bad V","Corrupt JPEG");
-   }
-
-   // compute interleaved mcu info
-   z->img_h_max = h_max;
-   z->img_v_max = v_max;
-   z->img_mcu_w = h_max * 8;
-   z->img_mcu_h = v_max * 8;
-   // these sizes can't be more than 17 bits
-   z->img_mcu_x = (s->img_x + z->img_mcu_w-1) / z->img_mcu_w;
-   z->img_mcu_y = (s->img_y + z->img_mcu_h-1) / z->img_mcu_h;
-
-   for (i=0; i < s->img_n; ++i) {
-      // number of effective pixels (e.g. for non-interleaved MCU)
-      z->img_comp[i].x = (s->img_x * z->img_comp[i].h + h_max-1) / h_max;
-      z->img_comp[i].y = (s->img_y * z->img_comp[i].v + v_max-1) / v_max;
-      // to simplify generation, we'll allocate enough memory to decode
-      // the bogus oversized data from using interleaved MCUs and their
-      // big blocks (e.g. a 16x16 iMCU on an image of width 33); we won't
-      // discard the extra data until colorspace conversion
-      //
-      // img_mcu_x, img_mcu_y: <=17 bits; comp[i].h and .v are <=4 (checked earlier)
-      // so these muls can't overflow with 32-bit ints (which we require)
-      z->img_comp[i].w2 = z->img_mcu_x * z->img_comp[i].h * 8;
-      z->img_comp[i].h2 = z->img_mcu_y * z->img_comp[i].v * 8;
-      z->img_comp[i].coeff = 0;
-      z->img_comp[i].raw_coeff = 0;
-      z->img_comp[i].linebuf = NULL;
-      z->img_comp[i].raw_data = stbi__malloc_mad2(z->img_comp[i].w2, z->img_comp[i].h2, 15);
-      if (z->img_comp[i].raw_data == NULL)
-         return stbi__free_jpeg_components(z, i+1, stbi__err("outofmem", "Out of memory"));
-      // align blocks for idct using mmx/sse
-      z->img_comp[i].data = (stbi_uc*) (((size_t) z->img_comp[i].raw_data + 15) & ~15);
-      if (z->progressive) {
-         // w2, h2 are multiples of 8 (see above)
-         z->img_comp[i].coeff_w = z->img_comp[i].w2 / 8;
-         z->img_comp[i].coeff_h = z->img_comp[i].h2 / 8;
-         z->img_comp[i].raw_coeff = stbi__malloc_mad3(z->img_comp[i].w2, z->img_comp[i].h2, sizeof(short), 15);
-         if (z->img_comp[i].raw_coeff == NULL)
-            return stbi__free_jpeg_components(z, i+1, stbi__err("outofmem", "Out of memory"));
-         z->img_comp[i].coeff = (short*) (((size_t) z->img_comp[i].raw_coeff + 15) & ~15);
-      }
-   }
-
-   return 1;
-}
-
-// use comparisons since in some cases we handle more than one case (e.g. SOF)
-#define stbi__DNL(x)         ((x) == 0xdc)
-#define stbi__SOI(x)         ((x) == 0xd8)
-#define stbi__EOI(x)         ((x) == 0xd9)
-#define stbi__SOF(x)         ((x) == 0xc0 || (x) == 0xc1 || (x) == 0xc2)
-#define stbi__SOS(x)         ((x) == 0xda)
-
-#define stbi__SOF_progressive(x)   ((x) == 0xc2)
-
-static int stbi__decode_jpeg_header(stbi__jpeg *z, int scan)
-{
-   int m;
-   z->jfif = 0;
-   z->app14_color_transform = -1; // valid values are 0,1,2
-   z->marker = STBI__MARKER_none; // initialize cached marker to empty
-   m = stbi__get_marker(z);
-   if (!stbi__SOI(m)) return stbi__err("no SOI","Corrupt JPEG");
-   if (scan == STBI__SCAN_type) return 1;
-   m = stbi__get_marker(z);
-   while (!stbi__SOF(m)) {
-      if (!stbi__process_marker(z,m)) return 0;
-      m = stbi__get_marker(z);
-      while (m == STBI__MARKER_none) {
-         // some files have extra padding after their blocks, so ok, we'll scan
-         if (stbi__at_eof(z->s)) return stbi__err("no SOF", "Corrupt JPEG");
-         m = stbi__get_marker(z);
-      }
-   }
-   z->progressive = stbi__SOF_progressive(m);
-   if (!stbi__process_frame_header(z, scan)) return 0;
-   return 1;
-}
-
-static int stbi__skip_jpeg_junk_at_end(stbi__jpeg *j)
-{
-   // some JPEGs have junk at end, skip over it but if we find what looks
-   // like a valid marker, resume there
-   while (!stbi__at_eof(j->s)) {
-      int x = stbi__get8(j->s);
-      while (x == 255) { // might be a marker
-         if (stbi__at_eof(j->s)) return STBI__MARKER_none;
-         x = stbi__get8(j->s);
-         if (x != 0x00 && x != 0xff) {
-            // not a stuffed zero or lead-in to another marker, looks
-            // like an actual marker, return it
-            return x;
-         }
-         // stuffed zero has x=0 now which ends the loop, meaning we go
-         // back to regular scan loop.
-         // repeated 0xff keeps trying to read the next byte of the marker.
-      }
-   }
-   return STBI__MARKER_none;
-}
-
-// decode image to YCbCr format
-static int stbi__decode_jpeg_image(stbi__jpeg *j)
-{
-   int m;
-   for (m = 0; m < 4; m++) {
-      j->img_comp[m].raw_data = NULL;
-      j->img_comp[m].raw_coeff = NULL;
-   }
-   j->restart_interval = 0;
-   if (!stbi__decode_jpeg_header(j, STBI__SCAN_load)) return 0;
-   m = stbi__get_marker(j);
-   while (!stbi__EOI(m)) {
-      if (stbi__SOS(m)) {
-         if (!stbi__process_scan_header(j)) return 0;
-         if (!stbi__parse_entropy_coded_data(j)) return 0;
-         if (j->marker == STBI__MARKER_none ) {
-         j->marker = stbi__skip_jpeg_junk_at_end(j);
-            // if we reach eof without hitting a marker, stbi__get_marker() below will fail and we'll eventually return 0
-         }
-         m = stbi__get_marker(j);
-         if (STBI__RESTART(m))
-            m = stbi__get_marker(j);
-      } else if (stbi__DNL(m)) {
-         int Ld = stbi__get16be(j->s);
-         stbi__uint32 NL = stbi__get16be(j->s);
-         if (Ld != 4) return stbi__err("bad DNL len", "Corrupt JPEG");
-         if (NL != j->s->img_y) return stbi__err("bad DNL height", "Corrupt JPEG");
-         m = stbi__get_marker(j);
-      } else {
-         if (!stbi__process_marker(j, m)) return 1;
-         m = stbi__get_marker(j);
-      }
-   }
-   if (j->progressive)
-      stbi__jpeg_finish(j);
-   return 1;
-}
-
-// static jfif-centered resampling (across block boundaries)
-
-typedef stbi_uc *(*resample_row_func)(stbi_uc *out, stbi_uc *in0, stbi_uc *in1,
-                                    int w, int hs);
-
-#define stbi__div4(x) ((stbi_uc) ((x) >> 2))
-
-static stbi_uc *resample_row_1(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
-{
-   STBI_NOTUSED(out);
-   STBI_NOTUSED(in_far);
-   STBI_NOTUSED(w);
-   STBI_NOTUSED(hs);
-   return in_near;
-}
-
-static stbi_uc* stbi__resample_row_v_2(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
-{
-   // need to generate two samples vertically for every one in input
-   int i;
-   STBI_NOTUSED(hs);
-   for (i=0; i < w; ++i)
-      out[i] = stbi__div4(3*in_near[i] + in_far[i] + 2);
-   return out;
-}
-
-static stbi_uc*  stbi__resample_row_h_2(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
-{
-   // need to generate two samples horizontally for every one in input
-   int i;
-   stbi_uc *input = in_near;
-
-   if (w == 1) {
-      // if only one sample, can't do any interpolation
-      out[0] = out[1] = input[0];
-      return out;
-   }
-
-   out[0] = input[0];
-   out[1] = stbi__div4(input[0]*3 + input[1] + 2);
-   for (i=1; i < w-1; ++i) {
-      int n = 3*input[i]+2;
-      out[i*2+0] = stbi__div4(n+input[i-1]);
-      out[i*2+1] = stbi__div4(n+input[i+1]);
-   }
-   out[i*2+0] = stbi__div4(input[w-2]*3 + input[w-1] + 2);
-   out[i*2+1] = input[w-1];
-
-   STBI_NOTUSED(in_far);
-   STBI_NOTUSED(hs);
-
-   return out;
-}
-
-#define stbi__div16(x) ((stbi_uc) ((x) >> 4))
-
-static stbi_uc *stbi__resample_row_hv_2(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
-{
-   // need to generate 2x2 samples for every one in input
-   int i,t0,t1;
-   if (w == 1) {
-      out[0] = out[1] = stbi__div4(3*in_near[0] + in_far[0] + 2);
-      return out;
-   }
-
-   t1 = 3*in_near[0] + in_far[0];
-   out[0] = stbi__div4(t1+2);
-   for (i=1; i < w; ++i) {
-      t0 = t1;
-      t1 = 3*in_near[i]+in_far[i];
-      out[i*2-1] = stbi__div16(3*t0 + t1 + 8);
-      out[i*2  ] = stbi__div16(3*t1 + t0 + 8);
-   }
-   out[w*2-1] = stbi__div4(t1+2);
-
-   STBI_NOTUSED(hs);
-
-   return out;
-}
-
-#if defined(STBI_SSE2) || defined(STBI_NEON)
-static stbi_uc *stbi__resample_row_hv_2_simd(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
-{
-   // need to generate 2x2 samples for every one in input
-   int i=0,t0,t1;
-
-   if (w == 1) {
-      out[0] = out[1] = stbi__div4(3*in_near[0] + in_far[0] + 2);
-      return out;
-   }
-
-   t1 = 3*in_near[0] + in_far[0];
-   // process groups of 8 pixels for as long as we can.
-   // note we can't handle the last pixel in a row in this loop
-   // because we need to handle the filter boundary conditions.
-   for (; i < ((w-1) & ~7); i += 8) {
-#if defined(STBI_SSE2)
-      // load and perform the vertical filtering pass
-      // this uses 3*x + y = 4*x + (y - x)
-      __m128i zero  = _mm_setzero_si128();
-      __m128i farb  = _mm_loadl_epi64((__m128i *) (in_far + i));
-      __m128i nearb = _mm_loadl_epi64((__m128i *) (in_near + i));
-      __m128i farw  = _mm_unpacklo_epi8(farb, zero);
-      __m128i nearw = _mm_unpacklo_epi8(nearb, zero);
-      __m128i diff  = _mm_sub_epi16(farw, nearw);
-      __m128i nears = _mm_slli_epi16(nearw, 2);
-      __m128i curr  = _mm_add_epi16(nears, diff); // current row
-
-      // horizontal filter works the same based on shifted vers of current
-      // row. "prev" is current row shifted right by 1 pixel; we need to
-      // insert the previous pixel value (from t1).
-      // "next" is current row shifted left by 1 pixel, with first pixel
-      // of next block of 8 pixels added in.
-      __m128i prv0 = _mm_slli_si128(curr, 2);
-      __m128i nxt0 = _mm_srli_si128(curr, 2);
-      __m128i prev = _mm_insert_epi16(prv0, t1, 0);
-      __m128i next = _mm_insert_epi16(nxt0, 3*in_near[i+8] + in_far[i+8], 7);
-
-      // horizontal filter, polyphase implementation since it's convenient:
-      // even pixels = 3*cur + prev = cur*4 + (prev - cur)
-      // odd  pixels = 3*cur + next = cur*4 + (next - cur)
-      // note the shared term.
-      __m128i bias  = _mm_set1_epi16(8);
-      __m128i curs = _mm_slli_epi16(curr, 2);
-      __m128i prvd = _mm_sub_epi16(prev, curr);
-      __m128i nxtd = _mm_sub_epi16(next, curr);
-      __m128i curb = _mm_add_epi16(curs, bias);
-      __m128i even = _mm_add_epi16(prvd, curb);
-      __m128i odd  = _mm_add_epi16(nxtd, curb);
-
-      // interleave even and odd pixels, then undo scaling.
-      __m128i int0 = _mm_unpacklo_epi16(even, odd);
-      __m128i int1 = _mm_unpackhi_epi16(even, odd);
-      __m128i de0  = _mm_srli_epi16(int0, 4);
-      __m128i de1  = _mm_srli_epi16(int1, 4);
-
-      // pack and write output
-      __m128i outv = _mm_packus_epi16(de0, de1);
-      _mm_storeu_si128((__m128i *) (out + i*2), outv);
-#elif defined(STBI_NEON)
-      // load and perform the vertical filtering pass
-      // this uses 3*x + y = 4*x + (y - x)
-      uint8x8_t farb  = vld1_u8(in_far + i);
-      uint8x8_t nearb = vld1_u8(in_near + i);
-      int16x8_t diff  = vreinterpretq_s16_u16(vsubl_u8(farb, nearb));
-      int16x8_t nears = vreinterpretq_s16_u16(vshll_n_u8(nearb, 2));
-      int16x8_t curr  = vaddq_s16(nears, diff); // current row
-
-      // horizontal filter works the same based on shifted vers of current
-      // row. "prev" is current row shifted right by 1 pixel; we need to
-      // insert the previous pixel value (from t1).
-      // "next" is current row shifted left by 1 pixel, with first pixel
-      // of next block of 8 pixels added in.
-      int16x8_t prv0 = vextq_s16(curr, curr, 7);
-      int16x8_t nxt0 = vextq_s16(curr, curr, 1);
-      int16x8_t prev = vsetq_lane_s16(t1, prv0, 0);
-      int16x8_t next = vsetq_lane_s16(3*in_near[i+8] + in_far[i+8], nxt0, 7);
-
-      // horizontal filter, polyphase implementation since it's convenient:
-      // even pixels = 3*cur + prev = cur*4 + (prev - cur)
-      // odd  pixels = 3*cur + next = cur*4 + (next - cur)
-      // note the shared term.
-      int16x8_t curs = vshlq_n_s16(curr, 2);
-      int16x8_t prvd = vsubq_s16(prev, curr);
-      int16x8_t nxtd = vsubq_s16(next, curr);
-      int16x8_t even = vaddq_s16(curs, prvd);
-      int16x8_t odd  = vaddq_s16(curs, nxtd);
-
-      // undo scaling and round, then store with even/odd phases interleaved
-      uint8x8x2_t o;
-      o.val[0] = vqrshrun_n_s16(even, 4);
-      o.val[1] = vqrshrun_n_s16(odd,  4);
-      vst2_u8(out + i*2, o);
-#endif
-
-      // "previous" value for next iter
-      t1 = 3*in_near[i+7] + in_far[i+7];
-   }
-
-   t0 = t1;
-   t1 = 3*in_near[i] + in_far[i];
-   out[i*2] = stbi__div16(3*t1 + t0 + 8);
-
-   for (++i; i < w; ++i) {
-      t0 = t1;
-      t1 = 3*in_near[i]+in_far[i];
-      out[i*2-1] = stbi__div16(3*t0 + t1 + 8);
-      out[i*2  ] = stbi__div16(3*t1 + t0 + 8);
-   }
-   out[w*2-1] = stbi__div4(t1+2);
-
-   STBI_NOTUSED(hs);
-
-   return out;
-}
-#endif
-
-static stbi_uc *stbi__resample_row_generic(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
-{
-   // resample with nearest-neighbor
-   int i,j;
-   STBI_NOTUSED(in_far);
-   for (i=0; i < w; ++i)
-      for (j=0; j < hs; ++j)
-         out[i*hs+j] = in_near[i];
-   return out;
-}
-
-// this is a reduced-precision calculation of YCbCr-to-RGB introduced
-// to make sure the code produces the same results in both SIMD and scalar
-#define stbi__float2fixed(x)  (((int) ((x) * 4096.0f + 0.5f)) << 8)
-static void stbi__YCbCr_to_RGB_row(stbi_uc *out, const stbi_uc *y, const stbi_uc *pcb, const stbi_uc *pcr, int count, int step)
-{
-   int i;
-   for (i=0; i < count; ++i) {
-      int y_fixed = (y[i] << 20) + (1<<19); // rounding
-      int r,g,b;
-      int cr = pcr[i] - 128;
-      int cb = pcb[i] - 128;
-      r = y_fixed +  cr* stbi__float2fixed(1.40200f);
-      g = y_fixed + (cr*-stbi__float2fixed(0.71414f)) + ((cb*-stbi__float2fixed(0.34414f)) & 0xffff0000);
-      b = y_fixed                                     +   cb* stbi__float2fixed(1.77200f);
-      r >>= 20;
-      g >>= 20;
-      b >>= 20;
-      if ((unsigned) r > 255) { if (r < 0) r = 0; else r = 255; }
-      if ((unsigned) g > 255) { if (g < 0) g = 0; else g = 255; }
-      if ((unsigned) b > 255) { if (b < 0) b = 0; else b = 255; }
-      out[0] = (stbi_uc)r;
-      out[1] = (stbi_uc)g;
-      out[2] = (stbi_uc)b;
-      out[3] = 255;
-      out += step;
-   }
-}
-
-#if defined(STBI_SSE2) || defined(STBI_NEON)
-static void stbi__YCbCr_to_RGB_simd(stbi_uc *out, stbi_uc const *y, stbi_uc const *pcb, stbi_uc const *pcr, int count, int step)
-{
-   int i = 0;
-
-#ifdef STBI_SSE2
-   // step == 3 is pretty ugly on the final interleave, and i'm not convinced
-   // it's useful in practice (you wouldn't use it for textures, for example).
-   // so just accelerate step == 4 case.
-   if (step == 4) {
-      // this is a fairly straightforward implementation and not super-optimized.
-      __m128i signflip  = _mm_set1_epi8(-0x80);
-      __m128i cr_const0 = _mm_set1_epi16(   (short) ( 1.40200f*4096.0f+0.5f));
-      __m128i cr_const1 = _mm_set1_epi16( - (short) ( 0.71414f*4096.0f+0.5f));
-      __m128i cb_const0 = _mm_set1_epi16( - (short) ( 0.34414f*4096.0f+0.5f));
-      __m128i cb_const1 = _mm_set1_epi16(   (short) ( 1.77200f*4096.0f+0.5f));
-      __m128i y_bias = _mm_set1_epi8((char) (unsigned char) 128);
-      __m128i xw = _mm_set1_epi16(255); // alpha channel
-
-      for (; i+7 < count; i += 8) {
-         // load
-         __m128i y_bytes = _mm_loadl_epi64((__m128i *) (y+i));
-         __m128i cr_bytes = _mm_loadl_epi64((__m128i *) (pcr+i));
-         __m128i cb_bytes = _mm_loadl_epi64((__m128i *) (pcb+i));
-         __m128i cr_biased = _mm_xor_si128(cr_bytes, signflip); // -128
-         __m128i cb_biased = _mm_xor_si128(cb_bytes, signflip); // -128
-
-         // unpack to short (and left-shift cr, cb by 8)
-         __m128i yw  = _mm_unpacklo_epi8(y_bias, y_bytes);
-         __m128i crw = _mm_unpacklo_epi8(_mm_setzero_si128(), cr_biased);
-         __m128i cbw = _mm_unpacklo_epi8(_mm_setzero_si128(), cb_biased);
-
-         // color transform
-         __m128i yws = _mm_srli_epi16(yw, 4);
-         __m128i cr0 = _mm_mulhi_epi16(cr_const0, crw);
-         __m128i cb0 = _mm_mulhi_epi16(cb_const0, cbw);
-         __m128i cb1 = _mm_mulhi_epi16(cbw, cb_const1);
-         __m128i cr1 = _mm_mulhi_epi16(crw, cr_const1);
-         __m128i rws = _mm_add_epi16(cr0, yws);
-         __m128i gwt = _mm_add_epi16(cb0, yws);
-         __m128i bws = _mm_add_epi16(yws, cb1);
-         __m128i gws = _mm_add_epi16(gwt, cr1);
-
-         // descale
-         __m128i rw = _mm_srai_epi16(rws, 4);
-         __m128i bw = _mm_srai_epi16(bws, 4);
-         __m128i gw = _mm_srai_epi16(gws, 4);
-
-         // back to byte, set up for transpose
-         __m128i brb = _mm_packus_epi16(rw, bw);
-         __m128i gxb = _mm_packus_epi16(gw, xw);
-
-         // transpose to interleave channels
-         __m128i t0 = _mm_unpacklo_epi8(brb, gxb);
-         __m128i t1 = _mm_unpackhi_epi8(brb, gxb);
-         __m128i o0 = _mm_unpacklo_epi16(t0, t1);
-         __m128i o1 = _mm_unpackhi_epi16(t0, t1);
-
-         // store
-         _mm_storeu_si128((__m128i *) (out + 0), o0);
-         _mm_storeu_si128((__m128i *) (out + 16), o1);
-         out += 32;
-      }
-   }
-#endif
-
-#ifdef STBI_NEON
-   // in this version, step=3 support would be easy to add. but is there demand?
-   if (step == 4) {
-      // this is a fairly straightforward implementation and not super-optimized.
-      uint8x8_t signflip = vdup_n_u8(0x80);
-      int16x8_t cr_const0 = vdupq_n_s16(   (short) ( 1.40200f*4096.0f+0.5f));
-      int16x8_t cr_const1 = vdupq_n_s16( - (short) ( 0.71414f*4096.0f+0.5f));
-      int16x8_t cb_const0 = vdupq_n_s16( - (short) ( 0.34414f*4096.0f+0.5f));
-      int16x8_t cb_const1 = vdupq_n_s16(   (short) ( 1.77200f*4096.0f+0.5f));
-
-      for (; i+7 < count; i += 8) {
-         // load
-         uint8x8_t y_bytes  = vld1_u8(y + i);
-         uint8x8_t cr_bytes = vld1_u8(pcr + i);
-         uint8x8_t cb_bytes = vld1_u8(pcb + i);
-         int8x8_t cr_biased = vreinterpret_s8_u8(vsub_u8(cr_bytes, signflip));
-         int8x8_t cb_biased = vreinterpret_s8_u8(vsub_u8(cb_bytes, signflip));
-
-         // expand to s16
-         int16x8_t yws = vreinterpretq_s16_u16(vshll_n_u8(y_bytes, 4));
-         int16x8_t crw = vshll_n_s8(cr_biased, 7);
-         int16x8_t cbw = vshll_n_s8(cb_biased, 7);
-
-         // color transform
-         int16x8_t cr0 = vqdmulhq_s16(crw, cr_const0);
-         int16x8_t cb0 = vqdmulhq_s16(cbw, cb_const0);
-         int16x8_t cr1 = vqdmulhq_s16(crw, cr_const1);
-         int16x8_t cb1 = vqdmulhq_s16(cbw, cb_const1);
-         int16x8_t rws = vaddq_s16(yws, cr0);
-         int16x8_t gws = vaddq_s16(vaddq_s16(yws, cb0), cr1);
-         int16x8_t bws = vaddq_s16(yws, cb1);
-
-         // undo scaling, round, convert to byte
-         uint8x8x4_t o;
-         o.val[0] = vqrshrun_n_s16(rws, 4);
-         o.val[1] = vqrshrun_n_s16(gws, 4);
-         o.val[2] = vqrshrun_n_s16(bws, 4);
-         o.val[3] = vdup_n_u8(255);
-
-         // store, interleaving r/g/b/a
-         vst4_u8(out, o);
-         out += 8*4;
-      }
-   }
-#endif
-
-   for (; i < count; ++i) {
-      int y_fixed = (y[i] << 20) + (1<<19); // rounding
-      int r,g,b;
-      int cr = pcr[i] - 128;
-      int cb = pcb[i] - 128;
-      r = y_fixed + cr* stbi__float2fixed(1.40200f);
-      g = y_fixed + cr*-stbi__float2fixed(0.71414f) + ((cb*-stbi__float2fixed(0.34414f)) & 0xffff0000);
-      b = y_fixed                                   +   cb* stbi__float2fixed(1.77200f);
-      r >>= 20;
-      g >>= 20;
-      b >>= 20;
-      if ((unsigned) r > 255) { if (r < 0) r = 0; else r = 255; }
-      if ((unsigned) g > 255) { if (g < 0) g = 0; else g = 255; }
-      if ((unsigned) b > 255) { if (b < 0) b = 0; else b = 255; }
-      out[0] = (stbi_uc)r;
-      out[1] = (stbi_uc)g;
-      out[2] = (stbi_uc)b;
-      out[3] = 255;
-      out += step;
-   }
-}
-#endif
-
-// set up the kernels
-static void stbi__setup_jpeg(stbi__jpeg *j)
-{
-   j->idct_block_kernel = stbi__idct_block;
-   j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_row;
-   j->resample_row_hv_2_kernel = stbi__resample_row_hv_2;
-
-#ifdef STBI_SSE2
-   if (stbi__sse2_available()) {
-      j->idct_block_kernel = stbi__idct_simd;
-      j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_simd;
-      j->resample_row_hv_2_kernel = stbi__resample_row_hv_2_simd;
-   }
-#endif
-
-#ifdef STBI_NEON
-   j->idct_block_kernel = stbi__idct_simd;
-   j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_simd;
-   j->resample_row_hv_2_kernel = stbi__resample_row_hv_2_simd;
-#endif
-}
-
-// clean up the temporary component buffers
-static void stbi__cleanup_jpeg(stbi__jpeg *j)
-{
-   stbi__free_jpeg_components(j, j->s->img_n, 0);
-}
-
-typedef struct
-{
-   resample_row_func resample;
-   stbi_uc *line0,*line1;
-   int hs,vs;   // expansion factor in each axis
-   int w_lores; // horizontal pixels pre-expansion
-   int ystep;   // how far through vertical expansion we are
-   int ypos;    // which pre-expansion row we're on
-} stbi__resample;
-
-// fast 0..255 * 0..255 => 0..255 rounded multiplication
-static stbi_uc stbi__blinn_8x8(stbi_uc x, stbi_uc y)
-{
-   unsigned int t = x*y + 128;
-   return (stbi_uc) ((t + (t >>8)) >> 8);
-}
-
-static stbi_uc *load_jpeg_image(stbi__jpeg *z, int *out_x, int *out_y, int *comp, int req_comp)
-{
-   int n, decode_n, is_rgb;
-   z->s->img_n = 0; // make stbi__cleanup_jpeg safe
-
-   // validate req_comp
-   if (req_comp < 0 || req_comp > 4) return stbi__errpuc("bad req_comp", "Internal error");
-
-   // load a jpeg image from whichever source, but leave in YCbCr format
-   if (!stbi__decode_jpeg_image(z)) { stbi__cleanup_jpeg(z); return NULL; }
-
-   // determine actual number of components to generate
-   n = req_comp ? req_comp : z->s->img_n >= 3 ? 3 : 1;
-
-   is_rgb = z->s->img_n == 3 && (z->rgb == 3 || (z->app14_color_transform == 0 && !z->jfif));
-
-   if (z->s->img_n == 3 && n < 3 && !is_rgb)
-      decode_n = 1;
-   else
-      decode_n = z->s->img_n;
-
-   // nothing to do if no components requested; check this now to avoid
-   // accessing uninitialized coutput[0] later
-   if (decode_n <= 0) { stbi__cleanup_jpeg(z); return NULL; }
-
-   // resample and color-convert
-   {
-      int k;
-      unsigned int i,j;
-      stbi_uc *output;
-      stbi_uc *coutput[4] = { NULL, NULL, NULL, NULL };
-
-      stbi__resample res_comp[4];
-
-      for (k=0; k < decode_n; ++k) {
-         stbi__resample *r = &res_comp[k];
-
-         // allocate line buffer big enough for upsampling off the edges
-         // with upsample factor of 4
-         z->img_comp[k].linebuf = (stbi_uc *) stbi__malloc(z->s->img_x + 3);
-         if (!z->img_comp[k].linebuf) { stbi__cleanup_jpeg(z); return stbi__errpuc("outofmem", "Out of memory"); }
-
-         r->hs      = z->img_h_max / z->img_comp[k].h;
-         r->vs      = z->img_v_max / z->img_comp[k].v;
-         r->ystep   = r->vs >> 1;
-         r->w_lores = (z->s->img_x + r->hs-1) / r->hs;
-         r->ypos    = 0;
-         r->line0   = r->line1 = z->img_comp[k].data;
-
-         if      (r->hs == 1 && r->vs == 1) r->resample = resample_row_1;
-         else if (r->hs == 1 && r->vs == 2) r->resample = stbi__resample_row_v_2;
-         else if (r->hs == 2 && r->vs == 1) r->resample = stbi__resample_row_h_2;
-         else if (r->hs == 2 && r->vs == 2) r->resample = z->resample_row_hv_2_kernel;
-         else                               r->resample = stbi__resample_row_generic;
-      }
-
-      // can't error after this so, this is safe
-      output = (stbi_uc *) stbi__malloc_mad3(n, z->s->img_x, z->s->img_y, 1);
-      if (!output) { stbi__cleanup_jpeg(z); return stbi__errpuc("outofmem", "Out of memory"); }
-
-      // now go ahead and resample
-      for (j=0; j < z->s->img_y; ++j) {
-         stbi_uc *out = output + n * z->s->img_x * j;
-         for (k=0; k < decode_n; ++k) {
-            stbi__resample *r = &res_comp[k];
-            int y_bot = r->ystep >= (r->vs >> 1);
-            coutput[k] = r->resample(z->img_comp[k].linebuf,
-                                     y_bot ? r->line1 : r->line0,
-                                     y_bot ? r->line0 : r->line1,
-                                     r->w_lores, r->hs);
-            if (++r->ystep >= r->vs) {
-               r->ystep = 0;
-               r->line0 = r->line1;
-               if (++r->ypos < z->img_comp[k].y)
-                  r->line1 += z->img_comp[k].w2;
-            }
-         }
-         if (n >= 3) {
-            stbi_uc *y = coutput[0];
-            if (z->s->img_n == 3) {
-               if (is_rgb) {
-                  for (i=0; i < z->s->img_x; ++i) {
-                     out[0] = y[i];
-                     out[1] = coutput[1][i];
-                     out[2] = coutput[2][i];
-                     out[3] = 255;
-                     out += n;
-                  }
-               } else {
-                  z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
-               }
-            } else if (z->s->img_n == 4) {
-               if (z->app14_color_transform == 0) { // CMYK
-                  for (i=0; i < z->s->img_x; ++i) {
-                     stbi_uc m = coutput[3][i];
-                     out[0] = stbi__blinn_8x8(coutput[0][i], m);
-                     out[1] = stbi__blinn_8x8(coutput[1][i], m);
-                     out[2] = stbi__blinn_8x8(coutput[2][i], m);
-                     out[3] = 255;
-                     out += n;
-                  }
-               } else if (z->app14_color_transform == 2) { // YCCK
-                  z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
-                  for (i=0; i < z->s->img_x; ++i) {
-                     stbi_uc m = coutput[3][i];
-                     out[0] = stbi__blinn_8x8(255 - out[0], m);
-                     out[1] = stbi__blinn_8x8(255 - out[1], m);
-                     out[2] = stbi__blinn_8x8(255 - out[2], m);
-                     out += n;
-                  }
-               } else { // YCbCr + alpha?  Ignore the fourth channel for now
-                  z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
-               }
-            } else
-               for (i=0; i < z->s->img_x; ++i) {
-                  out[0] = out[1] = out[2] = y[i];
-                  out[3] = 255; // not used if n==3
-                  out += n;
-               }
-         } else {
-            if (is_rgb) {
-               if (n == 1)
-                  for (i=0; i < z->s->img_x; ++i)
-                     *out++ = stbi__compute_y(coutput[0][i], coutput[1][i], coutput[2][i]);
-               else {
-                  for (i=0; i < z->s->img_x; ++i, out += 2) {
-                     out[0] = stbi__compute_y(coutput[0][i], coutput[1][i], coutput[2][i]);
-                     out[1] = 255;
-                  }
-               }
-            } else if (z->s->img_n == 4 && z->app14_color_transform == 0) {
-               for (i=0; i < z->s->img_x; ++i) {
-                  stbi_uc m = coutput[3][i];
-                  stbi_uc r = stbi__blinn_8x8(coutput[0][i], m);
-                  stbi_uc g = stbi__blinn_8x8(coutput[1][i], m);
-                  stbi_uc b = stbi__blinn_8x8(coutput[2][i], m);
-                  out[0] = stbi__compute_y(r, g, b);
-                  out[1] = 255;
-                  out += n;
-               }
-            } else if (z->s->img_n == 4 && z->app14_color_transform == 2) {
-               for (i=0; i < z->s->img_x; ++i) {
-                  out[0] = stbi__blinn_8x8(255 - coutput[0][i], coutput[3][i]);
-                  out[1] = 255;
-                  out += n;
-               }
-            } else {
-               stbi_uc *y = coutput[0];
-               if (n == 1)
-                  for (i=0; i < z->s->img_x; ++i) out[i] = y[i];
-               else
-                  for (i=0; i < z->s->img_x; ++i) { *out++ = y[i]; *out++ = 255; }
-            }
-         }
-      }
-      stbi__cleanup_jpeg(z);
-      *out_x = z->s->img_x;
-      *out_y = z->s->img_y;
-      if (comp) *comp = z->s->img_n >= 3 ? 3 : 1; // report original components, not output
-      return output;
-   }
-}
-
-static void *stbi__jpeg_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
-{
-   unsigned char* result;
-   stbi__jpeg* j = (stbi__jpeg*) stbi__malloc(sizeof(stbi__jpeg));
-   if (!j) return stbi__errpuc("outofmem", "Out of memory");
-   memset(j, 0, sizeof(stbi__jpeg));
-   STBI_NOTUSED(ri);
-   j->s = s;
-   stbi__setup_jpeg(j);
-   result = load_jpeg_image(j, x,y,comp,req_comp);
-   STBI_FREE(j);
-   return result;
-}
-
-static int stbi__jpeg_test(stbi__context *s)
-{
-   int r;
-   stbi__jpeg* j = (stbi__jpeg*)stbi__malloc(sizeof(stbi__jpeg));
-   if (!j) return stbi__err("outofmem", "Out of memory");
-   memset(j, 0, sizeof(stbi__jpeg));
-   j->s = s;
-   stbi__setup_jpeg(j);
-   r = stbi__decode_jpeg_header(j, STBI__SCAN_type);
-   stbi__rewind(s);
-   STBI_FREE(j);
-   return r;
-}
-
-static int stbi__jpeg_info_raw(stbi__jpeg *j, int *x, int *y, int *comp)
-{
-   if (!stbi__decode_jpeg_header(j, STBI__SCAN_header)) {
-      stbi__rewind( j->s );
-      return 0;
-   }
-   if (x) *x = j->s->img_x;
-   if (y) *y = j->s->img_y;
-   if (comp) *comp = j->s->img_n >= 3 ? 3 : 1;
-   return 1;
-}
-
-static int stbi__jpeg_info(stbi__context *s, int *x, int *y, int *comp)
-{
-   int result;
-   stbi__jpeg* j = (stbi__jpeg*) (stbi__malloc(sizeof(stbi__jpeg)));
-   if (!j) return stbi__err("outofmem", "Out of memory");
-   memset(j, 0, sizeof(stbi__jpeg));
-   j->s = s;
-   result = stbi__jpeg_info_raw(j, x, y, comp);
-   STBI_FREE(j);
-   return result;
-}
-#endif
-
-// public domain zlib decode    v0.2  Sean Barrett 2006-11-18
-//    simple implementation
-//      - all input must be provided in an upfront buffer
-//      - all output is written to a single output buffer (can malloc/realloc)
-//    performance
-//      - fast huffman
-
-#ifndef STBI_NO_ZLIB
-
-// fast-way is faster to check than jpeg huffman, but slow way is slower
-#define STBI__ZFAST_BITS  9 // accelerate all cases in default tables
-#define STBI__ZFAST_MASK  ((1 << STBI__ZFAST_BITS) - 1)
-#define STBI__ZNSYMS 288 // number of symbols in literal/length alphabet
-
-// zlib-style huffman encoding
-// (jpegs packs from left, zlib from right, so can't share code)
-typedef struct
-{
-   stbi__uint16 fast[1 << STBI__ZFAST_BITS];
-   stbi__uint16 firstcode[16];
-   int maxcode[17];
-   stbi__uint16 firstsymbol[16];
-   stbi_uc  size[STBI__ZNSYMS];
-   stbi__uint16 value[STBI__ZNSYMS];
-} stbi__zhuffman;
-
-stbi_inline static int stbi__bitreverse16(int n)
-{
-  n = ((n & 0xAAAA) >>  1) | ((n & 0x5555) << 1);
-  n = ((n & 0xCCCC) >>  2) | ((n & 0x3333) << 2);
-  n = ((n & 0xF0F0) >>  4) | ((n & 0x0F0F) << 4);
-  n = ((n & 0xFF00) >>  8) | ((n & 0x00FF) << 8);
-  return n;
-}
-
-stbi_inline static int stbi__bit_reverse(int v, int bits)
-{
-   STBI_ASSERT(bits <= 16);
-   // to bit reverse n bits, reverse 16 and shift
-   // e.g. 11 bits, bit reverse and shift away 5
-   return stbi__bitreverse16(v) >> (16-bits);
-}
-
-static int stbi__zbuild_huffman(stbi__zhuffman *z, const stbi_uc *sizelist, int num)
-{
-   int i,k=0;
-   int code, next_code[16], sizes[17];
-
-   // DEFLATE spec for generating codes
-   memset(sizes, 0, sizeof(sizes));
-   memset(z->fast, 0, sizeof(z->fast));
-   for (i=0; i < num; ++i)
-      ++sizes[sizelist[i]];
-   sizes[0] = 0;
-   for (i=1; i < 16; ++i)
-      if (sizes[i] > (1 << i))
-         return stbi__err("bad sizes", "Corrupt PNG");
-   code = 0;
-   for (i=1; i < 16; ++i) {
-      next_code[i] = code;
-      z->firstcode[i] = (stbi__uint16) code;
-      z->firstsymbol[i] = (stbi__uint16) k;
-      code = (code + sizes[i]);
-      if (sizes[i])
-         if (code-1 >= (1 << i)) return stbi__err("bad codelengths","Corrupt PNG");
-      z->maxcode[i] = code << (16-i); // preshift for inner loop
-      code <<= 1;
-      k += sizes[i];
-   }
-   z->maxcode[16] = 0x10000; // sentinel
-   for (i=0; i < num; ++i) {
-      int s = sizelist[i];
-      if (s) {
-         int c = next_code[s] - z->firstcode[s] + z->firstsymbol[s];
-         stbi__uint16 fastv = (stbi__uint16) ((s << 9) | i);
-         z->size [c] = (stbi_uc     ) s;
-         z->value[c] = (stbi__uint16) i;
-         if (s <= STBI__ZFAST_BITS) {
-            int j = stbi__bit_reverse(next_code[s],s);
-            while (j < (1 << STBI__ZFAST_BITS)) {
-               z->fast[j] = fastv;
-               j += (1 << s);
-            }
-         }
-         ++next_code[s];
-      }
-   }
-   return 1;
-}
-
-// zlib-from-memory implementation for PNG reading
-//    because PNG allows splitting the zlib stream arbitrarily,
-//    and it's annoying structurally to have PNG call ZLIB call PNG,
-//    we require PNG read all the IDATs and combine them into a single
-//    memory buffer
-
-typedef struct
-{
-   stbi_uc *zbuffer, *zbuffer_end;
-   int num_bits;
-   stbi__uint32 code_buffer;
-
-   char *zout;
-   char *zout_start;
-   char *zout_end;
-   int   z_expandable;
-
-   stbi__zhuffman z_length, z_distance;
-} stbi__zbuf;
-
-stbi_inline static int stbi__zeof(stbi__zbuf *z)
-{
-   return (z->zbuffer >= z->zbuffer_end);
-}
-
-stbi_inline static stbi_uc stbi__zget8(stbi__zbuf *z)
-{
-   return stbi__zeof(z) ? 0 : *z->zbuffer++;
-}
-
-static void stbi__fill_bits(stbi__zbuf *z)
-{
-   do {
-      if (z->code_buffer >= (1U << z->num_bits)) {
-        z->zbuffer = z->zbuffer_end;  /* treat this as EOF so we fail. */
-        return;
-      }
-      z->code_buffer |= (unsigned int) stbi__zget8(z) << z->num_bits;
-      z->num_bits += 8;
-   } while (z->num_bits <= 24);
-}
-
-stbi_inline static unsigned int stbi__zreceive(stbi__zbuf *z, int n)
-{
-   unsigned int k;
-   if (z->num_bits < n) stbi__fill_bits(z);
-   k = z->code_buffer & ((1 << n) - 1);
-   z->code_buffer >>= n;
-   z->num_bits -= n;
-   return k;
-}
-
-static int stbi__zhuffman_decode_slowpath(stbi__zbuf *a, stbi__zhuffman *z)
-{
-   int b,s,k;
-   // not resolved by fast table, so compute it the slow way
-   // use jpeg approach, which requires MSbits at top
-   k = stbi__bit_reverse(a->code_buffer, 16);
-   for (s=STBI__ZFAST_BITS+1; ; ++s)
-      if (k < z->maxcode[s])
-         break;
-   if (s >= 16) return -1; // invalid code!
-   // code size is s, so:
-   b = (k >> (16-s)) - z->firstcode[s] + z->firstsymbol[s];
-   if (b >= STBI__ZNSYMS) return -1; // some data was corrupt somewhere!
-   if (z->size[b] != s) return -1;  // was originally an assert, but report failure instead.
-   a->code_buffer >>= s;
-   a->num_bits -= s;
-   return z->value[b];
-}
-
-stbi_inline static int stbi__zhuffman_decode(stbi__zbuf *a, stbi__zhuffman *z)
-{
-   int b,s;
-   if (a->num_bits < 16) {
-      if (stbi__zeof(a)) {
-         return -1;   /* report error for unexpected end of data. */
-      }
-      stbi__fill_bits(a);
-   }
-   b = z->fast[a->code_buffer & STBI__ZFAST_MASK];
-   if (b) {
-      s = b >> 9;
-      a->code_buffer >>= s;
-      a->num_bits -= s;
-      return b & 511;
-   }
-   return stbi__zhuffman_decode_slowpath(a, z);
-}
-
-static int stbi__zexpand(stbi__zbuf *z, char *zout, int n)  // need to make room for n bytes
-{
-   char *q;
-   unsigned int cur, limit, old_limit;
-   z->zout = zout;
-   if (!z->z_expandable) return stbi__err("output buffer limit","Corrupt PNG");
-   cur   = (unsigned int) (z->zout - z->zout_start);
-   limit = old_limit = (unsigned) (z->zout_end - z->zout_start);
-   if (UINT_MAX - cur < (unsigned) n) return stbi__err("outofmem", "Out of memory");
-   while (cur + n > limit) {
-      if(limit > UINT_MAX / 2) return stbi__err("outofmem", "Out of memory");
-      limit *= 2;
-   }
-   q = (char *) STBI_REALLOC_SIZED(z->zout_start, old_limit, limit);
-   STBI_NOTUSED(old_limit);
-   if (q == NULL) return stbi__err("outofmem", "Out of memory");
-   z->zout_start = q;
-   z->zout       = q + cur;
-   z->zout_end   = q + limit;
-   return 1;
-}
-
-static const int stbi__zlength_base[31] = {
-   3,4,5,6,7,8,9,10,11,13,
-   15,17,19,23,27,31,35,43,51,59,
-   67,83,99,115,131,163,195,227,258,0,0 };
-
-static const int stbi__zlength_extra[31]=
-{ 0,0,0,0,0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,0,0,0 };
-
-static const int stbi__zdist_base[32] = { 1,2,3,4,5,7,9,13,17,25,33,49,65,97,129,193,
-257,385,513,769,1025,1537,2049,3073,4097,6145,8193,12289,16385,24577,0,0};
-
-static const int stbi__zdist_extra[32] =
-{ 0,0,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13};
-
-static int stbi__parse_huffman_block(stbi__zbuf *a)
-{
-   char *zout = a->zout;
-   for(;;) {
-      int z = stbi__zhuffman_decode(a, &a->z_length);
-      if (z < 256) {
-         if (z < 0) return stbi__err("bad huffman code","Corrupt PNG"); // error in huffman codes
-         if (zout >= a->zout_end) {
-            if (!stbi__zexpand(a, zout, 1)) return 0;
-            zout = a->zout;
-         }
-         *zout++ = (char) z;
-      } else {
-         stbi_uc *p;
-         int len,dist;
-         if (z == 256) {
-            a->zout = zout;
-            return 1;
-         }
-         if (z >= 286) return stbi__err("bad huffman code","Corrupt PNG"); // per DEFLATE, length codes 286 and 287 must not appear in compressed data
-         z -= 257;
-         len = stbi__zlength_base[z];
-         if (stbi__zlength_extra[z]) len += stbi__zreceive(a, stbi__zlength_extra[z]);
-         z = stbi__zhuffman_decode(a, &a->z_distance);
-         if (z < 0 || z >= 30) return stbi__err("bad huffman code","Corrupt PNG"); // per DEFLATE, distance codes 30 and 31 must not appear in compressed data
-         dist = stbi__zdist_base[z];
-         if (stbi__zdist_extra[z]) dist += stbi__zreceive(a, stbi__zdist_extra[z]);
-         if (zout - a->zout_start < dist) return stbi__err("bad dist","Corrupt PNG");
-         if (zout + len > a->zout_end) {
-            if (!stbi__zexpand(a, zout, len)) return 0;
-            zout = a->zout;
-         }
-         p = (stbi_uc *) (zout - dist);
-         if (dist == 1) { // run of one byte; common in images.
-            stbi_uc v = *p;
-            if (len) { do *zout++ = v; while (--len); }
-         } else {
-            if (len) { do *zout++ = *p++; while (--len); }
-         }
-      }
-   }
-}
-
-static int stbi__compute_huffman_codes(stbi__zbuf *a)
-{
-   static const stbi_uc length_dezigzag[19] = { 16,17,18,0,8,7,9,6,10,5,11,4,12,3,13,2,14,1,15 };
-   stbi__zhuffman z_codelength;
-   stbi_uc lencodes[286+32+137];//padding for maximum single op
-   stbi_uc codelength_sizes[19];
-   int i,n;
-
-   int hlit  = stbi__zreceive(a,5) + 257;
-   int hdist = stbi__zreceive(a,5) + 1;
-   int hclen = stbi__zreceive(a,4) + 4;
-   int ntot  = hlit + hdist;
-
-   memset(codelength_sizes, 0, sizeof(codelength_sizes));
-   for (i=0; i < hclen; ++i) {
-      int s = stbi__zreceive(a,3);
-      codelength_sizes[length_dezigzag[i]] = (stbi_uc) s;
-   }
-   if (!stbi__zbuild_huffman(&z_codelength, codelength_sizes, 19)) return 0;
-
-   n = 0;
-   while (n < ntot) {
-      int c = stbi__zhuffman_decode(a, &z_codelength);
-      if (c < 0 || c >= 19) return stbi__err("bad codelengths", "Corrupt PNG");
-      if (c < 16)
-         lencodes[n++] = (stbi_uc) c;
-      else {
-         stbi_uc fill = 0;
-         if (c == 16) {
-            c = stbi__zreceive(a,2)+3;
-            if (n == 0) return stbi__err("bad codelengths", "Corrupt PNG");
-            fill = lencodes[n-1];
-         } else if (c == 17) {
-            c = stbi__zreceive(a,3)+3;
-         } else if (c == 18) {
-            c = stbi__zreceive(a,7)+11;
-         } else {
-            return stbi__err("bad codelengths", "Corrupt PNG");
-         }
-         if (ntot - n < c) return stbi__err("bad codelengths", "Corrupt PNG");
-         memset(lencodes+n, fill, c);
-         n += c;
-      }
-   }
-   if (n != ntot) return stbi__err("bad codelengths","Corrupt PNG");
-   if (!stbi__zbuild_huffman(&a->z_length, lencodes, hlit)) return 0;
-   if (!stbi__zbuild_huffman(&a->z_distance, lencodes+hlit, hdist)) return 0;
-   return 1;
-}
-
-static int stbi__parse_uncompressed_block(stbi__zbuf *a)
-{
-   stbi_uc header[4];
-   int len,nlen,k;
-   if (a->num_bits & 7)
-      stbi__zreceive(a, a->num_bits & 7); // discard
-   // drain the bit-packed data into header
-   k = 0;
-   while (a->num_bits > 0) {
-      header[k++] = (stbi_uc) (a->code_buffer & 255); // suppress MSVC run-time check
-      a->code_buffer >>= 8;
-      a->num_bits -= 8;
-   }
-   if (a->num_bits < 0) return stbi__err("zlib corrupt","Corrupt PNG");
-   // now fill header the normal way
-   while (k < 4)
-      header[k++] = stbi__zget8(a);
-   len  = header[1] * 256 + header[0];
-   nlen = header[3] * 256 + header[2];
-   if (nlen != (len ^ 0xffff)) return stbi__err("zlib corrupt","Corrupt PNG");
-   if (a->zbuffer + len > a->zbuffer_end) return stbi__err("read past buffer","Corrupt PNG");
-   if (a->zout + len > a->zout_end)
-      if (!stbi__zexpand(a, a->zout, len)) return 0;
-   memcpy(a->zout, a->zbuffer, len);
-   a->zbuffer += len;
-   a->zout += len;
-   return 1;
-}
-
-static int stbi__parse_zlib_header(stbi__zbuf *a)
-{
-   int cmf   = stbi__zget8(a);
-   int cm    = cmf & 15;
-   /* int cinfo = cmf >> 4; */
-   int flg   = stbi__zget8(a);
-   if (stbi__zeof(a)) return stbi__err("bad zlib header","Corrupt PNG"); // zlib spec
-   if ((cmf*256+flg) % 31 != 0) return stbi__err("bad zlib header","Corrupt PNG"); // zlib spec
-   if (flg & 32) return stbi__err("no preset dict","Corrupt PNG"); // preset dictionary not allowed in png
-   if (cm != 8) return stbi__err("bad compression","Corrupt PNG"); // DEFLATE required for png
-   // window = 1 << (8 + cinfo)... but who cares, we fully buffer output
-   return 1;
-}
-
-static const stbi_uc stbi__zdefault_length[STBI__ZNSYMS] =
-{
-   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
-   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
-   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
-   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
-   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
-   9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
-   9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
-   9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
-   7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,8,8,8,8,8,8,8,8
-};
-static const stbi_uc stbi__zdefault_distance[32] =
-{
-   5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5
-};
-/*
-Init algorithm:
-{
-   int i;   // use <= to match clearly with spec
-   for (i=0; i <= 143; ++i)     stbi__zdefault_length[i]   = 8;
-   for (   ; i <= 255; ++i)     stbi__zdefault_length[i]   = 9;
-   for (   ; i <= 279; ++i)     stbi__zdefault_length[i]   = 7;
-   for (   ; i <= 287; ++i)     stbi__zdefault_length[i]   = 8;
-
-   for (i=0; i <=  31; ++i)     stbi__zdefault_distance[i] = 5;
-}
-*/
-
-static int stbi__parse_zlib(stbi__zbuf *a, int parse_header)
-{
-   int final, type;
-   if (parse_header)
-      if (!stbi__parse_zlib_header(a)) return 0;
-   a->num_bits = 0;
-   a->code_buffer = 0;
-   do {
-      final = stbi__zreceive(a,1);
-      type = stbi__zreceive(a,2);
-      if (type == 0) {
-         if (!stbi__parse_uncompressed_block(a)) return 0;
-      } else if (type == 3) {
-         return 0;
-      } else {
-         if (type == 1) {
-            // use fixed code lengths
-            if (!stbi__zbuild_huffman(&a->z_length  , stbi__zdefault_length  , STBI__ZNSYMS)) return 0;
-            if (!stbi__zbuild_huffman(&a->z_distance, stbi__zdefault_distance,  32)) return 0;
-         } else {
-            if (!stbi__compute_huffman_codes(a)) return 0;
-         }
-         if (!stbi__parse_huffman_block(a)) return 0;
-      }
-   } while (!final);
-   return 1;
-}
-
-static int stbi__do_zlib(stbi__zbuf *a, char *obuf, int olen, int exp, int parse_header)
-{
-   a->zout_start = obuf;
-   a->zout       = obuf;
-   a->zout_end   = obuf + olen;
-   a->z_expandable = exp;
-
-   return stbi__parse_zlib(a, parse_header);
-}
-
-STBIDEF char *stbi_zlib_decode_malloc_guesssize(const char *buffer, int len, int initial_size, int *outlen)
-{
-   stbi__zbuf a;
-   char *p = (char *) stbi__malloc(initial_size);
-   if (p == NULL) return NULL;
-   a.zbuffer = (stbi_uc *) buffer;
-   a.zbuffer_end = (stbi_uc *) buffer + len;
-   if (stbi__do_zlib(&a, p, initial_size, 1, 1)) {
-      if (outlen) *outlen = (int) (a.zout - a.zout_start);
-      return a.zout_start;
-   } else {
-      STBI_FREE(a.zout_start);
-      return NULL;
-   }
-}
-
-STBIDEF char *stbi_zlib_decode_malloc(char const *buffer, int len, int *outlen)
-{
-   return stbi_zlib_decode_malloc_guesssize(buffer, len, 16384, outlen);
-}
-
-STBIDEF char *stbi_zlib_decode_malloc_guesssize_headerflag(const char *buffer, int len, int initial_size, int *outlen, int parse_header)
-{
-   stbi__zbuf a;
-   char *p = (char *) stbi__malloc(initial_size);
-   if (p == NULL) return NULL;
-   a.zbuffer = (stbi_uc *) buffer;
-   a.zbuffer_end = (stbi_uc *) buffer + len;
-   if (stbi__do_zlib(&a, p, initial_size, 1, parse_header)) {
-      if (outlen) *outlen = (int) (a.zout - a.zout_start);
-      return a.zout_start;
-   } else {
-      STBI_FREE(a.zout_start);
-      return NULL;
-   }
-}
-
-STBIDEF int stbi_zlib_decode_buffer(char *obuffer, int olen, char const *ibuffer, int ilen)
-{
-   stbi__zbuf a;
-   a.zbuffer = (stbi_uc *) ibuffer;
-   a.zbuffer_end = (stbi_uc *) ibuffer + ilen;
-   if (stbi__do_zlib(&a, obuffer, olen, 0, 1))
-      return (int) (a.zout - a.zout_start);
-   else
-      return -1;
-}
-
-STBIDEF char *stbi_zlib_decode_noheader_malloc(char const *buffer, int len, int *outlen)
-{
-   stbi__zbuf a;
-   char *p = (char *) stbi__malloc(16384);
-   if (p == NULL) return NULL;
-   a.zbuffer = (stbi_uc *) buffer;
-   a.zbuffer_end = (stbi_uc *) buffer+len;
-   if (stbi__do_zlib(&a, p, 16384, 1, 0)) {
-      if (outlen) *outlen = (int) (a.zout - a.zout_start);
-      return a.zout_start;
-   } else {
-      STBI_FREE(a.zout_start);
-      return NULL;
-   }
-}
-
-STBIDEF int stbi_zlib_decode_noheader_buffer(char *obuffer, int olen, const char *ibuffer, int ilen)
-{
-   stbi__zbuf a;
-   a.zbuffer = (stbi_uc *) ibuffer;
-   a.zbuffer_end = (stbi_uc *) ibuffer + ilen;
-   if (stbi__do_zlib(&a, obuffer, olen, 0, 0))
-      return (int) (a.zout - a.zout_start);
-   else
-      return -1;
-}
-#endif
-
-// public domain "baseline" PNG decoder   v0.10  Sean Barrett 2006-11-18
-//    simple implementation
-//      - only 8-bit samples
-//      - no CRC checking
-//      - allocates lots of intermediate memory
-//        - avoids problem of streaming data between subsystems
-//        - avoids explicit window management
-//    performance
-//      - uses stb_zlib, a PD zlib implementation with fast huffman decoding
-
-#ifndef STBI_NO_PNG
-typedef struct
-{
-   stbi__uint32 length;
-   stbi__uint32 type;
-} stbi__pngchunk;
-
-static stbi__pngchunk stbi__get_chunk_header(stbi__context *s)
-{
-   stbi__pngchunk c;
-   c.length = stbi__get32be(s);
-   c.type   = stbi__get32be(s);
-   return c;
-}
-
-static int stbi__check_png_header(stbi__context *s)
-{
-   static const stbi_uc png_sig[8] = { 137,80,78,71,13,10,26,10 };
-   int i;
-   for (i=0; i < 8; ++i)
-      if (stbi__get8(s) != png_sig[i]) return stbi__err("bad png sig","Not a PNG");
-   return 1;
-}
-
-typedef struct
-{
-   stbi__context *s;
-   stbi_uc *idata, *expanded, *out;
-   int depth;
-} stbi__png;
-
-
-enum {
-   STBI__F_none=0,
-   STBI__F_sub=1,
-   STBI__F_up=2,
-   STBI__F_avg=3,
-   STBI__F_paeth=4,
-   // synthetic filters used for first scanline to avoid needing a dummy row of 0s
-   STBI__F_avg_first,
-   STBI__F_paeth_first
-};
-
-static stbi_uc first_row_filter[5] =
-{
-   STBI__F_none,
-   STBI__F_sub,
-   STBI__F_none,
-   STBI__F_avg_first,
-   STBI__F_paeth_first
-};
-
-static int stbi__paeth(int a, int b, int c)
-{
-   int p = a + b - c;
-   int pa = abs(p-a);
-   int pb = abs(p-b);
-   int pc = abs(p-c);
-   if (pa <= pb && pa <= pc) return a;
-   if (pb <= pc) return b;
-   return c;
-}
-
-static const stbi_uc stbi__depth_scale_table[9] = { 0, 0xff, 0x55, 0, 0x11, 0,0,0, 0x01 };
-
-// create the png data from post-deflated data
-static int stbi__create_png_image_raw(stbi__png *a, stbi_uc *raw, stbi__uint32 raw_len, int out_n, stbi__uint32 x, stbi__uint32 y, int depth, int color)
-{
-   int bytes = (depth == 16? 2 : 1);
-   stbi__context *s = a->s;
-   stbi__uint32 i,j,stride = x*out_n*bytes;
-   stbi__uint32 img_len, img_width_bytes;
-   int k;
-   int img_n = s->img_n; // copy it into a local for later
-
-   int output_bytes = out_n*bytes;
-   int filter_bytes = img_n*bytes;
-   int width = x;
-
-   STBI_ASSERT(out_n == s->img_n || out_n == s->img_n+1);
-   a->out = (stbi_uc *) stbi__malloc_mad3(x, y, output_bytes, 0); // extra bytes to write off the end into
-   if (!a->out) return stbi__err("outofmem", "Out of memory");
-
-   if (!stbi__mad3sizes_valid(img_n, x, depth, 7)) return stbi__err("too large", "Corrupt PNG");
-   img_width_bytes = (((img_n * x * depth) + 7) >> 3);
-   img_len = (img_width_bytes + 1) * y;
-
-   // we used to check for exact match between raw_len and img_len on non-interlaced PNGs,
-   // but issue #276 reported a PNG in the wild that had extra data at the end (all zeros),
-   // so just check for raw_len < img_len always.
-   if (raw_len < img_len) return stbi__err("not enough pixels","Corrupt PNG");
-
-   for (j=0; j < y; ++j) {
-      stbi_uc *cur = a->out + stride*j;
-      stbi_uc *prior;
-      int filter = *raw++;
-
-      if (filter > 4)
-         return stbi__err("invalid filter","Corrupt PNG");
-
-      if (depth < 8) {
-         if (img_width_bytes > x) return stbi__err("invalid width","Corrupt PNG");
-         cur += x*out_n - img_width_bytes; // store output to the rightmost img_len bytes, so we can decode in place
-         filter_bytes = 1;
-         width = img_width_bytes;
-      }
-      prior = cur - stride; // bugfix: need to compute this after 'cur +=' computation above
-
-      // if first row, use special filter that doesn't sample previous row
-      if (j == 0) filter = first_row_filter[filter];
-
-      // handle first byte explicitly
-      for (k=0; k < filter_bytes; ++k) {
-         switch (filter) {
-            case STBI__F_none       : cur[k] = raw[k]; break;
-            case STBI__F_sub        : cur[k] = raw[k]; break;
-            case STBI__F_up         : cur[k] = STBI__BYTECAST(raw[k] + prior[k]); break;
-            case STBI__F_avg        : cur[k] = STBI__BYTECAST(raw[k] + (prior[k]>>1)); break;
-            case STBI__F_paeth      : cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(0,prior[k],0)); break;
-            case STBI__F_avg_first  : cur[k] = raw[k]; break;
-            case STBI__F_paeth_first: cur[k] = raw[k]; break;
-         }
-      }
-
-      if (depth == 8) {
-         if (img_n != out_n)
-            cur[img_n] = 255; // first pixel
-         raw += img_n;
-         cur += out_n;
-         prior += out_n;
-      } else if (depth == 16) {
-         if (img_n != out_n) {
-            cur[filter_bytes]   = 255; // first pixel top byte
-            cur[filter_bytes+1] = 255; // first pixel bottom byte
-         }
-         raw += filter_bytes;
-         cur += output_bytes;
-         prior += output_bytes;
-      } else {
-         raw += 1;
-         cur += 1;
-         prior += 1;
-      }
-
-      // this is a little gross, so that we don't switch per-pixel or per-component
-      if (depth < 8 || img_n == out_n) {
-         int nk = (width - 1)*filter_bytes;
-         #define STBI__CASE(f) \
-             case f:     \
-                for (k=0; k < nk; ++k)
-         switch (filter) {
-            // "none" filter turns into a memcpy here; make that explicit.
-            case STBI__F_none:         memcpy(cur, raw, nk); break;
-            STBI__CASE(STBI__F_sub)          { cur[k] = STBI__BYTECAST(raw[k] + cur[k-filter_bytes]); } break;
-            STBI__CASE(STBI__F_up)           { cur[k] = STBI__BYTECAST(raw[k] + prior[k]); } break;
-            STBI__CASE(STBI__F_avg)          { cur[k] = STBI__BYTECAST(raw[k] + ((prior[k] + cur[k-filter_bytes])>>1)); } break;
-            STBI__CASE(STBI__F_paeth)        { cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k-filter_bytes],prior[k],prior[k-filter_bytes])); } break;
-            STBI__CASE(STBI__F_avg_first)    { cur[k] = STBI__BYTECAST(raw[k] + (cur[k-filter_bytes] >> 1)); } break;
-            STBI__CASE(STBI__F_paeth_first)  { cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k-filter_bytes],0,0)); } break;
-         }
-         #undef STBI__CASE
-         raw += nk;
-      } else {
-         STBI_ASSERT(img_n+1 == out_n);
-         #define STBI__CASE(f) \
-             case f:     \
-                for (i=x-1; i >= 1; --i, cur[filter_bytes]=255,raw+=filter_bytes,cur+=output_bytes,prior+=output_bytes) \
-                   for (k=0; k < filter_bytes; ++k)
-         switch (filter) {
-            STBI__CASE(STBI__F_none)         { cur[k] = raw[k]; } break;
-            STBI__CASE(STBI__F_sub)          { cur[k] = STBI__BYTECAST(raw[k] + cur[k- output_bytes]); } break;
-            STBI__CASE(STBI__F_up)           { cur[k] = STBI__BYTECAST(raw[k] + prior[k]); } break;
-            STBI__CASE(STBI__F_avg)          { cur[k] = STBI__BYTECAST(raw[k] + ((prior[k] + cur[k- output_bytes])>>1)); } break;
-            STBI__CASE(STBI__F_paeth)        { cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k- output_bytes],prior[k],prior[k- output_bytes])); } break;
-            STBI__CASE(STBI__F_avg_first)    { cur[k] = STBI__BYTECAST(raw[k] + (cur[k- output_bytes] >> 1)); } break;
-            STBI__CASE(STBI__F_paeth_first)  { cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k- output_bytes],0,0)); } break;
-         }
-         #undef STBI__CASE
-
-         // the loop above sets the high byte of the pixels' alpha, but for
-         // 16 bit png files we also need the low byte set. we'll do that here.
-         if (depth == 16) {
-            cur = a->out + stride*j; // start at the beginning of the row again
-            for (i=0; i < x; ++i,cur+=output_bytes) {
-               cur[filter_bytes+1] = 255;
-            }
-         }
-      }
-   }
-
-   // we make a separate pass to expand bits to pixels; for performance,
-   // this could run two scanlines behind the above code, so it won't
-   // intefere with filtering but will still be in the cache.
-   if (depth < 8) {
-      for (j=0; j < y; ++j) {
-         stbi_uc *cur = a->out + stride*j;
-         stbi_uc *in  = a->out + stride*j + x*out_n - img_width_bytes;
-         // unpack 1/2/4-bit into a 8-bit buffer. allows us to keep the common 8-bit path optimal at minimal cost for 1/2/4-bit
-         // png guarante byte alignment, if width is not multiple of 8/4/2 we'll decode dummy trailing data that will be skipped in the later loop
-         stbi_uc scale = (color == 0) ? stbi__depth_scale_table[depth] : 1; // scale grayscale values to 0..255 range
-
-         // note that the final byte might overshoot and write more data than desired.
-         // we can allocate enough data that this never writes out of memory, but it
-         // could also overwrite the next scanline. can it overwrite non-empty data
-         // on the next scanline? yes, consider 1-pixel-wide scanlines with 1-bit-per-pixel.
-         // so we need to explicitly clamp the final ones
-
-         if (depth == 4) {
-            for (k=x*img_n; k >= 2; k-=2, ++in) {
-               *cur++ = scale * ((*in >> 4)       );
-               *cur++ = scale * ((*in     ) & 0x0f);
-            }
-            if (k > 0) *cur++ = scale * ((*in >> 4)       );
-         } else if (depth == 2) {
-            for (k=x*img_n; k >= 4; k-=4, ++in) {
-               *cur++ = scale * ((*in >> 6)       );
-               *cur++ = scale * ((*in >> 4) & 0x03);
-               *cur++ = scale * ((*in >> 2) & 0x03);
-               *cur++ = scale * ((*in     ) & 0x03);
-            }
-            if (k > 0) *cur++ = scale * ((*in >> 6)       );
-            if (k > 1) *cur++ = scale * ((*in >> 4) & 0x03);
-            if (k > 2) *cur++ = scale * ((*in >> 2) & 0x03);
-         } else if (depth == 1) {
-            for (k=x*img_n; k >= 8; k-=8, ++in) {
-               *cur++ = scale * ((*in >> 7)       );
-               *cur++ = scale * ((*in >> 6) & 0x01);
-               *cur++ = scale * ((*in >> 5) & 0x01);
-               *cur++ = scale * ((*in >> 4) & 0x01);
-               *cur++ = scale * ((*in >> 3) & 0x01);
-               *cur++ = scale * ((*in >> 2) & 0x01);
-               *cur++ = scale * ((*in >> 1) & 0x01);
-               *cur++ = scale * ((*in     ) & 0x01);
-            }
-            if (k > 0) *cur++ = scale * ((*in >> 7)       );
-            if (k > 1) *cur++ = scale * ((*in >> 6) & 0x01);
-            if (k > 2) *cur++ = scale * ((*in >> 5) & 0x01);
-            if (k > 3) *cur++ = scale * ((*in >> 4) & 0x01);
-            if (k > 4) *cur++ = scale * ((*in >> 3) & 0x01);
-            if (k > 5) *cur++ = scale * ((*in >> 2) & 0x01);
-            if (k > 6) *cur++ = scale * ((*in >> 1) & 0x01);
-         }
-         if (img_n != out_n) {
-            int q;
-            // insert alpha = 255
-            cur = a->out + stride*j;
-            if (img_n == 1) {
-               for (q=x-1; q >= 0; --q) {
-                  cur[q*2+1] = 255;
-                  cur[q*2+0] = cur[q];
-               }
-            } else {
-               STBI_ASSERT(img_n == 3);
-               for (q=x-1; q >= 0; --q) {
-                  cur[q*4+3] = 255;
-                  cur[q*4+2] = cur[q*3+2];
-                  cur[q*4+1] = cur[q*3+1];
-                  cur[q*4+0] = cur[q*3+0];
-               }
-            }
-         }
-      }
-   } else if (depth == 16) {
-      // force the image data from big-endian to platform-native.
-      // this is done in a separate pass due to the decoding relying
-      // on the data being untouched, but could probably be done
-      // per-line during decode if care is taken.
-      stbi_uc *cur = a->out;
-      stbi__uint16 *cur16 = (stbi__uint16*)cur;
-
-      for(i=0; i < x*y*out_n; ++i,cur16++,cur+=2) {
-         *cur16 = (cur[0] << 8) | cur[1];
-      }
-   }
-
-   return 1;
-}
-
-static int stbi__create_png_image(stbi__png *a, stbi_uc *image_data, stbi__uint32 image_data_len, int out_n, int depth, int color, int interlaced)
-{
-   int bytes = (depth == 16 ? 2 : 1);
-   int out_bytes = out_n * bytes;
-   stbi_uc *final;
-   int p;
-   if (!interlaced)
-      return stbi__create_png_image_raw(a, image_data, image_data_len, out_n, a->s->img_x, a->s->img_y, depth, color);
-
-   // de-interlacing
-   final = (stbi_uc *) stbi__malloc_mad3(a->s->img_x, a->s->img_y, out_bytes, 0);
-   if (!final) return stbi__err("outofmem", "Out of memory");
-   for (p=0; p < 7; ++p) {
-      int xorig[] = { 0,4,0,2,0,1,0 };
-      int yorig[] = { 0,0,4,0,2,0,1 };
-      int xspc[]  = { 8,8,4,4,2,2,1 };
-      int yspc[]  = { 8,8,8,4,4,2,2 };
-      int i,j,x,y;
-      // pass1_x[4] = 0, pass1_x[5] = 1, pass1_x[12] = 1
-      x = (a->s->img_x - xorig[p] + xspc[p]-1) / xspc[p];
-      y = (a->s->img_y - yorig[p] + yspc[p]-1) / yspc[p];
-      if (x && y) {
-         stbi__uint32 img_len = ((((a->s->img_n * x * depth) + 7) >> 3) + 1) * y;
-         if (!stbi__create_png_image_raw(a, image_data, image_data_len, out_n, x, y, depth, color)) {
-            STBI_FREE(final);
-            return 0;
-         }
-         for (j=0; j < y; ++j) {
-            for (i=0; i < x; ++i) {
-               int out_y = j*yspc[p]+yorig[p];
-               int out_x = i*xspc[p]+xorig[p];
-               memcpy(final + out_y*a->s->img_x*out_bytes + out_x*out_bytes,
-                      a->out + (j*x+i)*out_bytes, out_bytes);
-            }
-         }
-         STBI_FREE(a->out);
-         image_data += img_len;
-         image_data_len -= img_len;
-      }
-   }
-   a->out = final;
-
-   return 1;
-}
-
-static int stbi__compute_transparency(stbi__png *z, stbi_uc tc[3], int out_n)
-{
-   stbi__context *s = z->s;
-   stbi__uint32 i, pixel_count = s->img_x * s->img_y;
-   stbi_uc *p = z->out;
-
-   // compute color-based transparency, assuming we've
-   // already got 255 as the alpha value in the output
-   STBI_ASSERT(out_n == 2 || out_n == 4);
-
-   if (out_n == 2) {
-      for (i=0; i < pixel_count; ++i) {
-         p[1] = (p[0] == tc[0] ? 0 : 255);
-         p += 2;
-      }
-   } else {
-      for (i=0; i < pixel_count; ++i) {
-         if (p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2])
-            p[3] = 0;
-         p += 4;
-      }
-   }
-   return 1;
-}
-
-static int stbi__compute_transparency16(stbi__png *z, stbi__uint16 tc[3], int out_n)
-{
-   stbi__context *s = z->s;
-   stbi__uint32 i, pixel_count = s->img_x * s->img_y;
-   stbi__uint16 *p = (stbi__uint16*) z->out;
-
-   // compute color-based transparency, assuming we've
-   // already got 65535 as the alpha value in the output
-   STBI_ASSERT(out_n == 2 || out_n == 4);
-
-   if (out_n == 2) {
-      for (i = 0; i < pixel_count; ++i) {
-         p[1] = (p[0] == tc[0] ? 0 : 65535);
-         p += 2;
-      }
-   } else {
-      for (i = 0; i < pixel_count; ++i) {
-         if (p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2])
-            p[3] = 0;
-         p += 4;
-      }
-   }
-   return 1;
-}
-
-static int stbi__expand_png_palette(stbi__png *a, stbi_uc *palette, int len, int pal_img_n)
-{
-   stbi__uint32 i, pixel_count = a->s->img_x * a->s->img_y;
-   stbi_uc *p, *temp_out, *orig = a->out;
-
-   p = (stbi_uc *) stbi__malloc_mad2(pixel_count, pal_img_n, 0);
-   if (p == NULL) return stbi__err("outofmem", "Out of memory");
-
-   // between here and free(out) below, exitting would leak
-   temp_out = p;
-
-   if (pal_img_n == 3) {
-      for (i=0; i < pixel_count; ++i) {
-         int n = orig[i]*4;
-         p[0] = palette[n  ];
-         p[1] = palette[n+1];
-         p[2] = palette[n+2];
-         p += 3;
-      }
-   } else {
-      for (i=0; i < pixel_count; ++i) {
-         int n = orig[i]*4;
-         p[0] = palette[n  ];
-         p[1] = palette[n+1];
-         p[2] = palette[n+2];
-         p[3] = palette[n+3];
-         p += 4;
-      }
-   }
-   STBI_FREE(a->out);
-   a->out = temp_out;
-
-   STBI_NOTUSED(len);
-
-   return 1;
-}
-
-static int stbi__unpremultiply_on_load_global = 0;
-static int stbi__de_iphone_flag_global = 0;
-
-STBIDEF void stbi_set_unpremultiply_on_load(int flag_true_if_should_unpremultiply)
-{
-   stbi__unpremultiply_on_load_global = flag_true_if_should_unpremultiply;
-}
-
-STBIDEF void stbi_convert_iphone_png_to_rgb(int flag_true_if_should_convert)
-{
-   stbi__de_iphone_flag_global = flag_true_if_should_convert;
-}
-
-#ifndef STBI_THREAD_LOCAL
-#define stbi__unpremultiply_on_load  stbi__unpremultiply_on_load_global
-#define stbi__de_iphone_flag  stbi__de_iphone_flag_global
-#else
-static STBI_THREAD_LOCAL int stbi__unpremultiply_on_load_local, stbi__unpremultiply_on_load_set;
-static STBI_THREAD_LOCAL int stbi__de_iphone_flag_local, stbi__de_iphone_flag_set;
-
-STBIDEF void stbi_set_unpremultiply_on_load_thread(int flag_true_if_should_unpremultiply)
-{
-   stbi__unpremultiply_on_load_local = flag_true_if_should_unpremultiply;
-   stbi__unpremultiply_on_load_set = 1;
-}
-
-STBIDEF void stbi_convert_iphone_png_to_rgb_thread(int flag_true_if_should_convert)
-{
-   stbi__de_iphone_flag_local = flag_true_if_should_convert;
-   stbi__de_iphone_flag_set = 1;
-}
-
-#define stbi__unpremultiply_on_load  (stbi__unpremultiply_on_load_set           \
-                                       ? stbi__unpremultiply_on_load_local      \
-                                       : stbi__unpremultiply_on_load_global)
-#define stbi__de_iphone_flag  (stbi__de_iphone_flag_set                         \
-                                ? stbi__de_iphone_flag_local                    \
-                                : stbi__de_iphone_flag_global)
-#endif // STBI_THREAD_LOCAL
-
-static void stbi__de_iphone(stbi__png *z)
-{
-   stbi__context *s = z->s;
-   stbi__uint32 i, pixel_count = s->img_x * s->img_y;
-   stbi_uc *p = z->out;
-
-   if (s->img_out_n == 3) {  // convert bgr to rgb
-      for (i=0; i < pixel_count; ++i) {
-         stbi_uc t = p[0];
-         p[0] = p[2];
-         p[2] = t;
-         p += 3;
-      }
-   } else {
-      STBI_ASSERT(s->img_out_n == 4);
-      if (stbi__unpremultiply_on_load) {
-         // convert bgr to rgb and unpremultiply
-         for (i=0; i < pixel_count; ++i) {
-            stbi_uc a = p[3];
-            stbi_uc t = p[0];
-            if (a) {
-               stbi_uc half = a / 2;
-               p[0] = (p[2] * 255 + half) / a;
-               p[1] = (p[1] * 255 + half) / a;
-               p[2] = ( t   * 255 + half) / a;
-            } else {
-               p[0] = p[2];
-               p[2] = t;
-            }
-            p += 4;
-         }
-      } else {
-         // convert bgr to rgb
-         for (i=0; i < pixel_count; ++i) {
-            stbi_uc t = p[0];
-            p[0] = p[2];
-            p[2] = t;
-            p += 4;
-         }
-      }
-   }
-}
-
-#define STBI__PNG_TYPE(a,b,c,d)  (((unsigned) (a) << 24) + ((unsigned) (b) << 16) + ((unsigned) (c) << 8) + (unsigned) (d))
-
-static int stbi__parse_png_file(stbi__png *z, int scan, int req_comp)
-{
-   stbi_uc palette[1024], pal_img_n=0;
-   stbi_uc has_trans=0, tc[3]={0};
-   stbi__uint16 tc16[3];
-   stbi__uint32 ioff=0, idata_limit=0, i, pal_len=0;
-   int first=1,k,interlace=0, color=0, is_iphone=0;
-   stbi__context *s = z->s;
-
-   z->expanded = NULL;
-   z->idata = NULL;
-   z->out = NULL;
-
-   if (!stbi__check_png_header(s)) return 0;
-
-   if (scan == STBI__SCAN_type) return 1;
-
-   for (;;) {
-      stbi__pngchunk c = stbi__get_chunk_header(s);
-      switch (c.type) {
-         case STBI__PNG_TYPE('C','g','B','I'):
-            is_iphone = 1;
-            stbi__skip(s, c.length);
-            break;
-         case STBI__PNG_TYPE('I','H','D','R'): {
-            int comp,filter;
-            if (!first) return stbi__err("multiple IHDR","Corrupt PNG");
-            first = 0;
-            if (c.length != 13) return stbi__err("bad IHDR len","Corrupt PNG");
-            s->img_x = stbi__get32be(s);
-            s->img_y = stbi__get32be(s);
-            if (s->img_y > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)");
-            if (s->img_x > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)");
-            z->depth = stbi__get8(s);  if (z->depth != 1 && z->depth != 2 && z->depth != 4 && z->depth != 8 && z->depth != 16)  return stbi__err("1/2/4/8/16-bit only","PNG not supported: 1/2/4/8/16-bit only");
-            color = stbi__get8(s);  if (color > 6)         return stbi__err("bad ctype","Corrupt PNG");
-            if (color == 3 && z->depth == 16)                  return stbi__err("bad ctype","Corrupt PNG");
-            if (color == 3) pal_img_n = 3; else if (color & 1) return stbi__err("bad ctype","Corrupt PNG");
-            comp  = stbi__get8(s);  if (comp) return stbi__err("bad comp method","Corrupt PNG");
-            filter= stbi__get8(s);  if (filter) return stbi__err("bad filter method","Corrupt PNG");
-            interlace = stbi__get8(s); if (interlace>1) return stbi__err("bad interlace method","Corrupt PNG");
-            if (!s->img_x || !s->img_y) return stbi__err("0-pixel image","Corrupt PNG");
-            if (!pal_img_n) {
-               s->img_n = (color & 2 ? 3 : 1) + (color & 4 ? 1 : 0);
-               if ((1 << 30) / s->img_x / s->img_n < s->img_y) return stbi__err("too large", "Image too large to decode");
-            } else {
-               // if paletted, then pal_n is our final components, and
-               // img_n is # components to decompress/filter.
-               s->img_n = 1;
-               if ((1 << 30) / s->img_x / 4 < s->img_y) return stbi__err("too large","Corrupt PNG");
-            }
-            // even with SCAN_header, have to scan to see if we have a tRNS
-            break;
-         }
-
-         case STBI__PNG_TYPE('P','L','T','E'):  {
-            if (first) return stbi__err("first not IHDR", "Corrupt PNG");
-            if (c.length > 256*3) return stbi__err("invalid PLTE","Corrupt PNG");
-            pal_len = c.length / 3;
-            if (pal_len * 3 != c.length) return stbi__err("invalid PLTE","Corrupt PNG");
-            for (i=0; i < pal_len; ++i) {
-               palette[i*4+0] = stbi__get8(s);
-               palette[i*4+1] = stbi__get8(s);
-               palette[i*4+2] = stbi__get8(s);
-               palette[i*4+3] = 255;
-            }
-            break;
-         }
-
-         case STBI__PNG_TYPE('t','R','N','S'): {
-            if (first) return stbi__err("first not IHDR", "Corrupt PNG");
-            if (z->idata) return stbi__err("tRNS after IDAT","Corrupt PNG");
-            if (pal_img_n) {
-               if (scan == STBI__SCAN_header) { s->img_n = 4; return 1; }
-               if (pal_len == 0) return stbi__err("tRNS before PLTE","Corrupt PNG");
-               if (c.length > pal_len) return stbi__err("bad tRNS len","Corrupt PNG");
-               pal_img_n = 4;
-               for (i=0; i < c.length; ++i)
-                  palette[i*4+3] = stbi__get8(s);
-            } else {
-               if (!(s->img_n & 1)) return stbi__err("tRNS with alpha","Corrupt PNG");
-               if (c.length != (stbi__uint32) s->img_n*2) return stbi__err("bad tRNS len","Corrupt PNG");
-               has_trans = 1;
-               // non-paletted with tRNS = constant alpha. if header-scanning, we can stop now.
-               if (scan == STBI__SCAN_header) { ++s->img_n; return 1; }
-               if (z->depth == 16) {
-                  for (k = 0; k < s->img_n; ++k) tc16[k] = (stbi__uint16)stbi__get16be(s); // copy the values as-is
-               } else {
-                  for (k = 0; k < s->img_n; ++k) tc[k] = (stbi_uc)(stbi__get16be(s) & 255) * stbi__depth_scale_table[z->depth]; // non 8-bit images will be larger
-               }
-            }
-            break;
-         }
-
-         case STBI__PNG_TYPE('I','D','A','T'): {
-            if (first) return stbi__err("first not IHDR", "Corrupt PNG");
-            if (pal_img_n && !pal_len) return stbi__err("no PLTE","Corrupt PNG");
-            if (scan == STBI__SCAN_header) {
-               // header scan definitely stops at first IDAT
-               if (pal_img_n)
-                  s->img_n = pal_img_n;
-               return 1;
-            }
-            if (c.length > (1u << 30)) return stbi__err("IDAT size limit", "IDAT section larger than 2^30 bytes");
-            if ((int)(ioff + c.length) < (int)ioff) return 0;
-            if (ioff + c.length > idata_limit) {
-               stbi__uint32 idata_limit_old = idata_limit;
-               stbi_uc *p;
-               if (idata_limit == 0) idata_limit = c.length > 4096 ? c.length : 4096;
-               while (ioff + c.length > idata_limit)
-                  idata_limit *= 2;
-               STBI_NOTUSED(idata_limit_old);
-               p = (stbi_uc *) STBI_REALLOC_SIZED(z->idata, idata_limit_old, idata_limit); if (p == NULL) return stbi__err("outofmem", "Out of memory");
-               z->idata = p;
-            }
-            if (!stbi__getn(s, z->idata+ioff,c.length)) return stbi__err("outofdata","Corrupt PNG");
-            ioff += c.length;
-            break;
-         }
-
-         case STBI__PNG_TYPE('I','E','N','D'): {
-            stbi__uint32 raw_len, bpl;
-            if (first) return stbi__err("first not IHDR", "Corrupt PNG");
-            if (scan != STBI__SCAN_load) return 1;
-            if (z->idata == NULL) return stbi__err("no IDAT","Corrupt PNG");
-            // initial guess for decoded data size to avoid unnecessary reallocs
-            bpl = (s->img_x * z->depth + 7) / 8; // bytes per line, per component
-            raw_len = bpl * s->img_y * s->img_n /* pixels */ + s->img_y /* filter mode per row */;
-            z->expanded = (stbi_uc *) stbi_zlib_decode_malloc_guesssize_headerflag((char *) z->idata, ioff, raw_len, (int *) &raw_len, !is_iphone);
-            if (z->expanded == NULL) return 0; // zlib should set error
-            STBI_FREE(z->idata); z->idata = NULL;
-            if ((req_comp == s->img_n+1 && req_comp != 3 && !pal_img_n) || has_trans)
-               s->img_out_n = s->img_n+1;
-            else
-               s->img_out_n = s->img_n;
-            if (!stbi__create_png_image(z, z->expanded, raw_len, s->img_out_n, z->depth, color, interlace)) return 0;
-            if (has_trans) {
-               if (z->depth == 16) {
-                  if (!stbi__compute_transparency16(z, tc16, s->img_out_n)) return 0;
-               } else {
-                  if (!stbi__compute_transparency(z, tc, s->img_out_n)) return 0;
-               }
-            }
-            if (is_iphone && stbi__de_iphone_flag && s->img_out_n > 2)
-               stbi__de_iphone(z);
-            if (pal_img_n) {
-               // pal_img_n == 3 or 4
-               s->img_n = pal_img_n; // record the actual colors we had
-               s->img_out_n = pal_img_n;
-               if (req_comp >= 3) s->img_out_n = req_comp;
-               if (!stbi__expand_png_palette(z, palette, pal_len, s->img_out_n))
-                  return 0;
-            } else if (has_trans) {
-               // non-paletted image with tRNS -> source image has (constant) alpha
-               ++s->img_n;
-            }
-            STBI_FREE(z->expanded); z->expanded = NULL;
-            // end of PNG chunk, read and skip CRC
-            stbi__get32be(s);
-            return 1;
-         }
-
-         default:
-            // if critical, fail
-            if (first) return stbi__err("first not IHDR", "Corrupt PNG");
-            if ((c.type & (1 << 29)) == 0) {
-               #ifndef STBI_NO_FAILURE_STRINGS
-               // not threadsafe
-               static char invalid_chunk[] = "XXXX PNG chunk not known";
-               invalid_chunk[0] = STBI__BYTECAST(c.type >> 24);
-               invalid_chunk[1] = STBI__BYTECAST(c.type >> 16);
-               invalid_chunk[2] = STBI__BYTECAST(c.type >>  8);
-               invalid_chunk[3] = STBI__BYTECAST(c.type >>  0);
-               #endif
-               return stbi__err(invalid_chunk, "PNG not supported: unknown PNG chunk type");
-            }
-            stbi__skip(s, c.length);
-            break;
-      }
-      // end of PNG chunk, read and skip CRC
-      stbi__get32be(s);
-   }
-}
-
-static void *stbi__do_png(stbi__png *p, int *x, int *y, int *n, int req_comp, stbi__result_info *ri)
-{
-   void *result=NULL;
-   if (req_comp < 0 || req_comp > 4) return stbi__errpuc("bad req_comp", "Internal error");
-   if (stbi__parse_png_file(p, STBI__SCAN_load, req_comp)) {
-      if (p->depth <= 8)
-         ri->bits_per_channel = 8;
-      else if (p->depth == 16)
-         ri->bits_per_channel = 16;
-      else
-         return stbi__errpuc("bad bits_per_channel", "PNG not supported: unsupported color depth");
-      result = p->out;
-      p->out = NULL;
-      if (req_comp && req_comp != p->s->img_out_n) {
-         if (ri->bits_per_channel == 8)
-            result = stbi__convert_format((unsigned char *) result, p->s->img_out_n, req_comp, p->s->img_x, p->s->img_y);
-         else
-            result = stbi__convert_format16((stbi__uint16 *) result, p->s->img_out_n, req_comp, p->s->img_x, p->s->img_y);
-         p->s->img_out_n = req_comp;
-         if (result == NULL) return result;
-      }
-      *x = p->s->img_x;
-      *y = p->s->img_y;
-      if (n) *n = p->s->img_n;
-   }
-   STBI_FREE(p->out);      p->out      = NULL;
-   STBI_FREE(p->expanded); p->expanded = NULL;
-   STBI_FREE(p->idata);    p->idata    = NULL;
-
-   return result;
-}
-
-static void *stbi__png_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
-{
-   stbi__png p;
-   p.s = s;
-   return stbi__do_png(&p, x,y,comp,req_comp, ri);
-}
-
-static int stbi__png_test(stbi__context *s)
-{
-   int r;
-   r = stbi__check_png_header(s);
-   stbi__rewind(s);
-   return r;
-}
-
-static int stbi__png_info_raw(stbi__png *p, int *x, int *y, int *comp)
-{
-   if (!stbi__parse_png_file(p, STBI__SCAN_header, 0)) {
-      stbi__rewind( p->s );
-      return 0;
-   }
-   if (x) *x = p->s->img_x;
-   if (y) *y = p->s->img_y;
-   if (comp) *comp = p->s->img_n;
-   return 1;
-}
-
-static int stbi__png_info(stbi__context *s, int *x, int *y, int *comp)
-{
-   stbi__png p;
-   p.s = s;
-   return stbi__png_info_raw(&p, x, y, comp);
-}
-
-static int stbi__png_is16(stbi__context *s)
-{
-   stbi__png p;
-   p.s = s;
-   if (!stbi__png_info_raw(&p, NULL, NULL, NULL))
-	   return 0;
-   if (p.depth != 16) {
-      stbi__rewind(p.s);
-      return 0;
-   }
-   return 1;
-}
-#endif
-
-// Microsoft/Windows BMP image
-
-#ifndef STBI_NO_BMP
-static int stbi__bmp_test_raw(stbi__context *s)
-{
-   int r;
-   int sz;
-   if (stbi__get8(s) != 'B') return 0;
-   if (stbi__get8(s) != 'M') return 0;
-   stbi__get32le(s); // discard filesize
-   stbi__get16le(s); // discard reserved
-   stbi__get16le(s); // discard reserved
-   stbi__get32le(s); // discard data offset
-   sz = stbi__get32le(s);
-   r = (sz == 12 || sz == 40 || sz == 56 || sz == 108 || sz == 124);
-   return r;
-}
-
-static int stbi__bmp_test(stbi__context *s)
-{
-   int r = stbi__bmp_test_raw(s);
-   stbi__rewind(s);
-   return r;
-}
-
-
-// returns 0..31 for the highest set bit
-static int stbi__high_bit(unsigned int z)
-{
-   int n=0;
-   if (z == 0) return -1;
-   if (z >= 0x10000) { n += 16; z >>= 16; }
-   if (z >= 0x00100) { n +=  8; z >>=  8; }
-   if (z >= 0x00010) { n +=  4; z >>=  4; }
-   if (z >= 0x00004) { n +=  2; z >>=  2; }
-   if (z >= 0x00002) { n +=  1;/* >>=  1;*/ }
-   return n;
-}
-
-static int stbi__bitcount(unsigned int a)
-{
-   a = (a & 0x55555555) + ((a >>  1) & 0x55555555); // max 2
-   a = (a & 0x33333333) + ((a >>  2) & 0x33333333); // max 4
-   a = (a + (a >> 4)) & 0x0f0f0f0f; // max 8 per 4, now 8 bits
-   a = (a + (a >> 8)); // max 16 per 8 bits
-   a = (a + (a >> 16)); // max 32 per 8 bits
-   return a & 0xff;
-}
-
-// extract an arbitrarily-aligned N-bit value (N=bits)
-// from v, and then make it 8-bits long and fractionally
-// extend it to full full range.
-static int stbi__shiftsigned(unsigned int v, int shift, int bits)
-{
-   static unsigned int mul_table[9] = {
-      0,
-      0xff/*0b11111111*/, 0x55/*0b01010101*/, 0x49/*0b01001001*/, 0x11/*0b00010001*/,
-      0x21/*0b00100001*/, 0x41/*0b01000001*/, 0x81/*0b10000001*/, 0x01/*0b00000001*/,
-   };
-   static unsigned int shift_table[9] = {
-      0, 0,0,1,0,2,4,6,0,
-   };
-   if (shift < 0)
-      v <<= -shift;
-   else
-      v >>= shift;
-   STBI_ASSERT(v < 256);
-   v >>= (8-bits);
-   STBI_ASSERT(bits >= 0 && bits <= 8);
-   return (int) ((unsigned) v * mul_table[bits]) >> shift_table[bits];
-}
-
-typedef struct
-{
-   int bpp, offset, hsz;
-   unsigned int mr,mg,mb,ma, all_a;
-   int extra_read;
-} stbi__bmp_data;
-
-static int stbi__bmp_set_mask_defaults(stbi__bmp_data *info, int compress)
-{
-   // BI_BITFIELDS specifies masks explicitly, don't override
-   if (compress == 3)
-      return 1;
-
-   if (compress == 0) {
-      if (info->bpp == 16) {
-         info->mr = 31u << 10;
-         info->mg = 31u <<  5;
-         info->mb = 31u <<  0;
-      } else if (info->bpp == 32) {
-         info->mr = 0xffu << 16;
-         info->mg = 0xffu <<  8;
-         info->mb = 0xffu <<  0;
-         info->ma = 0xffu << 24;
-         info->all_a = 0; // if all_a is 0 at end, then we loaded alpha channel but it was all 0
-      } else {
-         // otherwise, use defaults, which is all-0
-         info->mr = info->mg = info->mb = info->ma = 0;
-      }
-      return 1;
-   }
-   return 0; // error
-}
-
-static void *stbi__bmp_parse_header(stbi__context *s, stbi__bmp_data *info)
-{
-   int hsz;
-   if (stbi__get8(s) != 'B' || stbi__get8(s) != 'M') return stbi__errpuc("not BMP", "Corrupt BMP");
-   stbi__get32le(s); // discard filesize
-   stbi__get16le(s); // discard reserved
-   stbi__get16le(s); // discard reserved
-   info->offset = stbi__get32le(s);
-   info->hsz = hsz = stbi__get32le(s);
-   info->mr = info->mg = info->mb = info->ma = 0;
-   info->extra_read = 14;
-
-   if (info->offset < 0) return stbi__errpuc("bad BMP", "bad BMP");
-
-   if (hsz != 12 && hsz != 40 && hsz != 56 && hsz != 108 && hsz != 124) return stbi__errpuc("unknown BMP", "BMP type not supported: unknown");
-   if (hsz == 12) {
-      s->img_x = stbi__get16le(s);
-      s->img_y = stbi__get16le(s);
-   } else {
-      s->img_x = stbi__get32le(s);
-      s->img_y = stbi__get32le(s);
-   }
-   if (stbi__get16le(s) != 1) return stbi__errpuc("bad BMP", "bad BMP");
-   info->bpp = stbi__get16le(s);
-   if (hsz != 12) {
-      int compress = stbi__get32le(s);
-      if (compress == 1 || compress == 2) return stbi__errpuc("BMP RLE", "BMP type not supported: RLE");
-      if (compress >= 4) return stbi__errpuc("BMP JPEG/PNG", "BMP type not supported: unsupported compression"); // this includes PNG/JPEG modes
-      if (compress == 3 && info->bpp != 16 && info->bpp != 32) return stbi__errpuc("bad BMP", "bad BMP"); // bitfields requires 16 or 32 bits/pixel
-      stbi__get32le(s); // discard sizeof
-      stbi__get32le(s); // discard hres
-      stbi__get32le(s); // discard vres
-      stbi__get32le(s); // discard colorsused
-      stbi__get32le(s); // discard max important
-      if (hsz == 40 || hsz == 56) {
-         if (hsz == 56) {
-            stbi__get32le(s);
-            stbi__get32le(s);
-            stbi__get32le(s);
-            stbi__get32le(s);
-         }
-         if (info->bpp == 16 || info->bpp == 32) {
-            if (compress == 0) {
-               stbi__bmp_set_mask_defaults(info, compress);
-            } else if (compress == 3) {
-               info->mr = stbi__get32le(s);
-               info->mg = stbi__get32le(s);
-               info->mb = stbi__get32le(s);
-               info->extra_read += 12;
-               // not documented, but generated by photoshop and handled by mspaint
-               if (info->mr == info->mg && info->mg == info->mb) {
-                  // ?!?!?
-                  return stbi__errpuc("bad BMP", "bad BMP");
-               }
-            } else
-               return stbi__errpuc("bad BMP", "bad BMP");
-         }
-      } else {
-         // V4/V5 header
-         int i;
-         if (hsz != 108 && hsz != 124)
-            return stbi__errpuc("bad BMP", "bad BMP");
-         info->mr = stbi__get32le(s);
-         info->mg = stbi__get32le(s);
-         info->mb = stbi__get32le(s);
-         info->ma = stbi__get32le(s);
-         if (compress != 3) // override mr/mg/mb unless in BI_BITFIELDS mode, as per docs
-            stbi__bmp_set_mask_defaults(info, compress);
-         stbi__get32le(s); // discard color space
-         for (i=0; i < 12; ++i)
-            stbi__get32le(s); // discard color space parameters
-         if (hsz == 124) {
-            stbi__get32le(s); // discard rendering intent
-            stbi__get32le(s); // discard offset of profile data
-            stbi__get32le(s); // discard size of profile data
-            stbi__get32le(s); // discard reserved
-         }
-      }
-   }
-   return (void *) 1;
-}
-
-
-static void *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
-{
-   stbi_uc *out;
-   unsigned int mr=0,mg=0,mb=0,ma=0, all_a;
-   stbi_uc pal[256][4];
-   int psize=0,i,j,width;
-   int flip_vertically, pad, target;
-   stbi__bmp_data info;
-   STBI_NOTUSED(ri);
-
-   info.all_a = 255;
-   if (stbi__bmp_parse_header(s, &info) == NULL)
-      return NULL; // error code already set
-
-   flip_vertically = ((int) s->img_y) > 0;
-   s->img_y = abs((int) s->img_y);
-
-   if (s->img_y > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
-   if (s->img_x > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
-
-   mr = info.mr;
-   mg = info.mg;
-   mb = info.mb;
-   ma = info.ma;
-   all_a = info.all_a;
-
-   if (info.hsz == 12) {
-      if (info.bpp < 24)
-         psize = (info.offset - info.extra_read - 24) / 3;
-   } else {
-      if (info.bpp < 16)
-         psize = (info.offset - info.extra_read - info.hsz) >> 2;
-   }
-   if (psize == 0) {
-      // accept some number of extra bytes after the header, but if the offset points either to before
-      // the header ends or implies a large amount of extra data, reject the file as malformed
-      int bytes_read_so_far = s->callback_already_read + (int)(s->img_buffer - s->img_buffer_original);
-      int header_limit = 1024; // max we actually read is below 256 bytes currently.
-      int extra_data_limit = 256*4; // what ordinarily goes here is a palette; 256 entries*4 bytes is its max size.
-      if (bytes_read_so_far <= 0 || bytes_read_so_far > header_limit) {
-         return stbi__errpuc("bad header", "Corrupt BMP");
-      }
-      // we established that bytes_read_so_far is positive and sensible.
-      // the first half of this test rejects offsets that are either too small positives, or
-      // negative, and guarantees that info.offset >= bytes_read_so_far > 0. this in turn
-      // ensures the number computed in the second half of the test can't overflow.
-      if (info.offset < bytes_read_so_far || info.offset - bytes_read_so_far > extra_data_limit) {
-         return stbi__errpuc("bad offset", "Corrupt BMP");
-      } else {
-         stbi__skip(s, info.offset - bytes_read_so_far);
-      }
-   }
-
-   if (info.bpp == 24 && ma == 0xff000000)
-      s->img_n = 3;
-   else
-      s->img_n = ma ? 4 : 3;
-   if (req_comp && req_comp >= 3) // we can directly decode 3 or 4
-      target = req_comp;
-   else
-      target = s->img_n; // if they want monochrome, we'll post-convert
-
-   // sanity-check size
-   if (!stbi__mad3sizes_valid(target, s->img_x, s->img_y, 0))
-      return stbi__errpuc("too large", "Corrupt BMP");
-
-   out = (stbi_uc *) stbi__malloc_mad3(target, s->img_x, s->img_y, 0);
-   if (!out) return stbi__errpuc("outofmem", "Out of memory");
-   if (info.bpp < 16) {
-      int z=0;
-      if (psize == 0 || psize > 256) { STBI_FREE(out); return stbi__errpuc("invalid", "Corrupt BMP"); }
-      for (i=0; i < psize; ++i) {
-         pal[i][2] = stbi__get8(s);
-         pal[i][1] = stbi__get8(s);
-         pal[i][0] = stbi__get8(s);
-         if (info.hsz != 12) stbi__get8(s);
-         pal[i][3] = 255;
-      }
-      stbi__skip(s, info.offset - info.extra_read - info.hsz - psize * (info.hsz == 12 ? 3 : 4));
-      if (info.bpp == 1) width = (s->img_x + 7) >> 3;
-      else if (info.bpp == 4) width = (s->img_x + 1) >> 1;
-      else if (info.bpp == 8) width = s->img_x;
-      else { STBI_FREE(out); return stbi__errpuc("bad bpp", "Corrupt BMP"); }
-      pad = (-width)&3;
-      if (info.bpp == 1) {
-         for (j=0; j < (int) s->img_y; ++j) {
-            int bit_offset = 7, v = stbi__get8(s);
-            for (i=0; i < (int) s->img_x; ++i) {
-               int color = (v>>bit_offset)&0x1;
-               out[z++] = pal[color][0];
-               out[z++] = pal[color][1];
-               out[z++] = pal[color][2];
-               if (target == 4) out[z++] = 255;
-               if (i+1 == (int) s->img_x) break;
-               if((--bit_offset) < 0) {
-                  bit_offset = 7;
-                  v = stbi__get8(s);
-               }
-            }
-            stbi__skip(s, pad);
-         }
-      } else {
-         for (j=0; j < (int) s->img_y; ++j) {
-            for (i=0; i < (int) s->img_x; i += 2) {
-               int v=stbi__get8(s),v2=0;
-               if (info.bpp == 4) {
-                  v2 = v & 15;
-                  v >>= 4;
-               }
-               out[z++] = pal[v][0];
-               out[z++] = pal[v][1];
-               out[z++] = pal[v][2];
-               if (target == 4) out[z++] = 255;
-               if (i+1 == (int) s->img_x) break;
-               v = (info.bpp == 8) ? stbi__get8(s) : v2;
-               out[z++] = pal[v][0];
-               out[z++] = pal[v][1];
-               out[z++] = pal[v][2];
-               if (target == 4) out[z++] = 255;
-            }
-            stbi__skip(s, pad);
-         }
-      }
-   } else {
-      int rshift=0,gshift=0,bshift=0,ashift=0,rcount=0,gcount=0,bcount=0,acount=0;
-      int z = 0;
-      int easy=0;
-      stbi__skip(s, info.offset - info.extra_read - info.hsz);
-      if (info.bpp == 24) width = 3 * s->img_x;
-      else if (info.bpp == 16) width = 2*s->img_x;
-      else /* bpp = 32 and pad = 0 */ width=0;
-      pad = (-width) & 3;
-      if (info.bpp == 24) {
-         easy = 1;
-      } else if (info.bpp == 32) {
-         if (mb == 0xff && mg == 0xff00 && mr == 0x00ff0000 && ma == 0xff000000)
-            easy = 2;
-      }
-      if (!easy) {
-         if (!mr || !mg || !mb) { STBI_FREE(out); return stbi__errpuc("bad masks", "Corrupt BMP"); }
-         // right shift amt to put high bit in position #7
-         rshift = stbi__high_bit(mr)-7; rcount = stbi__bitcount(mr);
-         gshift = stbi__high_bit(mg)-7; gcount = stbi__bitcount(mg);
-         bshift = stbi__high_bit(mb)-7; bcount = stbi__bitcount(mb);
-         ashift = stbi__high_bit(ma)-7; acount = stbi__bitcount(ma);
-         if (rcount > 8 || gcount > 8 || bcount > 8 || acount > 8) { STBI_FREE(out); return stbi__errpuc("bad masks", "Corrupt BMP"); }
-      }
-      for (j=0; j < (int) s->img_y; ++j) {
-         if (easy) {
-            for (i=0; i < (int) s->img_x; ++i) {
-               unsigned char a;
-               out[z+2] = stbi__get8(s);
-               out[z+1] = stbi__get8(s);
-               out[z+0] = stbi__get8(s);
-               z += 3;
-               a = (easy == 2 ? stbi__get8(s) : 255);
-               all_a |= a;
-               if (target == 4) out[z++] = a;
-            }
-         } else {
-            int bpp = info.bpp;
-            for (i=0; i < (int) s->img_x; ++i) {
-               stbi__uint32 v = (bpp == 16 ? (stbi__uint32) stbi__get16le(s) : stbi__get32le(s));
-               unsigned int a;
-               out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mr, rshift, rcount));
-               out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mg, gshift, gcount));
-               out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mb, bshift, bcount));
-               a = (ma ? stbi__shiftsigned(v & ma, ashift, acount) : 255);
-               all_a |= a;
-               if (target == 4) out[z++] = STBI__BYTECAST(a);
-            }
-         }
-         stbi__skip(s, pad);
-      }
-   }
-
-   // if alpha channel is all 0s, replace with all 255s
-   if (target == 4 && all_a == 0)
-      for (i=4*s->img_x*s->img_y-1; i >= 0; i -= 4)
-         out[i] = 255;
-
-   if (flip_vertically) {
-      stbi_uc t;
-      for (j=0; j < (int) s->img_y>>1; ++j) {
-         stbi_uc *p1 = out +      j     *s->img_x*target;
-         stbi_uc *p2 = out + (s->img_y-1-j)*s->img_x*target;
-         for (i=0; i < (int) s->img_x*target; ++i) {
-            t = p1[i]; p1[i] = p2[i]; p2[i] = t;
-         }
-      }
-   }
-
-   if (req_comp && req_comp != target) {
-      out = stbi__convert_format(out, target, req_comp, s->img_x, s->img_y);
-      if (out == NULL) return out; // stbi__convert_format frees input on failure
-   }
-
-   *x = s->img_x;
-   *y = s->img_y;
-   if (comp) *comp = s->img_n;
-   return out;
-}
-#endif
-
-// Targa Truevision - TGA
-// by Jonathan Dummer
-#ifndef STBI_NO_TGA
-// returns STBI_rgb or whatever, 0 on error
-static int stbi__tga_get_comp(int bits_per_pixel, int is_grey, int* is_rgb16)
-{
-   // only RGB or RGBA (incl. 16bit) or grey allowed
-   if (is_rgb16) *is_rgb16 = 0;
-   switch(bits_per_pixel) {
-      case 8:  return STBI_grey;
-      case 16: if(is_grey) return STBI_grey_alpha;
-               // fallthrough
-      case 15: if(is_rgb16) *is_rgb16 = 1;
-               return STBI_rgb;
-      case 24: // fallthrough
-      case 32: return bits_per_pixel/8;
-      default: return 0;
-   }
-}
-
-static int stbi__tga_info(stbi__context *s, int *x, int *y, int *comp)
-{
-    int tga_w, tga_h, tga_comp, tga_image_type, tga_bits_per_pixel, tga_colormap_bpp;
-    int sz, tga_colormap_type;
-    stbi__get8(s);                   // discard Offset
-    tga_colormap_type = stbi__get8(s); // colormap type
-    if( tga_colormap_type > 1 ) {
-        stbi__rewind(s);
-        return 0;      // only RGB or indexed allowed
-    }
-    tga_image_type = stbi__get8(s); // image type
-    if ( tga_colormap_type == 1 ) { // colormapped (paletted) image
-        if (tga_image_type != 1 && tga_image_type != 9) {
-            stbi__rewind(s);
-            return 0;
-        }
-        stbi__skip(s,4);       // skip index of first colormap entry and number of entries
-        sz = stbi__get8(s);    //   check bits per palette color entry
-        if ( (sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32) ) {
-            stbi__rewind(s);
-            return 0;
-        }
-        stbi__skip(s,4);       // skip image x and y origin
-        tga_colormap_bpp = sz;
-    } else { // "normal" image w/o colormap - only RGB or grey allowed, +/- RLE
-        if ( (tga_image_type != 2) && (tga_image_type != 3) && (tga_image_type != 10) && (tga_image_type != 11) ) {
-            stbi__rewind(s);
-            return 0; // only RGB or grey allowed, +/- RLE
-        }
-        stbi__skip(s,9); // skip colormap specification and image x/y origin
-        tga_colormap_bpp = 0;
-    }
-    tga_w = stbi__get16le(s);
-    if( tga_w < 1 ) {
-        stbi__rewind(s);
-        return 0;   // test width
-    }
-    tga_h = stbi__get16le(s);
-    if( tga_h < 1 ) {
-        stbi__rewind(s);
-        return 0;   // test height
-    }
-    tga_bits_per_pixel = stbi__get8(s); // bits per pixel
-    stbi__get8(s); // ignore alpha bits
-    if (tga_colormap_bpp != 0) {
-        if((tga_bits_per_pixel != 8) && (tga_bits_per_pixel != 16)) {
-            // when using a colormap, tga_bits_per_pixel is the size of the indexes
-            // I don't think anything but 8 or 16bit indexes makes sense
-            stbi__rewind(s);
-            return 0;
-        }
-        tga_comp = stbi__tga_get_comp(tga_colormap_bpp, 0, NULL);
-    } else {
-        tga_comp = stbi__tga_get_comp(tga_bits_per_pixel, (tga_image_type == 3) || (tga_image_type == 11), NULL);
-    }
-    if(!tga_comp) {
-      stbi__rewind(s);
-      return 0;
-    }
-    if (x) *x = tga_w;
-    if (y) *y = tga_h;
-    if (comp) *comp = tga_comp;
-    return 1;                   // seems to have passed everything
-}
-
-static int stbi__tga_test(stbi__context *s)
-{
-   int res = 0;
-   int sz, tga_color_type;
-   stbi__get8(s);      //   discard Offset
-   tga_color_type = stbi__get8(s);   //   color type
-   if ( tga_color_type > 1 ) goto errorEnd;   //   only RGB or indexed allowed
-   sz = stbi__get8(s);   //   image type
-   if ( tga_color_type == 1 ) { // colormapped (paletted) image
-      if (sz != 1 && sz != 9) goto errorEnd; // colortype 1 demands image type 1 or 9
-      stbi__skip(s,4);       // skip index of first colormap entry and number of entries
-      sz = stbi__get8(s);    //   check bits per palette color entry
-      if ( (sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32) ) goto errorEnd;
-      stbi__skip(s,4);       // skip image x and y origin
-   } else { // "normal" image w/o colormap
-      if ( (sz != 2) && (sz != 3) && (sz != 10) && (sz != 11) ) goto errorEnd; // only RGB or grey allowed, +/- RLE
-      stbi__skip(s,9); // skip colormap specification and image x/y origin
-   }
-   if ( stbi__get16le(s) < 1 ) goto errorEnd;      //   test width
-   if ( stbi__get16le(s) < 1 ) goto errorEnd;      //   test height
-   sz = stbi__get8(s);   //   bits per pixel
-   if ( (tga_color_type == 1) && (sz != 8) && (sz != 16) ) goto errorEnd; // for colormapped images, bpp is size of an index
-   if ( (sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32) ) goto errorEnd;
-
-   res = 1; // if we got this far, everything's good and we can return 1 instead of 0
-
-errorEnd:
-   stbi__rewind(s);
-   return res;
-}
-
-// read 16bit value and convert to 24bit RGB
-static void stbi__tga_read_rgb16(stbi__context *s, stbi_uc* out)
-{
-   stbi__uint16 px = (stbi__uint16)stbi__get16le(s);
-   stbi__uint16 fiveBitMask = 31;
-   // we have 3 channels with 5bits each
-   int r = (px >> 10) & fiveBitMask;
-   int g = (px >> 5) & fiveBitMask;
-   int b = px & fiveBitMask;
-   // Note that this saves the data in RGB(A) order, so it doesn't need to be swapped later
-   out[0] = (stbi_uc)((r * 255)/31);
-   out[1] = (stbi_uc)((g * 255)/31);
-   out[2] = (stbi_uc)((b * 255)/31);
-
-   // some people claim that the most significant bit might be used for alpha
-   // (possibly if an alpha-bit is set in the "image descriptor byte")
-   // but that only made 16bit test images completely translucent..
-   // so let's treat all 15 and 16bit TGAs as RGB with no alpha.
-}
-
-static void *stbi__tga_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
-{
-   //   read in the TGA header stuff
-   int tga_offset = stbi__get8(s);
-   int tga_indexed = stbi__get8(s);
-   int tga_image_type = stbi__get8(s);
-   int tga_is_RLE = 0;
-   int tga_palette_start = stbi__get16le(s);
-   int tga_palette_len = stbi__get16le(s);
-   int tga_palette_bits = stbi__get8(s);
-   int tga_x_origin = stbi__get16le(s);
-   int tga_y_origin = stbi__get16le(s);
-   int tga_width = stbi__get16le(s);
-   int tga_height = stbi__get16le(s);
-   int tga_bits_per_pixel = stbi__get8(s);
-   int tga_comp, tga_rgb16=0;
-   int tga_inverted = stbi__get8(s);
-   // int tga_alpha_bits = tga_inverted & 15; // the 4 lowest bits - unused (useless?)
-   //   image data
-   unsigned char *tga_data;
-   unsigned char *tga_palette = NULL;
-   int i, j;
-   unsigned char raw_data[4] = {0};
-   int RLE_count = 0;
-   int RLE_repeating = 0;
-   int read_next_pixel = 1;
-   STBI_NOTUSED(ri);
-   STBI_NOTUSED(tga_x_origin); // @TODO
-   STBI_NOTUSED(tga_y_origin); // @TODO
-
-   if (tga_height > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
-   if (tga_width > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
-
-   //   do a tiny bit of precessing
-   if ( tga_image_type >= 8 )
-   {
-      tga_image_type -= 8;
-      tga_is_RLE = 1;
-   }
-   tga_inverted = 1 - ((tga_inverted >> 5) & 1);
-
-   //   If I'm paletted, then I'll use the number of bits from the palette
-   if ( tga_indexed ) tga_comp = stbi__tga_get_comp(tga_palette_bits, 0, &tga_rgb16);
-   else tga_comp = stbi__tga_get_comp(tga_bits_per_pixel, (tga_image_type == 3), &tga_rgb16);
-
-   if(!tga_comp) // shouldn't really happen, stbi__tga_test() should have ensured basic consistency
-      return stbi__errpuc("bad format", "Can't find out TGA pixelformat");
-
-   //   tga info
-   *x = tga_width;
-   *y = tga_height;
-   if (comp) *comp = tga_comp;
-
-   if (!stbi__mad3sizes_valid(tga_width, tga_height, tga_comp, 0))
-      return stbi__errpuc("too large", "Corrupt TGA");
-
-   tga_data = (unsigned char*)stbi__malloc_mad3(tga_width, tga_height, tga_comp, 0);
-   if (!tga_data) return stbi__errpuc("outofmem", "Out of memory");
-
-   // skip to the data's starting position (offset usually = 0)
-   stbi__skip(s, tga_offset );
-
-   if ( !tga_indexed && !tga_is_RLE && !tga_rgb16 ) {
-      for (i=0; i < tga_height; ++i) {
-         int row = tga_inverted ? tga_height -i - 1 : i;
-         stbi_uc *tga_row = tga_data + row*tga_width*tga_comp;
-         stbi__getn(s, tga_row, tga_width * tga_comp);
-      }
-   } else  {
-      //   do I need to load a palette?
-      if ( tga_indexed)
-      {
-         if (tga_palette_len == 0) {  /* you have to have at least one entry! */
-            STBI_FREE(tga_data);
-            return stbi__errpuc("bad palette", "Corrupt TGA");
-         }
-
-         //   any data to skip? (offset usually = 0)
-         stbi__skip(s, tga_palette_start );
-         //   load the palette
-         tga_palette = (unsigned char*)stbi__malloc_mad2(tga_palette_len, tga_comp, 0);
-         if (!tga_palette) {
-            STBI_FREE(tga_data);
-            return stbi__errpuc("outofmem", "Out of memory");
-         }
-         if (tga_rgb16) {
-            stbi_uc *pal_entry = tga_palette;
-            STBI_ASSERT(tga_comp == STBI_rgb);
-            for (i=0; i < tga_palette_len; ++i) {
-               stbi__tga_read_rgb16(s, pal_entry);
-               pal_entry += tga_comp;
-            }
-         } else if (!stbi__getn(s, tga_palette, tga_palette_len * tga_comp)) {
-               STBI_FREE(tga_data);
-               STBI_FREE(tga_palette);
-               return stbi__errpuc("bad palette", "Corrupt TGA");
-         }
-      }
-      //   load the data
-      for (i=0; i < tga_width * tga_height; ++i)
-      {
-         //   if I'm in RLE mode, do I need to get a RLE stbi__pngchunk?
-         if ( tga_is_RLE )
-         {
-            if ( RLE_count == 0 )
-            {
-               //   yep, get the next byte as a RLE command
-               int RLE_cmd = stbi__get8(s);
-               RLE_count = 1 + (RLE_cmd & 127);
-               RLE_repeating = RLE_cmd >> 7;
-               read_next_pixel = 1;
-            } else if ( !RLE_repeating )
-            {
-               read_next_pixel = 1;
-            }
-         } else
-         {
-            read_next_pixel = 1;
-         }
-         //   OK, if I need to read a pixel, do it now
-         if ( read_next_pixel )
-         {
-            //   load however much data we did have
-            if ( tga_indexed )
-            {
-               // read in index, then perform the lookup
-               int pal_idx = (tga_bits_per_pixel == 8) ? stbi__get8(s) : stbi__get16le(s);
-               if ( pal_idx >= tga_palette_len ) {
-                  // invalid index
-                  pal_idx = 0;
-               }
-               pal_idx *= tga_comp;
-               for (j = 0; j < tga_comp; ++j) {
-                  raw_data[j] = tga_palette[pal_idx+j];
-               }
-            } else if(tga_rgb16) {
-               STBI_ASSERT(tga_comp == STBI_rgb);
-               stbi__tga_read_rgb16(s, raw_data);
-            } else {
-               //   read in the data raw
-               for (j = 0; j < tga_comp; ++j) {
-                  raw_data[j] = stbi__get8(s);
-               }
-            }
-            //   clear the reading flag for the next pixel
-            read_next_pixel = 0;
-         } // end of reading a pixel
-
-         // copy data
-         for (j = 0; j < tga_comp; ++j)
-           tga_data[i*tga_comp+j] = raw_data[j];
-
-         //   in case we're in RLE mode, keep counting down
-         --RLE_count;
-      }
-      //   do I need to invert the image?
-      if ( tga_inverted )
-      {
-         for (j = 0; j*2 < tga_height; ++j)
-         {
-            int index1 = j * tga_width * tga_comp;
-            int index2 = (tga_height - 1 - j) * tga_width * tga_comp;
-            for (i = tga_width * tga_comp; i > 0; --i)
-            {
-               unsigned char temp = tga_data[index1];
-               tga_data[index1] = tga_data[index2];
-               tga_data[index2] = temp;
-               ++index1;
-               ++index2;
-            }
-         }
-      }
-      //   clear my palette, if I had one
-      if ( tga_palette != NULL )
-      {
-         STBI_FREE( tga_palette );
-      }
-   }
-
-   // swap RGB - if the source data was RGB16, it already is in the right order
-   if (tga_comp >= 3 && !tga_rgb16)
-   {
-      unsigned char* tga_pixel = tga_data;
-      for (i=0; i < tga_width * tga_height; ++i)
-      {
-         unsigned char temp = tga_pixel[0];
-         tga_pixel[0] = tga_pixel[2];
-         tga_pixel[2] = temp;
-         tga_pixel += tga_comp;
-      }
-   }
-
-   // convert to target component count
-   if (req_comp && req_comp != tga_comp)
-      tga_data = stbi__convert_format(tga_data, tga_comp, req_comp, tga_width, tga_height);
-
-   //   the things I do to get rid of an error message, and yet keep
-   //   Microsoft's C compilers happy... [8^(
-   tga_palette_start = tga_palette_len = tga_palette_bits =
-         tga_x_origin = tga_y_origin = 0;
-   STBI_NOTUSED(tga_palette_start);
-   //   OK, done
-   return tga_data;
-}
-#endif
-
-// *************************************************************************************************
-// Photoshop PSD loader -- PD by Thatcher Ulrich, integration by Nicolas Schulz, tweaked by STB
-
-#ifndef STBI_NO_PSD
-static int stbi__psd_test(stbi__context *s)
-{
-   int r = (stbi__get32be(s) == 0x38425053);
-   stbi__rewind(s);
-   return r;
-}
-
-static int stbi__psd_decode_rle(stbi__context *s, stbi_uc *p, int pixelCount)
-{
-   int count, nleft, len;
-
-   count = 0;
-   while ((nleft = pixelCount - count) > 0) {
-      len = stbi__get8(s);
-      if (len == 128) {
-         // No-op.
-      } else if (len < 128) {
-         // Copy next len+1 bytes literally.
-         len++;
-         if (len > nleft) return 0; // corrupt data
-         count += len;
-         while (len) {
-            *p = stbi__get8(s);
-            p += 4;
-            len--;
-         }
-      } else if (len > 128) {
-         stbi_uc   val;
-         // Next -len+1 bytes in the dest are replicated from next source byte.
-         // (Interpret len as a negative 8-bit int.)
-         len = 257 - len;
-         if (len > nleft) return 0; // corrupt data
-         val = stbi__get8(s);
-         count += len;
-         while (len) {
-            *p = val;
-            p += 4;
-            len--;
-         }
-      }
-   }
-
-   return 1;
-}
-
-static void *stbi__psd_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri, int bpc)
-{
-   int pixelCount;
-   int channelCount, compression;
-   int channel, i;
-   int bitdepth;
-   int w,h;
-   stbi_uc *out;
-   STBI_NOTUSED(ri);
-
-   // Check identifier
-   if (stbi__get32be(s) != 0x38425053)   // "8BPS"
-      return stbi__errpuc("not PSD", "Corrupt PSD image");
-
-   // Check file type version.
-   if (stbi__get16be(s) != 1)
-      return stbi__errpuc("wrong version", "Unsupported version of PSD image");
-
-   // Skip 6 reserved bytes.
-   stbi__skip(s, 6 );
-
-   // Read the number of channels (R, G, B, A, etc).
-   channelCount = stbi__get16be(s);
-   if (channelCount < 0 || channelCount > 16)
-      return stbi__errpuc("wrong channel count", "Unsupported number of channels in PSD image");
-
-   // Read the rows and columns of the image.
-   h = stbi__get32be(s);
-   w = stbi__get32be(s);
-
-   if (h > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
-   if (w > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
-
-   // Make sure the depth is 8 bits.
-   bitdepth = stbi__get16be(s);
-   if (bitdepth != 8 && bitdepth != 16)
-      return stbi__errpuc("unsupported bit depth", "PSD bit depth is not 8 or 16 bit");
-
-   // Make sure the color mode is RGB.
-   // Valid options are:
-   //   0: Bitmap
-   //   1: Grayscale
-   //   2: Indexed color
-   //   3: RGB color
-   //   4: CMYK color
-   //   7: Multichannel
-   //   8: Duotone
-   //   9: Lab color
-   if (stbi__get16be(s) != 3)
-      return stbi__errpuc("wrong color format", "PSD is not in RGB color format");
-
-   // Skip the Mode Data.  (It's the palette for indexed color; other info for other modes.)
-   stbi__skip(s,stbi__get32be(s) );
-
-   // Skip the image resources.  (resolution, pen tool paths, etc)
-   stbi__skip(s, stbi__get32be(s) );
-
-   // Skip the reserved data.
-   stbi__skip(s, stbi__get32be(s) );
-
-   // Find out if the data is compressed.
-   // Known values:
-   //   0: no compression
-   //   1: RLE compressed
-   compression = stbi__get16be(s);
-   if (compression > 1)
-      return stbi__errpuc("bad compression", "PSD has an unknown compression format");
-
-   // Check size
-   if (!stbi__mad3sizes_valid(4, w, h, 0))
-      return stbi__errpuc("too large", "Corrupt PSD");
-
-   // Create the destination image.
-
-   if (!compression && bitdepth == 16 && bpc == 16) {
-      out = (stbi_uc *) stbi__malloc_mad3(8, w, h, 0);
-      ri->bits_per_channel = 16;
-   } else
-      out = (stbi_uc *) stbi__malloc(4 * w*h);
-
-   if (!out) return stbi__errpuc("outofmem", "Out of memory");
-   pixelCount = w*h;
-
-   // Initialize the data to zero.
-   //memset( out, 0, pixelCount * 4 );
-
-   // Finally, the image data.
-   if (compression) {
-      // RLE as used by .PSD and .TIFF
-      // Loop until you get the number of unpacked bytes you are expecting:
-      //     Read the next source byte into n.
-      //     If n is between 0 and 127 inclusive, copy the next n+1 bytes literally.
-      //     Else if n is between -127 and -1 inclusive, copy the next byte -n+1 times.
-      //     Else if n is 128, noop.
-      // Endloop
-
-      // The RLE-compressed data is preceded by a 2-byte data count for each row in the data,
-      // which we're going to just skip.
-      stbi__skip(s, h * channelCount * 2 );
-
-      // Read the RLE data by channel.
-      for (channel = 0; channel < 4; channel++) {
-         stbi_uc *p;
-
-         p = out+channel;
-         if (channel >= channelCount) {
-            // Fill this channel with default data.
-            for (i = 0; i < pixelCount; i++, p += 4)
-               *p = (channel == 3 ? 255 : 0);
-         } else {
-            // Read the RLE data.
-            if (!stbi__psd_decode_rle(s, p, pixelCount)) {
-               STBI_FREE(out);
-               return stbi__errpuc("corrupt", "bad RLE data");
-            }
-         }
-      }
-
-   } else {
-      // We're at the raw image data.  It's each channel in order (Red, Green, Blue, Alpha, ...)
-      // where each channel consists of an 8-bit (or 16-bit) value for each pixel in the image.
-
-      // Read the data by channel.
-      for (channel = 0; channel < 4; channel++) {
-         if (channel >= channelCount) {
-            // Fill this channel with default data.
-            if (bitdepth == 16 && bpc == 16) {
-               stbi__uint16 *q = ((stbi__uint16 *) out) + channel;
-               stbi__uint16 val = channel == 3 ? 65535 : 0;
-               for (i = 0; i < pixelCount; i++, q += 4)
-                  *q = val;
-            } else {
-               stbi_uc *p = out+channel;
-               stbi_uc val = channel == 3 ? 255 : 0;
-               for (i = 0; i < pixelCount; i++, p += 4)
-                  *p = val;
-            }
-         } else {
-            if (ri->bits_per_channel == 16) {    // output bpc
-               stbi__uint16 *q = ((stbi__uint16 *) out) + channel;
-               for (i = 0; i < pixelCount; i++, q += 4)
-                  *q = (stbi__uint16) stbi__get16be(s);
-            } else {
-               stbi_uc *p = out+channel;
-               if (bitdepth == 16) {  // input bpc
-                  for (i = 0; i < pixelCount; i++, p += 4)
-                     *p = (stbi_uc) (stbi__get16be(s) >> 8);
-               } else {
-                  for (i = 0; i < pixelCount; i++, p += 4)
-                     *p = stbi__get8(s);
-               }
-            }
-         }
-      }
-   }
-
-   // remove weird white matte from PSD
-   if (channelCount >= 4) {
-      if (ri->bits_per_channel == 16) {
-         for (i=0; i < w*h; ++i) {
-            stbi__uint16 *pixel = (stbi__uint16 *) out + 4*i;
-            if (pixel[3] != 0 && pixel[3] != 65535) {
-               float a = pixel[3] / 65535.0f;
-               float ra = 1.0f / a;
-               float inv_a = 65535.0f * (1 - ra);
-               pixel[0] = (stbi__uint16) (pixel[0]*ra + inv_a);
-               pixel[1] = (stbi__uint16) (pixel[1]*ra + inv_a);
-               pixel[2] = (stbi__uint16) (pixel[2]*ra + inv_a);
-            }
-         }
-      } else {
-         for (i=0; i < w*h; ++i) {
-            unsigned char *pixel = out + 4*i;
-            if (pixel[3] != 0 && pixel[3] != 255) {
-               float a = pixel[3] / 255.0f;
-               float ra = 1.0f / a;
-               float inv_a = 255.0f * (1 - ra);
-               pixel[0] = (unsigned char) (pixel[0]*ra + inv_a);
-               pixel[1] = (unsigned char) (pixel[1]*ra + inv_a);
-               pixel[2] = (unsigned char) (pixel[2]*ra + inv_a);
-            }
-         }
-      }
-   }
-
-   // convert to desired output format
-   if (req_comp && req_comp != 4) {
-      if (ri->bits_per_channel == 16)
-         out = (stbi_uc *) stbi__convert_format16((stbi__uint16 *) out, 4, req_comp, w, h);
-      else
-         out = stbi__convert_format(out, 4, req_comp, w, h);
-      if (out == NULL) return out; // stbi__convert_format frees input on failure
-   }
-
-   if (comp) *comp = 4;
-   *y = h;
-   *x = w;
-
-   return out;
-}
-#endif
-
-// *************************************************************************************************
-// Softimage PIC loader
-// by Tom Seddon
-//
-// See http://softimage.wiki.softimage.com/index.php/INFO:_PIC_file_format
-// See http://ozviz.wasp.uwa.edu.au/~pbourke/dataformats/softimagepic/
-
-#ifndef STBI_NO_PIC
-static int stbi__pic_is4(stbi__context *s,const char *str)
-{
-   int i;
-   for (i=0; i<4; ++i)
-      if (stbi__get8(s) != (stbi_uc)str[i])
-         return 0;
-
-   return 1;
-}
-
-static int stbi__pic_test_core(stbi__context *s)
-{
-   int i;
-
-   if (!stbi__pic_is4(s,"\x53\x80\xF6\x34"))
-      return 0;
-
-   for(i=0;i<84;++i)
-      stbi__get8(s);
-
-   if (!stbi__pic_is4(s,"PICT"))
-      return 0;
-
-   return 1;
-}
-
-typedef struct
-{
-   stbi_uc size,type,channel;
-} stbi__pic_packet;
-
-static stbi_uc *stbi__readval(stbi__context *s, int channel, stbi_uc *dest)
-{
-   int mask=0x80, i;
-
-   for (i=0; i<4; ++i, mask>>=1) {
-      if (channel & mask) {
-         if (stbi__at_eof(s)) return stbi__errpuc("bad file","PIC file too short");
-         dest[i]=stbi__get8(s);
-      }
-   }
-
-   return dest;
-}
-
-static void stbi__copyval(int channel,stbi_uc *dest,const stbi_uc *src)
-{
-   int mask=0x80,i;
-
-   for (i=0;i<4; ++i, mask>>=1)
-      if (channel&mask)
-         dest[i]=src[i];
-}
-
-static stbi_uc *stbi__pic_load_core(stbi__context *s,int width,int height,int *comp, stbi_uc *result)
-{
-   int act_comp=0,num_packets=0,y,chained;
-   stbi__pic_packet packets[10];
-
-   // this will (should...) cater for even some bizarre stuff like having data
-    // for the same channel in multiple packets.
-   do {
-      stbi__pic_packet *packet;
-
-      if (num_packets==sizeof(packets)/sizeof(packets[0]))
-         return stbi__errpuc("bad format","too many packets");
-
-      packet = &packets[num_packets++];
-
-      chained = stbi__get8(s);
-      packet->size    = stbi__get8(s);
-      packet->type    = stbi__get8(s);
-      packet->channel = stbi__get8(s);
-
-      act_comp |= packet->channel;
-
-      if (stbi__at_eof(s))          return stbi__errpuc("bad file","file too short (reading packets)");
-      if (packet->size != 8)  return stbi__errpuc("bad format","packet isn't 8bpp");
-   } while (chained);
-
-   *comp = (act_comp & 0x10 ? 4 : 3); // has alpha channel?
-
-   for(y=0; y<height; ++y) {
-      int packet_idx;
-
-      for(packet_idx=0; packet_idx < num_packets; ++packet_idx) {
-         stbi__pic_packet *packet = &packets[packet_idx];
-         stbi_uc *dest = result+y*width*4;
-
-         switch (packet->type) {
-            default:
-               return stbi__errpuc("bad format","packet has bad compression type");
-
-            case 0: {//uncompressed
-               int x;
-
-               for(x=0;x<width;++x, dest+=4)
-                  if (!stbi__readval(s,packet->channel,dest))
-                     return 0;
-               break;
-            }
-
-            case 1://Pure RLE
-               {
-                  int left=width, i;
-
-                  while (left>0) {
-                     stbi_uc count,value[4];
-
-                     count=stbi__get8(s);
-                     if (stbi__at_eof(s))   return stbi__errpuc("bad file","file too short (pure read count)");
-
-                     if (count > left)
-                        count = (stbi_uc) left;
-
-                     if (!stbi__readval(s,packet->channel,value))  return 0;
-
-                     for(i=0; i<count; ++i,dest+=4)
-                        stbi__copyval(packet->channel,dest,value);
-                     left -= count;
-                  }
-               }
-               break;
-
-            case 2: {//Mixed RLE
-               int left=width;
-               while (left>0) {
-                  int count = stbi__get8(s), i;
-                  if (stbi__at_eof(s))  return stbi__errpuc("bad file","file too short (mixed read count)");
-
-                  if (count >= 128) { // Repeated
-                     stbi_uc value[4];
-
-                     if (count==128)
-                        count = stbi__get16be(s);
-                     else
-                        count -= 127;
-                     if (count > left)
-                        return stbi__errpuc("bad file","scanline overrun");
-
-                     if (!stbi__readval(s,packet->channel,value))
-                        return 0;
-
-                     for(i=0;i<count;++i, dest += 4)
-                        stbi__copyval(packet->channel,dest,value);
-                  } else { // Raw
-                     ++count;
-                     if (count>left) return stbi__errpuc("bad file","scanline overrun");
-
-                     for(i=0;i<count;++i, dest+=4)
-                        if (!stbi__readval(s,packet->channel,dest))
-                           return 0;
-                  }
-                  left-=count;
-               }
-               break;
-            }
-         }
-      }
-   }
-
-   return result;
-}
-
-static void *stbi__pic_load(stbi__context *s,int *px,int *py,int *comp,int req_comp, stbi__result_info *ri)
-{
-   stbi_uc *result;
-   int i, x,y, internal_comp;
-   STBI_NOTUSED(ri);
-
-   if (!comp) comp = &internal_comp;
-
-   for (i=0; i<92; ++i)
-      stbi__get8(s);
-
-   x = stbi__get16be(s);
-   y = stbi__get16be(s);
-
-   if (y > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
-   if (x > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
-
-   if (stbi__at_eof(s))  return stbi__errpuc("bad file","file too short (pic header)");
-   if (!stbi__mad3sizes_valid(x, y, 4, 0)) return stbi__errpuc("too large", "PIC image too large to decode");
-
-   stbi__get32be(s); //skip `ratio'
-   stbi__get16be(s); //skip `fields'
-   stbi__get16be(s); //skip `pad'
-
-   // intermediate buffer is RGBA
-   result = (stbi_uc *) stbi__malloc_mad3(x, y, 4, 0);
-   if (!result) return stbi__errpuc("outofmem", "Out of memory");
-   memset(result, 0xff, x*y*4);
-
-   if (!stbi__pic_load_core(s,x,y,comp, result)) {
-      STBI_FREE(result);
-      result=0;
-   }
-   *px = x;
-   *py = y;
-   if (req_comp == 0) req_comp = *comp;
-   result=stbi__convert_format(result,4,req_comp,x,y);
-
-   return result;
-}
-
-static int stbi__pic_test(stbi__context *s)
-{
-   int r = stbi__pic_test_core(s);
-   stbi__rewind(s);
-   return r;
-}
-#endif
-
-// *************************************************************************************************
-// GIF loader -- public domain by Jean-Marc Lienher -- simplified/shrunk by stb
-
-#ifndef STBI_NO_GIF
-typedef struct
-{
-   stbi__int16 prefix;
-   stbi_uc first;
-   stbi_uc suffix;
-} stbi__gif_lzw;
-
-typedef struct
-{
-   int w,h;
-   stbi_uc *out;                 // output buffer (always 4 components)
-   stbi_uc *background;          // The current "background" as far as a gif is concerned
-   stbi_uc *history;
-   int flags, bgindex, ratio, transparent, eflags;
-   stbi_uc  pal[256][4];
-   stbi_uc lpal[256][4];
-   stbi__gif_lzw codes[8192];
-   stbi_uc *color_table;
-   int parse, step;
-   int lflags;
-   int start_x, start_y;
-   int max_x, max_y;
-   int cur_x, cur_y;
-   int line_size;
-   int delay;
-} stbi__gif;
-
-static int stbi__gif_test_raw(stbi__context *s)
-{
-   int sz;
-   if (stbi__get8(s) != 'G' || stbi__get8(s) != 'I' || stbi__get8(s) != 'F' || stbi__get8(s) != '8') return 0;
-   sz = stbi__get8(s);
-   if (sz != '9' && sz != '7') return 0;
-   if (stbi__get8(s) != 'a') return 0;
-   return 1;
-}
-
-static int stbi__gif_test(stbi__context *s)
-{
-   int r = stbi__gif_test_raw(s);
-   stbi__rewind(s);
-   return r;
-}
-
-static void stbi__gif_parse_colortable(stbi__context *s, stbi_uc pal[256][4], int num_entries, int transp)
-{
-   int i;
-   for (i=0; i < num_entries; ++i) {
-      pal[i][2] = stbi__get8(s);
-      pal[i][1] = stbi__get8(s);
-      pal[i][0] = stbi__get8(s);
-      pal[i][3] = transp == i ? 0 : 255;
-   }
-}
-
-static int stbi__gif_header(stbi__context *s, stbi__gif *g, int *comp, int is_info)
-{
-   stbi_uc version;
-   if (stbi__get8(s) != 'G' || stbi__get8(s) != 'I' || stbi__get8(s) != 'F' || stbi__get8(s) != '8')
-      return stbi__err("not GIF", "Corrupt GIF");
-
-   version = stbi__get8(s);
-   if (version != '7' && version != '9')    return stbi__err("not GIF", "Corrupt GIF");
-   if (stbi__get8(s) != 'a')                return stbi__err("not GIF", "Corrupt GIF");
-
-   stbi__g_failure_reason = "";
-   g->w = stbi__get16le(s);
-   g->h = stbi__get16le(s);
-   g->flags = stbi__get8(s);
-   g->bgindex = stbi__get8(s);
-   g->ratio = stbi__get8(s);
-   g->transparent = -1;
-
-   if (g->w > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)");
-   if (g->h > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)");
-
-   if (comp != 0) *comp = 4;  // can't actually tell whether it's 3 or 4 until we parse the comments
-
-   if (is_info) return 1;
-
-   if (g->flags & 0x80)
-      stbi__gif_parse_colortable(s,g->pal, 2 << (g->flags & 7), -1);
-
-   return 1;
-}
-
-static int stbi__gif_info_raw(stbi__context *s, int *x, int *y, int *comp)
-{
-   stbi__gif* g = (stbi__gif*) stbi__malloc(sizeof(stbi__gif));
-   if (!g) return stbi__err("outofmem", "Out of memory");
-   if (!stbi__gif_header(s, g, comp, 1)) {
-      STBI_FREE(g);
-      stbi__rewind( s );
-      return 0;
-   }
-   if (x) *x = g->w;
-   if (y) *y = g->h;
-   STBI_FREE(g);
-   return 1;
-}
-
-static void stbi__out_gif_code(stbi__gif *g, stbi__uint16 code)
-{
-   stbi_uc *p, *c;
-   int idx;
-
-   // recurse to decode the prefixes, since the linked-list is backwards,
-   // and working backwards through an interleaved image would be nasty
-   if (g->codes[code].prefix >= 0)
-      stbi__out_gif_code(g, g->codes[code].prefix);
-
-   if (g->cur_y >= g->max_y) return;
-
-   idx = g->cur_x + g->cur_y;
-   p = &g->out[idx];
-   g->history[idx / 4] = 1;
-
-   c = &g->color_table[g->codes[code].suffix * 4];
-   if (c[3] > 128) { // don't render transparent pixels;
-      p[0] = c[2];
-      p[1] = c[1];
-      p[2] = c[0];
-      p[3] = c[3];
-   }
-   g->cur_x += 4;
-
-   if (g->cur_x >= g->max_x) {
-      g->cur_x = g->start_x;
-      g->cur_y += g->step;
-
-      while (g->cur_y >= g->max_y && g->parse > 0) {
-         g->step = (1 << g->parse) * g->line_size;
-         g->cur_y = g->start_y + (g->step >> 1);
-         --g->parse;
-      }
-   }
-}
-
-static stbi_uc *stbi__process_gif_raster(stbi__context *s, stbi__gif *g)
-{
-   stbi_uc lzw_cs;
-   stbi__int32 len, init_code;
-   stbi__uint32 first;
-   stbi__int32 codesize, codemask, avail, oldcode, bits, valid_bits, clear;
-   stbi__gif_lzw *p;
-
-   lzw_cs = stbi__get8(s);
-   if (lzw_cs > 12) return NULL;
-   clear = 1 << lzw_cs;
-   first = 1;
-   codesize = lzw_cs + 1;
-   codemask = (1 << codesize) - 1;
-   bits = 0;
-   valid_bits = 0;
-   for (init_code = 0; init_code < clear; init_code++) {
-      g->codes[init_code].prefix = -1;
-      g->codes[init_code].first = (stbi_uc) init_code;
-      g->codes[init_code].suffix = (stbi_uc) init_code;
-   }
-
-   // support no starting clear code
-   avail = clear+2;
-   oldcode = -1;
-
-   len = 0;
-   for(;;) {
-      if (valid_bits < codesize) {
-         if (len == 0) {
-            len = stbi__get8(s); // start new block
-            if (len == 0)
-               return g->out;
-         }
-         --len;
-         bits |= (stbi__int32) stbi__get8(s) << valid_bits;
-         valid_bits += 8;
-      } else {
-         stbi__int32 code = bits & codemask;
-         bits >>= codesize;
-         valid_bits -= codesize;
-         // @OPTIMIZE: is there some way we can accelerate the non-clear path?
-         if (code == clear) {  // clear code
-            codesize = lzw_cs + 1;
-            codemask = (1 << codesize) - 1;
-            avail = clear + 2;
-            oldcode = -1;
-            first = 0;
-         } else if (code == clear + 1) { // end of stream code
-            stbi__skip(s, len);
-            while ((len = stbi__get8(s)) > 0)
-               stbi__skip(s,len);
-            return g->out;
-         } else if (code <= avail) {
-            if (first) {
-               return stbi__errpuc("no clear code", "Corrupt GIF");
-            }
-
-            if (oldcode >= 0) {
-               p = &g->codes[avail++];
-               if (avail > 8192) {
-                  return stbi__errpuc("too many codes", "Corrupt GIF");
-               }
-
-               p->prefix = (stbi__int16) oldcode;
-               p->first = g->codes[oldcode].first;
-               p->suffix = (code == avail) ? p->first : g->codes[code].first;
-            } else if (code == avail)
-               return stbi__errpuc("illegal code in raster", "Corrupt GIF");
-
-            stbi__out_gif_code(g, (stbi__uint16) code);
-
-            if ((avail & codemask) == 0 && avail <= 0x0FFF) {
-               codesize++;
-               codemask = (1 << codesize) - 1;
-            }
-
-            oldcode = code;
-         } else {
-            return stbi__errpuc("illegal code in raster", "Corrupt GIF");
-         }
-      }
-   }
-}
-
-// this function is designed to support animated gifs, although stb_image doesn't support it
-// two back is the image from two frames ago, used for a very specific disposal format
-static stbi_uc *stbi__gif_load_next(stbi__context *s, stbi__gif *g, int *comp, int req_comp, stbi_uc *two_back)
-{
-   int dispose;
-   int first_frame;
-   int pi;
-   int pcount;
-   STBI_NOTUSED(req_comp);
-
-   // on first frame, any non-written pixels get the background colour (non-transparent)
-   first_frame = 0;
-   if (g->out == 0) {
-      if (!stbi__gif_header(s, g, comp,0)) return 0; // stbi__g_failure_reason set by stbi__gif_header
-      if (!stbi__mad3sizes_valid(4, g->w, g->h, 0))
-         return stbi__errpuc("too large", "GIF image is too large");
-      pcount = g->w * g->h;
-      g->out = (stbi_uc *) stbi__malloc(4 * pcount);
-      g->background = (stbi_uc *) stbi__malloc(4 * pcount);
-      g->history = (stbi_uc *) stbi__malloc(pcount);
-      if (!g->out || !g->background || !g->history)
-         return stbi__errpuc("outofmem", "Out of memory");
-
-      // image is treated as "transparent" at the start - ie, nothing overwrites the current background;
-      // background colour is only used for pixels that are not rendered first frame, after that "background"
-      // color refers to the color that was there the previous frame.
-      memset(g->out, 0x00, 4 * pcount);
-      memset(g->background, 0x00, 4 * pcount); // state of the background (starts transparent)
-      memset(g->history, 0x00, pcount);        // pixels that were affected previous frame
-      first_frame = 1;
-   } else {
-      // second frame - how do we dispose of the previous one?
-      dispose = (g->eflags & 0x1C) >> 2;
-      pcount = g->w * g->h;
-
-      if ((dispose == 3) && (two_back == 0)) {
-         dispose = 2; // if I don't have an image to revert back to, default to the old background
-      }
-
-      if (dispose == 3) { // use previous graphic
-         for (pi = 0; pi < pcount; ++pi) {
-            if (g->history[pi]) {
-               memcpy( &g->out[pi * 4], &two_back[pi * 4], 4 );
-            }
-         }
-      } else if (dispose == 2) {
-         // restore what was changed last frame to background before that frame;
-         for (pi = 0; pi < pcount; ++pi) {
-            if (g->history[pi]) {
-               memcpy( &g->out[pi * 4], &g->background[pi * 4], 4 );
-            }
-         }
-      } else {
-         // This is a non-disposal case eithe way, so just
-         // leave the pixels as is, and they will become the new background
-         // 1: do not dispose
-         // 0:  not specified.
-      }
-
-      // background is what out is after the undoing of the previou frame;
-      memcpy( g->background, g->out, 4 * g->w * g->h );
-   }
-
-   // clear my history;
-   memset( g->history, 0x00, g->w * g->h );        // pixels that were affected previous frame
-
-   for (;;) {
-      int tag = stbi__get8(s);
-      switch (tag) {
-         case 0x2C: /* Image Descriptor */
-         {
-            stbi__int32 x, y, w, h;
-            stbi_uc *o;
-
-            x = stbi__get16le(s);
-            y = stbi__get16le(s);
-            w = stbi__get16le(s);
-            h = stbi__get16le(s);
-            if (((x + w) > (g->w)) || ((y + h) > (g->h)))
-               return stbi__errpuc("bad Image Descriptor", "Corrupt GIF");
-
-            g->line_size = g->w * 4;
-            g->start_x = x * 4;
-            g->start_y = y * g->line_size;
-            g->max_x   = g->start_x + w * 4;
-            g->max_y   = g->start_y + h * g->line_size;
-            g->cur_x   = g->start_x;
-            g->cur_y   = g->start_y;
-
-            // if the width of the specified rectangle is 0, that means
-            // we may not see *any* pixels or the image is malformed;
-            // to make sure this is caught, move the current y down to
-            // max_y (which is what out_gif_code checks).
-            if (w == 0)
-               g->cur_y = g->max_y;
-
-            g->lflags = stbi__get8(s);
-
-            if (g->lflags & 0x40) {
-               g->step = 8 * g->line_size; // first interlaced spacing
-               g->parse = 3;
-            } else {
-               g->step = g->line_size;
-               g->parse = 0;
-            }
-
-            if (g->lflags & 0x80) {
-               stbi__gif_parse_colortable(s,g->lpal, 2 << (g->lflags & 7), g->eflags & 0x01 ? g->transparent : -1);
-               g->color_table = (stbi_uc *) g->lpal;
-            } else if (g->flags & 0x80) {
-               g->color_table = (stbi_uc *) g->pal;
-            } else
-               return stbi__errpuc("missing color table", "Corrupt GIF");
-
-            o = stbi__process_gif_raster(s, g);
-            if (!o) return NULL;
-
-            // if this was the first frame,
-            pcount = g->w * g->h;
-            if (first_frame && (g->bgindex > 0)) {
-               // if first frame, any pixel not drawn to gets the background color
-               for (pi = 0; pi < pcount; ++pi) {
-                  if (g->history[pi] == 0) {
-                     g->pal[g->bgindex][3] = 255; // just in case it was made transparent, undo that; It will be reset next frame if need be;
-                     memcpy( &g->out[pi * 4], &g->pal[g->bgindex], 4 );
-                  }
-               }
-            }
-
-            return o;
-         }
-
-         case 0x21: // Comment Extension.
-         {
-            int len;
-            int ext = stbi__get8(s);
-            if (ext == 0xF9) { // Graphic Control Extension.
-               len = stbi__get8(s);
-               if (len == 4) {
-                  g->eflags = stbi__get8(s);
-                  g->delay = 10 * stbi__get16le(s); // delay - 1/100th of a second, saving as 1/1000ths.
-
-                  // unset old transparent
-                  if (g->transparent >= 0) {
-                     g->pal[g->transparent][3] = 255;
-                  }
-                  if (g->eflags & 0x01) {
-                     g->transparent = stbi__get8(s);
-                     if (g->transparent >= 0) {
-                        g->pal[g->transparent][3] = 0;
-                     }
-                  } else {
-                     // don't need transparent
-                     stbi__skip(s, 1);
-                     g->transparent = -1;
-                  }
-               } else {
-                  stbi__skip(s, len);
-                  break;
-               }
-            }
-            while ((len = stbi__get8(s)) != 0) {
-               stbi__skip(s, len);
-            }
-            break;
-         }
-
-         case 0x3B: // gif stream termination code
-            return (stbi_uc *) s; // using '1' causes warning on some compilers
-
-         default:
-            return stbi__errpuc("unknown code", "Corrupt GIF");
-      }
-   }
-}
-
-static void *stbi__load_gif_main_outofmem(stbi__gif *g, stbi_uc *out, int **delays)
-{
-   STBI_FREE(g->out);
-   STBI_FREE(g->history);
-   STBI_FREE(g->background);
-
-   if (out) STBI_FREE(out);
-   if (delays && *delays) STBI_FREE(*delays);
-   return stbi__errpuc("outofmem", "Out of memory");
-}
-
-static void *stbi__load_gif_main(stbi__context *s, int **delays, int *x, int *y, int *z, int *comp, int req_comp)
-{
-   if (stbi__gif_test(s)) {
-      int layers = 0;
-      stbi_uc *u = 0;
-      stbi_uc *out = 0;
-      stbi_uc *two_back = 0;
-      stbi__gif g;
-      int stride;
-      int out_size = 0;
-      int delays_size = 0;
-
-      STBI_NOTUSED(out_size);
-      STBI_NOTUSED(delays_size);
-
-      memset(&g, 0, sizeof(g));
-      if (delays) {
-         *delays = 0;
-      }
-
-      do {
-         u = stbi__gif_load_next(s, &g, comp, req_comp, two_back);
-         if (u == (stbi_uc *) s) u = 0;  // end of animated gif marker
-
-         if (u) {
-            *x = g.w;
-            *y = g.h;
-            ++layers;
-            stride = g.w * g.h * 4;
-
-            if (out) {
-               void *tmp = (stbi_uc*) STBI_REALLOC_SIZED( out, out_size, layers * stride );
-               if (!tmp)
-                  return stbi__load_gif_main_outofmem(&g, out, delays);
-               else {
-                   out = (stbi_uc*) tmp;
-                   out_size = layers * stride;
-               }
-
-               if (delays) {
-                  int *new_delays = (int*) STBI_REALLOC_SIZED( *delays, delays_size, sizeof(int) * layers );
-                  if (!new_delays)
-                     return stbi__load_gif_main_outofmem(&g, out, delays);
-                  *delays = new_delays;
-                  delays_size = layers * sizeof(int);
-               }
-            } else {
-               out = (stbi_uc*)stbi__malloc( layers * stride );
-               if (!out)
-                  return stbi__load_gif_main_outofmem(&g, out, delays);
-               out_size = layers * stride;
-               if (delays) {
-                  *delays = (int*) stbi__malloc( layers * sizeof(int) );
-                  if (!*delays)
-                     return stbi__load_gif_main_outofmem(&g, out, delays);
-                  delays_size = layers * sizeof(int);
-               }
-            }
-            memcpy( out + ((layers - 1) * stride), u, stride );
-            if (layers >= 2) {
-               two_back = out - 2 * stride;
-            }
-
-            if (delays) {
-               (*delays)[layers - 1U] = g.delay;
-            }
-         }
-      } while (u != 0);
-
-      // free temp buffer;
-      STBI_FREE(g.out);
-      STBI_FREE(g.history);
-      STBI_FREE(g.background);
-
-      // do the final conversion after loading everything;
-      if (req_comp && req_comp != 4)
-         out = stbi__convert_format(out, 4, req_comp, layers * g.w, g.h);
-
-      *z = layers;
-      return out;
-   } else {
-      return stbi__errpuc("not GIF", "Image was not as a gif type.");
-   }
-}
-
-static void *stbi__gif_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
-{
-   stbi_uc *u = 0;
-   stbi__gif g;
-   memset(&g, 0, sizeof(g));
-   STBI_NOTUSED(ri);
-
-   u = stbi__gif_load_next(s, &g, comp, req_comp, 0);
-   if (u == (stbi_uc *) s) u = 0;  // end of animated gif marker
-   if (u) {
-      *x = g.w;
-      *y = g.h;
-
-      // moved conversion to after successful load so that the same
-      // can be done for multiple frames.
-      if (req_comp && req_comp != 4)
-         u = stbi__convert_format(u, 4, req_comp, g.w, g.h);
-   } else if (g.out) {
-      // if there was an error and we allocated an image buffer, free it!
-      STBI_FREE(g.out);
-   }
-
-   // free buffers needed for multiple frame loading;
-   STBI_FREE(g.history);
-   STBI_FREE(g.background);
-
-   return u;
-}
-
-static int stbi__gif_info(stbi__context *s, int *x, int *y, int *comp)
-{
-   return stbi__gif_info_raw(s,x,y,comp);
-}
-#endif
-
-// *************************************************************************************************
-// Radiance RGBE HDR loader
-// originally by Nicolas Schulz
-#ifndef STBI_NO_HDR
-static int stbi__hdr_test_core(stbi__context *s, const char *signature)
-{
-   int i;
-   for (i=0; signature[i]; ++i)
-      if (stbi__get8(s) != signature[i])
-          return 0;
-   stbi__rewind(s);
-   return 1;
-}
-
-static int stbi__hdr_test(stbi__context* s)
-{
-   int r = stbi__hdr_test_core(s, "#?RADIANCE\n");
-   stbi__rewind(s);
-   if(!r) {
-       r = stbi__hdr_test_core(s, "#?RGBE\n");
-       stbi__rewind(s);
-   }
-   return r;
-}
-
-#define STBI__HDR_BUFLEN  1024
-static char *stbi__hdr_gettoken(stbi__context *z, char *buffer)
-{
-   int len=0;
-   char c = '\0';
-
-   c = (char) stbi__get8(z);
-
-   while (!stbi__at_eof(z) && c != '\n') {
-      buffer[len++] = c;
-      if (len == STBI__HDR_BUFLEN-1) {
-         // flush to end of line
-         while (!stbi__at_eof(z) && stbi__get8(z) != '\n')
-            ;
-         break;
-      }
-      c = (char) stbi__get8(z);
-   }
-
-   buffer[len] = 0;
-   return buffer;
-}
-
-static void stbi__hdr_convert(float *output, stbi_uc *input, int req_comp)
-{
-   if ( input[3] != 0 ) {
-      float f1;
-      // Exponent
-      f1 = (float) ldexp(1.0f, input[3] - (int)(128 + 8));
-      if (req_comp <= 2)
-         output[0] = (input[0] + input[1] + input[2]) * f1 / 3;
-      else {
-         output[0] = input[0] * f1;
-         output[1] = input[1] * f1;
-         output[2] = input[2] * f1;
-      }
-      if (req_comp == 2) output[1] = 1;
-      if (req_comp == 4) output[3] = 1;
-   } else {
-      switch (req_comp) {
-         case 4: output[3] = 1; /* fallthrough */
-         case 3: output[0] = output[1] = output[2] = 0;
-                 break;
-         case 2: output[1] = 1; /* fallthrough */
-         case 1: output[0] = 0;
-                 break;
-      }
-   }
-}
-
-static float *stbi__hdr_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
-{
-   char buffer[STBI__HDR_BUFLEN];
-   char *token;
-   int valid = 0;
-   int width, height;
-   stbi_uc *scanline;
-   float *hdr_data;
-   int len;
-   unsigned char count, value;
-   int i, j, k, c1,c2, z;
-   const char *headerToken;
-   STBI_NOTUSED(ri);
-
-   // Check identifier
-   headerToken = stbi__hdr_gettoken(s,buffer);
-   if (strcmp(headerToken, "#?RADIANCE") != 0 && strcmp(headerToken, "#?RGBE") != 0)
-      return stbi__errpf("not HDR", "Corrupt HDR image");
-
-   // Parse header
-   for(;;) {
-      token = stbi__hdr_gettoken(s,buffer);
-      if (token[0] == 0) break;
-      if (strcmp(token, "FORMAT=32-bit_rle_rgbe") == 0) valid = 1;
-   }
-
-   if (!valid)    return stbi__errpf("unsupported format", "Unsupported HDR format");
-
-   // Parse width and height
-   // can't use sscanf() if we're not using stdio!
-   token = stbi__hdr_gettoken(s,buffer);
-   if (strncmp(token, "-Y ", 3))  return stbi__errpf("unsupported data layout", "Unsupported HDR format");
-   token += 3;
-   height = (int) strtol(token, &token, 10);
-   while (*token == ' ') ++token;
-   if (strncmp(token, "+X ", 3))  return stbi__errpf("unsupported data layout", "Unsupported HDR format");
-   token += 3;
-   width = (int) strtol(token, NULL, 10);
-
-   if (height > STBI_MAX_DIMENSIONS) return stbi__errpf("too large","Very large image (corrupt?)");
-   if (width > STBI_MAX_DIMENSIONS) return stbi__errpf("too large","Very large image (corrupt?)");
-
-   *x = width;
-   *y = height;
-
-   if (comp) *comp = 3;
-   if (req_comp == 0) req_comp = 3;
-
-   if (!stbi__mad4sizes_valid(width, height, req_comp, sizeof(float), 0))
-      return stbi__errpf("too large", "HDR image is too large");
-
-   // Read data
-   hdr_data = (float *) stbi__malloc_mad4(width, height, req_comp, sizeof(float), 0);
-   if (!hdr_data)
-      return stbi__errpf("outofmem", "Out of memory");
-
-   // Load image data
-   // image data is stored as some number of sca
-   if ( width < 8 || width >= 32768) {
-      // Read flat data
-      for (j=0; j < height; ++j) {
-         for (i=0; i < width; ++i) {
-            stbi_uc rgbe[4];
-           main_decode_loop:
-            stbi__getn(s, rgbe, 4);
-            stbi__hdr_convert(hdr_data + j * width * req_comp + i * req_comp, rgbe, req_comp);
-         }
-      }
-   } else {
-      // Read RLE-encoded data
-      scanline = NULL;
-
-      for (j = 0; j < height; ++j) {
-         c1 = stbi__get8(s);
-         c2 = stbi__get8(s);
-         len = stbi__get8(s);
-         if (c1 != 2 || c2 != 2 || (len & 0x80)) {
-            // not run-length encoded, so we have to actually use THIS data as a decoded
-            // pixel (note this can't be a valid pixel--one of RGB must be >= 128)
-            stbi_uc rgbe[4];
-            rgbe[0] = (stbi_uc) c1;
-            rgbe[1] = (stbi_uc) c2;
-            rgbe[2] = (stbi_uc) len;
-            rgbe[3] = (stbi_uc) stbi__get8(s);
-            stbi__hdr_convert(hdr_data, rgbe, req_comp);
-            i = 1;
-            j = 0;
-            STBI_FREE(scanline);
-            goto main_decode_loop; // yes, this makes no sense
-         }
-         len <<= 8;
-         len |= stbi__get8(s);
-         if (len != width) { STBI_FREE(hdr_data); STBI_FREE(scanline); return stbi__errpf("invalid decoded scanline length", "corrupt HDR"); }
-         if (scanline == NULL) {
-            scanline = (stbi_uc *) stbi__malloc_mad2(width, 4, 0);
-            if (!scanline) {
-               STBI_FREE(hdr_data);
-               return stbi__errpf("outofmem", "Out of memory");
-            }
-         }
-
-         for (k = 0; k < 4; ++k) {
-            int nleft;
-            i = 0;
-            while ((nleft = width - i) > 0) {
-               count = stbi__get8(s);
-               if (count > 128) {
-                  // Run
-                  value = stbi__get8(s);
-                  count -= 128;
-                  if ((count == 0) || (count > nleft)) { STBI_FREE(hdr_data); STBI_FREE(scanline); return stbi__errpf("corrupt", "bad RLE data in HDR"); }
-                  for (z = 0; z < count; ++z)
-                     scanline[i++ * 4 + k] = value;
-               } else {
-                  // Dump
-                  if ((count == 0) || (count > nleft)) { STBI_FREE(hdr_data); STBI_FREE(scanline); return stbi__errpf("corrupt", "bad RLE data in HDR"); }
-                  for (z = 0; z < count; ++z)
-                     scanline[i++ * 4 + k] = stbi__get8(s);
-               }
-            }
-         }
-         for (i=0; i < width; ++i)
-            stbi__hdr_convert(hdr_data+(j*width + i)*req_comp, scanline + i*4, req_comp);
-      }
-      if (scanline)
-         STBI_FREE(scanline);
-   }
-
-   return hdr_data;
-}
-
-static int stbi__hdr_info(stbi__context *s, int *x, int *y, int *comp)
-{
-   char buffer[STBI__HDR_BUFLEN];
-   char *token;
-   int valid = 0;
-   int dummy;
-
-   if (!x) x = &dummy;
-   if (!y) y = &dummy;
-   if (!comp) comp = &dummy;
-
-   if (stbi__hdr_test(s) == 0) {
-       stbi__rewind( s );
-       return 0;
-   }
-
-   for(;;) {
-      token = stbi__hdr_gettoken(s,buffer);
-      if (token[0] == 0) break;
-      if (strcmp(token, "FORMAT=32-bit_rle_rgbe") == 0) valid = 1;
-   }
-
-   if (!valid) {
-       stbi__rewind( s );
-       return 0;
-   }
-   token = stbi__hdr_gettoken(s,buffer);
-   if (strncmp(token, "-Y ", 3)) {
-       stbi__rewind( s );
-       return 0;
-   }
-   token += 3;
-   *y = (int) strtol(token, &token, 10);
-   while (*token == ' ') ++token;
-   if (strncmp(token, "+X ", 3)) {
-       stbi__rewind( s );
-       return 0;
-   }
-   token += 3;
-   *x = (int) strtol(token, NULL, 10);
-   *comp = 3;
-   return 1;
-}
-#endif // STBI_NO_HDR
-
-#ifndef STBI_NO_BMP
-static int stbi__bmp_info(stbi__context *s, int *x, int *y, int *comp)
-{
-   void *p;
-   stbi__bmp_data info;
-
-   info.all_a = 255;
-   p = stbi__bmp_parse_header(s, &info);
-   if (p == NULL) {
-      stbi__rewind( s );
-      return 0;
-   }
-   if (x) *x = s->img_x;
-   if (y) *y = s->img_y;
-   if (comp) {
-      if (info.bpp == 24 && info.ma == 0xff000000)
-         *comp = 3;
-      else
-         *comp = info.ma ? 4 : 3;
-   }
-   return 1;
-}
-#endif
-
-#ifndef STBI_NO_PSD
-static int stbi__psd_info(stbi__context *s, int *x, int *y, int *comp)
-{
-   int channelCount, dummy, depth;
-   if (!x) x = &dummy;
-   if (!y) y = &dummy;
-   if (!comp) comp = &dummy;
-   if (stbi__get32be(s) != 0x38425053) {
-       stbi__rewind( s );
-       return 0;
-   }
-   if (stbi__get16be(s) != 1) {
-       stbi__rewind( s );
-       return 0;
-   }
-   stbi__skip(s, 6);
-   channelCount = stbi__get16be(s);
-   if (channelCount < 0 || channelCount > 16) {
-       stbi__rewind( s );
-       return 0;
-   }
-   *y = stbi__get32be(s);
-   *x = stbi__get32be(s);
-   depth = stbi__get16be(s);
-   if (depth != 8 && depth != 16) {
-       stbi__rewind( s );
-       return 0;
-   }
-   if (stbi__get16be(s) != 3) {
-       stbi__rewind( s );
-       return 0;
-   }
-   *comp = 4;
-   return 1;
-}
-
-static int stbi__psd_is16(stbi__context *s)
-{
-   int channelCount, depth;
-   if (stbi__get32be(s) != 0x38425053) {
-       stbi__rewind( s );
-       return 0;
-   }
-   if (stbi__get16be(s) != 1) {
-       stbi__rewind( s );
-       return 0;
-   }
-   stbi__skip(s, 6);
-   channelCount = stbi__get16be(s);
-   if (channelCount < 0 || channelCount > 16) {
-       stbi__rewind( s );
-       return 0;
-   }
-   STBI_NOTUSED(stbi__get32be(s));
-   STBI_NOTUSED(stbi__get32be(s));
-   depth = stbi__get16be(s);
-   if (depth != 16) {
-       stbi__rewind( s );
-       return 0;
-   }
-   return 1;
-}
-#endif
-
-#ifndef STBI_NO_PIC
-static int stbi__pic_info(stbi__context *s, int *x, int *y, int *comp)
-{
-   int act_comp=0,num_packets=0,chained,dummy;
-   stbi__pic_packet packets[10];
-
-   if (!x) x = &dummy;
-   if (!y) y = &dummy;
-   if (!comp) comp = &dummy;
-
-   if (!stbi__pic_is4(s,"\x53\x80\xF6\x34")) {
-      stbi__rewind(s);
-      return 0;
-   }
-
-   stbi__skip(s, 88);
-
-   *x = stbi__get16be(s);
-   *y = stbi__get16be(s);
-   if (stbi__at_eof(s)) {
-      stbi__rewind( s);
-      return 0;
-   }
-   if ( (*x) != 0 && (1 << 28) / (*x) < (*y)) {
-      stbi__rewind( s );
-      return 0;
-   }
-
-   stbi__skip(s, 8);
-
-   do {
-      stbi__pic_packet *packet;
-
-      if (num_packets==sizeof(packets)/sizeof(packets[0]))
-         return 0;
-
-      packet = &packets[num_packets++];
-      chained = stbi__get8(s);
-      packet->size    = stbi__get8(s);
-      packet->type    = stbi__get8(s);
-      packet->channel = stbi__get8(s);
-      act_comp |= packet->channel;
-
-      if (stbi__at_eof(s)) {
-          stbi__rewind( s );
-          return 0;
-      }
-      if (packet->size != 8) {
-          stbi__rewind( s );
-          return 0;
-      }
-   } while (chained);
-
-   *comp = (act_comp & 0x10 ? 4 : 3);
-
-   return 1;
-}
-#endif
-
-// *************************************************************************************************
-// Portable Gray Map and Portable Pixel Map loader
-// by Ken Miller
-//
-// PGM: http://netpbm.sourceforge.net/doc/pgm.html
-// PPM: http://netpbm.sourceforge.net/doc/ppm.html
-//
-// Known limitations:
-//    Does not support comments in the header section
-//    Does not support ASCII image data (formats P2 and P3)
-
-#ifndef STBI_NO_PNM
-
-static int      stbi__pnm_test(stbi__context *s)
-{
-   char p, t;
-   p = (char) stbi__get8(s);
-   t = (char) stbi__get8(s);
-   if (p != 'P' || (t != '5' && t != '6')) {
-       stbi__rewind( s );
-       return 0;
-   }
-   return 1;
-}
-
-static void *stbi__pnm_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
-{
-   stbi_uc *out;
-   STBI_NOTUSED(ri);
-
-   ri->bits_per_channel = stbi__pnm_info(s, (int *)&s->img_x, (int *)&s->img_y, (int *)&s->img_n);
-   if (ri->bits_per_channel == 0)
-      return 0;
-
-   if (s->img_y > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
-   if (s->img_x > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
-
-   *x = s->img_x;
-   *y = s->img_y;
-   if (comp) *comp = s->img_n;
-
-   if (!stbi__mad4sizes_valid(s->img_n, s->img_x, s->img_y, ri->bits_per_channel / 8, 0))
-      return stbi__errpuc("too large", "PNM too large");
-
-   out = (stbi_uc *) stbi__malloc_mad4(s->img_n, s->img_x, s->img_y, ri->bits_per_channel / 8, 0);
-   if (!out) return stbi__errpuc("outofmem", "Out of memory");
-   if (!stbi__getn(s, out, s->img_n * s->img_x * s->img_y * (ri->bits_per_channel / 8))) {
-      STBI_FREE(out);
-      return stbi__errpuc("bad PNM", "PNM file truncated");
-   }
-
-   if (req_comp && req_comp != s->img_n) {
-      if (ri->bits_per_channel == 16) {
-         out = (stbi_uc *) stbi__convert_format16((stbi__uint16 *) out, s->img_n, req_comp, s->img_x, s->img_y);
-      } else {
-         out = stbi__convert_format(out, s->img_n, req_comp, s->img_x, s->img_y);
-      }
-      if (out == NULL) return out; // stbi__convert_format frees input on failure
-   }
-   return out;
-}
-
-static int      stbi__pnm_isspace(char c)
-{
-   return c == ' ' || c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == '\r';
-}
-
-static void     stbi__pnm_skip_whitespace(stbi__context *s, char *c)
-{
-   for (;;) {
-      while (!stbi__at_eof(s) && stbi__pnm_isspace(*c))
-         *c = (char) stbi__get8(s);
-
-      if (stbi__at_eof(s) || *c != '#')
-         break;
-
-      while (!stbi__at_eof(s) && *c != '\n' && *c != '\r' )
-         *c = (char) stbi__get8(s);
-   }
-}
-
-static int      stbi__pnm_isdigit(char c)
-{
-   return c >= '0' && c <= '9';
-}
-
-static int      stbi__pnm_getinteger(stbi__context *s, char *c)
-{
-   int value = 0;
-
-   while (!stbi__at_eof(s) && stbi__pnm_isdigit(*c)) {
-      value = value*10 + (*c - '0');
-      *c = (char) stbi__get8(s);
-      if((value > 214748364) || (value == 214748364 && *c > '7'))
-          return stbi__err("integer parse overflow", "Parsing an integer in the PPM header overflowed a 32-bit int");
-   }
-
-   return value;
-}
-
-static int      stbi__pnm_info(stbi__context *s, int *x, int *y, int *comp)
-{
-   int maxv, dummy;
-   char c, p, t;
-
-   if (!x) x = &dummy;
-   if (!y) y = &dummy;
-   if (!comp) comp = &dummy;
-
-   stbi__rewind(s);
-
-   // Get identifier
-   p = (char) stbi__get8(s);
-   t = (char) stbi__get8(s);
-   if (p != 'P' || (t != '5' && t != '6')) {
-       stbi__rewind(s);
-       return 0;
-   }
-
-   *comp = (t == '6') ? 3 : 1;  // '5' is 1-component .pgm; '6' is 3-component .ppm
-
-   c = (char) stbi__get8(s);
-   stbi__pnm_skip_whitespace(s, &c);
-
-   *x = stbi__pnm_getinteger(s, &c); // read width
-   if(*x == 0)
-       return stbi__err("invalid width", "PPM image header had zero or overflowing width");
-   stbi__pnm_skip_whitespace(s, &c);
-
-   *y = stbi__pnm_getinteger(s, &c); // read height
-   if (*y == 0)
-       return stbi__err("invalid width", "PPM image header had zero or overflowing width");
-   stbi__pnm_skip_whitespace(s, &c);
-
-   maxv = stbi__pnm_getinteger(s, &c);  // read max value
-   if (maxv > 65535)
-      return stbi__err("max value > 65535", "PPM image supports only 8-bit and 16-bit images");
-   else if (maxv > 255)
-      return 16;
-   else
-      return 8;
-}
-
-static int stbi__pnm_is16(stbi__context *s)
-{
-   if (stbi__pnm_info(s, NULL, NULL, NULL) == 16)
-	   return 1;
-   return 0;
-}
-#endif
-
-static int stbi__info_main(stbi__context *s, int *x, int *y, int *comp)
-{
-   #ifndef STBI_NO_JPEG
-   if (stbi__jpeg_info(s, x, y, comp)) return 1;
-   #endif
-
-   #ifndef STBI_NO_PNG
-   if (stbi__png_info(s, x, y, comp))  return 1;
-   #endif
-
-   #ifndef STBI_NO_GIF
-   if (stbi__gif_info(s, x, y, comp))  return 1;
-   #endif
-
-   #ifndef STBI_NO_BMP
-   if (stbi__bmp_info(s, x, y, comp))  return 1;
-   #endif
-
-   #ifndef STBI_NO_PSD
-   if (stbi__psd_info(s, x, y, comp))  return 1;
-   #endif
-
-   #ifndef STBI_NO_PIC
-   if (stbi__pic_info(s, x, y, comp))  return 1;
-   #endif
-
-   #ifndef STBI_NO_PNM
-   if (stbi__pnm_info(s, x, y, comp))  return 1;
-   #endif
-
-   #ifndef STBI_NO_HDR
-   if (stbi__hdr_info(s, x, y, comp))  return 1;
-   #endif
-
-   // test tga last because it's a crappy test!
-   #ifndef STBI_NO_TGA
-   if (stbi__tga_info(s, x, y, comp))
-       return 1;
-   #endif
-   return stbi__err("unknown image type", "Image not of any known type, or corrupt");
-}
-
-static int stbi__is_16_main(stbi__context *s)
-{
-   #ifndef STBI_NO_PNG
-   if (stbi__png_is16(s))  return 1;
-   #endif
-
-   #ifndef STBI_NO_PSD
-   if (stbi__psd_is16(s))  return 1;
-   #endif
-
-   #ifndef STBI_NO_PNM
-   if (stbi__pnm_is16(s))  return 1;
-   #endif
-   return 0;
-}
-
-#ifndef STBI_NO_STDIO
-STBIDEF int stbi_info(char const *filename, int *x, int *y, int *comp)
-{
-    FILE *f = stbi__fopen(filename, "rb");
-    int result;
-    if (!f) return stbi__err("can't fopen", "Unable to open file");
-    result = stbi_info_from_file(f, x, y, comp);
-    fclose(f);
-    return result;
-}
-
-STBIDEF int stbi_info_from_file(FILE *f, int *x, int *y, int *comp)
-{
-   int r;
-   stbi__context s;
-   long pos = ftell(f);
-   stbi__start_file(&s, f);
-   r = stbi__info_main(&s,x,y,comp);
-   fseek(f,pos,SEEK_SET);
-   return r;
-}
-
-STBIDEF int stbi_is_16_bit(char const *filename)
-{
-    FILE *f = stbi__fopen(filename, "rb");
-    int result;
-    if (!f) return stbi__err("can't fopen", "Unable to open file");
-    result = stbi_is_16_bit_from_file(f);
-    fclose(f);
-    return result;
-}
-
-STBIDEF int stbi_is_16_bit_from_file(FILE *f)
-{
-   int r;
-   stbi__context s;
-   long pos = ftell(f);
-   stbi__start_file(&s, f);
-   r = stbi__is_16_main(&s);
-   fseek(f,pos,SEEK_SET);
-   return r;
-}
-#endif // !STBI_NO_STDIO
-
-STBIDEF int stbi_info_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp)
-{
-   stbi__context s;
-   stbi__start_mem(&s,buffer,len);
-   return stbi__info_main(&s,x,y,comp);
-}
-
-STBIDEF int stbi_info_from_callbacks(stbi_io_callbacks const *c, void *user, int *x, int *y, int *comp)
-{
-   stbi__context s;
-   stbi__start_callbacks(&s, (stbi_io_callbacks *) c, user);
-   return stbi__info_main(&s,x,y,comp);
-}
-
-STBIDEF int stbi_is_16_bit_from_memory(stbi_uc const *buffer, int len)
-{
-   stbi__context s;
-   stbi__start_mem(&s,buffer,len);
-   return stbi__is_16_main(&s);
-}
-
-STBIDEF int stbi_is_16_bit_from_callbacks(stbi_io_callbacks const *c, void *user)
-{
-   stbi__context s;
-   stbi__start_callbacks(&s, (stbi_io_callbacks *) c, user);
-   return stbi__is_16_main(&s);
-}
-
-#endif // STB_IMAGE_IMPLEMENTATION
-
-/*
-   revision history:
-      2.20  (2019-02-07) support utf8 filenames in Windows; fix warnings and platform ifdefs
-      2.19  (2018-02-11) fix warning
-      2.18  (2018-01-30) fix warnings
-      2.17  (2018-01-29) change sbti__shiftsigned to avoid clang -O2 bug
-                         1-bit BMP
-                         *_is_16_bit api
-                         avoid warnings
-      2.16  (2017-07-23) all functions have 16-bit variants;
-                         STBI_NO_STDIO works again;
-                         compilation fixes;
-                         fix rounding in unpremultiply;
-                         optimize vertical flip;
-                         disable raw_len validation;
-                         documentation fixes
-      2.15  (2017-03-18) fix png-1,2,4 bug; now all Imagenet JPGs decode;
-                         warning fixes; disable run-time SSE detection on gcc;
-                         uniform handling of optional "return" values;
-                         thread-safe initialization of zlib tables
-      2.14  (2017-03-03) remove deprecated STBI_JPEG_OLD; fixes for Imagenet JPGs
-      2.13  (2016-11-29) add 16-bit API, only supported for PNG right now
-      2.12  (2016-04-02) fix typo in 2.11 PSD fix that caused crashes
-      2.11  (2016-04-02) allocate large structures on the stack
-                         remove white matting for transparent PSD
-                         fix reported channel count for PNG & BMP
-                         re-enable SSE2 in non-gcc 64-bit
-                         support RGB-formatted JPEG
-                         read 16-bit PNGs (only as 8-bit)
-      2.10  (2016-01-22) avoid warning introduced in 2.09 by STBI_REALLOC_SIZED
-      2.09  (2016-01-16) allow comments in PNM files
-                         16-bit-per-pixel TGA (not bit-per-component)
-                         info() for TGA could break due to .hdr handling
-                         info() for BMP to shares code instead of sloppy parse
-                         can use STBI_REALLOC_SIZED if allocator doesn't support realloc
-                         code cleanup
-      2.08  (2015-09-13) fix to 2.07 cleanup, reading RGB PSD as RGBA
-      2.07  (2015-09-13) fix compiler warnings
-                         partial animated GIF support
-                         limited 16-bpc PSD support
-                         #ifdef unused functions
-                         bug with < 92 byte PIC,PNM,HDR,TGA
-      2.06  (2015-04-19) fix bug where PSD returns wrong '*comp' value
-      2.05  (2015-04-19) fix bug in progressive JPEG handling, fix warning
-      2.04  (2015-04-15) try to re-enable SIMD on MinGW 64-bit
-      2.03  (2015-04-12) extra corruption checking (mmozeiko)
-                         stbi_set_flip_vertically_on_load (nguillemot)
-                         fix NEON support; fix mingw support
-      2.02  (2015-01-19) fix incorrect assert, fix warning
-      2.01  (2015-01-17) fix various warnings; suppress SIMD on gcc 32-bit without -msse2
-      2.00b (2014-12-25) fix STBI_MALLOC in progressive JPEG
-      2.00  (2014-12-25) optimize JPG, including x86 SSE2 & NEON SIMD (ryg)
-                         progressive JPEG (stb)
-                         PGM/PPM support (Ken Miller)
-                         STBI_MALLOC,STBI_REALLOC,STBI_FREE
-                         GIF bugfix -- seemingly never worked
-                         STBI_NO_*, STBI_ONLY_*
-      1.48  (2014-12-14) fix incorrectly-named assert()
-      1.47  (2014-12-14) 1/2/4-bit PNG support, both direct and paletted (Omar Cornut & stb)
-                         optimize PNG (ryg)
-                         fix bug in interlaced PNG with user-specified channel count (stb)
-      1.46  (2014-08-26)
-              fix broken tRNS chunk (colorkey-style transparency) in non-paletted PNG
-      1.45  (2014-08-16)
-              fix MSVC-ARM internal compiler error by wrapping malloc
-      1.44  (2014-08-07)
-              various warning fixes from Ronny Chevalier
-      1.43  (2014-07-15)
-              fix MSVC-only compiler problem in code changed in 1.42
-      1.42  (2014-07-09)
-              don't define _CRT_SECURE_NO_WARNINGS (affects user code)
-              fixes to stbi__cleanup_jpeg path
-              added STBI_ASSERT to avoid requiring assert.h
-      1.41  (2014-06-25)
-              fix search&replace from 1.36 that messed up comments/error messages
-      1.40  (2014-06-22)
-              fix gcc struct-initialization warning
-      1.39  (2014-06-15)
-              fix to TGA optimization when req_comp != number of components in TGA;
-              fix to GIF loading because BMP wasn't rewinding (whoops, no GIFs in my test suite)
-              add support for BMP version 5 (more ignored fields)
-      1.38  (2014-06-06)
-              suppress MSVC warnings on integer casts truncating values
-              fix accidental rename of 'skip' field of I/O
-      1.37  (2014-06-04)
-              remove duplicate typedef
-      1.36  (2014-06-03)
-              convert to header file single-file library
-              if de-iphone isn't set, load iphone images color-swapped instead of returning NULL
-      1.35  (2014-05-27)
-              various warnings
-              fix broken STBI_SIMD path
-              fix bug where stbi_load_from_file no longer left file pointer in correct place
-              fix broken non-easy path for 32-bit BMP (possibly never used)
-              TGA optimization by Arseny Kapoulkine
-      1.34  (unknown)
-              use STBI_NOTUSED in stbi__resample_row_generic(), fix one more leak in tga failure case
-      1.33  (2011-07-14)
-              make stbi_is_hdr work in STBI_NO_HDR (as specified), minor compiler-friendly improvements
-      1.32  (2011-07-13)
-              support for "info" function for all supported filetypes (SpartanJ)
-      1.31  (2011-06-20)
-              a few more leak fixes, bug in PNG handling (SpartanJ)
-      1.30  (2011-06-11)
-              added ability to load files via callbacks to accomidate custom input streams (Ben Wenger)
-              removed deprecated format-specific test/load functions
-              removed support for installable file formats (stbi_loader) -- would have been broken for IO callbacks anyway
-              error cases in bmp and tga give messages and don't leak (Raymond Barbiero, grisha)
-              fix inefficiency in decoding 32-bit BMP (David Woo)
-      1.29  (2010-08-16)
-              various warning fixes from Aurelien Pocheville
-      1.28  (2010-08-01)
-              fix bug in GIF palette transparency (SpartanJ)
-      1.27  (2010-08-01)
-              cast-to-stbi_uc to fix warnings
-      1.26  (2010-07-24)
-              fix bug in file buffering for PNG reported by SpartanJ
-      1.25  (2010-07-17)
-              refix trans_data warning (Won Chun)
-      1.24  (2010-07-12)
-              perf improvements reading from files on platforms with lock-heavy fgetc()
-              minor perf improvements for jpeg
-              deprecated type-specific functions so we'll get feedback if they're needed
-              attempt to fix trans_data warning (Won Chun)
-      1.23    fixed bug in iPhone support
-      1.22  (2010-07-10)
-              removed image *writing* support
-              stbi_info support from Jetro Lauha
-              GIF support from Jean-Marc Lienher
-              iPhone PNG-extensions from James Brown
-              warning-fixes from Nicolas Schulz and Janez Zemva (i.stbi__err. Janez (U+017D)emva)
-      1.21    fix use of 'stbi_uc' in header (reported by jon blow)
-      1.20    added support for Softimage PIC, by Tom Seddon
-      1.19    bug in interlaced PNG corruption check (found by ryg)
-      1.18  (2008-08-02)
-              fix a threading bug (local mutable static)
-      1.17    support interlaced PNG
-      1.16    major bugfix - stbi__convert_format converted one too many pixels
-      1.15    initialize some fields for thread safety
-      1.14    fix threadsafe conversion bug
-              header-file-only version (#define STBI_HEADER_FILE_ONLY before including)
-      1.13    threadsafe
-      1.12    const qualifiers in the API
-      1.11    Support installable IDCT, colorspace conversion routines
-      1.10    Fixes for 64-bit (don't use "unsigned long")
-              optimized upsampling by Fabian "ryg" Giesen
-      1.09    Fix format-conversion for PSD code (bad global variables!)
-      1.08    Thatcher Ulrich's PSD code integrated by Nicolas Schulz
-      1.07    attempt to fix C++ warning/errors again
-      1.06    attempt to fix C++ warning/errors again
-      1.05    fix TGA loading to return correct *comp and use good luminance calc
-      1.04    default float alpha is 1, not 255; use 'void *' for stbi_image_free
-      1.03    bugfixes to STBI_NO_STDIO, STBI_NO_HDR
-      1.02    support for (subset of) HDR files, float interface for preferred access to them
-      1.01    fix bug: possible bug in handling right-side up bmps... not sure
-              fix bug: the stbi__bmp_load() and stbi__tga_load() functions didn't work at all
-      1.00    interface to zlib that skips zlib header
-      0.99    correct handling of alpha in palette
-      0.98    TGA loader by lonesock; dynamically add loaders (untested)
-      0.97    jpeg errors on too large a file; also catch another malloc failure
-      0.96    fix detection of invalid v value - particleman@mollyrocket forum
-      0.95    during header scan, seek to markers in case of padding
-      0.94    STBI_NO_STDIO to disable stdio usage; rename all #defines the same
-      0.93    handle jpegtran output; verbose errors
-      0.92    read 4,8,16,24,32-bit BMP files of several formats
-      0.91    output 24-bit Windows 3.0 BMP files
-      0.90    fix a few more warnings; bump version number to approach 1.0
-      0.61    bugfixes due to Marc LeBlanc, Christopher Lloyd
-      0.60    fix compiling as c++
-      0.59    fix warnings: merge Dave Moore's -Wall fixes
-      0.58    fix bug: zlib uncompressed mode len/nlen was wrong endian
-      0.57    fix bug: jpg last huffman symbol before marker was >9 bits but less than 16 available
-      0.56    fix bug: zlib uncompressed mode len vs. nlen
-      0.55    fix bug: restart_interval not initialized to 0
-      0.54    allow NULL for 'int *comp'
-      0.53    fix bug in png 3->4; speedup png decoding
-      0.52    png handles req_comp=3,4 directly; minor cleanup; jpeg comments
-      0.51    obey req_comp requests, 1-component jpegs return as 1-component,
-              on 'test' only check type, not whether we support this variant
-      0.50  (2006-11-19)
-              first released version
-*/
-
-
-/*
-------------------------------------------------------------------------------
-This software is available under 2 licenses -- choose whichever you prefer.
-------------------------------------------------------------------------------
-ALTERNATIVE A - MIT License
-Copyright (c) 2017 Sean Barrett
-Permission is hereby granted, free of charge, to any person obtaining a copy of
-this software and associated documentation files (the "Software"), to deal in
-the Software without restriction, including without limitation the rights to
-use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
-of the Software, and to permit persons to whom the Software is furnished to do
-so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-------------------------------------------------------------------------------
-ALTERNATIVE B - Public Domain (www.unlicense.org)
-This is free and unencumbered software released into the public domain.
-Anyone is free to copy, modify, publish, use, compile, sell, or distribute this
-software, either in source code form or as a compiled binary, for any purpose,
-commercial or non-commercial, and by any means.
-In jurisdictions that recognize copyright laws, the author or authors of this
-software dedicate any and all copyright interest in the software to the public
-domain. We make this dedication for the benefit of the public at large and to
-the detriment of our heirs and successors. We intend this dedication to be an
-overt act of relinquishment in perpetuity of all present and future rights to
-this software under copyright law.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
-ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
-WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-------------------------------------------------------------------------------
-*/
diff --git a/ggml/examples/stb_image_write.h b/ggml/examples/stb_image_write.h
deleted file mode 100644
index e4b32ed..0000000
--- a/ggml/examples/stb_image_write.h
+++ /dev/null
@@ -1,1724 +0,0 @@
-/* stb_image_write - v1.16 - public domain - http://nothings.org/stb
-   writes out PNG/BMP/TGA/JPEG/HDR images to C stdio - Sean Barrett 2010-2015
-                                     no warranty implied; use at your own risk
-
-   Before #including,
-
-       #define STB_IMAGE_WRITE_IMPLEMENTATION
-
-   in the file that you want to have the implementation.
-
-   Will probably not work correctly with strict-aliasing optimizations.
-
-ABOUT:
-
-   This header file is a library for writing images to C stdio or a callback.
-
-   The PNG output is not optimal; it is 20-50% larger than the file
-   written by a decent optimizing implementation; though providing a custom
-   zlib compress function (see STBIW_ZLIB_COMPRESS) can mitigate that.
-   This library is designed for source code compactness and simplicity,
-   not optimal image file size or run-time performance.
-
-BUILDING:
-
-   You can #define STBIW_ASSERT(x) before the #include to avoid using assert.h.
-   You can #define STBIW_MALLOC(), STBIW_REALLOC(), and STBIW_FREE() to replace
-   malloc,realloc,free.
-   You can #define STBIW_MEMMOVE() to replace memmove()
-   You can #define STBIW_ZLIB_COMPRESS to use a custom zlib-style compress function
-   for PNG compression (instead of the builtin one), it must have the following signature:
-   unsigned char * my_compress(unsigned char *data, int data_len, int *out_len, int quality);
-   The returned data will be freed with STBIW_FREE() (free() by default),
-   so it must be heap allocated with STBIW_MALLOC() (malloc() by default),
-
-UNICODE:
-
-   If compiling for Windows and you wish to use Unicode filenames, compile
-   with
-       #define STBIW_WINDOWS_UTF8
-   and pass utf8-encoded filenames. Call stbiw_convert_wchar_to_utf8 to convert
-   Windows wchar_t filenames to utf8.
-
-USAGE:
-
-   There are five functions, one for each image file format:
-
-     int stbi_write_png(char const *filename, int w, int h, int comp, const void *data, int stride_in_bytes);
-     int stbi_write_bmp(char const *filename, int w, int h, int comp, const void *data);
-     int stbi_write_tga(char const *filename, int w, int h, int comp, const void *data);
-     int stbi_write_jpg(char const *filename, int w, int h, int comp, const void *data, int quality);
-     int stbi_write_hdr(char const *filename, int w, int h, int comp, const float *data);
-
-     void stbi_flip_vertically_on_write(int flag); // flag is non-zero to flip data vertically
-
-   There are also five equivalent functions that use an arbitrary write function. You are
-   expected to open/close your file-equivalent before and after calling these:
-
-     int stbi_write_png_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data, int stride_in_bytes);
-     int stbi_write_bmp_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data);
-     int stbi_write_tga_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data);
-     int stbi_write_hdr_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const float *data);
-     int stbi_write_jpg_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data, int quality);
-
-   where the callback is:
-      void stbi_write_func(void *context, void *data, int size);
-
-   You can configure it with these global variables:
-      int stbi_write_tga_with_rle;             // defaults to true; set to 0 to disable RLE
-      int stbi_write_png_compression_level;    // defaults to 8; set to higher for more compression
-      int stbi_write_force_png_filter;         // defaults to -1; set to 0..5 to force a filter mode
-
-
-   You can define STBI_WRITE_NO_STDIO to disable the file variant of these
-   functions, so the library will not use stdio.h at all. However, this will
-   also disable HDR writing, because it requires stdio for formatted output.
-
-   Each function returns 0 on failure and non-0 on success.
-
-   The functions create an image file defined by the parameters. The image
-   is a rectangle of pixels stored from left-to-right, top-to-bottom.
-   Each pixel contains 'comp' channels of data stored interleaved with 8-bits
-   per channel, in the following order: 1=Y, 2=YA, 3=RGB, 4=RGBA. (Y is
-   monochrome color.) The rectangle is 'w' pixels wide and 'h' pixels tall.
-   The *data pointer points to the first byte of the top-left-most pixel.
-   For PNG, "stride_in_bytes" is the distance in bytes from the first byte of
-   a row of pixels to the first byte of the next row of pixels.
-
-   PNG creates output files with the same number of components as the input.
-   The BMP format expands Y to RGB in the file format and does not
-   output alpha.
-
-   PNG supports writing rectangles of data even when the bytes storing rows of
-   data are not consecutive in memory (e.g. sub-rectangles of a larger image),
-   by supplying the stride between the beginning of adjacent rows. The other
-   formats do not. (Thus you cannot write a native-format BMP through the BMP
-   writer, both because it is in BGR order and because it may have padding
-   at the end of the line.)
-
-   PNG allows you to set the deflate compression level by setting the global
-   variable 'stbi_write_png_compression_level' (it defaults to 8).
-
-   HDR expects linear float data. Since the format is always 32-bit rgb(e)
-   data, alpha (if provided) is discarded, and for monochrome data it is
-   replicated across all three channels.
-
-   TGA supports RLE or non-RLE compressed data. To use non-RLE-compressed
-   data, set the global variable 'stbi_write_tga_with_rle' to 0.
-
-   JPEG does ignore alpha channels in input data; quality is between 1 and 100.
-   Higher quality looks better but results in a bigger image.
-   JPEG baseline (no JPEG progressive).
-
-CREDITS:
-
-
-   Sean Barrett           -    PNG/BMP/TGA
-   Baldur Karlsson        -    HDR
-   Jean-Sebastien Guay    -    TGA monochrome
-   Tim Kelsey             -    misc enhancements
-   Alan Hickman           -    TGA RLE
-   Emmanuel Julien        -    initial file IO callback implementation
-   Jon Olick              -    original jo_jpeg.cpp code
-   Daniel Gibson          -    integrate JPEG, allow external zlib
-   Aarni Koskela          -    allow choosing PNG filter
-
-   bugfixes:
-      github:Chribba
-      Guillaume Chereau
-      github:jry2
-      github:romigrou
-      Sergio Gonzalez
-      Jonas Karlsson
-      Filip Wasil
-      Thatcher Ulrich
-      github:poppolopoppo
-      Patrick Boettcher
-      github:xeekworx
-      Cap Petschulat
-      Simon Rodriguez
-      Ivan Tikhonov
-      github:ignotion
-      Adam Schackart
-      Andrew Kensler
-
-LICENSE
-
-  See end of file for license information.
-
-*/
-
-#ifndef INCLUDE_STB_IMAGE_WRITE_H
-#define INCLUDE_STB_IMAGE_WRITE_H
-
-#include <stdlib.h>
-
-// if STB_IMAGE_WRITE_STATIC causes problems, try defining STBIWDEF to 'inline' or 'static inline'
-#ifndef STBIWDEF
-#ifdef STB_IMAGE_WRITE_STATIC
-#define STBIWDEF  static
-#else
-#ifdef __cplusplus
-#define STBIWDEF  extern "C"
-#else
-#define STBIWDEF  extern
-#endif
-#endif
-#endif
-
-#ifndef STB_IMAGE_WRITE_STATIC  // C++ forbids static forward declarations
-STBIWDEF int stbi_write_tga_with_rle;
-STBIWDEF int stbi_write_png_compression_level;
-STBIWDEF int stbi_write_force_png_filter;
-#endif
-
-#ifndef STBI_WRITE_NO_STDIO
-STBIWDEF int stbi_write_png(char const *filename, int w, int h, int comp, const void  *data, int stride_in_bytes);
-STBIWDEF int stbi_write_bmp(char const *filename, int w, int h, int comp, const void  *data);
-STBIWDEF int stbi_write_tga(char const *filename, int w, int h, int comp, const void  *data);
-STBIWDEF int stbi_write_hdr(char const *filename, int w, int h, int comp, const float *data);
-STBIWDEF int stbi_write_jpg(char const *filename, int x, int y, int comp, const void  *data, int quality);
-
-#ifdef STBIW_WINDOWS_UTF8
-STBIWDEF int stbiw_convert_wchar_to_utf8(char *buffer, size_t bufferlen, const wchar_t* input);
-#endif
-#endif
-
-typedef void stbi_write_func(void *context, void *data, int size);
-
-STBIWDEF int stbi_write_png_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data, int stride_in_bytes);
-STBIWDEF int stbi_write_bmp_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data);
-STBIWDEF int stbi_write_tga_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data);
-STBIWDEF int stbi_write_hdr_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const float *data);
-STBIWDEF int stbi_write_jpg_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void  *data, int quality);
-
-STBIWDEF void stbi_flip_vertically_on_write(int flip_boolean);
-
-#endif//INCLUDE_STB_IMAGE_WRITE_H
-
-#ifdef STB_IMAGE_WRITE_IMPLEMENTATION
-
-#ifdef _WIN32
-   #ifndef _CRT_SECURE_NO_WARNINGS
-   #define _CRT_SECURE_NO_WARNINGS
-   #endif
-   #ifndef _CRT_NONSTDC_NO_DEPRECATE
-   #define _CRT_NONSTDC_NO_DEPRECATE
-   #endif
-#endif
-
-#ifndef STBI_WRITE_NO_STDIO
-#include <stdio.h>
-#endif // STBI_WRITE_NO_STDIO
-
-#include <stdarg.h>
-#include <stdlib.h>
-#include <string.h>
-#include <math.h>
-
-#if defined(STBIW_MALLOC) && defined(STBIW_FREE) && (defined(STBIW_REALLOC) || defined(STBIW_REALLOC_SIZED))
-// ok
-#elif !defined(STBIW_MALLOC) && !defined(STBIW_FREE) && !defined(STBIW_REALLOC) && !defined(STBIW_REALLOC_SIZED)
-// ok
-#else
-#error "Must define all or none of STBIW_MALLOC, STBIW_FREE, and STBIW_REALLOC (or STBIW_REALLOC_SIZED)."
-#endif
-
-#ifndef STBIW_MALLOC
-#define STBIW_MALLOC(sz)        malloc(sz)
-#define STBIW_REALLOC(p,newsz)  realloc(p,newsz)
-#define STBIW_FREE(p)           free(p)
-#endif
-
-#ifndef STBIW_REALLOC_SIZED
-#define STBIW_REALLOC_SIZED(p,oldsz,newsz) STBIW_REALLOC(p,newsz)
-#endif
-
-
-#ifndef STBIW_MEMMOVE
-#define STBIW_MEMMOVE(a,b,sz) memmove(a,b,sz)
-#endif
-
-
-#ifndef STBIW_ASSERT
-#include <assert.h>
-#define STBIW_ASSERT(x) assert(x)
-#endif
-
-#define STBIW_UCHAR(x) (unsigned char) ((x) & 0xff)
-
-#ifdef STB_IMAGE_WRITE_STATIC
-static int stbi_write_png_compression_level = 8;
-static int stbi_write_tga_with_rle = 1;
-static int stbi_write_force_png_filter = -1;
-#else
-int stbi_write_png_compression_level = 8;
-int stbi_write_tga_with_rle = 1;
-int stbi_write_force_png_filter = -1;
-#endif
-
-static int stbi__flip_vertically_on_write = 0;
-
-STBIWDEF void stbi_flip_vertically_on_write(int flag)
-{
-   stbi__flip_vertically_on_write = flag;
-}
-
-typedef struct
-{
-   stbi_write_func *func;
-   void *context;
-   unsigned char buffer[64];
-   int buf_used;
-} stbi__write_context;
-
-// initialize a callback-based context
-static void stbi__start_write_callbacks(stbi__write_context *s, stbi_write_func *c, void *context)
-{
-   s->func    = c;
-   s->context = context;
-}
-
-#ifndef STBI_WRITE_NO_STDIO
-
-static void stbi__stdio_write(void *context, void *data, int size)
-{
-   fwrite(data,1,size,(FILE*) context);
-}
-
-#if defined(_WIN32) && defined(STBIW_WINDOWS_UTF8)
-#ifdef __cplusplus
-#define STBIW_EXTERN extern "C"
-#else
-#define STBIW_EXTERN extern
-#endif
-STBIW_EXTERN __declspec(dllimport) int __stdcall MultiByteToWideChar(unsigned int cp, unsigned long flags, const char *str, int cbmb, wchar_t *widestr, int cchwide);
-STBIW_EXTERN __declspec(dllimport) int __stdcall WideCharToMultiByte(unsigned int cp, unsigned long flags, const wchar_t *widestr, int cchwide, char *str, int cbmb, const char *defchar, int *used_default);
-
-STBIWDEF int stbiw_convert_wchar_to_utf8(char *buffer, size_t bufferlen, const wchar_t* input)
-{
-   return WideCharToMultiByte(65001 /* UTF8 */, 0, input, -1, buffer, (int) bufferlen, NULL, NULL);
-}
-#endif
-
-static FILE *stbiw__fopen(char const *filename, char const *mode)
-{
-   FILE *f;
-#if defined(_WIN32) && defined(STBIW_WINDOWS_UTF8)
-   wchar_t wMode[64];
-   wchar_t wFilename[1024];
-   if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, filename, -1, wFilename, sizeof(wFilename)/sizeof(*wFilename)))
-      return 0;
-
-   if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, mode, -1, wMode, sizeof(wMode)/sizeof(*wMode)))
-      return 0;
-
-#if defined(_MSC_VER) && _MSC_VER >= 1400
-   if (0 != _wfopen_s(&f, wFilename, wMode))
-      f = 0;
-#else
-   f = _wfopen(wFilename, wMode);
-#endif
-
-#elif defined(_MSC_VER) && _MSC_VER >= 1400
-   if (0 != fopen_s(&f, filename, mode))
-      f=0;
-#else
-   f = fopen(filename, mode);
-#endif
-   return f;
-}
-
-static int stbi__start_write_file(stbi__write_context *s, const char *filename)
-{
-   FILE *f = stbiw__fopen(filename, "wb");
-   stbi__start_write_callbacks(s, stbi__stdio_write, (void *) f);
-   return f != NULL;
-}
-
-static void stbi__end_write_file(stbi__write_context *s)
-{
-   fclose((FILE *)s->context);
-}
-
-#endif // !STBI_WRITE_NO_STDIO
-
-typedef unsigned int stbiw_uint32;
-typedef int stb_image_write_test[sizeof(stbiw_uint32)==4 ? 1 : -1];
-
-static void stbiw__writefv(stbi__write_context *s, const char *fmt, va_list v)
-{
-   while (*fmt) {
-      switch (*fmt++) {
-         case ' ': break;
-         case '1': { unsigned char x = STBIW_UCHAR(va_arg(v, int));
-                     s->func(s->context,&x,1);
-                     break; }
-         case '2': { int x = va_arg(v,int);
-                     unsigned char b[2];
-                     b[0] = STBIW_UCHAR(x);
-                     b[1] = STBIW_UCHAR(x>>8);
-                     s->func(s->context,b,2);
-                     break; }
-         case '4': { stbiw_uint32 x = va_arg(v,int);
-                     unsigned char b[4];
-                     b[0]=STBIW_UCHAR(x);
-                     b[1]=STBIW_UCHAR(x>>8);
-                     b[2]=STBIW_UCHAR(x>>16);
-                     b[3]=STBIW_UCHAR(x>>24);
-                     s->func(s->context,b,4);
-                     break; }
-         default:
-            STBIW_ASSERT(0);
-            return;
-      }
-   }
-}
-
-static void stbiw__writef(stbi__write_context *s, const char *fmt, ...)
-{
-   va_list v;
-   va_start(v, fmt);
-   stbiw__writefv(s, fmt, v);
-   va_end(v);
-}
-
-static void stbiw__write_flush(stbi__write_context *s)
-{
-   if (s->buf_used) {
-      s->func(s->context, &s->buffer, s->buf_used);
-      s->buf_used = 0;
-   }
-}
-
-static void stbiw__putc(stbi__write_context *s, unsigned char c)
-{
-   s->func(s->context, &c, 1);
-}
-
-static void stbiw__write1(stbi__write_context *s, unsigned char a)
-{
-   if ((size_t)s->buf_used + 1 > sizeof(s->buffer))
-      stbiw__write_flush(s);
-   s->buffer[s->buf_used++] = a;
-}
-
-static void stbiw__write3(stbi__write_context *s, unsigned char a, unsigned char b, unsigned char c)
-{
-   int n;
-   if ((size_t)s->buf_used + 3 > sizeof(s->buffer))
-      stbiw__write_flush(s);
-   n = s->buf_used;
-   s->buf_used = n+3;
-   s->buffer[n+0] = a;
-   s->buffer[n+1] = b;
-   s->buffer[n+2] = c;
-}
-
-static void stbiw__write_pixel(stbi__write_context *s, int rgb_dir, int comp, int write_alpha, int expand_mono, unsigned char *d)
-{
-   unsigned char bg[3] = { 255, 0, 255}, px[3];
-   int k;
-
-   if (write_alpha < 0)
-      stbiw__write1(s, d[comp - 1]);
-
-   switch (comp) {
-      case 2: // 2 pixels = mono + alpha, alpha is written separately, so same as 1-channel case
-      case 1:
-         if (expand_mono)
-            stbiw__write3(s, d[0], d[0], d[0]); // monochrome bmp
-         else
-            stbiw__write1(s, d[0]);  // monochrome TGA
-         break;
-      case 4:
-         if (!write_alpha) {
-            // composite against pink background
-            for (k = 0; k < 3; ++k)
-               px[k] = bg[k] + ((d[k] - bg[k]) * d[3]) / 255;
-            stbiw__write3(s, px[1 - rgb_dir], px[1], px[1 + rgb_dir]);
-            break;
-         }
-         /* FALLTHROUGH */
-      case 3:
-         stbiw__write3(s, d[1 - rgb_dir], d[1], d[1 + rgb_dir]);
-         break;
-   }
-   if (write_alpha > 0)
-      stbiw__write1(s, d[comp - 1]);
-}
-
-static void stbiw__write_pixels(stbi__write_context *s, int rgb_dir, int vdir, int x, int y, int comp, void *data, int write_alpha, int scanline_pad, int expand_mono)
-{
-   stbiw_uint32 zero = 0;
-   int i,j, j_end;
-
-   if (y <= 0)
-      return;
-
-   if (stbi__flip_vertically_on_write)
-      vdir *= -1;
-
-   if (vdir < 0) {
-      j_end = -1; j = y-1;
-   } else {
-      j_end =  y; j = 0;
-   }
-
-   for (; j != j_end; j += vdir) {
-      for (i=0; i < x; ++i) {
-         unsigned char *d = (unsigned char *) data + (j*x+i)*comp;
-         stbiw__write_pixel(s, rgb_dir, comp, write_alpha, expand_mono, d);
-      }
-      stbiw__write_flush(s);
-      s->func(s->context, &zero, scanline_pad);
-   }
-}
-
-static int stbiw__outfile(stbi__write_context *s, int rgb_dir, int vdir, int x, int y, int comp, int expand_mono, void *data, int alpha, int pad, const char *fmt, ...)
-{
-   if (y < 0 || x < 0) {
-      return 0;
-   } else {
-      va_list v;
-      va_start(v, fmt);
-      stbiw__writefv(s, fmt, v);
-      va_end(v);
-      stbiw__write_pixels(s,rgb_dir,vdir,x,y,comp,data,alpha,pad, expand_mono);
-      return 1;
-   }
-}
-
-static int stbi_write_bmp_core(stbi__write_context *s, int x, int y, int comp, const void *data)
-{
-   if (comp != 4) {
-      // write RGB bitmap
-      int pad = (-x*3) & 3;
-      return stbiw__outfile(s,-1,-1,x,y,comp,1,(void *) data,0,pad,
-              "11 4 22 4" "4 44 22 444444",
-              'B', 'M', 14+40+(x*3+pad)*y, 0,0, 14+40,  // file header
-               40, x,y, 1,24, 0,0,0,0,0,0);             // bitmap header
-   } else {
-      // RGBA bitmaps need a v4 header
-      // use BI_BITFIELDS mode with 32bpp and alpha mask
-      // (straight BI_RGB with alpha mask doesn't work in most readers)
-      return stbiw__outfile(s,-1,-1,x,y,comp,1,(void *)data,1,0,
-         "11 4 22 4" "4 44 22 444444 4444 4 444 444 444 444",
-         'B', 'M', 14+108+x*y*4, 0, 0, 14+108, // file header
-         108, x,y, 1,32, 3,0,0,0,0,0, 0xff0000,0xff00,0xff,0xff000000u, 0, 0,0,0, 0,0,0, 0,0,0, 0,0,0); // bitmap V4 header
-   }
-}
-
-STBIWDEF int stbi_write_bmp_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data)
-{
-   stbi__write_context s = { 0 };
-   stbi__start_write_callbacks(&s, func, context);
-   return stbi_write_bmp_core(&s, x, y, comp, data);
-}
-
-#ifndef STBI_WRITE_NO_STDIO
-STBIWDEF int stbi_write_bmp(char const *filename, int x, int y, int comp, const void *data)
-{
-   stbi__write_context s = { 0 };
-   if (stbi__start_write_file(&s,filename)) {
-      int r = stbi_write_bmp_core(&s, x, y, comp, data);
-      stbi__end_write_file(&s);
-      return r;
-   } else
-      return 0;
-}
-#endif //!STBI_WRITE_NO_STDIO
-
-static int stbi_write_tga_core(stbi__write_context *s, int x, int y, int comp, void *data)
-{
-   int has_alpha = (comp == 2 || comp == 4);
-   int colorbytes = has_alpha ? comp-1 : comp;
-   int format = colorbytes < 2 ? 3 : 2; // 3 color channels (RGB/RGBA) = 2, 1 color channel (Y/YA) = 3
-
-   if (y < 0 || x < 0)
-      return 0;
-
-   if (!stbi_write_tga_with_rle) {
-      return stbiw__outfile(s, -1, -1, x, y, comp, 0, (void *) data, has_alpha, 0,
-         "111 221 2222 11", 0, 0, format, 0, 0, 0, 0, 0, x, y, (colorbytes + has_alpha) * 8, has_alpha * 8);
-   } else {
-      int i,j,k;
-      int jend, jdir;
-
-      stbiw__writef(s, "111 221 2222 11", 0,0,format+8, 0,0,0, 0,0,x,y, (colorbytes + has_alpha) * 8, has_alpha * 8);
-
-      if (stbi__flip_vertically_on_write) {
-         j = 0;
-         jend = y;
-         jdir = 1;
-      } else {
-         j = y-1;
-         jend = -1;
-         jdir = -1;
-      }
-      for (; j != jend; j += jdir) {
-         unsigned char *row = (unsigned char *) data + j * x * comp;
-         int len;
-
-         for (i = 0; i < x; i += len) {
-            unsigned char *begin = row + i * comp;
-            int diff = 1;
-            len = 1;
-
-            if (i < x - 1) {
-               ++len;
-               diff = memcmp(begin, row + (i + 1) * comp, comp);
-               if (diff) {
-                  const unsigned char *prev = begin;
-                  for (k = i + 2; k < x && len < 128; ++k) {
-                     if (memcmp(prev, row + k * comp, comp)) {
-                        prev += comp;
-                        ++len;
-                     } else {
-                        --len;
-                        break;
-                     }
-                  }
-               } else {
-                  for (k = i + 2; k < x && len < 128; ++k) {
-                     if (!memcmp(begin, row + k * comp, comp)) {
-                        ++len;
-                     } else {
-                        break;
-                     }
-                  }
-               }
-            }
-
-            if (diff) {
-               unsigned char header = STBIW_UCHAR(len - 1);
-               stbiw__write1(s, header);
-               for (k = 0; k < len; ++k) {
-                  stbiw__write_pixel(s, -1, comp, has_alpha, 0, begin + k * comp);
-               }
-            } else {
-               unsigned char header = STBIW_UCHAR(len - 129);
-               stbiw__write1(s, header);
-               stbiw__write_pixel(s, -1, comp, has_alpha, 0, begin);
-            }
-         }
-      }
-      stbiw__write_flush(s);
-   }
-   return 1;
-}
-
-STBIWDEF int stbi_write_tga_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data)
-{
-   stbi__write_context s = { 0 };
-   stbi__start_write_callbacks(&s, func, context);
-   return stbi_write_tga_core(&s, x, y, comp, (void *) data);
-}
-
-#ifndef STBI_WRITE_NO_STDIO
-STBIWDEF int stbi_write_tga(char const *filename, int x, int y, int comp, const void *data)
-{
-   stbi__write_context s = { 0 };
-   if (stbi__start_write_file(&s,filename)) {
-      int r = stbi_write_tga_core(&s, x, y, comp, (void *) data);
-      stbi__end_write_file(&s);
-      return r;
-   } else
-      return 0;
-}
-#endif
-
-// *************************************************************************************************
-// Radiance RGBE HDR writer
-// by Baldur Karlsson
-
-#define stbiw__max(a, b)  ((a) > (b) ? (a) : (b))
-
-#ifndef STBI_WRITE_NO_STDIO
-
-static void stbiw__linear_to_rgbe(unsigned char *rgbe, float *linear)
-{
-   int exponent;
-   float maxcomp = stbiw__max(linear[0], stbiw__max(linear[1], linear[2]));
-
-   if (maxcomp < 1e-32f) {
-      rgbe[0] = rgbe[1] = rgbe[2] = rgbe[3] = 0;
-   } else {
-      float normalize = (float) frexp(maxcomp, &exponent) * 256.0f/maxcomp;
-
-      rgbe[0] = (unsigned char)(linear[0] * normalize);
-      rgbe[1] = (unsigned char)(linear[1] * normalize);
-      rgbe[2] = (unsigned char)(linear[2] * normalize);
-      rgbe[3] = (unsigned char)(exponent + 128);
-   }
-}
-
-static void stbiw__write_run_data(stbi__write_context *s, int length, unsigned char databyte)
-{
-   unsigned char lengthbyte = STBIW_UCHAR(length+128);
-   STBIW_ASSERT(length+128 <= 255);
-   s->func(s->context, &lengthbyte, 1);
-   s->func(s->context, &databyte, 1);
-}
-
-static void stbiw__write_dump_data(stbi__write_context *s, int length, unsigned char *data)
-{
-   unsigned char lengthbyte = STBIW_UCHAR(length);
-   STBIW_ASSERT(length <= 128); // inconsistent with spec but consistent with official code
-   s->func(s->context, &lengthbyte, 1);
-   s->func(s->context, data, length);
-}
-
-static void stbiw__write_hdr_scanline(stbi__write_context *s, int width, int ncomp, unsigned char *scratch, float *scanline)
-{
-   unsigned char scanlineheader[4] = { 2, 2, 0, 0 };
-   unsigned char rgbe[4];
-   float linear[3];
-   int x;
-
-   scanlineheader[2] = (width&0xff00)>>8;
-   scanlineheader[3] = (width&0x00ff);
-
-   /* skip RLE for images too small or large */
-   if (width < 8 || width >= 32768) {
-      for (x=0; x < width; x++) {
-         switch (ncomp) {
-            case 4: /* fallthrough */
-            case 3: linear[2] = scanline[x*ncomp + 2];
-                    linear[1] = scanline[x*ncomp + 1];
-                    linear[0] = scanline[x*ncomp + 0];
-                    break;
-            default:
-                    linear[0] = linear[1] = linear[2] = scanline[x*ncomp + 0];
-                    break;
-         }
-         stbiw__linear_to_rgbe(rgbe, linear);
-         s->func(s->context, rgbe, 4);
-      }
-   } else {
-      int c,r;
-      /* encode into scratch buffer */
-      for (x=0; x < width; x++) {
-         switch(ncomp) {
-            case 4: /* fallthrough */
-            case 3: linear[2] = scanline[x*ncomp + 2];
-                    linear[1] = scanline[x*ncomp + 1];
-                    linear[0] = scanline[x*ncomp + 0];
-                    break;
-            default:
-                    linear[0] = linear[1] = linear[2] = scanline[x*ncomp + 0];
-                    break;
-         }
-         stbiw__linear_to_rgbe(rgbe, linear);
-         scratch[x + width*0] = rgbe[0];
-         scratch[x + width*1] = rgbe[1];
-         scratch[x + width*2] = rgbe[2];
-         scratch[x + width*3] = rgbe[3];
-      }
-
-      s->func(s->context, scanlineheader, 4);
-
-      /* RLE each component separately */
-      for (c=0; c < 4; c++) {
-         unsigned char *comp = &scratch[width*c];
-
-         x = 0;
-         while (x < width) {
-            // find first run
-            r = x;
-            while (r+2 < width) {
-               if (comp[r] == comp[r+1] && comp[r] == comp[r+2])
-                  break;
-               ++r;
-            }
-            if (r+2 >= width)
-               r = width;
-            // dump up to first run
-            while (x < r) {
-               int len = r-x;
-               if (len > 128) len = 128;
-               stbiw__write_dump_data(s, len, &comp[x]);
-               x += len;
-            }
-            // if there's a run, output it
-            if (r+2 < width) { // same test as what we break out of in search loop, so only true if we break'd
-               // find next byte after run
-               while (r < width && comp[r] == comp[x])
-                  ++r;
-               // output run up to r
-               while (x < r) {
-                  int len = r-x;
-                  if (len > 127) len = 127;
-                  stbiw__write_run_data(s, len, comp[x]);
-                  x += len;
-               }
-            }
-         }
-      }
-   }
-}
-
-static int stbi_write_hdr_core(stbi__write_context *s, int x, int y, int comp, float *data)
-{
-   if (y <= 0 || x <= 0 || data == NULL)
-      return 0;
-   else {
-      // Each component is stored separately. Allocate scratch space for full output scanline.
-      unsigned char *scratch = (unsigned char *) STBIW_MALLOC(x*4);
-      int i, len;
-      char buffer[128];
-      char header[] = "#?RADIANCE\n# Written by stb_image_write.h\nFORMAT=32-bit_rle_rgbe\n";
-      s->func(s->context, header, sizeof(header)-1);
-
-#ifdef __STDC_LIB_EXT1__
-      len = sprintf_s(buffer, sizeof(buffer), "EXPOSURE=          1.0000000000000\n\n-Y %d +X %d\n", y, x);
-#else
-      len = sprintf(buffer, "EXPOSURE=          1.0000000000000\n\n-Y %d +X %d\n", y, x);
-#endif
-      s->func(s->context, buffer, len);
-
-      for(i=0; i < y; i++)
-         stbiw__write_hdr_scanline(s, x, comp, scratch, data + comp*x*(stbi__flip_vertically_on_write ? y-1-i : i));
-      STBIW_FREE(scratch);
-      return 1;
-   }
-}
-
-STBIWDEF int stbi_write_hdr_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const float *data)
-{
-   stbi__write_context s = { 0 };
-   stbi__start_write_callbacks(&s, func, context);
-   return stbi_write_hdr_core(&s, x, y, comp, (float *) data);
-}
-
-STBIWDEF int stbi_write_hdr(char const *filename, int x, int y, int comp, const float *data)
-{
-   stbi__write_context s = { 0 };
-   if (stbi__start_write_file(&s,filename)) {
-      int r = stbi_write_hdr_core(&s, x, y, comp, (float *) data);
-      stbi__end_write_file(&s);
-      return r;
-   } else
-      return 0;
-}
-#endif // STBI_WRITE_NO_STDIO
-
-
-//////////////////////////////////////////////////////////////////////////////
-//
-// PNG writer
-//
-
-#ifndef STBIW_ZLIB_COMPRESS
-// stretchy buffer; stbiw__sbpush() == vector<>::push_back() -- stbiw__sbcount() == vector<>::size()
-#define stbiw__sbraw(a) ((int *) (void *) (a) - 2)
-#define stbiw__sbm(a)   stbiw__sbraw(a)[0]
-#define stbiw__sbn(a)   stbiw__sbraw(a)[1]
-
-#define stbiw__sbneedgrow(a,n)  ((a)==0 || stbiw__sbn(a)+n >= stbiw__sbm(a))
-#define stbiw__sbmaybegrow(a,n) (stbiw__sbneedgrow(a,(n)) ? stbiw__sbgrow(a,n) : 0)
-#define stbiw__sbgrow(a,n)  stbiw__sbgrowf((void **) &(a), (n), sizeof(*(a)))
-
-#define stbiw__sbpush(a, v)      (stbiw__sbmaybegrow(a,1), (a)[stbiw__sbn(a)++] = (v))
-#define stbiw__sbcount(a)        ((a) ? stbiw__sbn(a) : 0)
-#define stbiw__sbfree(a)         ((a) ? STBIW_FREE(stbiw__sbraw(a)),0 : 0)
-
-static void *stbiw__sbgrowf(void **arr, int increment, int itemsize)
-{
-   int m = *arr ? 2*stbiw__sbm(*arr)+increment : increment+1;
-   void *p = STBIW_REALLOC_SIZED(*arr ? stbiw__sbraw(*arr) : 0, *arr ? (stbiw__sbm(*arr)*itemsize + sizeof(int)*2) : 0, itemsize * m + sizeof(int)*2);
-   STBIW_ASSERT(p);
-   if (p) {
-      if (!*arr) ((int *) p)[1] = 0;
-      *arr = (void *) ((int *) p + 2);
-      stbiw__sbm(*arr) = m;
-   }
-   return *arr;
-}
-
-static unsigned char *stbiw__zlib_flushf(unsigned char *data, unsigned int *bitbuffer, int *bitcount)
-{
-   while (*bitcount >= 8) {
-      stbiw__sbpush(data, STBIW_UCHAR(*bitbuffer));
-      *bitbuffer >>= 8;
-      *bitcount -= 8;
-   }
-   return data;
-}
-
-static int stbiw__zlib_bitrev(int code, int codebits)
-{
-   int res=0;
-   while (codebits--) {
-      res = (res << 1) | (code & 1);
-      code >>= 1;
-   }
-   return res;
-}
-
-static unsigned int stbiw__zlib_countm(unsigned char *a, unsigned char *b, int limit)
-{
-   int i;
-   for (i=0; i < limit && i < 258; ++i)
-      if (a[i] != b[i]) break;
-   return i;
-}
-
-static unsigned int stbiw__zhash(unsigned char *data)
-{
-   stbiw_uint32 hash = data[0] + (data[1] << 8) + (data[2] << 16);
-   hash ^= hash << 3;
-   hash += hash >> 5;
-   hash ^= hash << 4;
-   hash += hash >> 17;
-   hash ^= hash << 25;
-   hash += hash >> 6;
-   return hash;
-}
-
-#define stbiw__zlib_flush() (out = stbiw__zlib_flushf(out, &bitbuf, &bitcount))
-#define stbiw__zlib_add(code,codebits) \
-      (bitbuf |= (code) << bitcount, bitcount += (codebits), stbiw__zlib_flush())
-#define stbiw__zlib_huffa(b,c)  stbiw__zlib_add(stbiw__zlib_bitrev(b,c),c)
-// default huffman tables
-#define stbiw__zlib_huff1(n)  stbiw__zlib_huffa(0x30 + (n), 8)
-#define stbiw__zlib_huff2(n)  stbiw__zlib_huffa(0x190 + (n)-144, 9)
-#define stbiw__zlib_huff3(n)  stbiw__zlib_huffa(0 + (n)-256,7)
-#define stbiw__zlib_huff4(n)  stbiw__zlib_huffa(0xc0 + (n)-280,8)
-#define stbiw__zlib_huff(n)  ((n) <= 143 ? stbiw__zlib_huff1(n) : (n) <= 255 ? stbiw__zlib_huff2(n) : (n) <= 279 ? stbiw__zlib_huff3(n) : stbiw__zlib_huff4(n))
-#define stbiw__zlib_huffb(n) ((n) <= 143 ? stbiw__zlib_huff1(n) : stbiw__zlib_huff2(n))
-
-#define stbiw__ZHASH   16384
-
-#endif // STBIW_ZLIB_COMPRESS
-
-STBIWDEF unsigned char * stbi_zlib_compress(unsigned char *data, int data_len, int *out_len, int quality)
-{
-#ifdef STBIW_ZLIB_COMPRESS
-   // user provided a zlib compress implementation, use that
-   return STBIW_ZLIB_COMPRESS(data, data_len, out_len, quality);
-#else // use builtin
-   static unsigned short lengthc[] = { 3,4,5,6,7,8,9,10,11,13,15,17,19,23,27,31,35,43,51,59,67,83,99,115,131,163,195,227,258, 259 };
-   static unsigned char  lengtheb[]= { 0,0,0,0,0,0,0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4,  4,  5,  5,  5,  5,  0 };
-   static unsigned short distc[]   = { 1,2,3,4,5,7,9,13,17,25,33,49,65,97,129,193,257,385,513,769,1025,1537,2049,3073,4097,6145,8193,12289,16385,24577, 32768 };
-   static unsigned char  disteb[]  = { 0,0,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13 };
-   unsigned int bitbuf=0;
-   int i,j, bitcount=0;
-   unsigned char *out = NULL;
-   unsigned char ***hash_table = (unsigned char***) STBIW_MALLOC(stbiw__ZHASH * sizeof(unsigned char**));
-   if (hash_table == NULL)
-      return NULL;
-   if (quality < 5) quality = 5;
-
-   stbiw__sbpush(out, 0x78);   // DEFLATE 32K window
-   stbiw__sbpush(out, 0x5e);   // FLEVEL = 1
-   stbiw__zlib_add(1,1);  // BFINAL = 1
-   stbiw__zlib_add(1,2);  // BTYPE = 1 -- fixed huffman
-
-   for (i=0; i < stbiw__ZHASH; ++i)
-      hash_table[i] = NULL;
-
-   i=0;
-   while (i < data_len-3) {
-      // hash next 3 bytes of data to be compressed
-      int h = stbiw__zhash(data+i)&(stbiw__ZHASH-1), best=3;
-      unsigned char *bestloc = 0;
-      unsigned char **hlist = hash_table[h];
-      int n = stbiw__sbcount(hlist);
-      for (j=0; j < n; ++j) {
-         if (hlist[j]-data > i-32768) { // if entry lies within window
-            int d = stbiw__zlib_countm(hlist[j], data+i, data_len-i);
-            if (d >= best) { best=d; bestloc=hlist[j]; }
-         }
-      }
-      // when hash table entry is too long, delete half the entries
-      if (hash_table[h] && stbiw__sbn(hash_table[h]) == 2*quality) {
-         STBIW_MEMMOVE(hash_table[h], hash_table[h]+quality, sizeof(hash_table[h][0])*quality);
-         stbiw__sbn(hash_table[h]) = quality;
-      }
-      stbiw__sbpush(hash_table[h],data+i);
-
-      if (bestloc) {
-         // "lazy matching" - check match at *next* byte, and if it's better, do cur byte as literal
-         h = stbiw__zhash(data+i+1)&(stbiw__ZHASH-1);
-         hlist = hash_table[h];
-         n = stbiw__sbcount(hlist);
-         for (j=0; j < n; ++j) {
-            if (hlist[j]-data > i-32767) {
-               int e = stbiw__zlib_countm(hlist[j], data+i+1, data_len-i-1);
-               if (e > best) { // if next match is better, bail on current match
-                  bestloc = NULL;
-                  break;
-               }
-            }
-         }
-      }
-
-      if (bestloc) {
-         int d = (int) (data+i - bestloc); // distance back
-         STBIW_ASSERT(d <= 32767 && best <= 258);
-         for (j=0; best > lengthc[j+1]-1; ++j);
-         stbiw__zlib_huff(j+257);
-         if (lengtheb[j]) stbiw__zlib_add(best - lengthc[j], lengtheb[j]);
-         for (j=0; d > distc[j+1]-1; ++j);
-         stbiw__zlib_add(stbiw__zlib_bitrev(j,5),5);
-         if (disteb[j]) stbiw__zlib_add(d - distc[j], disteb[j]);
-         i += best;
-      } else {
-         stbiw__zlib_huffb(data[i]);
-         ++i;
-      }
-   }
-   // write out final bytes
-   for (;i < data_len; ++i)
-      stbiw__zlib_huffb(data[i]);
-   stbiw__zlib_huff(256); // end of block
-   // pad with 0 bits to byte boundary
-   while (bitcount)
-      stbiw__zlib_add(0,1);
-
-   for (i=0; i < stbiw__ZHASH; ++i)
-      (void) stbiw__sbfree(hash_table[i]);
-   STBIW_FREE(hash_table);
-
-   // store uncompressed instead if compression was worse
-   if (stbiw__sbn(out) > data_len + 2 + ((data_len+32766)/32767)*5) {
-      stbiw__sbn(out) = 2;  // truncate to DEFLATE 32K window and FLEVEL = 1
-      for (j = 0; j < data_len;) {
-         int blocklen = data_len - j;
-         if (blocklen > 32767) blocklen = 32767;
-         stbiw__sbpush(out, data_len - j == blocklen); // BFINAL = ?, BTYPE = 0 -- no compression
-         stbiw__sbpush(out, STBIW_UCHAR(blocklen)); // LEN
-         stbiw__sbpush(out, STBIW_UCHAR(blocklen >> 8));
-         stbiw__sbpush(out, STBIW_UCHAR(~blocklen)); // NLEN
-         stbiw__sbpush(out, STBIW_UCHAR(~blocklen >> 8));
-         memcpy(out+stbiw__sbn(out), data+j, blocklen);
-         stbiw__sbn(out) += blocklen;
-         j += blocklen;
-      }
-   }
-
-   {
-      // compute adler32 on input
-      unsigned int s1=1, s2=0;
-      int blocklen = (int) (data_len % 5552);
-      j=0;
-      while (j < data_len) {
-         for (i=0; i < blocklen; ++i) { s1 += data[j+i]; s2 += s1; }
-         s1 %= 65521; s2 %= 65521;
-         j += blocklen;
-         blocklen = 5552;
-      }
-      stbiw__sbpush(out, STBIW_UCHAR(s2 >> 8));
-      stbiw__sbpush(out, STBIW_UCHAR(s2));
-      stbiw__sbpush(out, STBIW_UCHAR(s1 >> 8));
-      stbiw__sbpush(out, STBIW_UCHAR(s1));
-   }
-   *out_len = stbiw__sbn(out);
-   // make returned pointer freeable
-   STBIW_MEMMOVE(stbiw__sbraw(out), out, *out_len);
-   return (unsigned char *) stbiw__sbraw(out);
-#endif // STBIW_ZLIB_COMPRESS
-}
-
-static unsigned int stbiw__crc32(unsigned char *buffer, int len)
-{
-#ifdef STBIW_CRC32
-    return STBIW_CRC32(buffer, len);
-#else
-   static unsigned int crc_table[256] =
-   {
-      0x00000000, 0x77073096, 0xEE0E612C, 0x990951BA, 0x076DC419, 0x706AF48F, 0xE963A535, 0x9E6495A3,
-      0x0eDB8832, 0x79DCB8A4, 0xE0D5E91E, 0x97D2D988, 0x09B64C2B, 0x7EB17CBD, 0xE7B82D07, 0x90BF1D91,
-      0x1DB71064, 0x6AB020F2, 0xF3B97148, 0x84BE41DE, 0x1ADAD47D, 0x6DDDE4EB, 0xF4D4B551, 0x83D385C7,
-      0x136C9856, 0x646BA8C0, 0xFD62F97A, 0x8A65C9EC, 0x14015C4F, 0x63066CD9, 0xFA0F3D63, 0x8D080DF5,
-      0x3B6E20C8, 0x4C69105E, 0xD56041E4, 0xA2677172, 0x3C03E4D1, 0x4B04D447, 0xD20D85FD, 0xA50AB56B,
-      0x35B5A8FA, 0x42B2986C, 0xDBBBC9D6, 0xACBCF940, 0x32D86CE3, 0x45DF5C75, 0xDCD60DCF, 0xABD13D59,
-      0x26D930AC, 0x51DE003A, 0xC8D75180, 0xBFD06116, 0x21B4F4B5, 0x56B3C423, 0xCFBA9599, 0xB8BDA50F,
-      0x2802B89E, 0x5F058808, 0xC60CD9B2, 0xB10BE924, 0x2F6F7C87, 0x58684C11, 0xC1611DAB, 0xB6662D3D,
-      0x76DC4190, 0x01DB7106, 0x98D220BC, 0xEFD5102A, 0x71B18589, 0x06B6B51F, 0x9FBFE4A5, 0xE8B8D433,
-      0x7807C9A2, 0x0F00F934, 0x9609A88E, 0xE10E9818, 0x7F6A0DBB, 0x086D3D2D, 0x91646C97, 0xE6635C01,
-      0x6B6B51F4, 0x1C6C6162, 0x856530D8, 0xF262004E, 0x6C0695ED, 0x1B01A57B, 0x8208F4C1, 0xF50FC457,
-      0x65B0D9C6, 0x12B7E950, 0x8BBEB8EA, 0xFCB9887C, 0x62DD1DDF, 0x15DA2D49, 0x8CD37CF3, 0xFBD44C65,
-      0x4DB26158, 0x3AB551CE, 0xA3BC0074, 0xD4BB30E2, 0x4ADFA541, 0x3DD895D7, 0xA4D1C46D, 0xD3D6F4FB,
-      0x4369E96A, 0x346ED9FC, 0xAD678846, 0xDA60B8D0, 0x44042D73, 0x33031DE5, 0xAA0A4C5F, 0xDD0D7CC9,
-      0x5005713C, 0x270241AA, 0xBE0B1010, 0xC90C2086, 0x5768B525, 0x206F85B3, 0xB966D409, 0xCE61E49F,
-      0x5EDEF90E, 0x29D9C998, 0xB0D09822, 0xC7D7A8B4, 0x59B33D17, 0x2EB40D81, 0xB7BD5C3B, 0xC0BA6CAD,
-      0xEDB88320, 0x9ABFB3B6, 0x03B6E20C, 0x74B1D29A, 0xEAD54739, 0x9DD277AF, 0x04DB2615, 0x73DC1683,
-      0xE3630B12, 0x94643B84, 0x0D6D6A3E, 0x7A6A5AA8, 0xE40ECF0B, 0x9309FF9D, 0x0A00AE27, 0x7D079EB1,
-      0xF00F9344, 0x8708A3D2, 0x1E01F268, 0x6906C2FE, 0xF762575D, 0x806567CB, 0x196C3671, 0x6E6B06E7,
-      0xFED41B76, 0x89D32BE0, 0x10DA7A5A, 0x67DD4ACC, 0xF9B9DF6F, 0x8EBEEFF9, 0x17B7BE43, 0x60B08ED5,
-      0xD6D6A3E8, 0xA1D1937E, 0x38D8C2C4, 0x4FDFF252, 0xD1BB67F1, 0xA6BC5767, 0x3FB506DD, 0x48B2364B,
-      0xD80D2BDA, 0xAF0A1B4C, 0x36034AF6, 0x41047A60, 0xDF60EFC3, 0xA867DF55, 0x316E8EEF, 0x4669BE79,
-      0xCB61B38C, 0xBC66831A, 0x256FD2A0, 0x5268E236, 0xCC0C7795, 0xBB0B4703, 0x220216B9, 0x5505262F,
-      0xC5BA3BBE, 0xB2BD0B28, 0x2BB45A92, 0x5CB36A04, 0xC2D7FFA7, 0xB5D0CF31, 0x2CD99E8B, 0x5BDEAE1D,
-      0x9B64C2B0, 0xEC63F226, 0x756AA39C, 0x026D930A, 0x9C0906A9, 0xEB0E363F, 0x72076785, 0x05005713,
-      0x95BF4A82, 0xE2B87A14, 0x7BB12BAE, 0x0CB61B38, 0x92D28E9B, 0xE5D5BE0D, 0x7CDCEFB7, 0x0BDBDF21,
-      0x86D3D2D4, 0xF1D4E242, 0x68DDB3F8, 0x1FDA836E, 0x81BE16CD, 0xF6B9265B, 0x6FB077E1, 0x18B74777,
-      0x88085AE6, 0xFF0F6A70, 0x66063BCA, 0x11010B5C, 0x8F659EFF, 0xF862AE69, 0x616BFFD3, 0x166CCF45,
-      0xA00AE278, 0xD70DD2EE, 0x4E048354, 0x3903B3C2, 0xA7672661, 0xD06016F7, 0x4969474D, 0x3E6E77DB,
-      0xAED16A4A, 0xD9D65ADC, 0x40DF0B66, 0x37D83BF0, 0xA9BCAE53, 0xDEBB9EC5, 0x47B2CF7F, 0x30B5FFE9,
-      0xBDBDF21C, 0xCABAC28A, 0x53B39330, 0x24B4A3A6, 0xBAD03605, 0xCDD70693, 0x54DE5729, 0x23D967BF,
-      0xB3667A2E, 0xC4614AB8, 0x5D681B02, 0x2A6F2B94, 0xB40BBE37, 0xC30C8EA1, 0x5A05DF1B, 0x2D02EF8D
-   };
-
-   unsigned int crc = ~0u;
-   int i;
-   for (i=0; i < len; ++i)
-      crc = (crc >> 8) ^ crc_table[buffer[i] ^ (crc & 0xff)];
-   return ~crc;
-#endif
-}
-
-#define stbiw__wpng4(o,a,b,c,d) ((o)[0]=STBIW_UCHAR(a),(o)[1]=STBIW_UCHAR(b),(o)[2]=STBIW_UCHAR(c),(o)[3]=STBIW_UCHAR(d),(o)+=4)
-#define stbiw__wp32(data,v) stbiw__wpng4(data, (v)>>24,(v)>>16,(v)>>8,(v));
-#define stbiw__wptag(data,s) stbiw__wpng4(data, s[0],s[1],s[2],s[3])
-
-static void stbiw__wpcrc(unsigned char **data, int len)
-{
-   unsigned int crc = stbiw__crc32(*data - len - 4, len+4);
-   stbiw__wp32(*data, crc);
-}
-
-static unsigned char stbiw__paeth(int a, int b, int c)
-{
-   int p = a + b - c, pa = abs(p-a), pb = abs(p-b), pc = abs(p-c);
-   if (pa <= pb && pa <= pc) return STBIW_UCHAR(a);
-   if (pb <= pc) return STBIW_UCHAR(b);
-   return STBIW_UCHAR(c);
-}
-
-// @OPTIMIZE: provide an option that always forces left-predict or paeth predict
-static void stbiw__encode_png_line(unsigned char *pixels, int stride_bytes, int width, int height, int y, int n, int filter_type, signed char *line_buffer)
-{
-   static int mapping[] = { 0,1,2,3,4 };
-   static int firstmap[] = { 0,1,0,5,6 };
-   int *mymap = (y != 0) ? mapping : firstmap;
-   int i;
-   int type = mymap[filter_type];
-   unsigned char *z = pixels + stride_bytes * (stbi__flip_vertically_on_write ? height-1-y : y);
-   int signed_stride = stbi__flip_vertically_on_write ? -stride_bytes : stride_bytes;
-
-   if (type==0) {
-      memcpy(line_buffer, z, width*n);
-      return;
-   }
-
-   // first loop isn't optimized since it's just one pixel
-   for (i = 0; i < n; ++i) {
-      switch (type) {
-         case 1: line_buffer[i] = z[i]; break;
-         case 2: line_buffer[i] = z[i] - z[i-signed_stride]; break;
-         case 3: line_buffer[i] = z[i] - (z[i-signed_stride]>>1); break;
-         case 4: line_buffer[i] = (signed char) (z[i] - stbiw__paeth(0,z[i-signed_stride],0)); break;
-         case 5: line_buffer[i] = z[i]; break;
-         case 6: line_buffer[i] = z[i]; break;
-      }
-   }
-   switch (type) {
-      case 1: for (i=n; i < width*n; ++i) line_buffer[i] = z[i] - z[i-n]; break;
-      case 2: for (i=n; i < width*n; ++i) line_buffer[i] = z[i] - z[i-signed_stride]; break;
-      case 3: for (i=n; i < width*n; ++i) line_buffer[i] = z[i] - ((z[i-n] + z[i-signed_stride])>>1); break;
-      case 4: for (i=n; i < width*n; ++i) line_buffer[i] = z[i] - stbiw__paeth(z[i-n], z[i-signed_stride], z[i-signed_stride-n]); break;
-      case 5: for (i=n; i < width*n; ++i) line_buffer[i] = z[i] - (z[i-n]>>1); break;
-      case 6: for (i=n; i < width*n; ++i) line_buffer[i] = z[i] - stbiw__paeth(z[i-n], 0,0); break;
-   }
-}
-
-STBIWDEF unsigned char *stbi_write_png_to_mem(const unsigned char *pixels, int stride_bytes, int x, int y, int n, int *out_len)
-{
-   int force_filter = stbi_write_force_png_filter;
-   int ctype[5] = { -1, 0, 4, 2, 6 };
-   unsigned char sig[8] = { 137,80,78,71,13,10,26,10 };
-   unsigned char *out,*o, *filt, *zlib;
-   signed char *line_buffer;
-   int j,zlen;
-
-   if (stride_bytes == 0)
-      stride_bytes = x * n;
-
-   if (force_filter >= 5) {
-      force_filter = -1;
-   }
-
-   filt = (unsigned char *) STBIW_MALLOC((x*n+1) * y); if (!filt) return 0;
-   line_buffer = (signed char *) STBIW_MALLOC(x * n); if (!line_buffer) { STBIW_FREE(filt); return 0; }
-   for (j=0; j < y; ++j) {
-      int filter_type;
-      if (force_filter > -1) {
-         filter_type = force_filter;
-         stbiw__encode_png_line((unsigned char*)(pixels), stride_bytes, x, y, j, n, force_filter, line_buffer);
-      } else { // Estimate the best filter by running through all of them:
-         int best_filter = 0, best_filter_val = 0x7fffffff, est, i;
-         for (filter_type = 0; filter_type < 5; filter_type++) {
-            stbiw__encode_png_line((unsigned char*)(pixels), stride_bytes, x, y, j, n, filter_type, line_buffer);
-
-            // Estimate the entropy of the line using this filter; the less, the better.
-            est = 0;
-            for (i = 0; i < x*n; ++i) {
-               est += abs((signed char) line_buffer[i]);
-            }
-            if (est < best_filter_val) {
-               best_filter_val = est;
-               best_filter = filter_type;
-            }
-         }
-         if (filter_type != best_filter) {  // If the last iteration already got us the best filter, don't redo it
-            stbiw__encode_png_line((unsigned char*)(pixels), stride_bytes, x, y, j, n, best_filter, line_buffer);
-            filter_type = best_filter;
-         }
-      }
-      // when we get here, filter_type contains the filter type, and line_buffer contains the data
-      filt[j*(x*n+1)] = (unsigned char) filter_type;
-      STBIW_MEMMOVE(filt+j*(x*n+1)+1, line_buffer, x*n);
-   }
-   STBIW_FREE(line_buffer);
-   zlib = stbi_zlib_compress(filt, y*( x*n+1), &zlen, stbi_write_png_compression_level);
-   STBIW_FREE(filt);
-   if (!zlib) return 0;
-
-   // each tag requires 12 bytes of overhead
-   out = (unsigned char *) STBIW_MALLOC(8 + 12+13 + 12+zlen + 12);
-   if (!out) return 0;
-   *out_len = 8 + 12+13 + 12+zlen + 12;
-
-   o=out;
-   STBIW_MEMMOVE(o,sig,8); o+= 8;
-   stbiw__wp32(o, 13); // header length
-   stbiw__wptag(o, "IHDR");
-   stbiw__wp32(o, x);
-   stbiw__wp32(o, y);
-   *o++ = 8;
-   *o++ = STBIW_UCHAR(ctype[n]);
-   *o++ = 0;
-   *o++ = 0;
-   *o++ = 0;
-   stbiw__wpcrc(&o,13);
-
-   stbiw__wp32(o, zlen);
-   stbiw__wptag(o, "IDAT");
-   STBIW_MEMMOVE(o, zlib, zlen);
-   o += zlen;
-   STBIW_FREE(zlib);
-   stbiw__wpcrc(&o, zlen);
-
-   stbiw__wp32(o,0);
-   stbiw__wptag(o, "IEND");
-   stbiw__wpcrc(&o,0);
-
-   STBIW_ASSERT(o == out + *out_len);
-
-   return out;
-}
-
-#ifndef STBI_WRITE_NO_STDIO
-STBIWDEF int stbi_write_png(char const *filename, int x, int y, int comp, const void *data, int stride_bytes)
-{
-   FILE *f;
-   int len;
-   unsigned char *png = stbi_write_png_to_mem((const unsigned char *) data, stride_bytes, x, y, comp, &len);
-   if (png == NULL) return 0;
-
-   f = stbiw__fopen(filename, "wb");
-   if (!f) { STBIW_FREE(png); return 0; }
-   fwrite(png, 1, len, f);
-   fclose(f);
-   STBIW_FREE(png);
-   return 1;
-}
-#endif
-
-STBIWDEF int stbi_write_png_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data, int stride_bytes)
-{
-   int len;
-   unsigned char *png = stbi_write_png_to_mem((const unsigned char *) data, stride_bytes, x, y, comp, &len);
-   if (png == NULL) return 0;
-   func(context, png, len);
-   STBIW_FREE(png);
-   return 1;
-}
-
-
-/* ***************************************************************************
- *
- * JPEG writer
- *
- * This is based on Jon Olick's jo_jpeg.cpp:
- * public domain Simple, Minimalistic JPEG writer - http://www.jonolick.com/code.html
- */
-
-static const unsigned char stbiw__jpg_ZigZag[] = { 0,1,5,6,14,15,27,28,2,4,7,13,16,26,29,42,3,8,12,17,25,30,41,43,9,11,18,
-      24,31,40,44,53,10,19,23,32,39,45,52,54,20,22,33,38,46,51,55,60,21,34,37,47,50,56,59,61,35,36,48,49,57,58,62,63 };
-
-static void stbiw__jpg_writeBits(stbi__write_context *s, int *bitBufP, int *bitCntP, const unsigned short *bs) {
-   int bitBuf = *bitBufP, bitCnt = *bitCntP;
-   bitCnt += bs[1];
-   bitBuf |= bs[0] << (24 - bitCnt);
-   while(bitCnt >= 8) {
-      unsigned char c = (bitBuf >> 16) & 255;
-      stbiw__putc(s, c);
-      if(c == 255) {
-         stbiw__putc(s, 0);
-      }
-      bitBuf <<= 8;
-      bitCnt -= 8;
-   }
-   *bitBufP = bitBuf;
-   *bitCntP = bitCnt;
-}
-
-static void stbiw__jpg_DCT(float *d0p, float *d1p, float *d2p, float *d3p, float *d4p, float *d5p, float *d6p, float *d7p) {
-   float d0 = *d0p, d1 = *d1p, d2 = *d2p, d3 = *d3p, d4 = *d4p, d5 = *d5p, d6 = *d6p, d7 = *d7p;
-   float z1, z2, z3, z4, z5, z11, z13;
-
-   float tmp0 = d0 + d7;
-   float tmp7 = d0 - d7;
-   float tmp1 = d1 + d6;
-   float tmp6 = d1 - d6;
-   float tmp2 = d2 + d5;
-   float tmp5 = d2 - d5;
-   float tmp3 = d3 + d4;
-   float tmp4 = d3 - d4;
-
-   // Even part
-   float tmp10 = tmp0 + tmp3;   // phase 2
-   float tmp13 = tmp0 - tmp3;
-   float tmp11 = tmp1 + tmp2;
-   float tmp12 = tmp1 - tmp2;
-
-   d0 = tmp10 + tmp11;       // phase 3
-   d4 = tmp10 - tmp11;
-
-   z1 = (tmp12 + tmp13) * 0.707106781f; // c4
-   d2 = tmp13 + z1;       // phase 5
-   d6 = tmp13 - z1;
-
-   // Odd part
-   tmp10 = tmp4 + tmp5;       // phase 2
-   tmp11 = tmp5 + tmp6;
-   tmp12 = tmp6 + tmp7;
-
-   // The rotator is modified from fig 4-8 to avoid extra negations.
-   z5 = (tmp10 - tmp12) * 0.382683433f; // c6
-   z2 = tmp10 * 0.541196100f + z5; // c2-c6
-   z4 = tmp12 * 1.306562965f + z5; // c2+c6
-   z3 = tmp11 * 0.707106781f; // c4
-
-   z11 = tmp7 + z3;      // phase 5
-   z13 = tmp7 - z3;
-
-   *d5p = z13 + z2;         // phase 6
-   *d3p = z13 - z2;
-   *d1p = z11 + z4;
-   *d7p = z11 - z4;
-
-   *d0p = d0;  *d2p = d2;  *d4p = d4;  *d6p = d6;
-}
-
-static void stbiw__jpg_calcBits(int val, unsigned short bits[2]) {
-   int tmp1 = val < 0 ? -val : val;
-   val = val < 0 ? val-1 : val;
-   bits[1] = 1;
-   while(tmp1 >>= 1) {
-      ++bits[1];
-   }
-   bits[0] = val & ((1<<bits[1])-1);
-}
-
-static int stbiw__jpg_processDU(stbi__write_context *s, int *bitBuf, int *bitCnt, float *CDU, int du_stride, float *fdtbl, int DC, const unsigned short HTDC[256][2], const unsigned short HTAC[256][2]) {
-   const unsigned short EOB[2] = { HTAC[0x00][0], HTAC[0x00][1] };
-   const unsigned short M16zeroes[2] = { HTAC[0xF0][0], HTAC[0xF0][1] };
-   int dataOff, i, j, n, diff, end0pos, x, y;
-   int DU[64];
-
-   // DCT rows
-   for(dataOff=0, n=du_stride*8; dataOff<n; dataOff+=du_stride) {
-      stbiw__jpg_DCT(&CDU[dataOff], &CDU[dataOff+1], &CDU[dataOff+2], &CDU[dataOff+3], &CDU[dataOff+4], &CDU[dataOff+5], &CDU[dataOff+6], &CDU[dataOff+7]);
-   }
-   // DCT columns
-   for(dataOff=0; dataOff<8; ++dataOff) {
-      stbiw__jpg_DCT(&CDU[dataOff], &CDU[dataOff+du_stride], &CDU[dataOff+du_stride*2], &CDU[dataOff+du_stride*3], &CDU[dataOff+du_stride*4],
-                     &CDU[dataOff+du_stride*5], &CDU[dataOff+du_stride*6], &CDU[dataOff+du_stride*7]);
-   }
-   // Quantize/descale/zigzag the coefficients
-   for(y = 0, j=0; y < 8; ++y) {
-      for(x = 0; x < 8; ++x,++j) {
-         float v;
-         i = y*du_stride+x;
-         v = CDU[i]*fdtbl[j];
-         // DU[stbiw__jpg_ZigZag[j]] = (int)(v < 0 ? ceilf(v - 0.5f) : floorf(v + 0.5f));
-         // ceilf() and floorf() are C99, not C89, but I /think/ they're not needed here anyway?
-         DU[stbiw__jpg_ZigZag[j]] = (int)(v < 0 ? v - 0.5f : v + 0.5f);
-      }
-   }
-
-   // Encode DC
-   diff = DU[0] - DC;
-   if (diff == 0) {
-      stbiw__jpg_writeBits(s, bitBuf, bitCnt, HTDC[0]);
-   } else {
-      unsigned short bits[2];
-      stbiw__jpg_calcBits(diff, bits);
-      stbiw__jpg_writeBits(s, bitBuf, bitCnt, HTDC[bits[1]]);
-      stbiw__jpg_writeBits(s, bitBuf, bitCnt, bits);
-   }
-   // Encode ACs
-   end0pos = 63;
-   for(; (end0pos>0)&&(DU[end0pos]==0); --end0pos) {
-   }
-   // end0pos = first element in reverse order !=0
-   if(end0pos == 0) {
-      stbiw__jpg_writeBits(s, bitBuf, bitCnt, EOB);
-      return DU[0];
-   }
-   for(i = 1; i <= end0pos; ++i) {
-      int startpos = i;
-      int nrzeroes;
-      unsigned short bits[2];
-      for (; DU[i]==0 && i<=end0pos; ++i) {
-      }
-      nrzeroes = i-startpos;
-      if ( nrzeroes >= 16 ) {
-         int lng = nrzeroes>>4;
-         int nrmarker;
-         for (nrmarker=1; nrmarker <= lng; ++nrmarker)
-            stbiw__jpg_writeBits(s, bitBuf, bitCnt, M16zeroes);
-         nrzeroes &= 15;
-      }
-      stbiw__jpg_calcBits(DU[i], bits);
-      stbiw__jpg_writeBits(s, bitBuf, bitCnt, HTAC[(nrzeroes<<4)+bits[1]]);
-      stbiw__jpg_writeBits(s, bitBuf, bitCnt, bits);
-   }
-   if(end0pos != 63) {
-      stbiw__jpg_writeBits(s, bitBuf, bitCnt, EOB);
-   }
-   return DU[0];
-}
-
-static int stbi_write_jpg_core(stbi__write_context *s, int width, int height, int comp, const void* data, int quality) {
-   // Constants that don't pollute global namespace
-   static const unsigned char std_dc_luminance_nrcodes[] = {0,0,1,5,1,1,1,1,1,1,0,0,0,0,0,0,0};
-   static const unsigned char std_dc_luminance_values[] = {0,1,2,3,4,5,6,7,8,9,10,11};
-   static const unsigned char std_ac_luminance_nrcodes[] = {0,0,2,1,3,3,2,4,3,5,5,4,4,0,0,1,0x7d};
-   static const unsigned char std_ac_luminance_values[] = {
-      0x01,0x02,0x03,0x00,0x04,0x11,0x05,0x12,0x21,0x31,0x41,0x06,0x13,0x51,0x61,0x07,0x22,0x71,0x14,0x32,0x81,0x91,0xa1,0x08,
-      0x23,0x42,0xb1,0xc1,0x15,0x52,0xd1,0xf0,0x24,0x33,0x62,0x72,0x82,0x09,0x0a,0x16,0x17,0x18,0x19,0x1a,0x25,0x26,0x27,0x28,
-      0x29,0x2a,0x34,0x35,0x36,0x37,0x38,0x39,0x3a,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4a,0x53,0x54,0x55,0x56,0x57,0x58,0x59,
-      0x5a,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,0x83,0x84,0x85,0x86,0x87,0x88,0x89,
-      0x8a,0x92,0x93,0x94,0x95,0x96,0x97,0x98,0x99,0x9a,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,0xa8,0xa9,0xaa,0xb2,0xb3,0xb4,0xb5,0xb6,
-      0xb7,0xb8,0xb9,0xba,0xc2,0xc3,0xc4,0xc5,0xc6,0xc7,0xc8,0xc9,0xca,0xd2,0xd3,0xd4,0xd5,0xd6,0xd7,0xd8,0xd9,0xda,0xe1,0xe2,
-      0xe3,0xe4,0xe5,0xe6,0xe7,0xe8,0xe9,0xea,0xf1,0xf2,0xf3,0xf4,0xf5,0xf6,0xf7,0xf8,0xf9,0xfa
-   };
-   static const unsigned char std_dc_chrominance_nrcodes[] = {0,0,3,1,1,1,1,1,1,1,1,1,0,0,0,0,0};
-   static const unsigned char std_dc_chrominance_values[] = {0,1,2,3,4,5,6,7,8,9,10,11};
-   static const unsigned char std_ac_chrominance_nrcodes[] = {0,0,2,1,2,4,4,3,4,7,5,4,4,0,1,2,0x77};
-   static const unsigned char std_ac_chrominance_values[] = {
-      0x00,0x01,0x02,0x03,0x11,0x04,0x05,0x21,0x31,0x06,0x12,0x41,0x51,0x07,0x61,0x71,0x13,0x22,0x32,0x81,0x08,0x14,0x42,0x91,
-      0xa1,0xb1,0xc1,0x09,0x23,0x33,0x52,0xf0,0x15,0x62,0x72,0xd1,0x0a,0x16,0x24,0x34,0xe1,0x25,0xf1,0x17,0x18,0x19,0x1a,0x26,
-      0x27,0x28,0x29,0x2a,0x35,0x36,0x37,0x38,0x39,0x3a,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4a,0x53,0x54,0x55,0x56,0x57,0x58,
-      0x59,0x5a,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,0x82,0x83,0x84,0x85,0x86,0x87,
-      0x88,0x89,0x8a,0x92,0x93,0x94,0x95,0x96,0x97,0x98,0x99,0x9a,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,0xa8,0xa9,0xaa,0xb2,0xb3,0xb4,
-      0xb5,0xb6,0xb7,0xb8,0xb9,0xba,0xc2,0xc3,0xc4,0xc5,0xc6,0xc7,0xc8,0xc9,0xca,0xd2,0xd3,0xd4,0xd5,0xd6,0xd7,0xd8,0xd9,0xda,
-      0xe2,0xe3,0xe4,0xe5,0xe6,0xe7,0xe8,0xe9,0xea,0xf2,0xf3,0xf4,0xf5,0xf6,0xf7,0xf8,0xf9,0xfa
-   };
-   // Huffman tables
-   static const unsigned short YDC_HT[256][2] = { {0,2},{2,3},{3,3},{4,3},{5,3},{6,3},{14,4},{30,5},{62,6},{126,7},{254,8},{510,9}};
-   static const unsigned short UVDC_HT[256][2] = { {0,2},{1,2},{2,2},{6,3},{14,4},{30,5},{62,6},{126,7},{254,8},{510,9},{1022,10},{2046,11}};
-   static const unsigned short YAC_HT[256][2] = {
-      {10,4},{0,2},{1,2},{4,3},{11,4},{26,5},{120,7},{248,8},{1014,10},{65410,16},{65411,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
-      {12,4},{27,5},{121,7},{502,9},{2038,11},{65412,16},{65413,16},{65414,16},{65415,16},{65416,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
-      {28,5},{249,8},{1015,10},{4084,12},{65417,16},{65418,16},{65419,16},{65420,16},{65421,16},{65422,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
-      {58,6},{503,9},{4085,12},{65423,16},{65424,16},{65425,16},{65426,16},{65427,16},{65428,16},{65429,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
-      {59,6},{1016,10},{65430,16},{65431,16},{65432,16},{65433,16},{65434,16},{65435,16},{65436,16},{65437,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
-      {122,7},{2039,11},{65438,16},{65439,16},{65440,16},{65441,16},{65442,16},{65443,16},{65444,16},{65445,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
-      {123,7},{4086,12},{65446,16},{65447,16},{65448,16},{65449,16},{65450,16},{65451,16},{65452,16},{65453,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
-      {250,8},{4087,12},{65454,16},{65455,16},{65456,16},{65457,16},{65458,16},{65459,16},{65460,16},{65461,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
-      {504,9},{32704,15},{65462,16},{65463,16},{65464,16},{65465,16},{65466,16},{65467,16},{65468,16},{65469,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
-      {505,9},{65470,16},{65471,16},{65472,16},{65473,16},{65474,16},{65475,16},{65476,16},{65477,16},{65478,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
-      {506,9},{65479,16},{65480,16},{65481,16},{65482,16},{65483,16},{65484,16},{65485,16},{65486,16},{65487,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
-      {1017,10},{65488,16},{65489,16},{65490,16},{65491,16},{65492,16},{65493,16},{65494,16},{65495,16},{65496,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
-      {1018,10},{65497,16},{65498,16},{65499,16},{65500,16},{65501,16},{65502,16},{65503,16},{65504,16},{65505,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
-      {2040,11},{65506,16},{65507,16},{65508,16},{65509,16},{65510,16},{65511,16},{65512,16},{65513,16},{65514,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
-      {65515,16},{65516,16},{65517,16},{65518,16},{65519,16},{65520,16},{65521,16},{65522,16},{65523,16},{65524,16},{0,0},{0,0},{0,0},{0,0},{0,0},
-      {2041,11},{65525,16},{65526,16},{65527,16},{65528,16},{65529,16},{65530,16},{65531,16},{65532,16},{65533,16},{65534,16},{0,0},{0,0},{0,0},{0,0},{0,0}
-   };
-   static const unsigned short UVAC_HT[256][2] = {
-      {0,2},{1,2},{4,3},{10,4},{24,5},{25,5},{56,6},{120,7},{500,9},{1014,10},{4084,12},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
-      {11,4},{57,6},{246,8},{501,9},{2038,11},{4085,12},{65416,16},{65417,16},{65418,16},{65419,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
-      {26,5},{247,8},{1015,10},{4086,12},{32706,15},{65420,16},{65421,16},{65422,16},{65423,16},{65424,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
-      {27,5},{248,8},{1016,10},{4087,12},{65425,16},{65426,16},{65427,16},{65428,16},{65429,16},{65430,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
-      {58,6},{502,9},{65431,16},{65432,16},{65433,16},{65434,16},{65435,16},{65436,16},{65437,16},{65438,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
-      {59,6},{1017,10},{65439,16},{65440,16},{65441,16},{65442,16},{65443,16},{65444,16},{65445,16},{65446,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
-      {121,7},{2039,11},{65447,16},{65448,16},{65449,16},{65450,16},{65451,16},{65452,16},{65453,16},{65454,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
-      {122,7},{2040,11},{65455,16},{65456,16},{65457,16},{65458,16},{65459,16},{65460,16},{65461,16},{65462,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
-      {249,8},{65463,16},{65464,16},{65465,16},{65466,16},{65467,16},{65468,16},{65469,16},{65470,16},{65471,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
-      {503,9},{65472,16},{65473,16},{65474,16},{65475,16},{65476,16},{65477,16},{65478,16},{65479,16},{65480,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
-      {504,9},{65481,16},{65482,16},{65483,16},{65484,16},{65485,16},{65486,16},{65487,16},{65488,16},{65489,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
-      {505,9},{65490,16},{65491,16},{65492,16},{65493,16},{65494,16},{65495,16},{65496,16},{65497,16},{65498,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
-      {506,9},{65499,16},{65500,16},{65501,16},{65502,16},{65503,16},{65504,16},{65505,16},{65506,16},{65507,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
-      {2041,11},{65508,16},{65509,16},{65510,16},{65511,16},{65512,16},{65513,16},{65514,16},{65515,16},{65516,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
-      {16352,14},{65517,16},{65518,16},{65519,16},{65520,16},{65521,16},{65522,16},{65523,16},{65524,16},{65525,16},{0,0},{0,0},{0,0},{0,0},{0,0},
-      {1018,10},{32707,15},{65526,16},{65527,16},{65528,16},{65529,16},{65530,16},{65531,16},{65532,16},{65533,16},{65534,16},{0,0},{0,0},{0,0},{0,0},{0,0}
-   };
-   static const int YQT[] = {16,11,10,16,24,40,51,61,12,12,14,19,26,58,60,55,14,13,16,24,40,57,69,56,14,17,22,29,51,87,80,62,18,22,
-                             37,56,68,109,103,77,24,35,55,64,81,104,113,92,49,64,78,87,103,121,120,101,72,92,95,98,112,100,103,99};
-   static const int UVQT[] = {17,18,24,47,99,99,99,99,18,21,26,66,99,99,99,99,24,26,56,99,99,99,99,99,47,66,99,99,99,99,99,99,
-                              99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99};
-   static const float aasf[] = { 1.0f * 2.828427125f, 1.387039845f * 2.828427125f, 1.306562965f * 2.828427125f, 1.175875602f * 2.828427125f,
-                                 1.0f * 2.828427125f, 0.785694958f * 2.828427125f, 0.541196100f * 2.828427125f, 0.275899379f * 2.828427125f };
-
-   int row, col, i, k, subsample;
-   float fdtbl_Y[64], fdtbl_UV[64];
-   unsigned char YTable[64], UVTable[64];
-
-   if(!data || !width || !height || comp > 4 || comp < 1) {
-      return 0;
-   }
-
-   quality = quality ? quality : 90;
-   subsample = quality <= 90 ? 1 : 0;
-   quality = quality < 1 ? 1 : quality > 100 ? 100 : quality;
-   quality = quality < 50 ? 5000 / quality : 200 - quality * 2;
-
-   for(i = 0; i < 64; ++i) {
-      int uvti, yti = (YQT[i]*quality+50)/100;
-      YTable[stbiw__jpg_ZigZag[i]] = (unsigned char) (yti < 1 ? 1 : yti > 255 ? 255 : yti);
-      uvti = (UVQT[i]*quality+50)/100;
-      UVTable[stbiw__jpg_ZigZag[i]] = (unsigned char) (uvti < 1 ? 1 : uvti > 255 ? 255 : uvti);
-   }
-
-   for(row = 0, k = 0; row < 8; ++row) {
-      for(col = 0; col < 8; ++col, ++k) {
-         fdtbl_Y[k]  = 1 / (YTable [stbiw__jpg_ZigZag[k]] * aasf[row] * aasf[col]);
-         fdtbl_UV[k] = 1 / (UVTable[stbiw__jpg_ZigZag[k]] * aasf[row] * aasf[col]);
-      }
-   }
-
-   // Write Headers
-   {
-      static const unsigned char head0[] = { 0xFF,0xD8,0xFF,0xE0,0,0x10,'J','F','I','F',0,1,1,0,0,1,0,1,0,0,0xFF,0xDB,0,0x84,0 };
-      static const unsigned char head2[] = { 0xFF,0xDA,0,0xC,3,1,0,2,0x11,3,0x11,0,0x3F,0 };
-      const unsigned char head1[] = { 0xFF,0xC0,0,0x11,8,(unsigned char)(height>>8),STBIW_UCHAR(height),(unsigned char)(width>>8),STBIW_UCHAR(width),
-                                      3,1,(unsigned char)(subsample?0x22:0x11),0,2,0x11,1,3,0x11,1,0xFF,0xC4,0x01,0xA2,0 };
-      s->func(s->context, (void*)head0, sizeof(head0));
-      s->func(s->context, (void*)YTable, sizeof(YTable));
-      stbiw__putc(s, 1);
-      s->func(s->context, UVTable, sizeof(UVTable));
-      s->func(s->context, (void*)head1, sizeof(head1));
-      s->func(s->context, (void*)(std_dc_luminance_nrcodes+1), sizeof(std_dc_luminance_nrcodes)-1);
-      s->func(s->context, (void*)std_dc_luminance_values, sizeof(std_dc_luminance_values));
-      stbiw__putc(s, 0x10); // HTYACinfo
-      s->func(s->context, (void*)(std_ac_luminance_nrcodes+1), sizeof(std_ac_luminance_nrcodes)-1);
-      s->func(s->context, (void*)std_ac_luminance_values, sizeof(std_ac_luminance_values));
-      stbiw__putc(s, 1); // HTUDCinfo
-      s->func(s->context, (void*)(std_dc_chrominance_nrcodes+1), sizeof(std_dc_chrominance_nrcodes)-1);
-      s->func(s->context, (void*)std_dc_chrominance_values, sizeof(std_dc_chrominance_values));
-      stbiw__putc(s, 0x11); // HTUACinfo
-      s->func(s->context, (void*)(std_ac_chrominance_nrcodes+1), sizeof(std_ac_chrominance_nrcodes)-1);
-      s->func(s->context, (void*)std_ac_chrominance_values, sizeof(std_ac_chrominance_values));
-      s->func(s->context, (void*)head2, sizeof(head2));
-   }
-
-   // Encode 8x8 macroblocks
-   {
-      static const unsigned short fillBits[] = {0x7F, 7};
-      int DCY=0, DCU=0, DCV=0;
-      int bitBuf=0, bitCnt=0;
-      // comp == 2 is grey+alpha (alpha is ignored)
-      int ofsG = comp > 2 ? 1 : 0, ofsB = comp > 2 ? 2 : 0;
-      const unsigned char *dataR = (const unsigned char *)data;
-      const unsigned char *dataG = dataR + ofsG;
-      const unsigned char *dataB = dataR + ofsB;
-      int x, y, pos;
-      if(subsample) {
-         for(y = 0; y < height; y += 16) {
-            for(x = 0; x < width; x += 16) {
-               float Y[256], U[256], V[256];
-               for(row = y, pos = 0; row < y+16; ++row) {
-                  // row >= height => use last input row
-                  int clamped_row = (row < height) ? row : height - 1;
-                  int base_p = (stbi__flip_vertically_on_write ? (height-1-clamped_row) : clamped_row)*width*comp;
-                  for(col = x; col < x+16; ++col, ++pos) {
-                     // if col >= width => use pixel from last input column
-                     int p = base_p + ((col < width) ? col : (width-1))*comp;
-                     float r = dataR[p], g = dataG[p], b = dataB[p];
-                     Y[pos]= +0.29900f*r + 0.58700f*g + 0.11400f*b - 128;
-                     U[pos]= -0.16874f*r - 0.33126f*g + 0.50000f*b;
-                     V[pos]= +0.50000f*r - 0.41869f*g - 0.08131f*b;
-                  }
-               }
-               DCY = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, Y+0,   16, fdtbl_Y, DCY, YDC_HT, YAC_HT);
-               DCY = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, Y+8,   16, fdtbl_Y, DCY, YDC_HT, YAC_HT);
-               DCY = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, Y+128, 16, fdtbl_Y, DCY, YDC_HT, YAC_HT);
-               DCY = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, Y+136, 16, fdtbl_Y, DCY, YDC_HT, YAC_HT);
-
-               // subsample U,V
-               {
-                  float subU[64], subV[64];
-                  int yy, xx;
-                  for(yy = 0, pos = 0; yy < 8; ++yy) {
-                     for(xx = 0; xx < 8; ++xx, ++pos) {
-                        int j = yy*32+xx*2;
-                        subU[pos] = (U[j+0] + U[j+1] + U[j+16] + U[j+17]) * 0.25f;
-                        subV[pos] = (V[j+0] + V[j+1] + V[j+16] + V[j+17]) * 0.25f;
-                     }
-                  }
-                  DCU = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, subU, 8, fdtbl_UV, DCU, UVDC_HT, UVAC_HT);
-                  DCV = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, subV, 8, fdtbl_UV, DCV, UVDC_HT, UVAC_HT);
-               }
-            }
-         }
-      } else {
-         for(y = 0; y < height; y += 8) {
-            for(x = 0; x < width; x += 8) {
-               float Y[64], U[64], V[64];
-               for(row = y, pos = 0; row < y+8; ++row) {
-                  // row >= height => use last input row
-                  int clamped_row = (row < height) ? row : height - 1;
-                  int base_p = (stbi__flip_vertically_on_write ? (height-1-clamped_row) : clamped_row)*width*comp;
-                  for(col = x; col < x+8; ++col, ++pos) {
-                     // if col >= width => use pixel from last input column
-                     int p = base_p + ((col < width) ? col : (width-1))*comp;
-                     float r = dataR[p], g = dataG[p], b = dataB[p];
-                     Y[pos]= +0.29900f*r + 0.58700f*g + 0.11400f*b - 128;
-                     U[pos]= -0.16874f*r - 0.33126f*g + 0.50000f*b;
-                     V[pos]= +0.50000f*r - 0.41869f*g - 0.08131f*b;
-                  }
-               }
-
-               DCY = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, Y, 8, fdtbl_Y,  DCY, YDC_HT, YAC_HT);
-               DCU = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, U, 8, fdtbl_UV, DCU, UVDC_HT, UVAC_HT);
-               DCV = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, V, 8, fdtbl_UV, DCV, UVDC_HT, UVAC_HT);
-            }
-         }
-      }
-
-      // Do the bit alignment of the EOI marker
-      stbiw__jpg_writeBits(s, &bitBuf, &bitCnt, fillBits);
-   }
-
-   // EOI
-   stbiw__putc(s, 0xFF);
-   stbiw__putc(s, 0xD9);
-
-   return 1;
-}
-
-STBIWDEF int stbi_write_jpg_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data, int quality)
-{
-   stbi__write_context s = { 0 };
-   stbi__start_write_callbacks(&s, func, context);
-   return stbi_write_jpg_core(&s, x, y, comp, (void *) data, quality);
-}
-
-
-#ifndef STBI_WRITE_NO_STDIO
-STBIWDEF int stbi_write_jpg(char const *filename, int x, int y, int comp, const void *data, int quality)
-{
-   stbi__write_context s = { 0 };
-   if (stbi__start_write_file(&s,filename)) {
-      int r = stbi_write_jpg_core(&s, x, y, comp, data, quality);
-      stbi__end_write_file(&s);
-      return r;
-   } else
-      return 0;
-}
-#endif
-
-#endif // STB_IMAGE_WRITE_IMPLEMENTATION
-
-/* Revision history
-      1.16  (2021-07-11)
-             make Deflate code emit uncompressed blocks when it would otherwise expand
-             support writing BMPs with alpha channel
-      1.15  (2020-07-13) unknown
-      1.14  (2020-02-02) updated JPEG writer to downsample chroma channels
-      1.13
-      1.12
-      1.11  (2019-08-11)
-
-      1.10  (2019-02-07)
-             support utf8 filenames in Windows; fix warnings and platform ifdefs
-      1.09  (2018-02-11)
-             fix typo in zlib quality API, improve STB_I_W_STATIC in C++
-      1.08  (2018-01-29)
-             add stbi__flip_vertically_on_write, external zlib, zlib quality, choose PNG filter
-      1.07  (2017-07-24)
-             doc fix
-      1.06 (2017-07-23)
-             writing JPEG (using Jon Olick's code)
-      1.05   ???
-      1.04 (2017-03-03)
-             monochrome BMP expansion
-      1.03   ???
-      1.02 (2016-04-02)
-             avoid allocating large structures on the stack
-      1.01 (2016-01-16)
-             STBIW_REALLOC_SIZED: support allocators with no realloc support
-             avoid race-condition in crc initialization
-             minor compile issues
-      1.00 (2015-09-14)
-             installable file IO function
-      0.99 (2015-09-13)
-             warning fixes; TGA rle support
-      0.98 (2015-04-08)
-             added STBIW_MALLOC, STBIW_ASSERT etc
-      0.97 (2015-01-18)
-             fixed HDR asserts, rewrote HDR rle logic
-      0.96 (2015-01-17)
-             add HDR output
-             fix monochrome BMP
-      0.95 (2014-08-17)
-             add monochrome TGA output
-      0.94 (2014-05-31)
-             rename private functions to avoid conflicts with stb_image.h
-      0.93 (2014-05-27)
-             warning fixes
-      0.92 (2010-08-01)
-             casts to unsigned char to fix warnings
-      0.91 (2010-07-17)
-             first public release
-      0.90   first internal release
-*/
-
-/*
-------------------------------------------------------------------------------
-This software is available under 2 licenses -- choose whichever you prefer.
-------------------------------------------------------------------------------
-ALTERNATIVE A - MIT License
-Copyright (c) 2017 Sean Barrett
-Permission is hereby granted, free of charge, to any person obtaining a copy of
-this software and associated documentation files (the "Software"), to deal in
-the Software without restriction, including without limitation the rights to
-use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
-of the Software, and to permit persons to whom the Software is furnished to do
-so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-------------------------------------------------------------------------------
-ALTERNATIVE B - Public Domain (www.unlicense.org)
-This is free and unencumbered software released into the public domain.
-Anyone is free to copy, modify, publish, use, compile, sell, or distribute this
-software, either in source code form or as a compiled binary, for any purpose,
-commercial or non-commercial, and by any means.
-In jurisdictions that recognize copyright laws, the author or authors of this
-software dedicate any and all copyright interest in the software to the public
-domain. We make this dedication for the benefit of the public at large and to
-the detriment of our heirs and successors. We intend this dedication to be an
-overt act of relinquishment in perpetuity of all present and future rights to
-this software under copyright law.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
-ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
-WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-------------------------------------------------------------------------------
-*/
diff --git a/ggml/examples/whisper/CMakeLists.txt b/ggml/examples/whisper/CMakeLists.txt
deleted file mode 100644
index 63f8cd4..0000000
--- a/ggml/examples/whisper/CMakeLists.txt
+++ /dev/null
@@ -1,23 +0,0 @@
-#
-# whisper
-
-add_library(whisper-cpp STATIC
-    whisper.cpp
-    )
-
-target_link_libraries(whisper-cpp PRIVATE
-    ggml
-    )
-
-set(TEST_TARGET whisper)
-add_executable(${TEST_TARGET} main.cpp)
-target_link_libraries(${TEST_TARGET} PRIVATE whisper-cpp common)
-target_include_directories(${TEST_TARGET} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/..)
-target_include_directories(${TEST_TARGET} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../include/ggml)
-
-#
-# whisper-quantize
-
-set(TEST_TARGET whisper-quantize)
-add_executable(${TEST_TARGET} quantize.cpp)
-target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)
diff --git a/ggml/examples/whisper/README.md b/ggml/examples/whisper/README.md
deleted file mode 100644
index a2e9727..0000000
--- a/ggml/examples/whisper/README.md
+++ /dev/null
@@ -1,29 +0,0 @@
-# whisper
-
-Port of [OpenAI's Whisper](https://github.com/openai/whisper) ASR model in C/C++ using
-[ggml](https://github.com/ggerganov/ggml)
-
-## More info
-
-Checkout https://github.com/ggerganov/whisper.cpp
-
-## Memory usage
-
-| Model  | Disk   | Mem     |
-| ---    | ---    | ---     |
-| tiny   |  75 MB | ~280 MB |
-| base   | 142 MB | ~430 MB |
-| small  | 466 MB | ~1.0 GB |
-| medium | 1.5 GB | ~2.6 GB |
-| large  | 2.9 GB | ~4.7 GB |
-
-## ggml format
-
-The original models are converted to a custom binary format. This allows to pack everything needed into a single file:
-
-- model parameters
-- mel filters
-- vocabulary
-- weights
-
-For more details, see the conversion script [convert-pt-to-ggml.py](convert-pt-to-ggml.py)
diff --git a/ggml/examples/whisper/convert-pt-to-ggml.py b/ggml/examples/whisper/convert-pt-to-ggml.py
deleted file mode 100644
index 9aa134b..0000000
--- a/ggml/examples/whisper/convert-pt-to-ggml.py
+++ /dev/null
@@ -1,342 +0,0 @@
-# Convert Whisper transformer model from PyTorch to ggml format
-#
-# Usage: python convert-pt-to-ggml.py ~/.cache/whisper/medium.pt ~/path/to/repo/whisper/ ./models/whisper-medium
-#
-# You need to clone the original repo in ~/path/to/repo/whisper/
-#
-#  git clone https://github.com/openai/whisper ~/path/to/repo/whisper/
-#
-# It is used to various assets needed by the algorithm:
-#
-#  - tokenizer
-#  - mel filters
-#
-# Also, you need to have the original models in ~/.cache/whisper/
-# See the original repo for more details.
-#
-# This script loads the specified model and whisper assets and saves them in ggml format.
-# The output is a single binary file containing the following information:
-#
-#  - hparams
-#  - mel filters
-#  - tokenizer vocab
-#  - model variables
-#
-# For each variable, write the following:
-#
-#  - Number of dimensions (int)
-#  - Name length (int)
-#  - Dimensions (int[n_dims])
-#  - Name (char[name_length])
-#  - Data (float[n_dims])
-#
-
-import io
-import os
-import sys
-import struct
-import json
-import code
-import torch
-import numpy as np
-import base64
-from pathlib import Path
-#from transformers import GPTJForCausalLM
-#from transformers import GPT2TokenizerFast
-
-# ref: https://github.com/openai/whisper/blob/8cf36f3508c9acd341a45eb2364239a3d81458b9/whisper/tokenizer.py#L10-L110
-#LANGUAGES = {
-#    "en": "english",
-#    "zh": "chinese",
-#    "de": "german",
-#    "es": "spanish",
-#    "ru": "russian",
-#    "ko": "korean",
-#    "fr": "french",
-#    "ja": "japanese",
-#    "pt": "portuguese",
-#    "tr": "turkish",
-#    "pl": "polish",
-#    "ca": "catalan",
-#    "nl": "dutch",
-#    "ar": "arabic",
-#    "sv": "swedish",
-#    "it": "italian",
-#    "id": "indonesian",
-#    "hi": "hindi",
-#    "fi": "finnish",
-#    "vi": "vietnamese",
-#    "iw": "hebrew",
-#    "uk": "ukrainian",
-#    "el": "greek",
-#    "ms": "malay",
-#    "cs": "czech",
-#    "ro": "romanian",
-#    "da": "danish",
-#    "hu": "hungarian",
-#    "ta": "tamil",
-#    "no": "norwegian",
-#    "th": "thai",
-#    "ur": "urdu",
-#    "hr": "croatian",
-#    "bg": "bulgarian",
-#    "lt": "lithuanian",
-#    "la": "latin",
-#    "mi": "maori",
-#    "ml": "malayalam",
-#    "cy": "welsh",
-#    "sk": "slovak",
-#    "te": "telugu",
-#    "fa": "persian",
-#    "lv": "latvian",
-#    "bn": "bengali",
-#    "sr": "serbian",
-#    "az": "azerbaijani",
-#    "sl": "slovenian",
-#    "kn": "kannada",
-#    "et": "estonian",
-#    "mk": "macedonian",
-#    "br": "breton",
-#    "eu": "basque",
-#    "is": "icelandic",
-#    "hy": "armenian",
-#    "ne": "nepali",
-#    "mn": "mongolian",
-#    "bs": "bosnian",
-#    "kk": "kazakh",
-#    "sq": "albanian",
-#    "sw": "swahili",
-#    "gl": "galician",
-#    "mr": "marathi",
-#    "pa": "punjabi",
-#    "si": "sinhala",
-#    "km": "khmer",
-#    "sn": "shona",
-#    "yo": "yoruba",
-#    "so": "somali",
-#    "af": "afrikaans",
-#    "oc": "occitan",
-#    "ka": "georgian",
-#    "be": "belarusian",
-#    "tg": "tajik",
-#    "sd": "sindhi",
-#    "gu": "gujarati",
-#    "am": "amharic",
-#    "yi": "yiddish",
-#    "lo": "lao",
-#    "uz": "uzbek",
-#    "fo": "faroese",
-#    "ht": "haitian creole",
-#    "ps": "pashto",
-#    "tk": "turkmen",
-#    "nn": "nynorsk",
-#    "mt": "maltese",
-#    "sa": "sanskrit",
-#    "lb": "luxembourgish",
-#    "my": "myanmar",
-#    "bo": "tibetan",
-#    "tl": "tagalog",
-#    "mg": "malagasy",
-#    "as": "assamese",
-#    "tt": "tatar",
-#    "haw": "hawaiian",
-#    "ln": "lingala",
-#    "ha": "hausa",
-#    "ba": "bashkir",
-#    "jw": "javanese",
-#    "su": "sundanese",
-#}
-
-## ref: https://github.com/openai/whisper/blob/8cf36f3508c9acd341a45eb2364239a3d81458b9/whisper/tokenizer.py#L273-L292
-#def build_tokenizer(path_to_whisper_repo: str, name: str = "gpt2"):
-#    os.environ["TOKENIZERS_PARALLELISM"] = "false"
-#    path = os.path.join(path_to_whisper_repo, "whisper/assets", name)
-#    tokenizer = GPT2TokenizerFast.from_pretrained(path)
-#
-#    specials = [
-#        "<|startoftranscript|>",
-#        *[f"<|{lang}|>" for lang in LANGUAGES.keys()],
-#        "<|translate|>",
-#        "<|transcribe|>",
-#        "<|startoflm|>",
-#        "<|startofprev|>",
-#        "<|nocaptions|>",
-#        "<|notimestamps|>",
-#    ]
-#
-#    tokenizer.add_special_tokens(dict(additional_special_tokens=specials))
-#    return tokenizer
-
-# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
-def bytes_to_unicode():
-    """
-    Returns list of utf-8 byte and a corresponding list of unicode strings.
-    The reversible bpe codes work on unicode strings.
-    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
-    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
-    This is a signficant percentage of your normal, say, 32K bpe vocab.
-    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
-    And avoids mapping to whitespace/control characters the bpe code barfs on.
-    """
-    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
-    cs = bs[:]
-    n = 0
-    for b in range(2**8):
-        if b not in bs:
-            bs.append(b)
-            cs.append(2**8+n)
-            n += 1
-    cs = [chr(n) for n in cs]
-    return dict(zip(bs, cs))
-
-
-if len(sys.argv) < 4:
-    print("Usage: convert-pt-to-ggml.py model.pt path-to-whisper-repo dir-output [use-f32]\n")
-    sys.exit(1)
-
-fname_inp   = Path(sys.argv[1])
-dir_whisper = Path(sys.argv[2])
-dir_out     = Path(sys.argv[3])
-
-# try to load PyTorch binary data
-try:
-    model_bytes = open(fname_inp, "rb").read()
-    with io.BytesIO(model_bytes) as fp:
-        checkpoint = torch.load(fp, map_location="cpu")
-except Exception:
-    print("Error: failed to load PyTorch model file:" , fname_inp)
-    sys.exit(1)
-
-hparams = checkpoint["dims"]
-print("hparams:", hparams)
-
-list_vars = checkpoint["model_state_dict"]
-
-#print(list_vars['encoder.positional_embedding'])
-#print(list_vars['encoder.conv1.weight'])
-#print(list_vars['encoder.conv1.weight'].shape)
-
-# load mel filters
-n_mels = hparams["n_mels"]
-with np.load(dir_whisper / "whisper" / "assets" / "mel_filters.npz") as f:
-    filters = torch.from_numpy(f[f"mel_{n_mels}"])
-    #print (filters)
-
-#code.interact(local=locals())
-
-# load tokenizer
-# for backwards compatibility, also check for older hf_transformers format tokenizer files
-# old format: dir_whisper/whisper/assets/[multilingual/gpt2]/vocab.json
-# new format: dir_whisper/whisper/assets/[multilingual/gpt2].tiktoken
-multilingual = hparams["n_vocab"] == 51865
-tokenizer = dir_whisper / "whisper" / "assets" / (multilingual and "multilingual.tiktoken" or "gpt2.tiktoken")
-tokenizer_type = "tiktoken"
-if not tokenizer.is_file():
-    tokenizer = dir_whisper / "whisper" / "assets" / (multilingual and "multilingual" or "gpt2") / "vocab.json"
-    tokenizer_type = "hf_transformers"
-    if not tokenizer.is_file():
-        print("Error: failed to find either tiktoken or hf_transformers tokenizer file:", tokenizer)
-        sys.exit(1)
-
-byte_encoder = bytes_to_unicode()
-byte_decoder = {v:k for k, v in byte_encoder.items()}
-
-if tokenizer_type == "tiktoken":
-    with open(tokenizer, "rb") as f:
-        contents = f.read()
-        tokens = {base64.b64decode(token): int(rank) for token, rank in (line.split() for line in contents.splitlines() if line)}
-elif tokenizer_type == "hf_transformers":
-    with open(tokenizer, "r", encoding="utf8") as f:
-        _tokens_raw = json.load(f)
-        if '<|endoftext|>' in _tokens_raw:
-            # ensures exact same model as tokenizer_type == tiktoken
-            # details: https://github.com/ggerganov/whisper.cpp/pull/725
-            del _tokens_raw['<|endoftext|>']
-        tokens = {bytes([byte_decoder[c] for c in token]): int(idx) for token, idx in _tokens_raw.items()}
-
-# output in the same directory as the model
-fname_out = dir_out / "ggml-model.bin"
-
-# use 16-bit or 32-bit floats
-use_f16 = True
-if len(sys.argv) > 4:
-    use_f16 = False
-    fname_out = dir_out / "ggml-model-f32.bin"
-
-fout = fname_out.open("wb")
-
-fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
-fout.write(struct.pack("i", hparams["n_vocab"]))
-fout.write(struct.pack("i", hparams["n_audio_ctx"]))
-fout.write(struct.pack("i", hparams["n_audio_state"]))
-fout.write(struct.pack("i", hparams["n_audio_head"]))
-fout.write(struct.pack("i", hparams["n_audio_layer"]))
-fout.write(struct.pack("i", hparams["n_text_ctx"]))
-fout.write(struct.pack("i", hparams["n_text_state"]))
-fout.write(struct.pack("i", hparams["n_text_head"]))
-fout.write(struct.pack("i", hparams["n_text_layer"]))
-fout.write(struct.pack("i", hparams["n_mels"]))
-fout.write(struct.pack("i", use_f16))
-
-# write mel filters
-fout.write(struct.pack("i", filters.shape[0]))
-fout.write(struct.pack("i", filters.shape[1]))
-for i in range(filters.shape[0]):
-    for j in range(filters.shape[1]):
-        fout.write(struct.pack("f", filters[i][j]))
-
-# write tokenizer
-fout.write(struct.pack("i", len(tokens)))
-
-for key in tokens:
-    fout.write(struct.pack("i", len(key)))
-    fout.write(key)
-
-for name in list_vars.keys():
-    data = list_vars[name].squeeze().numpy()
-    print("Processing variable: " , name ,  " with shape: ", data.shape)
-
-    # reshape conv bias from [n] to [n, 1]
-    if name in ["encoder.conv1.bias", "encoder.conv2.bias"]:
-        data = data.reshape(data.shape[0], 1)
-        print(f"  Reshaped variable: {name} to shape: ", data.shape)
-
-    n_dims = len(data.shape)
-
-    # looks like the whisper models are in f16 by default
-    # so we need to convert the small tensors to f32 until we fully support f16 in ggml
-    # ftype == 0 -> float32, ftype == 1 -> float16
-    ftype = 1
-    if use_f16:
-        if n_dims < 2 or \
-                name == "encoder.conv1.bias"   or \
-                name == "encoder.conv2.bias"   or \
-                name == "encoder.positional_embedding" or \
-                name == "decoder.positional_embedding":
-            print("  Converting to float32")
-            data = data.astype(np.float32)
-            ftype = 0
-    else:
-        data = data.astype(np.float32)
-        ftype = 0
-
-    #if name.startswith("encoder"):
-    #    if name.endswith("mlp.0.weight") or \
-    #       name.endswith("mlp.2.weight"):
-    #        print("  Transposing")
-    #        data = data.transpose()
-
-    # header
-    str_ = name.encode('utf-8')
-    fout.write(struct.pack("iii", n_dims, len(str_), ftype))
-    for i in range(n_dims):
-        fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
-    fout.write(str_)
-
-    # data
-    data.tofile(fout)
-
-fout.close()
-
-print("Done. Output file: " , fname_out)
-print("")
diff --git a/ggml/examples/whisper/main.cpp b/ggml/examples/whisper/main.cpp
deleted file mode 100644
index c92e9e0..0000000
--- a/ggml/examples/whisper/main.cpp
+++ /dev/null
@@ -1,1089 +0,0 @@
-#include "common.h"
-
-#include "whisper.h"
-
-#include <cmath>
-#include <fstream>
-#include <cstdio>
-#include <string>
-#include <thread>
-#include <vector>
-#include <cstring>
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-
-// Terminal color map. 10 colors grouped in ranges [0.0, 0.1, ..., 0.9]
-// Lowest is red, middle is yellow, highest is green.
-const std::vector<std::string> k_colors = {
-    "\033[38;5;196m", "\033[38;5;202m", "\033[38;5;208m", "\033[38;5;214m", "\033[38;5;220m",
-    "\033[38;5;226m", "\033[38;5;190m", "\033[38;5;154m", "\033[38;5;118m", "\033[38;5;82m",
-};
-
-//  500 -> 00:05.000
-// 6000 -> 01:00.000
-std::string to_timestamp(int64_t t, bool comma = false) {
-    int64_t msec = t * 10;
-    int64_t hr = msec / (1000 * 60 * 60);
-    msec = msec - hr * (1000 * 60 * 60);
-    int64_t min = msec / (1000 * 60);
-    msec = msec - min * (1000 * 60);
-    int64_t sec = msec / 1000;
-    msec = msec - sec * 1000;
-
-    char buf[32];
-    snprintf(buf, sizeof(buf), "%02d:%02d:%02d%s%03d", (int) hr, (int) min, (int) sec, comma ? "," : ".", (int) msec);
-
-    return std::string(buf);
-}
-
-int timestamp_to_sample(int64_t t, int n_samples) {
-    return std::max(0, std::min((int) n_samples - 1, (int) ((t*WHISPER_SAMPLE_RATE)/100)));
-}
-
-// helper function to replace substrings
-void replace_all(std::string & s, const std::string & search, const std::string & replace) {
-    for (size_t pos = 0; ; pos += replace.length()) {
-        pos = s.find(search, pos);
-        if (pos == std::string::npos) break;
-        s.erase(pos, search.length());
-        s.insert(pos, replace);
-    }
-}
-
-// command-line parameters
-struct whisper_params {
-    int32_t n_threads    = std::min(4, (int32_t) std::thread::hardware_concurrency());
-    int32_t n_processors =  1;
-    int32_t offset_t_ms  =  0;
-    int32_t offset_n     =  0;
-    int32_t duration_ms  =  0;
-    int32_t progress_step =  5;
-    int32_t max_context  = -1;
-    int32_t max_len      =  0;
-    int32_t best_of      = whisper_full_default_params(WHISPER_SAMPLING_GREEDY).greedy.best_of;
-    int32_t beam_size    = whisper_full_default_params(WHISPER_SAMPLING_BEAM_SEARCH).beam_search.beam_size;
-
-    float word_thold    =  0.01f;
-    float entropy_thold =  2.40f;
-    float logprob_thold = -1.00f;
-
-    bool speed_up        = false;
-    bool debug_mode      = false;
-    bool translate       = false;
-    bool detect_language = false;
-    bool diarize         = false;
-    bool tinydiarize     = false;
-    bool split_on_word   = false;
-    bool no_fallback     = false;
-    bool output_txt      = false;
-    bool output_vtt      = false;
-    bool output_srt      = false;
-    bool output_wts      = false;
-    bool output_csv      = false;
-    bool output_jsn      = false;
-    bool output_jsn_full = false;
-    bool output_lrc      = false;
-    bool no_prints       = false;
-    bool print_special   = false;
-    bool print_colors    = false;
-    bool print_progress  = false;
-    bool no_timestamps   = false;
-    bool log_score       = false;
-    bool use_gpu         = true;
-
-    std::string language  = "en";
-    std::string prompt;
-    std::string font_path = "/System/Library/Fonts/Supplemental/Courier New Bold.ttf";
-    std::string model     = "models/ggml-base.en.bin";
-
-    // [TDRZ] speaker turn string
-    std::string tdrz_speaker_turn = " [SPEAKER_TURN]"; // TODO: set from command line
-
-    std::string openvino_encode_device = "CPU";
-
-    std::vector<std::string> fname_inp = {};
-    std::vector<std::string> fname_out = {};
-};
-
-void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
-
-bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
-    for (int i = 1; i < argc; i++) {
-        std::string arg = argv[i];
-
-        if (arg == "-"){
-            params.fname_inp.push_back(arg);
-            continue;
-        }
-
-        if (arg[0] != '-') {
-            params.fname_inp.push_back(arg);
-            continue;
-        }
-
-        if (arg == "-h" || arg == "--help") {
-            whisper_print_usage(argc, argv, params);
-            exit(0);
-        }
-        else if (arg == "-t"    || arg == "--threads")         { params.n_threads       = std::stoi(argv[++i]); }
-        else if (arg == "-p"    || arg == "--processors")      { params.n_processors    = std::stoi(argv[++i]); }
-        else if (arg == "-ot"   || arg == "--offset-t")        { params.offset_t_ms     = std::stoi(argv[++i]); }
-        else if (arg == "-on"   || arg == "--offset-n")        { params.offset_n        = std::stoi(argv[++i]); }
-        else if (arg == "-d"    || arg == "--duration")        { params.duration_ms     = std::stoi(argv[++i]); }
-        else if (arg == "-mc"   || arg == "--max-context")     { params.max_context     = std::stoi(argv[++i]); }
-        else if (arg == "-ml"   || arg == "--max-len")         { params.max_len         = std::stoi(argv[++i]); }
-        else if (arg == "-bo"   || arg == "--best-of")         { params.best_of         = std::stoi(argv[++i]); }
-        else if (arg == "-bs"   || arg == "--beam-size")       { params.beam_size       = std::stoi(argv[++i]); }
-        else if (arg == "-wt"   || arg == "--word-thold")      { params.word_thold      = std::stof(argv[++i]); }
-        else if (arg == "-et"   || arg == "--entropy-thold")   { params.entropy_thold   = std::stof(argv[++i]); }
-        else if (arg == "-lpt"  || arg == "--logprob-thold")   { params.logprob_thold   = std::stof(argv[++i]); }
-        // else if (arg == "-su"   || arg == "--speed-up")        { params.speed_up        = true; }
-        else if (arg == "-debug"|| arg == "--debug-mode")      { params.debug_mode      = true; }
-        else if (arg == "-tr"   || arg == "--translate")       { params.translate       = true; }
-        else if (arg == "-di"   || arg == "--diarize")         { params.diarize         = true; }
-        else if (arg == "-tdrz" || arg == "--tinydiarize")     { params.tinydiarize     = true; }
-        else if (arg == "-sow"  || arg == "--split-on-word")   { params.split_on_word   = true; }
-        else if (arg == "-nf"   || arg == "--no-fallback")     { params.no_fallback     = true; }
-        else if (arg == "-otxt" || arg == "--output-txt")      { params.output_txt      = true; }
-        else if (arg == "-ovtt" || arg == "--output-vtt")      { params.output_vtt      = true; }
-        else if (arg == "-osrt" || arg == "--output-srt")      { params.output_srt      = true; }
-        else if (arg == "-owts" || arg == "--output-words")    { params.output_wts      = true; }
-        else if (arg == "-olrc" || arg == "--output-lrc")      { params.output_lrc      = true; }
-        else if (arg == "-fp"   || arg == "--font-path")       { params.font_path       = argv[++i]; }
-        else if (arg == "-ocsv" || arg == "--output-csv")      { params.output_csv      = true; }
-        else if (arg == "-oj"   || arg == "--output-json")     { params.output_jsn      = true; }
-        else if (arg == "-ojf"  || arg == "--output-json-full"){ params.output_jsn_full = params.output_jsn = true; }
-        else if (arg == "-of"   || arg == "--output-file")     { params.fname_out.emplace_back(argv[++i]); }
-        else if (arg == "-np"   || arg == "--no-prints")       { params.no_prints       = true; }
-        else if (arg == "-ps"   || arg == "--print-special")   { params.print_special   = true; }
-        else if (arg == "-pc"   || arg == "--print-colors")    { params.print_colors    = true; }
-        else if (arg == "-pp"   || arg == "--print-progress")  { params.print_progress  = true; }
-        else if (arg == "-nt"   || arg == "--no-timestamps")   { params.no_timestamps   = true; }
-        else if (arg == "-l"    || arg == "--language")        { params.language        = argv[++i]; }
-        else if (arg == "-dl"   || arg == "--detect-language") { params.detect_language = true; }
-        else if (                  arg == "--prompt")          { params.prompt          = argv[++i]; }
-        else if (arg == "-m"    || arg == "--model")           { params.model           = argv[++i]; }
-        else if (arg == "-f"    || arg == "--file")            { params.fname_inp.emplace_back(argv[++i]); }
-        else if (arg == "-oved" || arg == "--ov-e-device")     { params.openvino_encode_device = argv[++i]; }
-        else if (arg == "-ls"   || arg == "--log-score")       { params.log_score       = true; }
-        else if (arg == "-ng"   || arg == "--no-gpu")          { params.use_gpu         = false; }
-        else {
-            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
-            whisper_print_usage(argc, argv, params);
-            exit(0);
-        }
-    }
-
-    return true;
-}
-
-void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & params) {
-    fprintf(stderr, "\n");
-    fprintf(stderr, "usage: %s [options] file0.wav file1.wav ...\n", argv[0]);
-    fprintf(stderr, "\n");
-    fprintf(stderr, "options:\n");
-    fprintf(stderr, "  -h,        --help              [default] show this help message and exit\n");
-    fprintf(stderr, "  -t N,      --threads N         [%-7d] number of threads to use during computation\n",    params.n_threads);
-    fprintf(stderr, "  -p N,      --processors N      [%-7d] number of processors to use during computation\n", params.n_processors);
-    fprintf(stderr, "  -ot N,     --offset-t N        [%-7d] time offset in milliseconds\n",                    params.offset_t_ms);
-    fprintf(stderr, "  -on N,     --offset-n N        [%-7d] segment index offset\n",                           params.offset_n);
-    fprintf(stderr, "  -d  N,     --duration N        [%-7d] duration of audio to process in milliseconds\n",   params.duration_ms);
-    fprintf(stderr, "  -mc N,     --max-context N     [%-7d] maximum number of text context tokens to store\n", params.max_context);
-    fprintf(stderr, "  -ml N,     --max-len N         [%-7d] maximum segment length in characters\n",           params.max_len);
-    fprintf(stderr, "  -sow,      --split-on-word     [%-7s] split on word rather than on token\n",             params.split_on_word ? "true" : "false");
-    fprintf(stderr, "  -bo N,     --best-of N         [%-7d] number of best candidates to keep\n",              params.best_of);
-    fprintf(stderr, "  -bs N,     --beam-size N       [%-7d] beam size for beam search\n",                      params.beam_size);
-    fprintf(stderr, "  -wt N,     --word-thold N      [%-7.2f] word timestamp probability threshold\n",         params.word_thold);
-    fprintf(stderr, "  -et N,     --entropy-thold N   [%-7.2f] entropy threshold for decoder fail\n",           params.entropy_thold);
-    fprintf(stderr, "  -lpt N,    --logprob-thold N   [%-7.2f] log probability threshold for decoder fail\n",   params.logprob_thold);
-    // fprintf(stderr, "  -su,       --speed-up          [%-7s] speed up audio by x2 (reduced accuracy)\n",        params.speed_up ? "true" : "false");
-    fprintf(stderr, "  -debug,    --debug-mode        [%-7s] enable debug mode (eg. dump log_mel)\n",           params.debug_mode ? "true" : "false");
-    fprintf(stderr, "  -tr,       --translate         [%-7s] translate from source language to english\n",      params.translate ? "true" : "false");
-    fprintf(stderr, "  -di,       --diarize           [%-7s] stereo audio diarization\n",                       params.diarize ? "true" : "false");
-    fprintf(stderr, "  -tdrz,     --tinydiarize       [%-7s] enable tinydiarize (requires a tdrz model)\n",     params.tinydiarize ? "true" : "false");
-    fprintf(stderr, "  -nf,       --no-fallback       [%-7s] do not use temperature fallback while decoding\n", params.no_fallback ? "true" : "false");
-    fprintf(stderr, "  -otxt,     --output-txt        [%-7s] output result in a text file\n",                   params.output_txt ? "true" : "false");
-    fprintf(stderr, "  -ovtt,     --output-vtt        [%-7s] output result in a vtt file\n",                    params.output_vtt ? "true" : "false");
-    fprintf(stderr, "  -osrt,     --output-srt        [%-7s] output result in a srt file\n",                    params.output_srt ? "true" : "false");
-    fprintf(stderr, "  -olrc,     --output-lrc        [%-7s] output result in a lrc file\n",                    params.output_lrc ? "true" : "false");
-    fprintf(stderr, "  -owts,     --output-words      [%-7s] output script for generating karaoke video\n",     params.output_wts ? "true" : "false");
-    fprintf(stderr, "  -fp,       --font-path         [%-7s] path to a monospace font for karaoke video\n",     params.font_path.c_str());
-    fprintf(stderr, "  -ocsv,     --output-csv        [%-7s] output result in a CSV file\n",                    params.output_csv ? "true" : "false");
-    fprintf(stderr, "  -oj,       --output-json       [%-7s] output result in a JSON file\n",                   params.output_jsn ? "true" : "false");
-    fprintf(stderr, "  -ojf,      --output-json-full  [%-7s] include more information in the JSON file\n",      params.output_jsn_full ? "true" : "false");
-    fprintf(stderr, "  -of FNAME, --output-file FNAME [%-7s] output file path (without file extension)\n",      "");
-    fprintf(stderr, "  -np,       --no-prints         [%-7s] do not print anything other than the results\n",   params.no_prints ? "true" : "false");
-    fprintf(stderr, "  -ps,       --print-special     [%-7s] print special tokens\n",                           params.print_special ? "true" : "false");
-    fprintf(stderr, "  -pc,       --print-colors      [%-7s] print colors\n",                                   params.print_colors ? "true" : "false");
-    fprintf(stderr, "  -pp,       --print-progress    [%-7s] print progress\n",                                 params.print_progress ? "true" : "false");
-    fprintf(stderr, "  -nt,       --no-timestamps     [%-7s] do not print timestamps\n",                        params.no_timestamps ? "true" : "false");
-    fprintf(stderr, "  -l LANG,   --language LANG     [%-7s] spoken language ('auto' for auto-detect)\n",       params.language.c_str());
-    fprintf(stderr, "  -dl,       --detect-language   [%-7s] exit after automatically detecting language\n",    params.detect_language ? "true" : "false");
-    fprintf(stderr, "             --prompt PROMPT     [%-7s] initial prompt\n",                                 params.prompt.c_str());
-    fprintf(stderr, "  -m FNAME,  --model FNAME       [%-7s] model path\n",                                     params.model.c_str());
-    fprintf(stderr, "  -f FNAME,  --file FNAME        [%-7s] input WAV file path\n",                            "");
-    fprintf(stderr, "  -oved D,   --ov-e-device DNAME [%-7s] the OpenVINO device used for encode inference\n",  params.openvino_encode_device.c_str());
-    fprintf(stderr, "  -ls,       --log-score         [%-7s] log best decoder scores of tokens\n",              params.log_score?"true":"false");
-    fprintf(stderr, "  -ng,       --no-gpu            [%-7s] disable GPU\n",                                    params.use_gpu ? "false" : "true");
-    fprintf(stderr, "\n");
-}
-
-struct whisper_print_user_data {
-    const whisper_params * params;
-
-    const std::vector<std::vector<float>> * pcmf32s;
-    int progress_prev;
-};
-
-std::string estimate_diarization_speaker(std::vector<std::vector<float>> pcmf32s, int64_t t0, int64_t t1, bool id_only = false) {
-    std::string speaker = "";
-    const int64_t n_samples = pcmf32s[0].size();
-
-    const int64_t is0 = timestamp_to_sample(t0, n_samples);
-    const int64_t is1 = timestamp_to_sample(t1, n_samples);
-
-    double energy0 = 0.0f;
-    double energy1 = 0.0f;
-
-    for (int64_t j = is0; j < is1; j++) {
-        energy0 += fabs(pcmf32s[0][j]);
-        energy1 += fabs(pcmf32s[1][j]);
-    }
-
-    if (energy0 > 1.1*energy1) {
-        speaker = "0";
-    } else if (energy1 > 1.1*energy0) {
-        speaker = "1";
-    } else {
-        speaker = "?";
-    }
-
-    //printf("is0 = %lld, is1 = %lld, energy0 = %f, energy1 = %f, speaker = %s\n", is0, is1, energy0, energy1, speaker.c_str());
-
-    if (!id_only) {
-        speaker.insert(0, "(speaker ");
-        speaker.append(")");
-    }
-
-    return speaker;
-}
-void whisper_print_progress_callback(struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, int progress, void * user_data) {
-    int progress_step = ((whisper_print_user_data *) user_data)->params->progress_step;
-    int * progress_prev  = &(((whisper_print_user_data *) user_data)->progress_prev);
-    if (progress >= *progress_prev + progress_step) {
-        *progress_prev += progress_step;
-        fprintf(stderr, "%s: progress = %3d%%\n", __func__, progress);
-    }
-}
-
-void whisper_print_segment_callback(struct whisper_context * ctx, struct whisper_state * /*state*/, int n_new, void * user_data) {
-    const auto & params  = *((whisper_print_user_data *) user_data)->params;
-    const auto & pcmf32s = *((whisper_print_user_data *) user_data)->pcmf32s;
-
-    const int n_segments = whisper_full_n_segments(ctx);
-
-    std::string speaker = "";
-
-    int64_t t0 = 0;
-    int64_t t1 = 0;
-
-    // print the last n_new segments
-    const int s0 = n_segments - n_new;
-
-    if (s0 == 0) {
-        printf("\n");
-    }
-
-    for (int i = s0; i < n_segments; i++) {
-        if (!params.no_timestamps || params.diarize) {
-            t0 = whisper_full_get_segment_t0(ctx, i);
-            t1 = whisper_full_get_segment_t1(ctx, i);
-        }
-
-        if (!params.no_timestamps) {
-            printf("[%s --> %s]  ", to_timestamp(t0).c_str(), to_timestamp(t1).c_str());
-        }
-
-        if (params.diarize && pcmf32s.size() == 2) {
-            speaker = estimate_diarization_speaker(pcmf32s, t0, t1);
-        }
-
-        if (params.print_colors) {
-            for (int j = 0; j < whisper_full_n_tokens(ctx, i); ++j) {
-                if (params.print_special == false) {
-                    const whisper_token id = whisper_full_get_token_id(ctx, i, j);
-                    if (id >= whisper_token_eot(ctx)) {
-                        continue;
-                    }
-                }
-
-                const char * text = whisper_full_get_token_text(ctx, i, j);
-                const float  p    = whisper_full_get_token_p   (ctx, i, j);
-
-                const int col = std::max(0, std::min((int) k_colors.size() - 1, (int) (std::pow(p, 3)*float(k_colors.size()))));
-
-                printf("%s%s%s%s", speaker.c_str(), k_colors[col].c_str(), text, "\033[0m");
-            }
-        } else {
-            const char * text = whisper_full_get_segment_text(ctx, i);
-
-            printf("%s%s", speaker.c_str(), text);
-        }
-
-        if (params.tinydiarize) {
-            if (whisper_full_get_segment_speaker_turn_next(ctx, i)) {
-                printf("%s", params.tdrz_speaker_turn.c_str());
-            }
-        }
-
-        // with timestamps or speakers: each segment on new line
-        if (!params.no_timestamps || params.diarize) {
-            printf("\n");
-        }
-
-        fflush(stdout);
-    }
-}
-
-bool output_txt(struct whisper_context * ctx, const char * fname, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
-    std::ofstream fout(fname);
-    if (!fout.is_open()) {
-        fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname);
-        return false;
-    }
-
-    fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
-
-    const int n_segments = whisper_full_n_segments(ctx);
-    for (int i = 0; i < n_segments; ++i) {
-        const char * text = whisper_full_get_segment_text(ctx, i);
-        std::string speaker = "";
-
-        if (params.diarize && pcmf32s.size() == 2)
-        {
-            const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
-            const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
-            speaker = estimate_diarization_speaker(pcmf32s, t0, t1);
-        }
-
-        fout << speaker << text << "\n";
-    }
-
-    return true;
-}
-
-bool output_vtt(struct whisper_context * ctx, const char * fname, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
-    std::ofstream fout(fname);
-    if (!fout.is_open()) {
-        fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname);
-        return false;
-    }
-
-    fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
-
-    fout << "WEBVTT\n\n";
-
-    const int n_segments = whisper_full_n_segments(ctx);
-    for (int i = 0; i < n_segments; ++i) {
-        const char * text = whisper_full_get_segment_text(ctx, i);
-        const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
-        const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
-        std::string speaker = "";
-
-        if (params.diarize && pcmf32s.size() == 2)
-        {
-            speaker = estimate_diarization_speaker(pcmf32s, t0, t1, true);
-            speaker.insert(0, "<v Speaker");
-            speaker.append(">");
-        }
-
-        fout << to_timestamp(t0) << " --> " << to_timestamp(t1) << "\n";
-        fout << speaker << text << "\n\n";
-    }
-
-    return true;
-}
-
-bool output_srt(struct whisper_context * ctx, const char * fname, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
-    std::ofstream fout(fname);
-    if (!fout.is_open()) {
-        fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname);
-        return false;
-    }
-
-    fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
-
-    const int n_segments = whisper_full_n_segments(ctx);
-    for (int i = 0; i < n_segments; ++i) {
-        const char * text = whisper_full_get_segment_text(ctx, i);
-        const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
-        const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
-        std::string speaker = "";
-
-        if (params.diarize && pcmf32s.size() == 2)
-        {
-            speaker = estimate_diarization_speaker(pcmf32s, t0, t1);
-        }
-
-        fout << i + 1 + params.offset_n << "\n";
-        fout << to_timestamp(t0, true) << " --> " << to_timestamp(t1, true) << "\n";
-        fout << speaker << text << "\n\n";
-    }
-
-    return true;
-}
-
-char *escape_double_quotes_and_backslashes(const char *str) {
-    if (str == NULL) {
-        return NULL;
-    }
-
-    size_t escaped_length = strlen(str) + 1;
-
-    for (size_t i = 0; str[i] != '\0'; i++) {
-        if (str[i] == '"' || str[i] == '\\') {
-            escaped_length++;
-        }
-    }
-
-    char *escaped = (char *)calloc(escaped_length, 1); // pre-zeroed
-    if (escaped == NULL) {
-        return NULL;
-    }
-
-    size_t pos = 0;
-    for (size_t i = 0; str[i] != '\0'; i++) {
-        if (str[i] == '"' || str[i] == '\\') {
-            escaped[pos++] = '\\';
-        }
-        escaped[pos++] = str[i];
-    }
-
-    // no need to set zero due to calloc() being used prior
-
-    return escaped;
-}
-
-bool output_csv(struct whisper_context * ctx, const char * fname, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
-    std::ofstream fout(fname);
-    if (!fout.is_open()) {
-        fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname);
-        return false;
-    }
-
-    fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
-
-    const int n_segments = whisper_full_n_segments(ctx);
-    fout << "start,end,";
-    if (params.diarize && pcmf32s.size() == 2)
-    {
-        fout << "speaker,";
-    }
-    fout << "text\n";
-
-    for (int i = 0; i < n_segments; ++i) {
-        const char * text = whisper_full_get_segment_text(ctx, i);
-        const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
-        const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
-        char * text_escaped = escape_double_quotes_and_backslashes(text);
-
-        //need to multiply times returned from whisper_full_get_segment_t{0,1}() by 10 to get milliseconds.
-        fout << 10 * t0 << "," << 10 * t1 << ",";
-        if (params.diarize && pcmf32s.size() == 2)
-        {
-            fout << estimate_diarization_speaker(pcmf32s, t0, t1, true) << ",";
-        }
-        fout << "\"" << text_escaped << "\"\n";
-    }
-
-    return true;
-}
-
-bool output_score(struct whisper_context * ctx, const char * fname, const whisper_params & /*params*/, std::vector<std::vector<float>> /*pcmf32s*/) {
-    std::ofstream fout(fname);
-    fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
-
-    const int n_segments = whisper_full_n_segments(ctx);
-    // fprintf(stderr,"segments: %d\n",n_segments);
-    for (int i = 0; i < n_segments; ++i) {
-        const int n_tokens = whisper_full_n_tokens(ctx, i);
-        // fprintf(stderr,"tokens: %d\n",n_tokens);
-        for (int j = 0; j < n_tokens; j++) {
-            auto token = whisper_full_get_token_text(ctx, i, j);
-            auto probability = whisper_full_get_token_p(ctx, i, j);
-            fout << token << '\t' << probability << std::endl;
-            // fprintf(stderr,"token: %s %f\n",token,probability);
-	    }
-    }
-    return true;
-}
-
-bool output_json(
-             struct whisper_context * ctx,
-                         const char * fname,
-               const whisper_params & params,
-    std::vector<std::vector<float>>   pcmf32s,
-                               bool   full) {
-    std::ofstream fout(fname);
-    int indent = 0;
-
-    auto doindent = [&]() {
-        for (int i = 0; i < indent; i++) fout << "\t";
-    };
-
-    auto start_arr = [&](const char *name) {
-        doindent();
-        fout << "\"" << name << "\": [\n";
-        indent++;
-    };
-
-    auto end_arr = [&](bool end) {
-        indent--;
-        doindent();
-        fout << (end ? "]\n" : "],\n");
-    };
-
-    auto start_obj = [&](const char *name) {
-        doindent();
-        if (name) {
-            fout << "\"" << name << "\": {\n";
-        } else {
-            fout << "{\n";
-        }
-        indent++;
-    };
-
-    auto end_obj = [&](bool end) {
-        indent--;
-        doindent();
-        fout << (end ? "}\n" : "},\n");
-    };
-
-    auto start_value = [&](const char *name) {
-        doindent();
-        fout << "\"" << name << "\": ";
-    };
-
-    auto value_s = [&](const char *name, const char *val, bool end) {
-        start_value(name);
-        char * val_escaped = escape_double_quotes_and_backslashes(val);
-        fout << "\"" << val_escaped << (end ? "\"\n" : "\",\n");
-        free(val_escaped);
-    };
-
-    auto end_value = [&](bool end) {
-        fout << (end ? "\n" : ",\n");
-    };
-
-    auto value_i = [&](const char *name, const int64_t val, bool end) {
-        start_value(name);
-        fout << val;
-        end_value(end);
-    };
-
-    auto value_f = [&](const char *name, const float val, bool end) {
-        start_value(name);
-        fout << val;
-        end_value(end);
-    };
-
-    auto value_b = [&](const char *name, const bool val, bool end) {
-        start_value(name);
-        fout << (val ? "true" : "false");
-        end_value(end);
-    };
-
-    auto times_o = [&](int64_t t0, int64_t t1, bool end) {
-        start_obj("timestamps");
-        value_s("from", to_timestamp(t0, true).c_str(), false);
-        value_s("to", to_timestamp(t1, true).c_str(), true);
-        end_obj(false);
-        start_obj("offsets");
-        value_i("from", t0 * 10, false);
-        value_i("to", t1 * 10, true);
-        end_obj(end);
-    };
-
-    if (!fout.is_open()) {
-        fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname);
-        return false;
-    }
-
-    fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
-    start_obj(nullptr);
-        value_s("systeminfo", whisper_print_system_info(), false);
-        start_obj("model");
-            value_s("type", whisper_model_type_readable(ctx), false);
-            value_b("multilingual", whisper_is_multilingual(ctx), false);
-            value_i("vocab", whisper_model_n_vocab(ctx), false);
-            start_obj("audio");
-                value_i("ctx", whisper_model_n_audio_ctx(ctx), false);
-                value_i("state", whisper_model_n_audio_state(ctx), false);
-                value_i("head", whisper_model_n_audio_head(ctx), false);
-                value_i("layer", whisper_model_n_audio_layer(ctx), true);
-            end_obj(false);
-            start_obj("text");
-                value_i("ctx", whisper_model_n_text_ctx(ctx), false);
-                value_i("state", whisper_model_n_text_state(ctx), false);
-                value_i("head", whisper_model_n_text_head(ctx), false);
-                value_i("layer", whisper_model_n_text_layer(ctx), true);
-            end_obj(false);
-            value_i("mels", whisper_model_n_mels(ctx), false);
-            value_i("ftype", whisper_model_ftype(ctx), true);
-        end_obj(false);
-        start_obj("params");
-            value_s("model", params.model.c_str(), false);
-            value_s("language", params.language.c_str(), false);
-            value_b("translate", params.translate, true);
-        end_obj(false);
-        start_obj("result");
-            value_s("language", whisper_lang_str(whisper_full_lang_id(ctx)), true);
-        end_obj(false);
-        start_arr("transcription");
-
-            const int n_segments = whisper_full_n_segments(ctx);
-            for (int i = 0; i < n_segments; ++i) {
-                const char * text = whisper_full_get_segment_text(ctx, i);
-
-                const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
-                const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
-
-                start_obj(nullptr);
-                    times_o(t0, t1, false);
-                    value_s("text", text, !params.diarize && !params.tinydiarize && !full);
-
-                    if (full) {
-                        start_arr("tokens");
-                        const int n = whisper_full_n_tokens(ctx, i);
-                        for (int j = 0; j < n; ++j) {
-                            auto token = whisper_full_get_token_data(ctx, i, j);
-                            start_obj(nullptr);
-                                value_s("text", whisper_token_to_str(ctx, token.id), false);
-                                if(token.t0 > -1 && token.t1 > -1) {
-                                    // If we have per-token timestamps, write them out
-                                    times_o(token.t0, token.t1, false);
-                                }
-                                value_i("id", token.id, false);
-                                value_f("p", token.p, true);
-                            end_obj(j == (n - 1));
-                        }
-                        end_arr(!params.diarize && !params.tinydiarize);
-                    }
-
-                    if (params.diarize && pcmf32s.size() == 2) {
-                        value_s("speaker", estimate_diarization_speaker(pcmf32s, t0, t1, true).c_str(), true);
-                    }
-
-                    if (params.tinydiarize) {
-                        value_b("speaker_turn_next", whisper_full_get_segment_speaker_turn_next(ctx, i), true);
-                    }
-                end_obj(i == (n_segments - 1));
-            }
-
-        end_arr(true);
-    end_obj(true);
-    return true;
-}
-
-// karaoke video generation
-// outputs a bash script that uses ffmpeg to generate a video with the subtitles
-// TODO: font parameter adjustments
-bool output_wts(struct whisper_context * ctx, const char * fname, const char * fname_inp, const whisper_params & params, float t_sec, std::vector<std::vector<float>> pcmf32s) {
-    std::ofstream fout(fname);
-
-    fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
-
-    static const char * font = params.font_path.c_str();
-
-    std::ifstream fin(font);
-    if (!fin.is_open()) {
-        fprintf(stderr, "%s: font not found at '%s', please specify a monospace font with -fp\n", __func__, font);
-        return false;
-    }
-
-    fout << "#!/bin/bash" << "\n";
-    fout << "\n";
-
-    fout << "ffmpeg -i " << fname_inp << " -f lavfi -i color=size=1200x120:duration=" << t_sec << ":rate=25:color=black -vf \"";
-
-    for (int i = 0; i < whisper_full_n_segments(ctx); i++) {
-        const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
-        const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
-
-        const int n = whisper_full_n_tokens(ctx, i);
-
-        std::vector<whisper_token_data> tokens(n);
-        for (int j = 0; j < n; ++j) {
-            tokens[j] = whisper_full_get_token_data(ctx, i, j);
-        }
-
-        if (i > 0) {
-            fout << ",";
-        }
-
-        // background text
-        fout << "drawtext=fontfile='" << font << "':fontsize=24:fontcolor=gray:x=(w-text_w)/2:y=h/2:text='':enable='between(t," << t0/100.0 << "," << t0/100.0 << ")'";
-
-        bool is_first = true;
-        std::string speaker = "";
-
-        if (params.diarize && pcmf32s.size() == 2) {
-            speaker = estimate_diarization_speaker(pcmf32s, t0, t1);
-        }
-
-        for (int j = 0; j < n; ++j) {
-            const auto & token = tokens[j];
-
-            if (tokens[j].id >= whisper_token_eot(ctx)) {
-                continue;
-            }
-
-            std::string txt_bg = "";
-            std::string txt_fg = ""; // highlight token
-            std::string txt_ul = ""; // underline
-
-            if (params.diarize && pcmf32s.size() == 2) {
-                txt_bg = speaker;
-                txt_fg = speaker;
-                txt_ul = "\\ \\ \\ \\ \\ \\ \\ \\ \\ \\ \\ ";
-            }
-
-            txt_bg.append("> ");
-            txt_fg.append("> ");
-            txt_ul.append("\\ \\ ");
-
-            {
-                for (int k = 0; k < n; ++k) {
-                    const auto & token2 = tokens[k];
-
-                    if (tokens[k].id >= whisper_token_eot(ctx)) {
-                        continue;
-                    }
-
-                    const std::string txt = whisper_token_to_str(ctx, token2.id);
-
-                    txt_bg += txt;
-
-                    if (k == j) {
-                        for (int l = 0; l < (int) txt.size(); ++l) {
-                            txt_fg += txt[l];
-                            txt_ul += "_";
-                        }
-                        txt_fg += "|";
-                    } else {
-                        for (int l = 0; l < (int) txt.size(); ++l) {
-                            txt_fg += "\\ ";
-                            txt_ul += "\\ ";
-                        }
-                    }
-                }
-
-                ::replace_all(txt_bg, "'", "\u2019");
-                ::replace_all(txt_bg, "\"", "\\\"");
-                ::replace_all(txt_fg, "'", "\u2019");
-                ::replace_all(txt_fg, "\"", "\\\"");
-            }
-
-            if (is_first) {
-                // background text
-                fout << ",drawtext=fontfile='" << font << "':fontsize=24:fontcolor=gray:x=(w-text_w)/2:y=h/2:text='" << txt_bg << "':enable='between(t," << t0/100.0 << "," << t1/100.0 << ")'";
-                is_first = false;
-            }
-
-            // foreground text
-            fout << ",drawtext=fontfile='" << font << "':fontsize=24:fontcolor=lightgreen:x=(w-text_w)/2+8:y=h/2:text='" << txt_fg << "':enable='between(t," << token.t0/100.0 << "," << token.t1/100.0 << ")'";
-
-            // underline
-            fout << ",drawtext=fontfile='" << font << "':fontsize=24:fontcolor=lightgreen:x=(w-text_w)/2+8:y=h/2+16:text='" << txt_ul << "':enable='between(t," << token.t0/100.0 << "," << token.t1/100.0 << ")'";
-        }
-    }
-
-    fout << "\" -c:v libx264 -pix_fmt yuv420p -y " << fname_inp << ".mp4" << "\n";
-
-    fout << "\n\n";
-    fout << "echo \"Your video has been saved to " << fname_inp << ".mp4\"" << "\n";
-    fout << "\n";
-    fout << "echo \"  ffplay " << fname_inp << ".mp4\"\n";
-    fout << "\n";
-
-    fout.close();
-
-    fprintf(stderr, "%s: run 'source %s' to generate karaoke video\n", __func__, fname);
-
-    return true;
-}
-
-bool output_lrc(struct whisper_context * ctx, const char * fname, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
-    std::ofstream fout(fname);
-    if (!fout.is_open()) {
-        fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname);
-        return false;
-    }
-
-    fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
-
-    fout << "[by:whisper.cpp]\n";
-
-    const int n_segments = whisper_full_n_segments(ctx);
-    for (int i = 0; i < n_segments; ++i) {
-        const char * text = whisper_full_get_segment_text(ctx, i);
-        const int64_t t = whisper_full_get_segment_t0(ctx, i);
-
-        int64_t msec = t * 10;
-        int64_t min = msec / (1000 * 60);
-        msec = msec - min * (1000 * 60);
-        int64_t sec = msec / 1000;
-        msec = msec - sec * 1000;
-
-        char buf[16];
-        snprintf(buf, sizeof(buf), "%02d:%02d.%02d", (int) min, (int) sec, (int) ( msec / 10));
-        std::string timestamp_lrc = std::string(buf);
-        std::string speaker = "";
-
-        if (params.diarize && pcmf32s.size() == 2)
-        {
-            const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
-            const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
-            speaker = estimate_diarization_speaker(pcmf32s, t0, t1);
-        }
-
-        fout <<  '[' << timestamp_lrc << ']' << speaker << text << "\n";
-    }
-
-    return true;
-}
-
-
-void cb_log_disable(enum ggml_log_level , const char * , void * ) { }
-
-int main(int argc, char ** argv) {
-    whisper_params params;
-
-    if (whisper_params_parse(argc, argv, params) == false) {
-        whisper_print_usage(argc, argv, params);
-        return 1;
-    }
-
-    if (params.fname_inp.empty()) {
-        fprintf(stderr, "error: no input files specified\n");
-        whisper_print_usage(argc, argv, params);
-        return 2;
-    }
-
-    if (params.language != "auto" && whisper_lang_id(params.language.c_str()) == -1) {
-        fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
-        whisper_print_usage(argc, argv, params);
-        exit(0);
-    }
-
-    if (params.diarize && params.tinydiarize) {
-        fprintf(stderr, "error: cannot use both --diarize and --tinydiarize\n");
-        whisper_print_usage(argc, argv, params);
-        exit(0);
-    }
-
-    if (params.no_prints) {
-        whisper_log_set(cb_log_disable, NULL);
-    }
-
-    // whisper init
-
-    struct whisper_context_params cparams;
-    cparams.use_gpu = params.use_gpu;
-
-    struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);
-
-    if (ctx == nullptr) {
-        fprintf(stderr, "error: failed to initialize whisper context\n");
-        return 3;
-    }
-
-    // initialize openvino encoder. this has no effect on whisper.cpp builds that don't have OpenVINO configured
-    whisper_ctx_init_openvino_encoder(ctx, nullptr, params.openvino_encode_device.c_str(), nullptr);
-
-    for (int f = 0; f < (int) params.fname_inp.size(); ++f) {
-        const auto fname_inp = params.fname_inp[f];
-		const auto fname_out = f < (int) params.fname_out.size() && !params.fname_out[f].empty() ? params.fname_out[f] : params.fname_inp[f];
-
-        std::vector<float> pcmf32;               // mono-channel F32 PCM
-        std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
-
-        if (!::read_wav(fname_inp, pcmf32, pcmf32s, params.diarize)) {
-            fprintf(stderr, "error: failed to read WAV file '%s'\n", fname_inp.c_str());
-            continue;
-        }
-
-        if (!whisper_is_multilingual(ctx)) {
-            if (params.language != "en" || params.translate) {
-                params.language = "en";
-                params.translate = false;
-                fprintf(stderr, "%s: WARNING: model is not multilingual, ignoring language and translation options\n", __func__);
-            }
-        }
-        if (params.detect_language) {
-            params.language = "auto";
-        }
-
-        if (!params.no_prints) {
-            // print system information
-            fprintf(stderr, "\n");
-            fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
-                    params.n_threads*params.n_processors, std::thread::hardware_concurrency(), whisper_print_system_info());
-
-            // print some info about the processing
-            fprintf(stderr, "\n");
-            fprintf(stderr, "%s: processing '%s' (%d samples, %.1f sec), %d threads, %d processors, %d beams + best of %d, lang = %s, task = %s, %stimestamps = %d ...\n",
-                    __func__, fname_inp.c_str(), int(pcmf32.size()), float(pcmf32.size())/WHISPER_SAMPLE_RATE,
-                    params.n_threads, params.n_processors, params.beam_size, params.best_of,
-                    params.language.c_str(),
-                    params.translate ? "translate" : "transcribe",
-                    params.tinydiarize ? "tdrz = 1, " : "",
-                    params.no_timestamps ? 0 : 1);
-
-            fprintf(stderr, "\n");
-        }
-
-        // run the inference
-        {
-            whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
-
-            wparams.strategy = params.beam_size > 1 ? WHISPER_SAMPLING_BEAM_SEARCH : WHISPER_SAMPLING_GREEDY;
-
-            wparams.print_realtime   = false;
-            wparams.print_progress   = params.print_progress;
-            wparams.print_timestamps = !params.no_timestamps;
-            wparams.print_special    = params.print_special;
-            wparams.translate        = params.translate;
-            wparams.language         = params.language.c_str();
-            wparams.detect_language  = params.detect_language;
-            wparams.n_threads        = params.n_threads;
-            wparams.n_max_text_ctx   = params.max_context >= 0 ? params.max_context : wparams.n_max_text_ctx;
-            wparams.offset_ms        = params.offset_t_ms;
-            wparams.duration_ms      = params.duration_ms;
-
-            wparams.token_timestamps = params.output_wts || params.output_jsn_full || params.max_len > 0;
-            wparams.thold_pt         = params.word_thold;
-            wparams.max_len          = params.output_wts && params.max_len == 0 ? 60 : params.max_len;
-            wparams.split_on_word    = params.split_on_word;
-
-            wparams.speed_up         = params.speed_up;
-            wparams.debug_mode       = params.debug_mode;
-
-            wparams.tdrz_enable      = params.tinydiarize; // [TDRZ]
-
-            wparams.initial_prompt   = params.prompt.c_str();
-
-            wparams.greedy.best_of        = params.best_of;
-            wparams.beam_search.beam_size = params.beam_size;
-
-            wparams.temperature_inc  = params.no_fallback ? 0.0f : wparams.temperature_inc;
-            wparams.entropy_thold    = params.entropy_thold;
-            wparams.logprob_thold    = params.logprob_thold;
-
-            wparams.no_timestamps    = params.no_timestamps;
-
-            whisper_print_user_data user_data = { &params, &pcmf32s, 0 };
-
-            // this callback is called on each new segment
-            if (!wparams.print_realtime) {
-                wparams.new_segment_callback           = whisper_print_segment_callback;
-                wparams.new_segment_callback_user_data = &user_data;
-            }
-
-            if (wparams.print_progress) {
-                wparams.progress_callback           = whisper_print_progress_callback;
-                wparams.progress_callback_user_data = &user_data;
-            }
-
-            // examples for abort mechanism
-            // in examples below, we do not abort the processing, but we could if the flag is set to true
-
-            // the callback is called before every encoder run - if it returns false, the processing is aborted
-            {
-                static bool is_aborted = false; // NOTE: this should be atomic to avoid data race
-
-                wparams.encoder_begin_callback = [](struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, void * user_data) {
-                    bool is_aborted = *(bool*)user_data;
-                    return !is_aborted;
-                };
-                wparams.encoder_begin_callback_user_data = &is_aborted;
-            }
-
-            // the callback is called before every computation - if it returns true, the computation is aborted
-            {
-                static bool is_aborted = false; // NOTE: this should be atomic to avoid data race
-
-                wparams.abort_callback = [](void * user_data) {
-                    bool is_aborted = *(bool*)user_data;
-                    return is_aborted;
-                };
-                wparams.abort_callback_user_data = &is_aborted;
-            }
-
-            if (whisper_full_parallel(ctx, wparams, pcmf32.data(), pcmf32.size(), params.n_processors) != 0) {
-                fprintf(stderr, "%s: failed to process audio\n", argv[0]);
-                return 10;
-            }
-        }
-
-        // output stuff
-        {
-            printf("\n");
-
-            // output to text file
-            if (params.output_txt) {
-                const auto fname_txt = fname_out + ".txt";
-                output_txt(ctx, fname_txt.c_str(), params, pcmf32s);
-            }
-
-            // output to VTT file
-            if (params.output_vtt) {
-                const auto fname_vtt = fname_out + ".vtt";
-                output_vtt(ctx, fname_vtt.c_str(), params, pcmf32s);
-            }
-
-            // output to SRT file
-            if (params.output_srt) {
-                const auto fname_srt = fname_out + ".srt";
-                output_srt(ctx, fname_srt.c_str(), params, pcmf32s);
-            }
-
-            // output to WTS file
-            if (params.output_wts) {
-                const auto fname_wts = fname_out + ".wts";
-                output_wts(ctx, fname_wts.c_str(), fname_inp.c_str(), params, float(pcmf32.size() + 1000)/WHISPER_SAMPLE_RATE, pcmf32s);
-            }
-
-            // output to CSV file
-            if (params.output_csv) {
-                const auto fname_csv = fname_out + ".csv";
-                output_csv(ctx, fname_csv.c_str(), params, pcmf32s);
-            }
-
-            // output to JSON file
-            if (params.output_jsn) {
-                const auto fname_jsn = fname_out + ".json";
-                output_json(ctx, fname_jsn.c_str(), params, pcmf32s, params.output_jsn_full);
-            }
-
-            // output to LRC file
-            if (params.output_lrc) {
-                const auto fname_lrc = fname_out + ".lrc";
-                output_lrc(ctx, fname_lrc.c_str(), params, pcmf32s);
-            }
-
-            // output to score file
-            if (params.log_score) {
-                const auto fname_score = fname_out + ".score.txt";
-                output_score(ctx, fname_score.c_str(), params, pcmf32s);
-            }
-        }
-    }
-
-    whisper_print_timings(ctx);
-    whisper_free(ctx);
-
-    return 0;
-}
diff --git a/ggml/examples/whisper/quantize.cpp b/ggml/examples/whisper/quantize.cpp
deleted file mode 100644
index b01d614..0000000
--- a/ggml/examples/whisper/quantize.cpp
+++ /dev/null
@@ -1,223 +0,0 @@
-#include "ggml.h"
-
-#include "common.h"
-#include "common-ggml.h"
-
-#include <cassert>
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <fstream>
-#include <map>
-#include <string>
-#include <vector>
-#include <regex>
-
-// default hparams (Whisper tiny)
-struct whisper_hparams {
-    int32_t n_vocab       = 51864;
-    int32_t n_audio_ctx   = 1500;
-    int32_t n_audio_state = 384;
-    int32_t n_audio_head  = 6;
-    int32_t n_audio_layer = 4;
-    int32_t n_text_ctx    = 448;
-    int32_t n_text_state  = 384;
-    int32_t n_text_head   = 6;
-    int32_t n_text_layer  = 4;
-    int32_t n_mels        = 80;
-    int32_t ftype         = 1;
-};
-
-struct whisper_filters {
-    int32_t n_mel;
-    int32_t n_fft;
-
-    std::vector<float> data;
-};
-
-// quantize a model
-bool whisper_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_ftype ftype) {
-    gpt_vocab vocab;
-
-    printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
-
-    auto finp = std::ifstream(fname_inp, std::ios::binary);
-    if (!finp) {
-        fprintf(stderr, "%s: failed to open '%s' for reading\n", __func__, fname_inp.c_str());
-        return false;
-    }
-
-    auto fout = std::ofstream(fname_out, std::ios::binary);
-    if (!fout) {
-        fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname_out.c_str());
-        return false;
-    }
-
-    // verify magic
-    {
-        uint32_t magic;
-        finp.read((char *) &magic, sizeof(magic));
-        if (magic != GGML_FILE_MAGIC) {
-            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname_inp.c_str());
-            return false;
-        }
-
-        fout.write((char *) &magic, sizeof(magic));
-    }
-
-    whisper_hparams hparams;
-
-    // load hparams
-    {
-        finp.read((char *) &hparams.n_vocab,       sizeof(hparams.n_vocab));
-        finp.read((char *) &hparams.n_audio_ctx,   sizeof(hparams.n_audio_ctx));
-        finp.read((char *) &hparams.n_audio_state, sizeof(hparams.n_audio_state));
-        finp.read((char *) &hparams.n_audio_head,  sizeof(hparams.n_audio_head));
-        finp.read((char *) &hparams.n_audio_layer, sizeof(hparams.n_audio_layer));
-        finp.read((char *) &hparams.n_text_ctx,    sizeof(hparams.n_text_ctx));
-        finp.read((char *) &hparams.n_text_state,  sizeof(hparams.n_text_state));
-        finp.read((char *) &hparams.n_text_head,   sizeof(hparams.n_text_head));
-        finp.read((char *) &hparams.n_text_layer,  sizeof(hparams.n_text_layer));
-        finp.read((char *) &hparams.n_mels,        sizeof(hparams.n_mels));
-        finp.read((char *) &hparams.ftype,         sizeof(hparams.ftype));
-
-        const int32_t qntvr_src =    hparams.ftype / GGML_QNT_VERSION_FACTOR;
-        const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype;
-
-        fprintf(stderr, "%s: n_vocab       = %d\n", __func__, hparams.n_vocab);
-        fprintf(stderr, "%s: n_audio_ctx   = %d\n", __func__, hparams.n_audio_ctx);
-        fprintf(stderr, "%s: n_audio_state = %d\n", __func__, hparams.n_audio_state);
-        fprintf(stderr, "%s: n_audio_head  = %d\n", __func__, hparams.n_audio_head);
-        fprintf(stderr, "%s: n_audio_layer = %d\n", __func__, hparams.n_audio_layer);
-        fprintf(stderr, "%s: n_text_ctx    = %d\n", __func__, hparams.n_text_ctx);
-        fprintf(stderr, "%s: n_text_state  = %d\n", __func__, hparams.n_text_state);
-        fprintf(stderr, "%s: n_text_head   = %d\n", __func__, hparams.n_text_head);
-        fprintf(stderr, "%s: n_text_layer  = %d\n", __func__, hparams.n_text_layer);
-        fprintf(stderr, "%s: n_mels        = %d\n", __func__, hparams.n_mels);
-        fprintf(stderr, "%s: ftype (src)   = %d\n", __func__, hparams.ftype);
-        fprintf(stderr, "%s: qntvr (src)   = %d\n", __func__, qntvr_src);
-        fprintf(stderr, "%s: ftype (dst)   = %d\n", __func__, ftype_dst);
-        fprintf(stderr, "%s: qntvr (dst)   = %d\n", __func__, GGML_QNT_VERSION);
-
-        fout.write((const char *) &hparams.n_vocab,       sizeof(hparams.n_vocab));
-        fout.write((const char *) &hparams.n_audio_ctx,   sizeof(hparams.n_audio_ctx));
-        fout.write((const char *) &hparams.n_audio_state, sizeof(hparams.n_audio_state));
-        fout.write((const char *) &hparams.n_audio_head,  sizeof(hparams.n_audio_head));
-        fout.write((const char *) &hparams.n_audio_layer, sizeof(hparams.n_audio_layer));
-        fout.write((const char *) &hparams.n_text_ctx,    sizeof(hparams.n_text_ctx));
-        fout.write((const char *) &hparams.n_text_state,  sizeof(hparams.n_text_state));
-        fout.write((const char *) &hparams.n_text_head,   sizeof(hparams.n_text_head));
-        fout.write((const char *) &hparams.n_text_layer,  sizeof(hparams.n_text_layer));
-        fout.write((const char *) &hparams.n_mels,        sizeof(hparams.n_mels));
-        fout.write((const char *) &ftype_dst,             sizeof(hparams.ftype));
-    }
-
-    // load mel filters
-    {
-        whisper_filters filters;
-
-        finp.read ((char *) &filters.n_mel, sizeof(filters.n_mel));
-        fout.write((char *) &filters.n_mel, sizeof(filters.n_mel));
-        finp.read ((char *) &filters.n_fft, sizeof(filters.n_fft));
-        fout.write((char *) &filters.n_fft, sizeof(filters.n_fft));
-
-        filters.data.resize(filters.n_mel * filters.n_fft);
-        finp.read ((char *) filters.data.data(), filters.data.size() * sizeof(float));
-        fout.write((char *) filters.data.data(), filters.data.size() * sizeof(float));
-    }
-
-    // load vocab
-    {
-        int32_t n_vocab = 0;
-        finp.read ((char *) &n_vocab, sizeof(n_vocab));
-        fout.write((char *) &n_vocab, sizeof(n_vocab));
-
-        //if (n_vocab != hparams.n_vocab) {
-        //    fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
-        //            __func__, fname_inp.c_str(), n_vocab, hparams.n_vocab);
-        //    return false;
-        //}
-
-        char word[129];
-
-        for (int i = 0; i < n_vocab; i++) {
-            uint32_t len;
-            finp.read ((char *) &len, sizeof(len));
-            fout.write((char *) &len, sizeof(len));
-
-            word[len] = '\0';
-
-            finp.read ((char *) word, len);
-            fout.write((char *) word, len);
-
-            vocab.token_to_id[word] = i;
-            vocab.id_to_token[i] = word;
-        }
-    }
-
-    // regexes of tensor names to not be quantized
-    const std::vector<std::string> to_skip = {
-        //"encoder.*",
-        "encoder.conv1.bias",
-        "encoder.conv2.bias",
-        "encoder.positional_embedding",
-        "decoder.positional_embedding",
-    };
-
-    if (!ggml_common_quantize_0(finp, fout, ftype, { ".*" }, to_skip)) {
-        fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__, fname_inp.c_str());
-        return false;
-    }
-
-    finp.close();
-    fout.close();
-
-    return true;
-}
-
-int main(int argc, char ** argv) {
-    if (argc != 4) {
-        fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
-        ggml_print_ftypes(stderr);
-        return 1;
-    }
-
-    // needed to initialize f16 tables
-    {
-        struct ggml_init_params params = { 0, NULL, false };
-        struct ggml_context * ctx = ggml_init(params);
-        ggml_free(ctx);
-    }
-
-    const std::string fname_inp = argv[1];
-    const std::string fname_out = argv[2];
-
-    const ggml_ftype ftype = ggml_parse_ftype(argv[3]);
-
-    const int64_t t_main_start_us = ggml_time_us();
-
-    int64_t t_quantize_us = 0;
-
-    // load the model
-    {
-        const int64_t t_start_us = ggml_time_us();
-
-        if (!whisper_model_quantize(fname_inp, fname_out, ggml_ftype(ftype))) {
-            fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
-            return 1;
-        }
-
-        t_quantize_us = ggml_time_us() - t_start_us;
-    }
-
-    // report timing
-    {
-        const int64_t t_main_end_us = ggml_time_us();
-
-        printf("\n");
-        printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0f);
-        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
-    }
-
-    return 0;
-}
diff --git a/ggml/examples/whisper/whisper.cpp b/ggml/examples/whisper/whisper.cpp
deleted file mode 100644
index ba867b0..0000000
--- a/ggml/examples/whisper/whisper.cpp
+++ /dev/null
@@ -1,6673 +0,0 @@
-#include "whisper.h"
-
-#ifdef WHISPER_USE_COREML
-#include "coreml/whisper-encoder.h"
-#endif
-
-#ifdef GGML_USE_METAL
-#include "ggml-metal.h"
-#endif
-
-#ifdef GGML_USE_CUBLAS
-#include "ggml-cuda.h"
-#endif
-
-#ifdef WHISPER_USE_OPENVINO
-#include "openvino/whisper-openvino-encoder.h"
-#endif
-
-#include "ggml.h"
-#include "ggml-alloc.h"
-#include "ggml-backend.h"
-
-#include <atomic>
-#include <algorithm>
-#include <cassert>
-#define _USE_MATH_DEFINES
-#include <cmath>
-#include <cstdio>
-#include <cstdarg>
-#include <cstring>
-#include <fstream>
-#include <map>
-#include <set>
-#include <string>
-#include <thread>
-#include <vector>
-#include <regex>
-#include <random>
-#include <functional>
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-
-#if defined(GGML_BIG_ENDIAN)
-#include <bit>
-
-template<typename T>
-static T byteswap(T value) {
-    return std::byteswap(value);
-}
-
-template<>
-float byteswap(float value) {
-    return std::bit_cast<float>(byteswap(std::bit_cast<std::uint32_t>(value)));
-}
-
-template<typename T>
-static void byteswap_tensor_data(ggml_tensor * tensor) {
-    T * datum = reinterpret_cast<T *>(tensor->data);
-    for (int i = 0; i < ggml_nelements(tensor); i++) {
-        datum[i] = byteswap(datum[i]);
-    }
-}
-
-static void byteswap_tensor(ggml_tensor * tensor) {
-    switch (tensor->type) {
-        case GGML_TYPE_I16: {
-            byteswap_tensor_data<int16_t>(tensor);
-            break;
-        }
-        case GGML_TYPE_F16: {
-            byteswap_tensor_data<ggml_fp16_t>(tensor);
-            break;
-        }
-        case GGML_TYPE_I32: {
-            byteswap_tensor_data<int32_t>(tensor);
-            break;
-        }
-        case GGML_TYPE_F32: {
-            byteswap_tensor_data<float>(tensor);
-            break;
-        }
-        default: { // GML_TYPE_I8
-            break;
-        }
-    }
-}
-
-#define BYTESWAP_VALUE(d) d = byteswap(d)
-#define BYTESWAP_FILTERS(f)            \
-    do {                              \
-        for (auto & datum : f.data) { \
-            datum = byteswap(datum);  \
-        }                             \
-    } while (0)
-#define BYTESWAP_TENSOR(t)       \
-    do {                         \
-        byteswap_tensor(t); \
-    } while (0)
-#else
-#define BYTESWAP_VALUE(d) do {} while (0)
-#define BYTESWAP_FILTERS(f) do {} while (0)
-#define BYTESWAP_TENSOR(t) do {} while (0)
-#endif
-
-#ifdef __GNUC__
-#ifdef __MINGW32__
-#define WHISPER_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
-#else
-#define WHISPER_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
-#endif
-#else
-#define WHISPER_ATTRIBUTE_FORMAT(...)
-#endif
-
-//
-// logging
-//
-
-WHISPER_ATTRIBUTE_FORMAT(2, 3)
-static void whisper_log_internal        (ggml_log_level level, const char * format, ...);
-static void whisper_log_callback_default(ggml_log_level level, const char * text, void * user_data);
-
-#define WHISPER_LOG_ERROR(...) whisper_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
-#define WHISPER_LOG_WARN(...)  whisper_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__)
-#define WHISPER_LOG_INFO(...)  whisper_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
-
-// define this to enable verbose trace logging - useful for debugging purposes
-//#define WHISPER_DEBUG
-
-#if defined(WHISPER_DEBUG)
-#define WHISPER_LOG_DEBUG(...) whisper_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
-#else
-#define WHISPER_LOG_DEBUG(...)
-#endif
-
-#define WHISPER_ASSERT(x) \
-    do { \
-        if (!(x)) { \
-            WHISPER_LOG_ERROR("WHISPER_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
-            abort(); \
-        } \
-    } while (0)
-
-//#define WHISPER_USE_FLASH_ATTN
-//#define WHISPER_USE_FLASH_FF
-#define WHISPER_MAX_DECODERS 8
-#define WHISPER_MAX_NODES 4096
-
-//
-// ggml helpers
-//
-
-static bool ggml_graph_compute_helper(
-          struct ggml_cgraph * graph,
-        std::vector<uint8_t> & buf,
-                         int   n_threads,
-      whisper_abort_callback   abort_callback,
-                        void * abort_callback_data) {
-    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
-
-    plan.abort_callback = abort_callback;
-    plan.abort_callback_data = abort_callback_data;
-
-    if (plan.work_size > 0) {
-        buf.resize(plan.work_size);
-        plan.work_data = buf.data();
-    }
-
-    return ggml_graph_compute(graph, &plan);
-}
-
-static bool ggml_graph_compute_helper(
-       struct ggml_backend * backend,
-        struct ggml_cgraph * graph,
-                       int   n_threads) {
-    if (ggml_backend_is_cpu(backend)) {
-        ggml_backend_cpu_set_n_threads(backend, n_threads);
-    }
-#ifdef GGML_USE_METAL
-    if (ggml_backend_is_metal(backend)) {
-        ggml_backend_metal_set_n_cb(backend, n_threads);
-    }
-#endif
-    return ggml_backend_graph_compute(backend, graph);
-}
-
-// faster matrix multiplications for tensors that do not have dimension 0 divisible by "pad"
-// the idea is to represent the original matrix multiplication:
-//
-//   Z = X @ Y
-//
-// with the sum of two matrix multiplications:
-//
-//   Z = (X_0 @ Y_0) + (X_1 @ Y_1)
-//
-// here X_0 and Y_0 are views of X and Y that have dimension 0 divisible by "pad"
-// and X_1 and Y_1 are the remaining views. X_1 and Y_1 end up being small matrices that can be processed with more
-// general-purpose kernels
-//
-static struct ggml_tensor * ggml_mul_mat_pad(struct ggml_context * ctx, struct ggml_tensor * x, struct ggml_tensor * y, int pad = 32) {
-    // use padding only if dimension 0 is at least 8 times larger than the padding
-    // else we won't get much benefit from the optimization
-    const int n_pad_req = 8;
-
-    if (x->ne[0] % pad == 0 || x->ne[0] / pad < n_pad_req) {
-        return ggml_mul_mat(ctx, x, y);
-    }
-
-    struct ggml_tensor * x_0 = ggml_view_3d(ctx, x, (x->ne[0]/pad)*pad, x->ne[1], x->ne[2], x->nb[1], x->nb[2], 0);
-    struct ggml_tensor * x_1 = ggml_view_3d(ctx, x,  x->ne[0]%pad,      x->ne[1], x->ne[2], x->nb[1], x->nb[2], x_0->ne[0]*x_0->nb[0]);
-
-    struct ggml_tensor * y_0 = ggml_view_3d(ctx, y, (y->ne[0]/pad)*pad, y->ne[1], y->ne[2], y->nb[1], y->nb[2], 0);
-    struct ggml_tensor * y_1 = ggml_view_3d(ctx, y,  y->ne[0]%pad,      y->ne[1], y->ne[2], y->nb[1], y->nb[2], y_0->ne[0]*y_0->nb[0]);
-
-    return ggml_add(ctx,
-            ggml_mul_mat(ctx, x_0, y_0),
-            ggml_mul_mat(ctx, x_1, y_1));
-}
-
-// TODO: check if other platforms can benefit from this optimization
-// TODO: CUDA is currently broken - seems ggml_mul_mat does not handle views correctly
-#if defined(GGML_USE_METAL)
-#define ggml_mul_mat ggml_mul_mat_pad
-#endif
-
-// available whisper models
-enum e_model {
-    MODEL_UNKNOWN,
-    MODEL_TINY,
-    MODEL_BASE,
-    MODEL_SMALL,
-    MODEL_MEDIUM,
-    MODEL_LARGE,
-};
-
-static const std::map<e_model, std::string> g_model_name = {
-    { MODEL_UNKNOWN,  "unknown"  },
-    { MODEL_TINY,     "tiny"     },
-    { MODEL_BASE,     "base"     },
-    { MODEL_SMALL,    "small"    },
-    { MODEL_MEDIUM,   "medium"   },
-    { MODEL_LARGE,    "large"    },
-};
-
-static const std::map<std::string, std::pair<int, std::string>> g_lang = {
-    { "en",  { 0,  "english",         } },
-    { "zh",  { 1,  "chinese",         } },
-    { "de",  { 2,  "german",          } },
-    { "es",  { 3,  "spanish",         } },
-    { "ru",  { 4,  "russian",         } },
-    { "ko",  { 5,  "korean",          } },
-    { "fr",  { 6,  "french",          } },
-    { "ja",  { 7,  "japanese",        } },
-    { "pt",  { 8,  "portuguese",      } },
-    { "tr",  { 9,  "turkish",         } },
-    { "pl",  { 10, "polish",          } },
-    { "ca",  { 11,  "catalan",        } },
-    { "nl",  { 12,  "dutch",          } },
-    { "ar",  { 13,  "arabic",         } },
-    { "sv",  { 14,  "swedish",        } },
-    { "it",  { 15,  "italian",        } },
-    { "id",  { 16,  "indonesian",     } },
-    { "hi",  { 17,  "hindi",          } },
-    { "fi",  { 18,  "finnish",        } },
-    { "vi",  { 19,  "vietnamese",     } },
-    { "he",  { 20,  "hebrew",         } },
-    { "uk",  { 21,  "ukrainian",      } },
-    { "el",  { 22,  "greek",          } },
-    { "ms",  { 23,  "malay",          } },
-    { "cs",  { 24,  "czech",          } },
-    { "ro",  { 25,  "romanian",       } },
-    { "da",  { 26,  "danish",         } },
-    { "hu",  { 27,  "hungarian",      } },
-    { "ta",  { 28,  "tamil",          } },
-    { "no",  { 29,  "norwegian",      } },
-    { "th",  { 30,  "thai",           } },
-    { "ur",  { 31,  "urdu",           } },
-    { "hr",  { 32,  "croatian",       } },
-    { "bg",  { 33,  "bulgarian",      } },
-    { "lt",  { 34,  "lithuanian",     } },
-    { "la",  { 35,  "latin",          } },
-    { "mi",  { 36,  "maori",          } },
-    { "ml",  { 37,  "malayalam",      } },
-    { "cy",  { 38,  "welsh",          } },
-    { "sk",  { 39,  "slovak",         } },
-    { "te",  { 40,  "telugu",         } },
-    { "fa",  { 41,  "persian",        } },
-    { "lv",  { 42,  "latvian",        } },
-    { "bn",  { 43,  "bengali",        } },
-    { "sr",  { 44,  "serbian",        } },
-    { "az",  { 45,  "azerbaijani",    } },
-    { "sl",  { 46,  "slovenian",      } },
-    { "kn",  { 47,  "kannada",        } },
-    { "et",  { 48,  "estonian",       } },
-    { "mk",  { 49,  "macedonian",     } },
-    { "br",  { 50,  "breton",         } },
-    { "eu",  { 51,  "basque",         } },
-    { "is",  { 52,  "icelandic",      } },
-    { "hy",  { 53,  "armenian",       } },
-    { "ne",  { 54,  "nepali",         } },
-    { "mn",  { 55,  "mongolian",      } },
-    { "bs",  { 56,  "bosnian",        } },
-    { "kk",  { 57,  "kazakh",         } },
-    { "sq",  { 58,  "albanian",       } },
-    { "sw",  { 59,  "swahili",        } },
-    { "gl",  { 60,  "galician",       } },
-    { "mr",  { 61,  "marathi",        } },
-    { "pa",  { 62,  "punjabi",        } },
-    { "si",  { 63,  "sinhala",        } },
-    { "km",  { 64,  "khmer",          } },
-    { "sn",  { 65,  "shona",          } },
-    { "yo",  { 66,  "yoruba",         } },
-    { "so",  { 67,  "somali",         } },
-    { "af",  { 68,  "afrikaans",      } },
-    { "oc",  { 69,  "occitan",        } },
-    { "ka",  { 70,  "georgian",       } },
-    { "be",  { 71,  "belarusian",     } },
-    { "tg",  { 72,  "tajik",          } },
-    { "sd",  { 73,  "sindhi",         } },
-    { "gu",  { 74,  "gujarati",       } },
-    { "am",  { 75,  "amharic",        } },
-    { "yi",  { 76,  "yiddish",        } },
-    { "lo",  { 77,  "lao",            } },
-    { "uz",  { 78,  "uzbek",          } },
-    { "fo",  { 79,  "faroese",        } },
-    { "ht",  { 80,  "haitian creole", } },
-    { "ps",  { 81,  "pashto",         } },
-    { "tk",  { 82,  "turkmen",        } },
-    { "nn",  { 83,  "nynorsk",        } },
-    { "mt",  { 84,  "maltese",        } },
-    { "sa",  { 85,  "sanskrit",       } },
-    { "lb",  { 86,  "luxembourgish",  } },
-    { "my",  { 87,  "myanmar",        } },
-    { "bo",  { 88,  "tibetan",        } },
-    { "tl",  { 89,  "tagalog",        } },
-    { "mg",  { 90,  "malagasy",       } },
-    { "as",  { 91,  "assamese",       } },
-    { "tt",  { 92,  "tatar",          } },
-    { "haw", { 93,  "hawaiian",       } },
-    { "ln",  { 94,  "lingala",        } },
-    { "ha",  { 95,  "hausa",          } },
-    { "ba",  { 96,  "bashkir",        } },
-    { "jw",  { 97,  "javanese",       } },
-    { "su",  { 98,  "sundanese",      } },
-    { "yue", { 99,  "cantonese",      } },
-};
-
-struct whisper_mel {
-    int n_len;
-    int n_len_org;
-    int n_mel;
-
-    std::vector<float> data;
-};
-
-struct whisper_filters {
-    int32_t n_mel;
-    int32_t n_fft;
-
-    std::vector<float> data;
-};
-
-struct whisper_vocab {
-    using id    = int32_t;
-    using token = std::string;
-
-    int n_vocab = 51864;
-
-    std::map<token, id> token_to_id;
-    std::map<id, token> id_to_token;
-
-    // reference: https://github.com/openai/whisper/blob/248b6cb124225dd263bb9bd32d060b6517e067f8/whisper/tokenizer.py#L334-L349
-    id token_eot        = 50256;
-    id token_sot        = 50257;
-    // task tokens (used only for multilingual models)
-    id token_translate  = 50357;
-    id token_transcribe = 50358;
-    // other special tokens
-    id token_solm       = 50359; // [TDRZ] used by tinydiarize models to indicate speaker turn
-    id token_prev       = 50360;
-    id token_nosp       = 50361;
-    id token_not        = 50362; // no timestamps
-    id token_beg        = 50363; // begin timestamps
-
-    bool is_multilingual() const {
-        return n_vocab >= 51865;
-    }
-
-    int num_languages() const {
-        return n_vocab - 51765 - (is_multilingual() ? 1 : 0);
-    }
-};
-
-struct whisper_segment {
-    int64_t t0;
-    int64_t t1;
-
-    std::string text;
-
-    std::vector<whisper_token_data> tokens;
-
-    bool speaker_turn_next;
-};
-
-struct whisper_batch {
-    int32_t n_tokens;
-
-    whisper_token  *  token;
-    whisper_pos    *  pos;
-    int32_t        *  n_seq_id;
-    whisper_seq_id ** seq_id;   // null terminated
-    int8_t         *  logits;
-};
-
-static struct whisper_batch whisper_batch_init(int32_t n_tokens, int32_t n_seq_max) {
-    whisper_batch batch = { 0, nullptr, nullptr, nullptr, nullptr, nullptr, };
-
-    batch.token    = (whisper_token *  ) malloc(sizeof(whisper_token)    * (n_tokens));
-    batch.pos      = (whisper_pos *)     malloc(sizeof(whisper_pos)      * (n_tokens));
-    batch.n_seq_id = (int32_t *)         malloc(sizeof(int32_t)          * (n_tokens));
-    batch.seq_id   = (whisper_seq_id **) malloc(sizeof(whisper_seq_id *) * (n_tokens + 1));
-    for (int i = 0; i < n_tokens; ++i) {
-        batch.seq_id[i] = (whisper_seq_id *) malloc(sizeof(whisper_seq_id)   * n_seq_max);
-    }
-    batch.seq_id[n_tokens] = nullptr;
-    batch.logits   = (int8_t *)          malloc(sizeof(int8_t)           * n_tokens);
-
-    return batch;
-}
-
-static void whisper_batch_free(struct whisper_batch batch) {
-    if (batch.token)    free(batch.token);
-    if (batch.pos)      free(batch.pos);
-    if (batch.n_seq_id) free(batch.n_seq_id);
-    if (batch.seq_id) {
-        for (int i = 0; batch.seq_id[i]; ++i) {
-            free(batch.seq_id[i]);
-        }
-        free(batch.seq_id);
-    }
-    if (batch.logits)   free(batch.logits);
-}
-
-static void whisper_batch_prep_legacy(whisper_batch & batch, const whisper_token * tokens, int n_tokens, int n_past, int seq_id) {
-    batch.n_tokens = n_tokens;
-    for (int i = 0; i < n_tokens; ++i) {
-        if (tokens) {
-            batch.token[i] = tokens[i];
-        }
-        batch.pos     [i]    = n_past + i;
-        batch.n_seq_id[i]    = 1;
-        batch.seq_id  [i][0] = seq_id;
-        batch.logits  [i]    = 0;
-    }
-    batch.logits[n_tokens - 1] = 1;
-}
-
-// replace std::pair by using customized pair struct (reason: std::pair is very slow)
-template<typename A, typename B>
-struct whisper_pair {
-    A first;
-    B second;
-
-    // Define a constructor that takes two arguments.
-    whisper_pair(const A& a, const B& b) : first(a), second(b) {}
-    // Define a constructor that takes no argument.
-    whisper_pair() : first(A()), second(B()) {}
-};
-
-// ggml_allocr wrapper for whisper usage
-struct whisper_allocr {
-    ggml_allocr * alloc = nullptr;
-
-    std::vector<uint8_t> meta;
-
-    ggml_backend_buffer_t buffer;
-};
-
-static size_t whisper_allocr_size(struct whisper_allocr & allocr) {
-    return allocr.meta.size() + ggml_allocr_max_size(allocr.alloc);
-}
-
-// measure the memory usage of a graph and prepare the allocr's internal data buffer
-static void whisper_allocr_graph_init(struct whisper_allocr & allocr, ggml_backend_t backend, std::function<struct ggml_cgraph *()> && get_graph) {
-    auto & alloc = allocr.alloc;
-    auto & meta  = allocr.meta;
-
-    alloc = ggml_allocr_new_measure_from_backend(backend);
-
-    meta.resize(ggml_tensor_overhead()*WHISPER_MAX_NODES + ggml_graph_overhead());
-
-    ggml_allocr_alloc_graph(alloc, get_graph());
-}
-
-static void whisper_allocr_graph_realloc(struct whisper_allocr & allocr, ggml_backend_t backend) {
-    if (allocr.alloc == nullptr) {
-        // this can be null if we use external encoder like CoreML or OpenVINO
-        return;
-    }
-
-    auto & alloc  = allocr.alloc;
-    auto & buffer = allocr.buffer;
-
-    size_t size = ggml_allocr_max_size(alloc);
-
-    ggml_allocr_free(alloc);
-
-    buffer = ggml_backend_alloc_buffer(backend, size);
-    alloc = ggml_allocr_new_from_buffer(buffer);
-}
-
-static void whisper_allocr_free(struct whisper_allocr & allocr) {
-    if (allocr.alloc) {
-        ggml_allocr_free(allocr.alloc);
-        ggml_backend_buffer_free(allocr.buffer);
-        allocr.alloc = nullptr;
-    }
-}
-
-// medium
-// hparams: {
-// 'n_mels': 80,
-// 'n_vocab': 51864,
-// 'n_audio_ctx': 1500,
-// 'n_audio_state': 1024,
-// 'n_audio_head': 16,
-// 'n_audio_layer': 24,
-// 'n_text_ctx': 448,
-// 'n_text_state': 1024,
-// 'n_text_head': 16,
-// 'n_text_layer': 24
-// }
-//
-// default hparams (Whisper tiny)
-struct whisper_hparams {
-    int32_t n_vocab       = 51864;
-    int32_t n_audio_ctx   = 1500;
-    int32_t n_audio_state = 384;
-    int32_t n_audio_head  = 6;
-    int32_t n_audio_layer = 4;
-    int32_t n_text_ctx    = 448;
-    int32_t n_text_state  = 384;
-    int32_t n_text_head   = 6;
-    int32_t n_text_layer  = 4;
-    int32_t n_mels        = 80;
-    int32_t ftype         = 1;
-    float   eps           = 1e-5f;
-};
-
-// audio encoding layer
-struct whisper_layer_encoder {
-    // encoder.blocks.*.attn_ln
-    struct ggml_tensor * attn_ln_0_w;
-    struct ggml_tensor * attn_ln_0_b;
-
-    // encoder.blocks.*.attn.out
-    struct ggml_tensor * attn_ln_1_w;
-    struct ggml_tensor * attn_ln_1_b;
-
-    // encoder.blocks.*.attn.query
-    struct ggml_tensor * attn_q_w;
-    struct ggml_tensor * attn_q_b;
-
-    // encoder.blocks.*.attn.key
-    struct ggml_tensor * attn_k_w;
-
-    // encoder.blocks.*.attn.value
-    struct ggml_tensor * attn_v_w;
-    struct ggml_tensor * attn_v_b;
-
-    // encoder.blocks.*.mlp_ln
-    struct ggml_tensor * mlp_ln_w;
-    struct ggml_tensor * mlp_ln_b;
-
-    // encoder.blocks.*.mlp.0
-    struct ggml_tensor * mlp_0_w;
-    struct ggml_tensor * mlp_0_b;
-
-    // encoder.blocks.*.mlp.2
-    struct ggml_tensor * mlp_1_w;
-    struct ggml_tensor * mlp_1_b;
-};
-
-// token decoding layer
-struct whisper_layer_decoder {
-    // decoder.blocks.*.attn_ln
-    struct ggml_tensor * attn_ln_0_w;
-    struct ggml_tensor * attn_ln_0_b;
-
-    // decoder.blocks.*.attn.out
-    struct ggml_tensor * attn_ln_1_w;
-    struct ggml_tensor * attn_ln_1_b;
-
-    // decoder.blocks.*.attn.query
-    struct ggml_tensor * attn_q_w;
-    struct ggml_tensor * attn_q_b;
-
-    // decoder.blocks.*.attn.key
-    struct ggml_tensor * attn_k_w;
-
-    // decoder.blocks.*.attn.value
-    struct ggml_tensor * attn_v_w;
-    struct ggml_tensor * attn_v_b;
-
-    // decoder.blocks.*.cross_attn_ln
-    struct ggml_tensor * cross_attn_ln_0_w;
-    struct ggml_tensor * cross_attn_ln_0_b;
-
-    // decoder.blocks.*.cross_attn.out
-    struct ggml_tensor * cross_attn_ln_1_w;
-    struct ggml_tensor * cross_attn_ln_1_b;
-
-    // decoder.blocks.*.cross_attn.query
-    struct ggml_tensor * cross_attn_q_w;
-    struct ggml_tensor * cross_attn_q_b;
-
-    // decoder.blocks.*.cross_attn.key
-    struct ggml_tensor * cross_attn_k_w;
-
-    // decoder.blocks.*.cross_attn.value
-    struct ggml_tensor * cross_attn_v_w;
-    struct ggml_tensor * cross_attn_v_b;
-
-    // decoder.blocks.*.mlp_ln
-    struct ggml_tensor * mlp_ln_w;
-    struct ggml_tensor * mlp_ln_b;
-
-    // decoder.blocks.*.mlp.0
-    struct ggml_tensor * mlp_0_w;
-    struct ggml_tensor * mlp_0_b;
-
-    // decoder.blocks.*.mlp.2
-    struct ggml_tensor * mlp_1_w;
-    struct ggml_tensor * mlp_1_b;
-};
-
-struct whisper_kv_cell {
-    whisper_pos pos = -1;
-
-    std::set<whisper_seq_id> seq_id;
-
-    bool has_seq_id(const whisper_seq_id & id) const {
-        return seq_id.find(id) != seq_id.end();
-    }
-};
-
-struct whisper_kv_cache {
-    uint32_t head = 0;
-    uint32_t size = 0;
-
-    // computed before each graph build
-    uint32_t n = 0;
-
-    std::vector<whisper_kv_cell> cells;
-
-    struct ggml_tensor * k;
-    struct ggml_tensor * v;
-
-    struct ggml_context * ctx;
-
-    ggml_backend_buffer_t buffer;
-};
-
-struct whisper_model {
-    e_model type = MODEL_UNKNOWN;
-
-    whisper_hparams hparams;
-    whisper_filters filters;
-
-    // encoder.positional_embedding
-    struct ggml_tensor * e_pe;
-
-    // encoder.conv1
-    struct ggml_tensor * e_conv_1_w;
-    struct ggml_tensor * e_conv_1_b;
-
-    // encoder.conv2
-    struct ggml_tensor * e_conv_2_w;
-    struct ggml_tensor * e_conv_2_b;
-
-    // encoder.ln_post
-    struct ggml_tensor * e_ln_w;
-    struct ggml_tensor * e_ln_b;
-
-    // decoder.positional_embedding
-    struct ggml_tensor * d_pe;
-
-    // decoder.token_embedding
-    struct ggml_tensor * d_te;
-
-    // decoder.ln
-    struct ggml_tensor * d_ln_w;
-    struct ggml_tensor * d_ln_b;
-
-    std::vector<whisper_layer_encoder> layers_encoder;
-    std::vector<whisper_layer_decoder> layers_decoder;
-
-    // ggml context that contains all the meta information about the model tensors
-    struct ggml_context * ctx;
-
-    // the model backend data is read-only and can be shared between processors
-    std::vector<struct ggml_backend_buffer *> buffers;
-
-    // tensors
-    int n_loaded;
-    std::map<std::string, struct ggml_tensor *> tensors;
-};
-
-struct whisper_partial_utf8 {
-    uint32_t value;    // bit value so far (unshifted)
-    int      n_remain; // num bytes remaining; -1 indicates invalid sequence
-};
-
-struct whisper_grammar {
-    /*const*/ std::vector<std::vector<whisper_grammar_element>> rules;
-    std::vector<std::vector<const whisper_grammar_element *>>   stacks;
-
-    // buffer for partially generated UTF-8 sequence from accepted tokens
-    whisper_partial_utf8 partial_utf8;
-};
-
-struct whisper_grammar_candidate {
-    whisper_token          id;
-    const uint32_t       * code_points;
-    whisper_partial_utf8   partial_utf8;
-};
-
-struct whisper_sequence {
-    std::vector<whisper_token_data> tokens;
-
-    // the accumulated transcription in the current iteration (used to truncate the tokens array)
-    int result_len;
-
-    double sum_logprobs_all; // the sum of the log probabilities of the tokens
-    double sum_logprobs;     // the sum of the log probabilities of the tokens (first result_len tokens)
-    double avg_logprobs;     // the average log probability of the tokens
-    double entropy;          // the entropy of the tokens
-    double score;            // likelihood rank score
-};
-
-// TAGS: WHISPER_DECODER_INIT
-struct whisper_decoder {
-    // the currently generated sequence of tokens
-    whisper_sequence sequence;
-
-    // grammar parse state of generated sequence of tokens
-    whisper_grammar  grammar;
-
-    int i_batch;    // the index of the token in the current batch
-    int seek_delta; // the window shift found so far based on the decoded timestamp tokens
-
-    bool failed;    // has the current segment failed to decode?
-    bool completed; // has the decoder completed the current segment?
-    bool has_ts;    // have we already sampled a non-beg timestamp token for the current segment?
-
-    // new token probs, logits and logprobs after the last whisper_decode (1-dimensional array: [n_vocab])
-    std::vector<float> probs;
-    std::vector<float> logits;
-    std::vector<float> logprobs;
-
-    // work container used to avoid memory allocations
-    std::vector<whisper_pair<double, whisper_vocab::id>> logits_id;
-
-    mutable std::mt19937 rng; // used for sampling at t > 0.0
-};
-
-struct whisper_state {
-    int64_t t_sample_us = 0;
-    int64_t t_encode_us = 0;
-    int64_t t_decode_us = 0;
-    int64_t t_batchd_us = 0;
-    int64_t t_prompt_us = 0;
-    int64_t t_mel_us = 0;
-
-    int32_t n_sample = 0; // number of tokens sampled
-    int32_t n_encode = 0; // number of encoder calls
-    int32_t n_decode = 0; // number of decoder calls with n_tokens == 1  (text-generation)
-    int32_t n_batchd = 0; // number of decoder calls with n_tokens <  16 (batch decoding)
-    int32_t n_prompt = 0; // number of decoder calls with n_tokens >  1  (prompt encoding)
-    int32_t n_fail_p = 0; // number of logprob threshold failures
-    int32_t n_fail_h = 0; // number of entropy threshold failures
-
-    // unified self-attention KV cache for all decoders
-    whisper_kv_cache kv_self;
-
-    // cross-attention KV cache for the decoders
-    // shared between all decoders
-    whisper_kv_cache kv_cross;
-
-    whisper_mel mel;
-
-    whisper_batch batch;
-
-    whisper_decoder decoders[WHISPER_MAX_DECODERS];
-
-    ggml_backend_t backend = nullptr;
-
-    // ggml-alloc:
-    // - stores meta info about the intermediate tensors into the `meta` buffers
-    // - stores the actual tensor data into the `data` buffers
-    whisper_allocr alloc_conv;
-    whisper_allocr alloc_encode;
-    whisper_allocr alloc_cross;
-    whisper_allocr alloc_decode;
-
-    // result of the encoder
-    struct ggml_tensor * embd_conv = nullptr;
-    struct ggml_tensor * embd_enc  = nullptr;
-
-    // helpers for GPU offloading
-    std::vector<float> inp_mel;
-    std::vector<float> inp_mask;
-
-    // decode output (2-dimensional array: [n_tokens][n_vocab])
-    std::vector<float> logits;
-
-    std::vector<whisper_segment> result_all;
-    std::vector<whisper_token>   prompt_past;
-
-    int lang_id = 0; // english by default
-
-    std::string path_model; // populated by whisper_init_from_file_with_params()
-
-#ifdef WHISPER_USE_COREML
-    whisper_coreml_context * ctx_coreml = nullptr;
-#endif
-
-#ifdef WHISPER_USE_OPENVINO
-    whisper_openvino_context * ctx_openvino = nullptr;
-#endif
-
-    // [EXPERIMENTAL] token-level timestamps data
-    int64_t t_beg  = 0;
-    int64_t t_last = 0;
-
-    whisper_token tid_last;
-
-    std::vector<float> energy; // PCM signal energy
-
-    // [EXPERIMENTAL] speed-up techniques
-    int32_t exp_n_audio_ctx = 0; // 0 - use default
-};
-
-struct whisper_context {
-    int64_t t_load_us  = 0;
-    int64_t t_start_us = 0;
-
-    ggml_type wtype = ggml_type::GGML_TYPE_F16; // weight type (FP32 / FP16 / QX)
-    ggml_type itype = ggml_type::GGML_TYPE_F16; // intermediate type (FP32 or FP16)
-
-    whisper_context_params params;
-
-    whisper_model model;
-    whisper_vocab vocab;
-
-    whisper_state * state = nullptr;
-
-    ggml_backend_t backend = nullptr;
-
-    std::string path_model; // populated by whisper_init_from_file_with_params()
-};
-
-struct whisper_global {
-    // We save the log callback globally
-    ggml_log_callback log_callback = whisper_log_callback_default;
-    void * log_callback_user_data = nullptr;
-};
-
-static whisper_global g_state;
-
-template<typename T>
-static void read_safe(whisper_model_loader * loader, T & dest) {
-    loader->read(loader->context, &dest, sizeof(T));
-    BYTESWAP_VALUE(dest);
-}
-
-static bool kv_cache_init(
-        const struct whisper_hparams & hparams,
-             struct whisper_kv_cache & cache,
-                      ggml_backend_t   backend,
-                           ggml_type   wtype,
-                                 int   n_ctx) {
-    const int64_t n_text_state = hparams.n_text_state;
-    const int64_t n_text_layer = hparams.n_text_layer;
-
-    const int64_t n_mem      = n_text_layer*n_ctx;
-    const int64_t n_elements = n_text_state*n_mem;
-
-    struct ggml_init_params params = {
-        /*.mem_size   =*/ 2*ggml_tensor_overhead(),
-        /*.mem_buffer =*/ nullptr,
-        /*.no_alloc   =*/ true,
-    };
-
-    cache.head = 0;
-    cache.size = n_ctx;
-
-    cache.cells.clear();
-    cache.cells.resize(n_ctx);
-
-    cache.ctx = ggml_init(params);
-
-    if (!cache.ctx) {
-        WHISPER_LOG_ERROR("%s: failed to allocate memory for kv cache\n", __func__);
-        return false;
-    }
-
-    cache.k = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
-    cache.v = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
-
-    const size_t mem_bytes = ggml_nbytes(cache.k) + ggml_nbytes(cache.v);
-
-    cache.buffer = ggml_backend_alloc_buffer(backend, mem_bytes);
-
-    // allocate the tensors into the backend buffer
-    {
-        ggml_allocr * alloc = ggml_allocr_new_from_buffer(cache.buffer);
-
-        ggml_allocr_alloc(alloc, cache.k);
-        ggml_allocr_alloc(alloc, cache.v);
-
-        ggml_allocr_free(alloc);
-    }
-
-    return true;
-}
-
-static void kv_cache_free(struct whisper_kv_cache & cache) {
-    if (cache.ctx) {
-        ggml_free(cache.ctx);
-        ggml_backend_buffer_free(cache.buffer);
-        cache.ctx = nullptr;
-    }
-}
-
-static bool whisper_kv_cache_find_slot(
-           struct whisper_kv_cache & cache,
-        const struct whisper_batch & batch) {
-    const uint32_t n_ctx    = cache.size;
-    const uint32_t n_tokens = batch.n_tokens;
-
-    if (n_tokens > n_ctx) {
-        WHISPER_LOG_ERROR("%s: n_tokens=%d > n_ctx=%d\n", __func__, n_tokens, n_ctx);
-        return false;
-    }
-
-    uint32_t n_tested = 0;
-
-    while (true) {
-        if (cache.head + n_tokens > n_ctx) {
-            n_tested += n_ctx - cache.head;
-            cache.head = 0;
-            continue;
-        }
-
-        bool found = true;
-        for (uint32_t i = 0; i < n_tokens; i++) {
-            if (cache.cells[cache.head + i].pos >= 0) {
-                found = false;
-                cache.head += i + 1;
-                n_tested   += i + 1;
-                break;
-            }
-        }
-
-        if (found) {
-            break;
-        }
-
-        if (n_tested >= n_ctx) {
-            //WHISPER_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens);
-            return false;
-        }
-    }
-
-    for (uint32_t i = 0; i < n_tokens; i++) {
-        cache.cells[cache.head + i].pos = batch.pos[i];
-
-        for (int32_t j = 0; j < batch.n_seq_id[i]; j++) {
-            cache.cells[cache.head + i].seq_id.insert(batch.seq_id[i][j]);
-        }
-    }
-
-    return true;
-}
-
-// find how many cells are currently in use
-static int32_t whisper_kv_cache_cell_max(const struct whisper_kv_cache & cache) {
-    for (uint32_t i = cache.size - 1; i > 0; --i) {
-        if (cache.cells[i].pos >= 0 && !cache.cells[i].seq_id.empty()) {
-            return i + 1;
-        }
-    }
-
-    return 1;
-}
-
-static void whisper_kv_cache_clear(struct whisper_kv_cache & cache) {
-    for (int32_t i = 0; i < (int32_t) cache.size; ++i) {
-        cache.cells[i].pos = -1;
-        cache.cells[i].seq_id.clear();
-    }
-    cache.head = 0;
-}
-
-static void whisper_kv_cache_seq_rm(
-        struct whisper_kv_cache & cache,
-                 whisper_seq_id   seq_id,
-                    whisper_pos   p0,
-                    whisper_pos   p1) {
-    uint32_t new_head = cache.size;
-
-    if (p0 < 0) p0 = 0;
-    if (p1 < 0) p1 = std::numeric_limits<whisper_pos>::max();
-
-    for (uint32_t i = 0; i < cache.size; ++i) {
-        if (cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
-            if (seq_id < 0) {
-                cache.cells[i].seq_id.clear();
-            } else if (cache.cells[i].has_seq_id(seq_id)) {
-                cache.cells[i].seq_id.erase(seq_id);
-            } else {
-                continue;
-            }
-            if (cache.cells[i].seq_id.empty()) {
-                cache.cells[i].pos = -1;
-                if (new_head == cache.size) new_head = i;
-            }
-        }
-    }
-
-    // If we freed up a slot, set head to it so searching can start there.
-    if (new_head != cache.size) cache.head = new_head;
-}
-
-static void whisper_kv_cache_seq_cp(
-        struct whisper_kv_cache & cache,
-                 whisper_seq_id   seq_id_src,
-                 whisper_seq_id   seq_id_dst,
-                    whisper_pos   p0,
-                    whisper_pos   p1) {
-    if (p0 < 0) p0 = 0;
-    if (p1 < 0) p1 = std::numeric_limits<whisper_pos>::max();
-
-    cache.head = 0;
-
-    for (uint32_t i = 0; i < cache.size; ++i) {
-        if (cache.cells[i].has_seq_id(seq_id_src) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
-            cache.cells[i].seq_id.insert(seq_id_dst);
-        }
-    }
-}
-
-static ggml_backend_t whisper_backend_init(const whisper_context_params & params) {
-    ggml_backend_t backend_gpu = NULL;
-
-    // initialize the backends
-#ifdef GGML_USE_CUBLAS
-    if (params.use_gpu && ggml_cublas_loaded()) {
-        WHISPER_LOG_INFO("%s: using CUDA backend\n", __func__);
-        backend_gpu = ggml_backend_cuda_init(0);
-        if (!backend_gpu) {
-            WHISPER_LOG_ERROR("%s: ggml_backend_cuda_init() failed\n", __func__);
-        }
-    }
-#endif
-
-#ifdef GGML_USE_METAL
-    if (params.use_gpu) {
-        WHISPER_LOG_INFO("%s: using Metal backend\n", __func__);
-        ggml_backend_metal_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
-        backend_gpu = ggml_backend_metal_init();
-        if (!backend_gpu) {
-            WHISPER_LOG_ERROR("%s: ggml_backend_metal_init() failed\n", __func__);
-        } else if (!ggml_backend_metal_supports_family(backend_gpu, 7)) {
-            WHISPER_LOG_ERROR("%s: Metal GPU does not support family 7 - falling back to CPU\n", __func__);
-            ggml_backend_free(backend_gpu);
-            backend_gpu = NULL;
-        }
-    }
-#endif
-
-    if (backend_gpu) {
-        return backend_gpu;
-    }
-    return ggml_backend_cpu_init();
-}
-
-// load the model from a ggml file
-//
-// file format:
-//
-//   - hparams
-//   - pre-computed mel filters
-//   - vocab
-//   - weights
-//
-// see the convert-pt-to-ggml.py script for details
-//
-static bool whisper_model_load(struct whisper_model_loader * loader, whisper_context & wctx) {
-    WHISPER_LOG_INFO("%s: loading model\n", __func__);
-
-    const int64_t t_start_us = ggml_time_us();
-
-    wctx.t_start_us = t_start_us;
-
-    auto & model = wctx.model;
-    auto & vocab = wctx.vocab;
-
-    // verify magic
-    {
-        uint32_t magic;
-        read_safe(loader, magic);
-        if (magic != GGML_FILE_MAGIC) {
-            WHISPER_LOG_ERROR("%s: invalid model data (bad magic)\n", __func__);
-            return false;
-        }
-    }
-
-    //load hparams
-    {
-        auto & hparams = model.hparams;
-
-        read_safe(loader, hparams.n_vocab);
-        read_safe(loader, hparams.n_audio_ctx);
-        read_safe(loader, hparams.n_audio_state);
-        read_safe(loader, hparams.n_audio_head);
-        read_safe(loader, hparams.n_audio_layer);
-        read_safe(loader, hparams.n_text_ctx);
-        read_safe(loader, hparams.n_text_state);
-        read_safe(loader, hparams.n_text_head);
-        read_safe(loader, hparams.n_text_layer);
-        read_safe(loader, hparams.n_mels);
-        read_safe(loader, hparams.ftype);
-
-        assert(hparams.n_text_state == hparams.n_audio_state);
-
-        std::string mver = "";
-
-        if (hparams.n_audio_layer == 4) {
-            model.type = e_model::MODEL_TINY;
-        }
-
-        if (hparams.n_audio_layer == 6) {
-            model.type = e_model::MODEL_BASE;
-        }
-
-        if (hparams.n_audio_layer == 12) {
-            model.type = e_model::MODEL_SMALL;
-        }
-
-        if (hparams.n_audio_layer == 24) {
-            model.type = e_model::MODEL_MEDIUM;
-        }
-
-        if (hparams.n_audio_layer == 32) {
-            model.type = e_model::MODEL_LARGE;
-
-            if (hparams.n_vocab == 51866) {
-                mver = " v3";
-            }
-        }
-
-        const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
-
-        hparams.ftype %= GGML_QNT_VERSION_FACTOR;
-
-        // for the big tensors, we have the option to store the data in 16-bit floats or quantized
-        // in order to save memory and also to speed up the computation
-        wctx.wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
-        if (wctx.wtype == GGML_TYPE_COUNT) {
-            WHISPER_LOG_ERROR("%s: invalid model (bad ftype value %d)\n", __func__, model.hparams.ftype);
-            return false;
-        }
-
-        WHISPER_LOG_INFO("%s: n_vocab       = %d\n", __func__, hparams.n_vocab);
-        WHISPER_LOG_INFO("%s: n_audio_ctx   = %d\n", __func__, hparams.n_audio_ctx);
-        WHISPER_LOG_INFO("%s: n_audio_state = %d\n", __func__, hparams.n_audio_state);
-        WHISPER_LOG_INFO("%s: n_audio_head  = %d\n", __func__, hparams.n_audio_head);
-        WHISPER_LOG_INFO("%s: n_audio_layer = %d\n", __func__, hparams.n_audio_layer);
-        WHISPER_LOG_INFO("%s: n_text_ctx    = %d\n", __func__, hparams.n_text_ctx);
-        WHISPER_LOG_INFO("%s: n_text_state  = %d\n", __func__, hparams.n_text_state);
-        WHISPER_LOG_INFO("%s: n_text_head   = %d\n", __func__, hparams.n_text_head);
-        WHISPER_LOG_INFO("%s: n_text_layer  = %d\n", __func__, hparams.n_text_layer);
-        WHISPER_LOG_INFO("%s: n_mels        = %d\n", __func__, hparams.n_mels);
-        WHISPER_LOG_INFO("%s: ftype         = %d\n", __func__, model.hparams.ftype);
-        WHISPER_LOG_INFO("%s: qntvr         = %d\n", __func__, qntvr);
-        WHISPER_LOG_INFO("%s: type          = %d (%s%s)\n", __func__, model.type, g_model_name.at(model.type).c_str(), mver.c_str());
-    }
-
-    // load mel filters
-    {
-        auto & filters = wctx.model.filters;
-
-        read_safe(loader, filters.n_mel);
-        read_safe(loader, filters.n_fft);
-
-        filters.data.resize(filters.n_mel * filters.n_fft);
-        loader->read(loader->context, filters.data.data(), filters.data.size() * sizeof(float));
-        BYTESWAP_FILTERS(filters);
-    }
-
-    // load vocab
-    {
-        int32_t n_vocab = 0;
-        read_safe(loader, n_vocab);
-
-        //if (n_vocab != model.hparams.n_vocab) {
-        //    WHISPER_LOG_ERROR("%s: invalid model file '%s' (bad vocab size %d != %d)\n",
-        //            __func__, fname.c_str(), n_vocab, model.hparams.n_vocab);
-        //    return false;
-        //}
-
-        std::string word;
-        std::vector<char> tmp;
-
-        tmp.reserve(128);
-
-        for (int i = 0; i < n_vocab; i++) {
-            uint32_t len;
-            read_safe(loader, len);
-
-            if (len > 0) {
-                tmp.resize(len);
-                loader->read(loader->context, &tmp[0], tmp.size()); // read to buffer
-                word.assign(&tmp[0], tmp.size());
-            } else {
-                // seems like we have an empty-string token in multi-language models (i = 50256)
-                //WHISPER_LOG_WARN("%s: warning: empty-string token in vocab, i = %d\n", __func__, i);
-                word = "";
-            }
-
-            vocab.token_to_id[word] = i;
-            vocab.id_to_token[i] = word;
-
-            //printf("%s: vocab[%d] = '%s'\n", __func__, i, word.c_str());
-        }
-
-        vocab.n_vocab = model.hparams.n_vocab;
-        if (vocab.is_multilingual()) {
-            vocab.token_eot++;
-            vocab.token_sot++;
-
-            // account for variable number of language tokens
-            const int dt = vocab.num_languages() - 98;
-
-            vocab.token_translate  += dt;
-            vocab.token_transcribe += dt;
-            vocab.token_solm       += dt;
-            vocab.token_prev       += dt;
-            vocab.token_nosp       += dt;
-            vocab.token_not        += dt;
-            vocab.token_beg        += dt;
-        }
-
-        if (n_vocab < model.hparams.n_vocab) {
-            WHISPER_LOG_INFO("%s: adding %d extra tokens\n", __func__, model.hparams.n_vocab - n_vocab);
-            for (int i = n_vocab; i < model.hparams.n_vocab; i++) {
-                if (i > vocab.token_beg) {
-                    word = "[_TT_" + std::to_string(i - vocab.token_beg) + "]";
-                } else if (i == vocab.token_eot) {
-                    word = "[_EOT_]";
-                } else if (i == vocab.token_sot) {
-                    word = "[_SOT_]";
-                } else if (i == vocab.token_translate) {
-                    word = "[_TRANSLATE_]";
-                } else if (i == vocab.token_transcribe) {
-                    word = "[_TRANSCRIBE_]";
-                } else if (i == vocab.token_solm) {
-                    word = "[_SOLM_]";
-                } else if (i == vocab.token_prev) {
-                    word = "[_PREV_]";
-                } else if (i == vocab.token_nosp) {
-                    word = "[_NOSP_]";
-                } else if (i == vocab.token_not) {
-                    word = "[_NOT_]";
-                } else if (i == vocab.token_beg) {
-                    word = "[_BEG_]";
-                } else if (i > vocab.token_sot && i <= vocab.token_sot + vocab.num_languages()) {
-                    word = "[_LANG_" + std::string(whisper_lang_str(i - vocab.token_sot - 1)) + "]";
-                } else {
-                    word = "[_extra_token_" + std::to_string(i) + "]";
-                }
-                vocab.token_to_id[word] = i;
-                vocab.id_to_token[i] = word;
-            }
-        }
-
-        WHISPER_LOG_INFO("%s: n_langs       = %d\n", __func__, vocab.num_languages());
-    }
-
-    const ggml_type wtype = wctx.wtype;
-    const ggml_type vtype = wctx.wtype == GGML_TYPE_F32 ? GGML_TYPE_F32 : GGML_TYPE_F16; // conv type
-
-    // create the ggml context
-    {
-        const auto & hparams = model.hparams;
-
-        const int n_audio_layer = hparams.n_audio_layer;
-        const int n_text_layer  = hparams.n_text_layer;
-
-        const size_t n_tensors = 10 /* input */ + 15 + 15*n_audio_layer + 24*n_text_layer;
-
-        struct ggml_init_params params = {
-            /*.mem_size   =*/ n_tensors*ggml_tensor_overhead(),
-            /*.mem_buffer =*/ nullptr,
-            /*.no_alloc   =*/ true,
-        };
-
-        model.ctx = ggml_init(params);
-        if (!model.ctx) {
-            WHISPER_LOG_ERROR("%s: ggml_init() failed\n", __func__);
-            return false;
-        }
-    }
-
-    // prepare tensors for the weights
-    {
-        auto & ctx = model.ctx;
-
-        const auto & hparams = model.hparams;
-
-        const int n_vocab = hparams.n_vocab;
-
-        const int n_audio_ctx   = hparams.n_audio_ctx;
-        const int n_audio_state = hparams.n_audio_state;
-        const int n_audio_layer = hparams.n_audio_layer;
-
-        const int n_text_ctx   = hparams.n_text_ctx;
-        const int n_text_state = hparams.n_text_state;
-        const int n_text_layer = hparams.n_text_layer;
-
-        const int n_mels = hparams.n_mels;
-
-        model.layers_encoder.resize(n_audio_layer);
-        model.layers_decoder.resize(n_text_layer);
-
-        // encoder
-        {
-            model.e_pe = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_audio_state, n_audio_ctx);
-
-            model.e_conv_1_w     = ggml_new_tensor_3d(ctx, vtype,         3, n_mels,     n_audio_state);
-            model.e_conv_1_b     = ggml_new_tensor_2d(ctx, GGML_TYPE_F32,         1,     n_audio_state);
-
-            model.e_conv_2_w     = ggml_new_tensor_3d(ctx, vtype,         3, n_audio_state, n_audio_state);
-            model.e_conv_2_b     = ggml_new_tensor_2d(ctx, GGML_TYPE_F32,                1, n_audio_state);
-
-            model.e_ln_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
-            model.e_ln_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
-
-            // map by name
-            model.tensors["encoder.positional_embedding"] = model.e_pe;
-
-            model.tensors["encoder.conv1.weight"]         = model.e_conv_1_w;
-            model.tensors["encoder.conv1.bias"]           = model.e_conv_1_b;
-
-            model.tensors["encoder.conv2.weight"]         = model.e_conv_2_w;
-            model.tensors["encoder.conv2.bias"]           = model.e_conv_2_b;
-
-            model.tensors["encoder.ln_post.weight"]       = model.e_ln_w;
-            model.tensors["encoder.ln_post.bias"]         = model.e_ln_b;
-
-            for (int i = 0; i < n_audio_layer; ++i) {
-                auto & layer = model.layers_encoder[i];
-
-                layer.mlp_ln_w    = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_audio_state);
-                layer.mlp_ln_b    = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_audio_state);
-
-                layer.mlp_0_w     = ggml_new_tensor_2d(ctx, wtype,           n_audio_state, 4*n_audio_state);
-                layer.mlp_0_b     = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_audio_state);
-
-                layer.mlp_1_w     = ggml_new_tensor_2d(ctx, wtype,         4*n_audio_state, n_audio_state);
-                layer.mlp_1_b     = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_audio_state);
-
-                layer.attn_ln_0_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_audio_state);
-                layer.attn_ln_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_audio_state);
-
-                layer.attn_q_w    = ggml_new_tensor_2d(ctx, wtype,           n_audio_state, n_audio_state);
-                layer.attn_q_b    = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_audio_state);
-
-                layer.attn_k_w    = ggml_new_tensor_2d(ctx, wtype,           n_audio_state, n_audio_state);
-
-                layer.attn_v_w    = ggml_new_tensor_2d(ctx, wtype,           n_audio_state, n_audio_state);
-                layer.attn_v_b    = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_audio_state);
-
-                layer.attn_ln_1_w = ggml_new_tensor_2d(ctx, wtype,           n_audio_state, n_audio_state);
-                layer.attn_ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_audio_state);
-
-                // map by name
-                model.tensors["encoder.blocks." + std::to_string(i) + ".mlp_ln.weight"]     = layer.mlp_ln_w;
-                model.tensors["encoder.blocks." + std::to_string(i) + ".mlp_ln.bias"]       = layer.mlp_ln_b;
-
-                model.tensors["encoder.blocks." + std::to_string(i) + ".mlp.0.weight"]      = layer.mlp_0_w;
-                model.tensors["encoder.blocks." + std::to_string(i) + ".mlp.0.bias"]        = layer.mlp_0_b;
-
-                model.tensors["encoder.blocks." + std::to_string(i) + ".mlp.2.weight"]      = layer.mlp_1_w;
-                model.tensors["encoder.blocks." + std::to_string(i) + ".mlp.2.bias"]        = layer.mlp_1_b;
-
-                model.tensors["encoder.blocks." + std::to_string(i) + ".attn_ln.weight"]    = layer.attn_ln_0_w;
-                model.tensors["encoder.blocks." + std::to_string(i) + ".attn_ln.bias"]      = layer.attn_ln_0_b;
-
-                model.tensors["encoder.blocks." + std::to_string(i) + ".attn.query.weight"] = layer.attn_q_w;
-                model.tensors["encoder.blocks." + std::to_string(i) + ".attn.query.bias"]   = layer.attn_q_b;
-
-                model.tensors["encoder.blocks." + std::to_string(i) + ".attn.key.weight"]   = layer.attn_k_w;
-
-                model.tensors["encoder.blocks." + std::to_string(i) + ".attn.value.weight"] = layer.attn_v_w;
-                model.tensors["encoder.blocks." + std::to_string(i) + ".attn.value.bias"]   = layer.attn_v_b;
-
-                model.tensors["encoder.blocks." + std::to_string(i) + ".attn.out.weight"]   = layer.attn_ln_1_w;
-                model.tensors["encoder.blocks." + std::to_string(i) + ".attn.out.bias"]     = layer.attn_ln_1_b;
-            }
-        }
-
-        // decoder
-        {
-            model.d_pe   = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_text_state, n_text_ctx);
-
-            model.d_te   = ggml_new_tensor_2d(ctx, wtype,         n_text_state, n_vocab);
-
-            model.d_ln_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
-            model.d_ln_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
-
-            // map by name
-            model.tensors["decoder.positional_embedding"]   = model.d_pe;
-
-            model.tensors["decoder.token_embedding.weight"] = model.d_te;
-
-            model.tensors["decoder.ln.weight"]              = model.d_ln_w;
-            model.tensors["decoder.ln.bias"]                = model.d_ln_b;
-
-            for (int i = 0; i < n_text_layer; ++i) {
-                auto & layer = model.layers_decoder[i];
-
-                layer.mlp_ln_w          = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_text_state);
-                layer.mlp_ln_b          = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_text_state);
-
-                layer.mlp_0_w           = ggml_new_tensor_2d(ctx, wtype,           n_text_state, 4*n_text_state);
-                layer.mlp_0_b           = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_text_state);
-
-                layer.mlp_1_w           = ggml_new_tensor_2d(ctx, wtype,         4*n_text_state, n_text_state);
-                layer.mlp_1_b           = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_text_state);
-
-                layer.attn_ln_0_w       = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_text_state);
-                layer.attn_ln_0_b       = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_text_state);
-
-                layer.attn_q_w          = ggml_new_tensor_2d(ctx, wtype,           n_text_state, n_text_state);
-                layer.attn_q_b          = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_text_state);
-
-                layer.attn_k_w          = ggml_new_tensor_2d(ctx, wtype,           n_text_state, n_text_state);
-
-                layer.attn_v_w          = ggml_new_tensor_2d(ctx, wtype,           n_text_state, n_text_state);
-                layer.attn_v_b          = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_text_state);
-
-                layer.attn_ln_1_w       = ggml_new_tensor_2d(ctx, wtype,           n_text_state, n_text_state);
-                layer.attn_ln_1_b       = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_text_state);
-
-                layer.cross_attn_ln_0_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_text_state);
-                layer.cross_attn_ln_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_text_state);
-
-                layer.cross_attn_q_w    = ggml_new_tensor_2d(ctx, wtype,           n_text_state, n_text_state);
-                layer.cross_attn_q_b    = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_text_state);
-
-                layer.cross_attn_k_w    = ggml_new_tensor_2d(ctx, wtype,           n_text_state, n_text_state);
-
-                layer.cross_attn_v_w    = ggml_new_tensor_2d(ctx, wtype,           n_text_state, n_text_state);
-                layer.cross_attn_v_b    = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_text_state);
-
-                layer.cross_attn_ln_1_w = ggml_new_tensor_2d(ctx, wtype,           n_text_state, n_text_state);
-                layer.cross_attn_ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_text_state);
-
-                // map by name
-                model.tensors["decoder.blocks." + std::to_string(i) + ".mlp_ln.weight"]           = layer.mlp_ln_w;
-                model.tensors["decoder.blocks." + std::to_string(i) + ".mlp_ln.bias"]             = layer.mlp_ln_b;
-
-                model.tensors["decoder.blocks." + std::to_string(i) + ".mlp.0.weight"]            = layer.mlp_0_w;
-                model.tensors["decoder.blocks." + std::to_string(i) + ".mlp.0.bias"]              = layer.mlp_0_b;
-
-                model.tensors["decoder.blocks." + std::to_string(i) + ".mlp.2.weight"]            = layer.mlp_1_w;
-                model.tensors["decoder.blocks." + std::to_string(i) + ".mlp.2.bias"]              = layer.mlp_1_b;
-
-                model.tensors["decoder.blocks." + std::to_string(i) + ".attn_ln.weight"]          = layer.attn_ln_0_w;
-                model.tensors["decoder.blocks." + std::to_string(i) + ".attn_ln.bias"]            = layer.attn_ln_0_b;
-
-                model.tensors["decoder.blocks." + std::to_string(i) + ".attn.query.weight"]       = layer.attn_q_w;
-                model.tensors["decoder.blocks." + std::to_string(i) + ".attn.query.bias"]         = layer.attn_q_b;
-
-                model.tensors["decoder.blocks." + std::to_string(i) + ".attn.key.weight"]         = layer.attn_k_w;
-
-                model.tensors["decoder.blocks." + std::to_string(i) + ".attn.value.weight"]       = layer.attn_v_w;
-                model.tensors["decoder.blocks." + std::to_string(i) + ".attn.value.bias"]         = layer.attn_v_b;
-
-                model.tensors["decoder.blocks." + std::to_string(i) + ".attn.out.weight"]         = layer.attn_ln_1_w;
-                model.tensors["decoder.blocks." + std::to_string(i) + ".attn.out.bias"]           = layer.attn_ln_1_b;
-
-                model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn_ln.weight"]    = layer.cross_attn_ln_0_w;
-                model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn_ln.bias"]      = layer.cross_attn_ln_0_b;
-
-                model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.query.weight"] = layer.cross_attn_q_w;
-                model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.query.bias"]   = layer.cross_attn_q_b;
-
-                model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.key.weight"]   = layer.cross_attn_k_w;
-
-                model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.value.weight"] = layer.cross_attn_v_w;
-                model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.value.bias"]   = layer.cross_attn_v_b;
-
-                model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.out.weight"]   = layer.cross_attn_ln_1_w;
-                model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.out.bias"]     = layer.cross_attn_ln_1_b;
-            }
-        }
-    }
-
-    wctx.backend = whisper_backend_init(wctx.params);
-
-    // some devices have a limit on the maximum size of single memory buffer
-    // for example, iPhones are limited to 1GB per buffer
-    // to workaround this, we will allocate multiple buffers of smaller size and will split the tensors with the
-    // model weights between them
-    //
-    // the map_t2b maps tensor names to buffer indices
-    // as we iterate over the tensors, we will allocate new buffers when the current one is full
-    //
-    // finally, we create a separate allocator for each buffer and use it to allocate the tensors
-    // we keep the allocators alive until all the tensors are loaded
-
-    GGML_ASSERT(model.buffers.empty());
-
-    std::map<std::string, int> map_t2b;
-
-    {
-        size_t size_main = 0;
-        size_t size_cur  = 0;
-
-        static const size_t GB = 1024ull*1024ull*1024ull;
-
-        for (const auto & t : model.tensors) {
-            const size_t cur = ggml_nbytes(t.second) + ggml_tensor_overhead();
-
-            // adding the tensor to the current buffer will exceed the limit, so we need to allocate a new buffer
-            if (size_cur + cur > GB) {
-                GGML_ASSERT(size_cur > 0 && "A tensor is too large to fit in a single buffer");
-
-                model.buffers.emplace_back(ggml_backend_alloc_buffer(wctx.backend, size_cur));
-
-                size_cur = cur;
-            }
-
-            map_t2b[t.first] = model.buffers.size();
-
-            size_cur  += cur;
-            size_main += cur;
-        }
-
-        // allocate the last buffer if needed
-        if (size_cur > 0) {
-            model.buffers.emplace_back(ggml_backend_alloc_buffer(wctx.backend, size_cur));
-        }
-
-        GGML_ASSERT(model.buffers.size() > 0);
-
-        WHISPER_LOG_INFO("%s: %8s total size = %8.2f MB (%d buffers)\n", __func__, ggml_backend_name(wctx.backend), size_main / 1e6, (int) model.buffers.size());
-    }
-
-    std::vector<ggml_allocr *> allocs(model.buffers.size());
-    for (size_t i = 0; i < allocs.size(); ++i) {
-        allocs[i] = ggml_allocr_new_from_buffer(model.buffers[i]);
-    }
-
-    // allocate tensors in the backend buffers
-    {
-        for (const auto & t : model.tensors) {
-            ggml_allocr_alloc(allocs[map_t2b[t.first]], t.second);
-        }
-    }
-
-    // load weights
-    {
-        size_t total_size = 0;
-
-        model.n_loaded = 0;
-
-        std::vector<char> read_buf;
-
-        while (true) {
-            int32_t n_dims;
-            int32_t length;
-            int32_t ttype;
-
-            read_safe(loader, n_dims);
-            read_safe(loader, length);
-            read_safe(loader, ttype);
-
-            if (loader->eof(loader->context)) {
-                break;
-            }
-
-            int32_t nelements = 1;
-            int32_t ne[4] = { 1, 1, 1, 1 };
-            for (int i = 0; i < n_dims; ++i) {
-                read_safe(loader, ne[i]);
-                nelements *= ne[i];
-            }
-
-            std::string name;
-            std::vector<char> tmp(length); // create a buffer
-            loader->read(loader->context, &tmp[0], tmp.size()); // read to buffer
-            name.assign(&tmp[0], tmp.size());
-
-            if (model.tensors.find(name) == model.tensors.end()) {
-                WHISPER_LOG_ERROR("%s: unknown tensor '%s' in model file\n", __func__, name.data());
-                return false;
-            }
-
-            auto tensor = model.tensors[name.data()];
-
-            if (ggml_nelements(tensor) != nelements) {
-                WHISPER_LOG_ERROR("%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
-                WHISPER_LOG_ERROR("%s: shape: [%d, %d, %d], expected: [%d, %d, %d]\n",
-                        __func__, ne[0], ne[1], ne[2], (int) tensor->ne[0], (int) tensor->ne[1], (int) tensor->ne[2]);
-                return false;
-            }
-
-            if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1] || tensor->ne[2] != ne[2]) {
-                WHISPER_LOG_ERROR("%s: tensor '%s' has wrong shape in model file: got [%d, %d, %d], expected [%d, %d, %d]\n",
-                        __func__, name.data(), (int) tensor->ne[0], (int) tensor->ne[1], (int) tensor->ne[2], ne[0], ne[1], ne[2]);
-                return false;
-            }
-
-            const size_t bpe = ggml_type_size(ggml_type(ttype));
-
-            if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
-                WHISPER_LOG_ERROR("%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
-                        __func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
-                return false;
-            }
-
-            ggml_backend_t backend = wctx.backend;
-
-            //printf("%s: [%5.5s] %s\n", __func__, ggml_backend_name(backend), name.c_str());
-
-            if ((ggml_backend_is_cpu(backend)
-#ifdef GGML_USE_METAL
-                || ggml_backend_is_metal(backend)
-#endif
-                )) {
-                // for the CPU and Metal backend, we can read directly into the tensor
-                loader->read(loader->context, tensor->data, ggml_nbytes(tensor));
-                BYTESWAP_TENSOR(tensor);
-            } else {
-                // read into a temporary buffer first, then copy to device memory
-                read_buf.resize(ggml_nbytes(tensor));
-
-                loader->read(loader->context, read_buf.data(), read_buf.size());
-
-                ggml_backend_tensor_set(tensor, read_buf.data(), 0, ggml_nbytes(tensor));
-            }
-
-            //printf("%48s - [%5d, %5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ne[2], ggml_type_name((ggml_type) ttype), ggml_nbytes(tensor)/1e6);
-            total_size += ggml_nbytes(tensor);
-            model.n_loaded++;
-        }
-
-        WHISPER_LOG_INFO("%s: model size    = %7.2f MB\n", __func__, total_size/1e6);
-
-        if (model.n_loaded == 0) {
-            WHISPER_LOG_WARN("%s: WARN no tensors loaded from model file - assuming empty model for testing\n", __func__);
-        } else if (model.n_loaded != (int) model.tensors.size()) {
-            WHISPER_LOG_ERROR("%s: ERROR not all tensors loaded from model file - expected %zu, got %d\n", __func__, model.tensors.size(), model.n_loaded);
-            return false;
-        }
-    }
-
-    for (auto & alloc : allocs) {
-        ggml_allocr_free(alloc);
-    }
-
-    wctx.t_load_us = ggml_time_us() - t_start_us;
-
-    return true;
-}
-
-static bool whisper_encode_external(const whisper_state & wstate) {
-    GGML_UNUSED(wstate);
-
-#ifndef WHISPER_USE_COREML
-    const bool use_coreml = false;
-#else
-    const bool use_coreml = wstate.ctx_coreml != nullptr;
-#endif
-
-#ifndef WHISPER_USE_OPENVINO
-    const bool use_openvino = false;
-#else
-    const bool use_openvino = wstate.ctx_openvino != nullptr;
-#endif
-
-    return use_coreml || use_openvino;
-}
-
-static struct ggml_cgraph * whisper_build_graph_conv(
-        whisper_context & wctx,
-          whisper_state & wstate,
-              const int   mel_offset) {
-    const auto & model   = wctx.model;
-    const auto & mel_inp = wstate.mel;
-    const auto & hparams = model.hparams;
-
-    const int n_ctx   = wstate.exp_n_audio_ctx > 0 ? wstate.exp_n_audio_ctx : hparams.n_audio_ctx;
-    const int n_state = hparams.n_audio_state; GGML_UNUSED(n_state);
-
-    const int n_mels = hparams.n_mels;
-
-    struct ggml_init_params params = {
-        /*.mem_size   =*/ wstate.alloc_conv.meta.size(),
-        /*.mem_buffer =*/ wstate.alloc_conv.meta.data(),
-        /*.no_alloc   =*/ true,
-    };
-
-    struct ggml_context * ctx0 = ggml_init(params);
-
-    ggml_cgraph * gf = ggml_new_graph(ctx0);
-
-    ggml_allocr * alloc = wstate.alloc_conv.alloc;
-
-    struct ggml_tensor * mel = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 2*n_ctx, n_mels);
-    ggml_allocr_alloc(alloc, mel);
-
-    assert(mel->type == GGML_TYPE_F32);
-    if (!ggml_allocr_is_measure(alloc)) {
-        assert(mel_inp.n_mel == n_mels);
-
-        wstate.inp_mel.resize(ggml_nelements(mel));
-
-        float * dst = wstate.inp_mel.data();
-        memset(dst, 0, ggml_nbytes(mel));
-
-        const int i0 = std::min(mel_offset,           mel_inp.n_len);
-        const int i1 = std::min(mel_offset + 2*n_ctx, mel_inp.n_len);
-
-        for (int j = 0; j < mel_inp.n_mel; ++j) {
-            for (int i = i0; i < i1; ++i) {
-                dst[j*2*n_ctx + (i - i0)] = mel_inp.data[j*mel_inp.n_len + i];
-            }
-        }
-
-        ggml_backend_tensor_set(mel, wstate.inp_mel.data(), 0, ggml_nelements(mel)*sizeof(float));
-    }
-
-    struct ggml_tensor * cur = nullptr;
-
-    if (!whisper_encode_external(wstate)) {
-        // convolution + gelu
-        {
-            cur = ggml_conv_1d_ph(ctx0, model.e_conv_1_w, mel, 1, 1);
-            cur = ggml_add(ctx0, cur, model.e_conv_1_b);
-
-            cur = ggml_gelu(ctx0, cur);
-
-            cur = ggml_conv_1d_ph(ctx0, model.e_conv_2_w, cur, 2, 1);
-            cur = ggml_add(ctx0, cur, model.e_conv_2_b);
-
-            cur = ggml_gelu(ctx0, cur);
-        }
-
-        ggml_set_name(cur, "embd_conv");
-        wstate.embd_conv = cur;
-    } else {
-#ifdef WHISPER_USE_COREML
-        cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_ctx);
-        ggml_allocr_alloc(alloc, cur);
-
-        if (!ggml_allocr_is_measure(alloc)) {
-            whisper_coreml_encode(wstate.ctx_coreml, mel->ne[0], mel->ne[1], (float *) mel->data, (float *) cur->data);
-        }
-#endif
-#ifdef WHISPER_USE_OPENVINO
-        cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_ctx);
-        ggml_allocr_alloc(alloc, cur);
-
-        if (!ggml_allocr_is_measure(alloc)) {
-            whisper_openvino_encode(wstate.ctx_openvino, mel, cur);
-        }
-#endif
-
-        ggml_set_name(cur, "embd_enc");
-        wstate.embd_enc = cur;
-    }
-
-    ggml_build_forward_expand(gf, cur);
-
-    ggml_free(ctx0);
-
-    return gf;
-}
-
-static struct ggml_cgraph * whisper_build_graph_encoder(
-        whisper_context & wctx,
-          whisper_state & wstate) {
-    const auto & model   = wctx.model;
-    const auto & hparams = model.hparams;
-
-    const int n_ctx   = wstate.exp_n_audio_ctx > 0 ? wstate.exp_n_audio_ctx : hparams.n_audio_ctx;
-    const int n_state = hparams.n_audio_state;
-    const int n_head  = hparams.n_audio_head;
-    const int n_layer = hparams.n_audio_layer;
-
-    struct ggml_init_params params = {
-        /*.mem_size   =*/ wstate.alloc_encode.meta.size(),
-        /*.mem_buffer =*/ wstate.alloc_encode.meta.data(),
-        /*.no_alloc   =*/ true,
-    };
-
-    struct ggml_context * ctx0 = ggml_init(params);
-
-    ggml_cgraph * gf = ggml_new_graph_custom(ctx0, WHISPER_MAX_NODES, false);
-
-    //ggml_allocr * alloc = wstate.alloc_encode.alloc;
-
-    //struct ggml_tensor * cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_ctx, n_state);
-    //ggml_allocr_alloc(alloc, cur);
-
-    //if (!ggml_allocr_is_measure(alloc)) {
-    //    ggml_backend_tensor_copy(wstate.embd_conv, cur);
-    //}
-    struct ggml_tensor * cur = ggml_view_tensor(ctx0, wstate.embd_conv);
-
-    const float KQscale = 1.0f/sqrtf(float(n_state)/n_head);
-
-    // ===================================================================
-    // NOTE: experimenting with partial evaluation of the encoder (ignore)
-    //static int iter = -1;
-    //const int n_iter = 1500/n_ctx;
-
-    //iter = (iter + 1) % n_iter;
-
-    //if (iter == 0) {
-    //    memset(model.memory_cross_k->data, 0, ggml_nbytes(model.memory_cross_k));
-    //    memset(model.memory_cross_v->data, 0, ggml_nbytes(model.memory_cross_v));
-    //}
-
-    static int iter = 0;
-
-    const size_t e_pe_stride = model.e_pe->ne[0]*ggml_element_size(model.e_pe);
-    const size_t e_pe_offset = model.e_pe->ne[0]*ggml_element_size(model.e_pe)*n_ctx*iter;
-
-    struct ggml_tensor * e_pe = ggml_view_2d(ctx0, model.e_pe, model.e_pe->ne[0], n_ctx, e_pe_stride, e_pe_offset);
-    cur = ggml_add(ctx0, e_pe, ggml_cont(ctx0, ggml_transpose(ctx0, cur)));
-
-    // ===================================================================
-
-    // original:
-    //cur = ggml_add(ctx0, model.e_pe, ggml_transpose(ctx0, cur));
-
-    struct ggml_tensor * inpL = cur;
-
-    for (int il = 0; il < n_layer; ++il) {
-        const auto & layer = model.layers_encoder[il];
-
-        // norm
-        {
-            cur = ggml_norm(ctx0, inpL, hparams.eps);
-
-            // cur = ln_0_w*cur + ln_0_b
-            cur = ggml_add(ctx0,
-                    ggml_mul(ctx0, cur, layer.attn_ln_0_w),
-                    layer.attn_ln_0_b);
-        }
-
-        // self-attention
-        {
-            struct ggml_tensor * Qcur = ggml_mul_mat(ctx0,
-                    layer.attn_q_w,
-                    cur);
-
-            Qcur = ggml_add(ctx0, Qcur, layer.attn_q_b);
-
-            //Qcur = ggml_scale(ctx0, Qcur, pow(float(n_state)/n_head, -0.25));
-
-            // note: no bias for Key
-            struct ggml_tensor * Kcur = ggml_mul_mat(ctx0,
-                    layer.attn_k_w,
-                    cur);
-
-            //Kcur = ggml_scale(ctx0, Kcur, pow(float(n_state)/n_head, -0.25));
-
-            struct ggml_tensor * Vcur = ggml_mul_mat(ctx0,
-                    layer.attn_v_w,
-                    cur);
-
-            Vcur = ggml_add(ctx0, Vcur, layer.attn_v_b);
-
-            // ------
-
-#ifdef WHISPER_USE_FLASH_ATTN
-            struct ggml_tensor * Q =
-                ggml_permute(ctx0,
-                        ggml_cpy(ctx0,
-                            Qcur,
-                            ggml_new_tensor_3d(ctx0, wctx.itype, n_state/n_head, n_head, n_ctx)),
-                        0, 2, 1, 3);
-
-            struct ggml_tensor * K =
-                ggml_permute(ctx0,
-                        ggml_cpy(ctx0,
-                            Kcur,
-                            ggml_new_tensor_3d(ctx0, wctx.itype, n_state/n_head, n_head, n_ctx)),
-                        0, 2, 1, 3);
-
-            struct ggml_tensor * V =
-                ggml_cpy(ctx0,
-                        ggml_permute(ctx0,
-                            ggml_reshape_3d(ctx0,
-                                Vcur,
-                                n_state/n_head, n_head, n_ctx),
-                            1, 2, 0, 3),
-                        ggml_new_tensor_3d(ctx0, wctx.itype, n_ctx, n_state/n_head, n_head));
-
-            struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, false);
-#else
-            struct ggml_tensor * Q =
-                ggml_permute(ctx0,
-                        ggml_cpy(ctx0,
-                            Qcur,
-                            ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_state/n_head, n_head, n_ctx)),
-                        0, 2, 1, 3);
-
-            struct ggml_tensor * K =
-                ggml_permute(ctx0,
-                        ggml_cpy(ctx0,
-                            Kcur,
-                            ggml_new_tensor_3d(ctx0, wctx.itype, n_state/n_head, n_head, n_ctx)),
-                        0, 2, 1, 3);
-
-            // K * Q
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
-
-            struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQscale);
-
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_scaled);
-
-            struct ggml_tensor * V =
-                ggml_cpy(ctx0,
-                        ggml_permute(ctx0,
-                            ggml_reshape_3d(ctx0,
-                                Vcur,
-                                n_state/n_head, n_head, n_ctx),
-                            1, 2, 0, 3),
-                        ggml_new_tensor_3d(ctx0, wctx.itype, n_ctx, n_state/n_head, n_head)
-                        );
-
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
-#endif
-            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
-
-            cur = ggml_cpy(ctx0,
-                    KQV_merged,
-                    ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_ctx));
-        }
-
-        // projection
-        {
-            cur = ggml_mul_mat(ctx0,
-                    layer.attn_ln_1_w,
-                    cur);
-
-            cur = ggml_add(ctx0, cur, layer.attn_ln_1_b);
-        }
-
-        // add the input
-        cur = ggml_add(ctx0, cur, inpL);
-
-        struct ggml_tensor * inpFF = cur;
-
-        // feed-forward network
-        {
-            // norm
-            {
-                cur = ggml_norm(ctx0, inpFF, hparams.eps);
-
-                // cur = mlp_ln_w*cur + mlp_ln_b
-                cur = ggml_add(ctx0,
-                        ggml_mul(ctx0, cur, layer.mlp_ln_w),
-                        layer.mlp_ln_b);
-            }
-
-#ifdef WHISPER_USE_FLASH_FF
-            cur = ggml_flash_ff(ctx0,
-                    ggml_cpy(ctx0, cur, ggml_new_tensor_2d(ctx0, wstate.itype, n_state, n_ctx)),
-                    layer.mlp_0_w, layer.mlp_0_b, layer.mlp_1_w, layer.mlp_1_b);
-#else
-            // fully connected
-            cur = ggml_mul_mat(ctx0,
-                    layer.mlp_0_w,
-                    cur);
-
-            cur = ggml_add(ctx0, cur, layer.mlp_0_b);
-
-            // GELU activation
-            cur = ggml_gelu(ctx0, cur);
-
-            // projection
-            cur = ggml_mul_mat(ctx0,
-                    layer.mlp_1_w,
-                    cur);
-
-            cur = ggml_add(ctx0, cur, layer.mlp_1_b);
-#endif
-        }
-
-        inpL = ggml_add(ctx0, cur, inpFF);
-    }
-
-    cur = inpL;
-
-    // norm
-    {
-        cur = ggml_norm(ctx0, cur, hparams.eps);
-
-        // cur = ln_f_g*cur + ln_f_b
-        cur = ggml_add(ctx0,
-                ggml_mul(ctx0, cur, model.e_ln_w),
-                model.e_ln_b);
-    }
-
-    ggml_build_forward_expand(gf, cur);
-
-    wstate.embd_enc = cur;
-
-    //ggml_graph_print(gf);
-
-    ////////////////////////////////////////////////////////////////////////////
-
-    //printf("%s: used_mem = %f MB, %f MB, %f MB %f MB %f MB\n", __func__,
-    //        ggml_used_mem(ctx0)/1e6,
-    //        wstate.get_buf_max_mem(0)/1e6,
-    //        wstate.get_buf_max_mem(1)/1e6,
-    //        wstate.get_buf_max_mem(2)/1e6,
-    //        wstate.get_buf_max_mem(3)/1e6);
-
-    ggml_free(ctx0);
-
-    return gf;
-}
-
-// pre-compute cross-attention memory
-static struct ggml_cgraph * whisper_build_graph_cross(
-        whisper_context & wctx,
-          whisper_state & wstate) {
-    const auto & model   = wctx.model;
-    const auto & hparams = model.hparams;
-
-    const int n_ctx   = wstate.exp_n_audio_ctx > 0 ? wstate.exp_n_audio_ctx : hparams.n_audio_ctx;
-    const int n_state = hparams.n_audio_state;
-    const int n_head  = hparams.n_audio_head;
-
-    struct ggml_init_params params = {
-        /*.mem_size   =*/ wstate.alloc_cross.meta.size(),
-        /*.mem_buffer =*/ wstate.alloc_cross.meta.data(),
-        /*.no_alloc   =*/ true,
-    };
-
-    struct ggml_context * ctx0 = ggml_init(params);
-
-    ggml_cgraph * gf = ggml_new_graph(ctx0);
-
-    //ggml_allocr * alloc = wstate.alloc_cross.alloc;
-
-    //struct ggml_tensor * cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_ctx);
-    //ggml_allocr_alloc(alloc, cur);
-
-    //if (!ggml_allocr_is_measure(alloc)) {
-    //    ggml_backend_tensor_copy(wstate.embd_enc, cur);
-    //}
-    struct ggml_tensor * cur = ggml_view_tensor(ctx0, wstate.embd_enc);
-
-    const float  Kscale = pow(float(n_state) / n_head, -0.25);
-
-    for (int il = 0; il < model.hparams.n_text_layer; ++il) {
-        auto & layer = model.layers_decoder[il];
-
-        struct ggml_tensor* Kcross = ggml_mul_mat(ctx0,
-                layer.cross_attn_k_w,
-                cur);
-
-        Kcross = ggml_scale(ctx0, Kcross, Kscale);
-
-        struct ggml_tensor* Vcross = ggml_mul_mat(ctx0,
-                layer.cross_attn_v_w,
-                cur);
-
-        Vcross = ggml_add(ctx0,
-                    Vcross,
-                    layer.cross_attn_v_b);
-
-        Vcross = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcross, n_state, n_ctx));
-
-        struct ggml_tensor * k = ggml_view_1d(ctx0, wstate.kv_cross.k,
-                n_state*n_ctx,
-                (ggml_element_size(wstate.kv_cross.k)*n_state)*(il*n_ctx));
-
-        struct ggml_tensor * v = ggml_view_2d(ctx0, wstate.kv_cross.v, n_ctx, n_state,
-                (   n_ctx)*ggml_element_size(wstate.kv_cross.v),
-                (il*n_ctx)*ggml_element_size(wstate.kv_cross.v)*n_state);
-
-        ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcross, k));
-        ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcross, v));
-    }
-
-    //ggml_graph_print(gf);
-
-    ggml_free(ctx0);
-
-    return gf;
-}
-
-// evaluate the encoder with the given state
-//
-// given audio recording (more specifically, its log mel spectrogram), runs forward pass of the encoder
-// part of the transformer model and returns the encoded features
-//
-//   - wctx:      the model
-//   - wstate:     the state of the encoder
-//   - n_threads:  number of threads to use
-//   - mel_offset: offset in the mel spectrogram (i.e. audio offset)
-//
-static bool whisper_encode_internal(
-        whisper_context & wctx,
-          whisper_state & wstate,
-              const int   mel_offset,
-              const int   n_threads,
- whisper_abort_callback   abort_callback,
-                   void * abort_callback_data) {
-    const int64_t t_start_us = ggml_time_us();
-
-    // conv
-    {
-        auto & alloc = wstate.alloc_conv.alloc;
-
-        ggml_allocr_reset(alloc);
-
-        ggml_cgraph * gf = whisper_build_graph_conv(wctx, wstate, mel_offset);
-
-        ggml_allocr_alloc_graph(alloc, gf);
-
-        if (!whisper_encode_external(wstate)) {
-            if (!ggml_graph_compute_helper(wstate.backend, gf, n_threads)) {
-                return false;
-            }
-        }
-    }
-
-    // encoder
-    if (!whisper_encode_external(wstate)) {
-        auto & alloc = wstate.alloc_encode.alloc;
-
-        ggml_allocr_reset(alloc);
-
-        ggml_cgraph * gf = whisper_build_graph_encoder(wctx, wstate);
-
-        ggml_allocr_alloc_graph(alloc, gf);
-
-        if (!ggml_graph_compute_helper(wstate.backend, gf, n_threads)) {
-            return false;
-        }
-    }
-
-    // cross
-    {
-        auto & alloc = wstate.alloc_cross.alloc;
-
-        ggml_allocr_reset(alloc);
-
-        ggml_cgraph * gf = whisper_build_graph_cross(wctx, wstate);
-
-        ggml_allocr_alloc_graph(alloc, gf);
-
-        if (!ggml_graph_compute_helper(wstate.backend, gf, n_threads)) {
-            return false;
-        }
-    }
-
-    wstate.t_encode_us += ggml_time_us() - t_start_us;
-    wstate.n_encode++;
-
-    return !(abort_callback && abort_callback(abort_callback_data));
-}
-
-static struct ggml_cgraph * whisper_build_graph_decoder(
-         whisper_context & wctx,
-         whisper_state   & wstate,
-     const whisper_batch & batch) {
-    const auto & model   = wctx.model;
-    const auto & hparams = model.hparams;
-
-    auto & kv_self = wstate.kv_self;
-
-    WHISPER_ASSERT(!!kv_self.ctx);
-
-    ggml_allocr * alloc = wstate.alloc_decode.alloc;
-
-    const int n_ctx   = kv_self.size;
-    const int n_state = hparams.n_text_state;
-    const int n_head  = hparams.n_text_head;
-    const int n_layer = hparams.n_text_layer;
-
-    const int n_tokens    = batch.n_tokens;
-    const int n_audio_ctx = wstate.exp_n_audio_ctx > 0 ? wstate.exp_n_audio_ctx : hparams.n_audio_ctx;
-
-    const int32_t n_kv     = ggml_allocr_is_measure(alloc) ? n_ctx            : kv_self.n;
-    const int32_t kv_head  = ggml_allocr_is_measure(alloc) ? n_ctx - n_tokens : kv_self.head;
-
-    //WHISPER_LOG_DEBUG("%s: n_past = %d, n_tokens = %d, n_audio_ctx = %d, n_ctx = %d\n", __func__, n_past, n_tokens, n_audio_ctx, n_ctx);
-
-    struct ggml_init_params params = {
-        /*.mem_size   =*/ wstate.alloc_decode.meta.size(),
-        /*.mem_buffer =*/ wstate.alloc_decode.meta.data(),
-        /*.no_alloc   =*/ true,
-    };
-
-    struct ggml_context * ctx0 = ggml_init(params);
-
-    ggml_cgraph * gf = ggml_new_graph_custom(ctx0, WHISPER_MAX_NODES, false);
-
-    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
-    ggml_allocr_alloc(alloc, embd);
-
-    if (!ggml_allocr_is_measure(alloc)) {
-        ggml_backend_tensor_set(embd, batch.token, 0, n_tokens*ggml_element_size(embd));
-    }
-
-    struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
-    ggml_allocr_alloc(alloc, position);
-
-    if (!ggml_allocr_is_measure(alloc)) {
-        for (int i = 0; i < n_tokens; ++i) {
-            const int32_t val = batch.pos[i];
-            ggml_backend_tensor_set(position, &val, i*sizeof(int32_t), sizeof(int32_t));
-        }
-    }
-
-    const float KQscale = pow(float(n_state)/n_head, -0.25);
-
-    struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
-    ggml_allocr_alloc(alloc, KQ_mask);
-
-    if (!ggml_allocr_is_measure(alloc)) {
-        wstate.inp_mask.resize(n_kv*n_tokens);
-
-        float * data = wstate.inp_mask.data();
-        memset(data, 0, ggml_nbytes(KQ_mask));
-
-        for (int h = 0; h < 1; ++h) {
-            for (int j = 0; j < n_tokens; ++j) {
-                const whisper_pos    pos    = batch.pos[j];
-                const whisper_seq_id seq_id = batch.seq_id[j][0];
-
-                for (int i = 0; i < n_kv; ++i) {
-                    if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
-                        data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
-                    }
-                }
-            }
-        }
-
-        ggml_backend_tensor_set(KQ_mask, wstate.inp_mask.data(), 0, ggml_nelements(KQ_mask)*sizeof(float));
-    }
-
-    // token encoding + position encoding
-    struct ggml_tensor * cur =
-        ggml_add(ctx0,
-                ggml_get_rows(ctx0, model.d_te, embd),
-                ggml_get_rows(ctx0, model.d_pe, position));
-
-    struct ggml_tensor * inpL = cur;
-
-    for (int il = 0; il < n_layer; ++il) {
-        const auto & layer = model.layers_decoder[il];
-
-        // norm
-        {
-            cur = ggml_norm(ctx0, inpL, hparams.eps);
-
-            // cur = ln_0_w*cur + ln_0_b
-            cur = ggml_add(ctx0,
-                    ggml_mul(ctx0,
-                        cur,
-                        layer.attn_ln_0_w),
-                    layer.attn_ln_0_b);
-        }
-
-        // self-attention
-        {
-            struct ggml_tensor * Qcur = ggml_mul_mat(ctx0,
-                    layer.attn_q_w,
-                    cur);
-
-            Qcur = ggml_add(ctx0,
-                        Qcur,
-                        layer.attn_q_b);
-
-            Qcur = ggml_scale(ctx0, Qcur, KQscale);
-
-            // note: no bias for Key
-            struct ggml_tensor * Kcur = ggml_mul_mat(ctx0,
-                    layer.attn_k_w,
-                    cur);
-
-            Kcur = ggml_scale(ctx0, Kcur, KQscale);
-
-            // store key and value to memory
-            {
-                struct ggml_tensor * Vcur = ggml_mul_mat(ctx0,
-                        layer.attn_v_w,
-                        cur);
-
-                Vcur = ggml_add(ctx0,
-                            Vcur,
-                            layer.attn_v_b);
-
-                Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_state, n_tokens));
-
-                struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_state, (ggml_element_size(kv_self.k)*n_state)*(il*n_ctx + kv_head));
-                struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_state,
-                        (   n_ctx)*ggml_element_size(kv_self.v),
-                        (il*n_ctx)*ggml_element_size(kv_self.v)*n_state + kv_head*ggml_element_size(kv_self.v));
-
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
-            }
-
-            // ------
-
-            struct ggml_tensor * Q =
-                ggml_permute(ctx0,
-                        ggml_reshape_3d(ctx0, Qcur, n_state/n_head, n_head, n_tokens),
-                        0, 2, 1, 3);
-
-            struct ggml_tensor * K =
-                ggml_view_3d(ctx0, kv_self.k,
-                        n_state/n_head, n_kv, n_head,
-                        ggml_element_size(kv_self.k)*n_state,
-                        ggml_element_size(kv_self.k)*n_state/n_head,
-                        ggml_element_size(kv_self.k)*n_state*n_ctx*il);
-
-            // K * Q
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
-
-            //struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
-
-            //struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ, n_past);
-            struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ, KQ_mask);
-
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
-
-            struct ggml_tensor * V =
-                ggml_view_3d(ctx0, kv_self.v,
-                        n_kv, n_state/n_head, n_head,
-                        n_ctx*ggml_element_size(kv_self.v),
-                        n_ctx*ggml_element_size(kv_self.v)*n_state/n_head,
-                        n_ctx*ggml_element_size(kv_self.v)*n_state*il);
-
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
-
-            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
-
-            cur = ggml_cpy(ctx0,
-                    KQV_merged,
-                    ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_tokens));
-        }
-
-        // projection
-        {
-            cur = ggml_mul_mat(ctx0,
-                    layer.attn_ln_1_w,
-                    cur);
-
-            cur = ggml_add(ctx0,
-                    cur,
-                    layer.attn_ln_1_b);
-        }
-
-        // add the input
-        struct ggml_tensor * inpCA = ggml_add(ctx0, cur, inpL);
-
-        // norm
-        {
-            cur = ggml_norm(ctx0, inpCA, hparams.eps); // note: we use inpCA here
-
-            // cur = ln_0_w*cur + ln_0_b
-            cur = ggml_add(ctx0,
-                    ggml_mul(ctx0,
-                        cur,
-                        layer.cross_attn_ln_0_w),
-                    layer.cross_attn_ln_0_b);
-        }
-
-        // cross-attention
-        {
-            struct ggml_tensor * Qcur = ggml_mul_mat(ctx0,
-                    layer.cross_attn_q_w,
-                    cur);
-
-            Qcur = ggml_add(ctx0,
-                        Qcur,
-                        layer.cross_attn_q_b);
-
-            Qcur = ggml_scale(ctx0, Qcur, KQscale);
-
-            // Kcross is already scaled
-            struct ggml_tensor * Kcross =
-                ggml_view_3d(ctx0, wstate.kv_cross.k,
-                        n_state/n_head, n_audio_ctx, n_head,
-                        ggml_element_size(wstate.kv_cross.k)*n_state,
-                        ggml_element_size(wstate.kv_cross.k)*n_state/n_head,
-                        ggml_element_size(wstate.kv_cross.k)*n_state*n_audio_ctx*il);
-
-            //struct ggml_tensor * Vcross =
-            //    ggml_reshape_3d(ctx0,
-            //            ggml_view_1d(ctx0, wstate.kv_cross.v, n_audio_ctx*n_state, il*n_audio_ctx*ggml_element_size(wstate.kv_cross.v)*n_state),
-            //            n_state/n_head, n_head, n_audio_ctx);
-
-            //struct ggml_tensor * V_trans =
-            //    ggml_cpy(ctx0,
-            //            ggml_permute(ctx0, Vcross, 1, 2, 0, 3),
-            //            ggml_new_tensor_3d(ctx0, Vcross->type, n_audio_ctx, n_state/n_head, n_head));
-
-            struct ggml_tensor * V =
-                ggml_view_3d(ctx0, wstate.kv_cross.v,
-                        n_audio_ctx, n_state/n_head, n_head,
-                        n_audio_ctx*ggml_element_size(wstate.kv_cross.v),
-                        n_audio_ctx*ggml_element_size(wstate.kv_cross.v)*n_state/n_head,
-                        n_audio_ctx*ggml_element_size(wstate.kv_cross.v)*n_state*il);
-
-            // ------
-
-            struct ggml_tensor * Q =
-                ggml_permute(ctx0,
-                        ggml_reshape_3d(ctx0, Qcur, n_state/n_head, n_head, n_tokens),
-                        0, 2, 1, 3);
-
-            // K * Q
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, Kcross, Q);
-
-            //struct ggml_tensor * KQ_scaled =
-            //    ggml_scale(ctx0,
-            //            KQ,
-            //            ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head))
-            //            );
-
-            // no masking for cross-attention
-            //struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
-
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ);
-
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
-
-            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
-
-            // cur = KQV_merged.contiguous().view(n_state, n_tokens)
-            cur = ggml_cpy(ctx0,
-                    KQV_merged,
-                    ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_tokens));
-        }
-
-        // projection
-        {
-            cur = ggml_mul_mat(ctx0,
-                    layer.cross_attn_ln_1_w,
-                    cur);
-
-            cur = ggml_add(ctx0,
-                    cur,
-                    layer.cross_attn_ln_1_b);
-        }
-
-        // add the input
-        cur = ggml_add(ctx0, cur, inpCA);
-
-        struct ggml_tensor * inpFF = cur;
-
-        // feed-forward network
-        {
-            // norm
-            {
-                cur = ggml_norm(ctx0, inpFF, hparams.eps);
-
-                // cur = mlp_ln_w*cur + mlp_ln_b
-                cur = ggml_add(ctx0,
-                        ggml_mul(ctx0,
-                            cur,
-                            layer.mlp_ln_w),
-                        layer.mlp_ln_b);
-            }
-
-            // fully connected
-            cur = ggml_mul_mat(ctx0,
-                    layer.mlp_0_w,
-                    cur);
-
-            cur = ggml_add(ctx0,
-                    cur,
-                    layer.mlp_0_b);
-
-            // GELU activation
-            cur = ggml_gelu(ctx0, cur);
-
-            // projection
-            cur = ggml_mul_mat(ctx0,
-                    layer.mlp_1_w,
-                    cur);
-
-            cur = ggml_add(ctx0,
-                    cur,
-                    layer.mlp_1_b);
-        }
-
-        inpL = ggml_add(ctx0, cur, inpFF);
-    }
-
-    cur = inpL;
-
-    // norm
-    {
-        cur = ggml_norm(ctx0, cur, hparams.eps);
-
-        cur = ggml_add(ctx0,
-                ggml_mul(ctx0,
-                    cur,
-                    model.d_ln_w),
-                model.d_ln_b);
-    }
-
-    // compute logits only for the last token
-    // comment this line to compute logits for all n_tokens
-    // might be useful in the future
-    //cur = ggml_view_2d(ctx0, cur, cur->ne[0], 1, cur->nb[1], (cur->ne[1] - 1)*cur->nb[1]);
-
-    struct ggml_tensor * logits = ggml_mul_mat(ctx0, model.d_te, cur);
-
-    ggml_build_forward_expand(gf, logits);
-
-    ggml_free(ctx0);
-
-    return gf;
-}
-
-// evaluate the decoder
-//
-// given text prompt + audio features -> computes the logits for the next token
-//
-//   - model:      the model
-//   - n_threads:  number of threads to use
-//   - tokens:     text prompt
-//   - n_tokens:   number of tokens in the prompt
-//   - n_past:     number of past tokens to prefix the prompt with
-//
-static bool whisper_decode_internal(
-        whisper_context & wctx,
-          whisper_state & wstate,
-    const whisper_batch & batch,
-              const int   n_threads,
- whisper_abort_callback   abort_callback,
-                   void * abort_callback_data) {
-    const int64_t t_start_us = ggml_time_us();
-
-    const auto & model   = wctx.model;
-    const auto & hparams = model.hparams;
-
-    const int n_vocab  = hparams.n_vocab;
-    const int n_tokens = batch.n_tokens;
-
-    auto & logits_out = wstate.logits;
-
-    struct ggml_tensor * logits;
-
-    // find KV slot for the batch
-    {
-        auto & kv_self = wstate.kv_self;
-
-        if (!whisper_kv_cache_find_slot(kv_self, batch)) {
-            return false;
-        }
-
-        kv_self.n = whisper_kv_cache_cell_max(kv_self);
-        //kv_self.n = std::min((int32_t) hparams.n_text_ctx, std::max(32, whisper_kv_cache_cell_max(kv_self)));
-        //printf("n_tokens = %5d, kv_self.head = %5d, kv_self.n = %5d, seq_id = %5d\n", batch.n_tokens, kv_self.head, kv_self.n, batch.seq_id[0][0]);
-    }
-
-    // decoder
-    {
-        auto & alloc = wstate.alloc_decode.alloc;
-
-        ggml_allocr_reset(alloc);
-
-        ggml_cgraph * gf = whisper_build_graph_decoder(wctx, wstate, batch);
-
-        ggml_allocr_alloc_graph(alloc, gf);
-
-        logits = gf->nodes[gf->n_nodes - 1];
-
-        if (!ggml_graph_compute_helper(wstate.backend, gf, n_threads)) {
-            return false;
-        }
-    }
-
-    logits_out.resize(n_tokens*n_vocab);
-    for (int i = 0; i < n_tokens; i++) {
-        if (batch.logits[i] == 0) {
-            continue;
-        }
-        ggml_backend_tensor_get(logits, logits_out.data() + (n_vocab*i), sizeof(float)*(n_vocab*i), sizeof(float)*n_vocab);
-    }
-
-    if (batch.n_tokens > 1) {
-        //printf("%s: used_mem = %f MB, %f MB, %f MB %f MB %f MB\n", __func__,
-        //        ggml_used_mem(ctx0)/1e6,
-        //        wstate.get_buf_max_mem(0)/1e6,
-        //        wstate.get_buf_max_mem(1)/1e6,
-        //        wstate.get_buf_max_mem(2)/1e6,
-        //        wstate.get_buf_max_mem(3)/1e6);
-    }
-
-    if (batch.n_tokens == 1) {
-        wstate.t_decode_us += ggml_time_us() - t_start_us;
-        wstate.n_decode++;
-    } else if (batch.n_tokens < 16) {
-        wstate.t_batchd_us += ggml_time_us() - t_start_us;
-        wstate.n_batchd += n_tokens;
-    } else {
-        wstate.t_prompt_us += ggml_time_us() - t_start_us;
-        wstate.n_prompt += n_tokens;
-    }
-
-    return !(abort_callback && abort_callback(abort_callback_data));
-}
-
-//  500 -> 00:05.000
-// 6000 -> 01:00.000
-static std::string to_timestamp(int64_t t, bool comma = false) {
-    int64_t msec = t * 10;
-    int64_t hr = msec / (1000 * 60 * 60);
-    msec = msec - hr * (1000 * 60 * 60);
-    int64_t min = msec / (1000 * 60);
-    msec = msec - min * (1000 * 60);
-    int64_t sec = msec / 1000;
-    msec = msec - sec * 1000;
-
-    char buf[32];
-    snprintf(buf, sizeof(buf), "%02d:%02d:%02d%s%03d", (int) hr, (int) min, (int) sec, comma ? "," : ".", (int) msec);
-
-    return std::string(buf);
-}
-
-#define SIN_COS_N_COUNT WHISPER_N_FFT
-static float sin_vals[SIN_COS_N_COUNT];
-static float cos_vals[SIN_COS_N_COUNT];
-
-// In FFT, we frequently use sine and cosine operations with the same values.
-// We can use precalculated values to speed up the process.
-static void fill_sin_cos_table() {
-    static bool is_filled = false;
-    if (is_filled) return;
-    for (int i = 0; i < SIN_COS_N_COUNT; i++) {
-        double theta = (2*M_PI*i)/SIN_COS_N_COUNT;
-        sin_vals[i] = sinf(theta);
-        cos_vals[i] = cosf(theta);
-    }
-    is_filled = true;
-}
-
-// naive Discrete Fourier Transform
-// input is real-valued
-// output is complex-valued
-static void dft(const std::vector<float> & in, std::vector<float> & out) {
-    int N = in.size();
-
-    out.resize(N*2);
-    const int sin_cos_step = SIN_COS_N_COUNT / N;
-
-    for (int k = 0; k < N; k++) {
-        float re = 0;
-        float im = 0;
-
-        for (int n = 0; n < N; n++) {
-            int idx = (k * n * sin_cos_step) % (SIN_COS_N_COUNT); // t = 2*M_PI*k*n/N
-            re += in[n]*cos_vals[idx]; // cos(t)
-            im -= in[n]*sin_vals[idx]; // sin(t)
-        }
-
-        out[k*2 + 0] = re;
-        out[k*2 + 1] = im;
-    }
-}
-
-// Cooley-Tukey FFT
-// poor man's implementation - use something better
-// input is real-valued
-// output is complex-valued
-static void fft(const std::vector<float> & in, std::vector<float> & out) {
-    out.resize(in.size()*2);
-
-    int N = in.size();
-
-    if (N == 1) {
-        out[0] = in[0];
-        out[1] = 0;
-        return;
-    }
-
-    if (N%2 == 1) {
-        dft(in, out);
-        return;
-    }
-
-    std::vector<float> even;
-    std::vector<float> odd;
-
-    even.reserve(N/2);
-    odd.reserve(N/2);
-
-    for (int i = 0; i < N; i++) {
-        if (i % 2 == 0) {
-            even.push_back(in[i]);
-        } else {
-            odd.push_back(in[i]);
-        }
-    }
-
-    std::vector<float> even_fft;
-    std::vector<float> odd_fft;
-
-    fft(even, even_fft);
-    fft(odd, odd_fft);
-
-    const int sin_cos_step = SIN_COS_N_COUNT / N;
-    for (int k = 0; k < N/2; k++) {
-        int idx = k * sin_cos_step; // t = 2*M_PI*k/N
-        float re = cos_vals[idx]; // cos(t)
-        float im = -sin_vals[idx]; // sin(t)
-
-        float re_odd = odd_fft[2*k + 0];
-        float im_odd = odd_fft[2*k + 1];
-
-        out[2*k + 0] = even_fft[2*k + 0] + re*re_odd - im*im_odd;
-        out[2*k + 1] = even_fft[2*k + 1] + re*im_odd + im*re_odd;
-
-        out[2*(k + N/2) + 0] = even_fft[2*k + 0] - re*re_odd + im*im_odd;
-        out[2*(k + N/2) + 1] = even_fft[2*k + 1] - re*im_odd - im*re_odd;
-    }
-}
-
-static bool hann_window(int length, bool periodic, std::vector<float> & output) {
-    if (output.size() < static_cast<size_t>(length)) {
-        output.resize(length);
-    }
-    int offset = -1;
-    if (periodic) {
-        offset = 0;
-    }
-    for (int i = 0; i < length; i++) {
-        output[i] = 0.5*(1.0 - cosf((2.0*M_PI*i)/(length + offset)));
-    }
-
-    return true;
-}
-
-static void log_mel_spectrogram_worker_thread(int ith, const std::vector<float> & hann, const std::vector<float> & samples,
-                                              int n_samples, int frame_size, int frame_step, int n_threads,
-                                              const whisper_filters & filters, whisper_mel & mel) {
-    std::vector<float> fft_in(frame_size, 0.0);
-    std::vector<float> fft_out(2 * frame_step);
-    // make sure n_fft == 1 + (WHISPER_N_FFT / 2), bin_0 to bin_nyquist
-    int n_fft = 1 + (frame_size / 2);
-    int i = ith;
-
-    // calculate FFT only when fft_in are not all zero
-    for (; i < std::min(n_samples / frame_step + 1, mel.n_len); i += n_threads) {
-        const int offset = i * frame_step;
-
-        // apply Hanning window (~10% faster)
-        for (int j = 0; j < std::min(frame_size, n_samples - offset); j++) {
-            fft_in[j] = hann[j] * samples[offset + j];
-        }
-        // fill the rest with zeros
-        if (n_samples - offset < frame_size) {
-            std::fill(fft_in.begin() + (n_samples - offset), fft_in.end(), 0.0);
-        }
-
-        // FFT
-        fft(fft_in, fft_out);
-
-        // Calculate modulus^2 of complex numbers
-        // Use pow(fft_out[2 * j + 0], 2) + pow(fft_out[2 * j + 1], 2) causes inference quality problem? Interesting.
-        for (int j = 0; j < frame_size; j++) {
-            fft_out[j] = (fft_out[2 * j + 0] * fft_out[2 * j + 0] + fft_out[2 * j + 1] * fft_out[2 * j + 1]);
-        }
-
-        // mel spectrogram
-        for (int j = 0; j < mel.n_mel; j++) {
-            double sum = 0.0;
-
-            // unroll loop (suggested by GH user @lunixbochs)
-            int k = 0;
-            for (k = 0; k < n_fft - 3; k += 4) {
-                sum +=
-                        fft_out[k + 0] * filters.data[j * n_fft + k + 0] +
-                        fft_out[k + 1] * filters.data[j * n_fft + k + 1] +
-                        fft_out[k + 2] * filters.data[j * n_fft + k + 2] +
-                        fft_out[k + 3] * filters.data[j * n_fft + k + 3];
-            }
-
-            // handle n_fft remainder
-            for (; k < n_fft; k++) {
-                sum += fft_out[k] * filters.data[j * n_fft + k];
-            }
-
-            sum = log10(std::max(sum, 1e-10));
-
-            mel.data[j * mel.n_len + i] = sum;
-        }
-    }
-
-    // Otherwise fft_out are all zero
-    double sum = log10(1e-10);
-    for (; i < mel.n_len; i += n_threads) {
-        for (int j = 0; j < mel.n_mel; j++) {
-            mel.data[j * mel.n_len + i] = sum;
-        }
-    }
-}
-
-// ref: https://github.com/openai/whisper/blob/main/whisper/audio.py#L110-L157
-static bool log_mel_spectrogram(
-              whisper_state & wstate,
-              const float * samples,
-              const int   n_samples,
-              const int   /*sample_rate*/,
-              const int   frame_size,
-              const int   frame_step,
-              const int   n_mel,
-              const int   n_threads,
-              const whisper_filters & filters,
-              const bool   debug,
-              whisper_mel & mel) {
-    const int64_t t_start_us = ggml_time_us();
-
-    // Hanning window (Use cosf to eliminate difference)
-    // ref: https://pytorch.org/docs/stable/generated/torch.hann_window.html
-    // ref: https://github.com/openai/whisper/blob/main/whisper/audio.py#L147
-    std::vector<float> hann;
-    hann_window(frame_size, true, hann);
-
-
-    // Calculate the length of padding
-    int64_t stage_1_pad = WHISPER_SAMPLE_RATE * 30;
-    int64_t stage_2_pad = frame_size / 2;
-
-    // Initialize a vector and copy data from C array to it.
-    std::vector<float> samples_padded;
-    samples_padded.resize(n_samples + stage_1_pad + stage_2_pad * 2);
-    std::copy(samples, samples + n_samples, samples_padded.begin() + stage_2_pad);
-
-    // pad 30 seconds of zeros at the end of audio (480,000 samples) + reflective pad 200 samples at the end of audio
-    std::fill(samples_padded.begin() + n_samples + stage_2_pad, samples_padded.begin() + n_samples + stage_1_pad + 2 * stage_2_pad, 0);
-
-    // reflective pad 200 samples at the beginning of audio
-    std::reverse_copy(samples + 1, samples + 1 + stage_2_pad, samples_padded.begin());
-
-    mel.n_mel     = n_mel;
-    // https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/SpectralOps.cpp#L936
-    // Calculate number of frames + remove the last frame
-    mel.n_len     = (samples_padded.size() - frame_size) / frame_step;
-    // Calculate semi-padded sample length to ensure compatibility
-    mel.n_len_org = 1 + (n_samples + stage_2_pad - frame_size) / frame_step;
-    mel.data.resize(mel.n_mel * mel.n_len);
-
-
-    {
-        std::vector<std::thread> workers(n_threads - 1);
-        for (int iw = 0; iw < n_threads - 1; ++iw) {
-            workers[iw] = std::thread(
-                    log_mel_spectrogram_worker_thread, iw + 1, std::cref(hann), samples_padded,
-                    n_samples + stage_2_pad, frame_size, frame_step, n_threads,
-                    std::cref(filters), std::ref(mel));
-        }
-
-        // main thread
-        log_mel_spectrogram_worker_thread(0, hann, samples_padded, n_samples + stage_2_pad, frame_size, frame_step, n_threads, filters, mel);
-
-        for (int iw = 0; iw < n_threads - 1; ++iw) {
-            workers[iw].join();
-        }
-    }
-
-    // clamping and normalization
-    double mmax = -1e20;
-    for (int i = 0; i < mel.n_mel*mel.n_len; i++) {
-        if (mel.data[i] > mmax) {
-            mmax = mel.data[i];
-        }
-    }
-
-    mmax -= 8.0;
-
-    for (int i = 0; i < mel.n_mel*mel.n_len; i++) {
-        if (mel.data[i] < mmax) {
-            mel.data[i] = mmax;
-        }
-
-        mel.data[i] = (mel.data[i] + 4.0)/4.0;
-    }
-
-    wstate.t_mel_us += ggml_time_us() - t_start_us;
-
-    // Dump log_mel_spectrogram
-    if (debug) {
-        std::ofstream outFile("log_mel_spectrogram.json");
-        outFile << "[";
-        for (uint64_t i = 0; i < mel.data.size() - 1; i++) {
-            outFile << mel.data[i] << ", ";
-        }
-        outFile << mel.data[mel.data.size() - 1] << "]";
-        outFile.close();
-    }
-
-    return true;
-}
-
-// split text into tokens
-//
-// ref: https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/encoder.py#L53
-//
-// Regex (Python):
-// r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
-//
-// Regex (C++):
-// R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)"
-//
-static std::vector<whisper_vocab::id> tokenize(const whisper_vocab & vocab, const std::string & text) {
-    std::vector<std::string> words;
-
-    // first split the text into words
-    {
-        std::string str = text;
-        std::string pat = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
-
-        std::regex re(pat);
-        std::smatch m;
-
-        while (std::regex_search(str, m, re)) {
-            for (auto x : m) {
-                words.push_back(x);
-            }
-            str = m.suffix();
-        }
-    }
-
-    // find the longest tokens that form the words:
-    std::vector<whisper_vocab::id> tokens;
-    for (const auto & word : words) {
-        if (word.empty()) continue;
-
-        int i = 0;
-        int n = word.size();
-        while (i < n) {
-            int j = n;
-            bool found = false;
-            while (j > i) {
-                auto sub = word.substr(i, j-i);
-                auto it = vocab.token_to_id.find(sub);
-                if (it != vocab.token_to_id.end()) {
-                    tokens.push_back(it->second);
-                    i = j;
-                    found = true;
-                    break;
-                }
-                --j;
-            }
-            if (!found) {
-                WHISPER_LOG_ERROR("unknown token\n");
-                ++i;
-            }
-        }
-    }
-
-    return tokens;
-}
-
-//
-// interface implementation
-//
-
-#ifdef WHISPER_USE_COREML
-// replace .bin with -encoder.mlmodelc
-static std::string whisper_get_coreml_path_encoder(std::string path_bin) {
-    auto pos = path_bin.rfind('.');
-    if (pos != std::string::npos) {
-        path_bin = path_bin.substr(0, pos);
-    }
-
-    // match "-qx_x"
-    pos = path_bin.rfind('-');
-    if (pos != std::string::npos) {
-        auto sub = path_bin.substr(pos);
-        if (sub.size() == 5 && sub[1] == 'q' && sub[3] == '_') {
-            path_bin = path_bin.substr(0, pos);
-        }
-    }
-
-    path_bin += "-encoder.mlmodelc";
-
-    return path_bin;
-}
-#endif
-
-#ifdef WHISPER_USE_OPENVINO
-// replace .bin with-encoder-openvino.xml
-static std::string whisper_openvino_get_path_encoder(std::string path_bin) {
-    auto pos = path_bin.rfind('.');
-    if (pos != std::string::npos) {
-        path_bin = path_bin.substr(0, pos);
-    }
-
-    path_bin += "-encoder-openvino.xml";
-
-    return path_bin;
-}
-
-static std::string whisper_openvino_get_path_cache(std::string path_bin) {
-    auto pos = path_bin.rfind('.');
-    if (pos != std::string::npos) {
-        path_bin = path_bin.substr(0, pos);
-    }
-
-    path_bin += "-encoder-openvino-cache";
-
-    return path_bin;
-}
-#endif
-
-struct whisper_state * whisper_init_state(whisper_context * ctx) {
-    fill_sin_cos_table();
-
-    whisper_state * state = new whisper_state;
-
-    state->backend = whisper_backend_init(ctx->params);
-
-    // at this point, we don't know yet how many decoders will be used, so we overallocate 3x ctx
-    // in theory, there can be a case where this is not enough, but in practice it should always be enough
-    const int factor = 3;
-
-    if (!kv_cache_init(ctx->model.hparams, state->kv_self, ctx->backend, ctx->itype, factor*ctx->model.hparams.n_text_ctx)) {
-        WHISPER_LOG_ERROR("%s: kv_cache_init() failed for self-attention cache\n", __func__);
-        delete state;
-        return nullptr;
-    }
-
-    {
-        const size_t memory_size = ggml_nbytes(state->kv_self.k) + ggml_nbytes(state->kv_self.v);
-        WHISPER_LOG_INFO("%s: kv self size  = %7.2f MB\n", __func__, memory_size / 1e6);
-    }
-
-    if (!kv_cache_init(ctx->model.hparams, state->kv_cross, ctx->backend, ctx->itype, ctx->model.hparams.n_audio_ctx)) {
-        WHISPER_LOG_ERROR("%s: kv_cache_init() failed for cross-attention cache\n", __func__);
-        delete state;
-        return nullptr;
-    }
-
-    {
-        const size_t memory_size = ggml_nbytes(state->kv_cross.k) + ggml_nbytes(state->kv_cross.v);
-        WHISPER_LOG_INFO("%s: kv cross size = %7.2f MB\n", __func__, memory_size / 1e6);
-    }
-
-#ifdef WHISPER_USE_COREML
-    const auto path_coreml = whisper_get_coreml_path_encoder(ctx->path_model);
-
-    WHISPER_LOG_INFO("%s: loading Core ML model from '%s'\n", __func__, path_coreml.c_str());
-    WHISPER_LOG_INFO("%s: first run on a device may take a while ...\n", __func__);
-
-    state->ctx_coreml = whisper_coreml_init(path_coreml.c_str());
-    if (!state->ctx_coreml) {
-        WHISPER_LOG_ERROR("%s: failed to load Core ML model from '%s'\n", __func__, path_coreml.c_str());
-#ifndef WHISPER_COREML_ALLOW_FALLBACK
-        delete state;
-        return nullptr;
-#endif
-    } else {
-        WHISPER_LOG_INFO("%s: Core ML model loaded\n", __func__);
-    }
-#endif
-
-    state->logits.reserve(ctx->vocab.n_vocab * ctx->model.hparams.n_text_ctx);
-
-    state->batch = whisper_batch_init(ctx->model.hparams.n_text_ctx, WHISPER_MAX_DECODERS);
-
-    // TAGS: WHISPER_DECODER_INIT
-    state->decoders[0].sequence.tokens.reserve(ctx->model.hparams.n_text_ctx);
-
-    state->decoders[0].probs.reserve    (ctx->vocab.n_vocab);
-    state->decoders[0].logits.reserve   (ctx->vocab.n_vocab);
-    state->decoders[0].logprobs.reserve (ctx->vocab.n_vocab);
-    state->decoders[0].logits_id.reserve(ctx->model.hparams.n_vocab);
-
-    state->decoders[0].rng = std::mt19937(0);
-
-    // conv allocator
-    {
-        whisper_allocr_graph_init(state->alloc_conv, ctx->backend,
-                [&]() {
-                    return whisper_build_graph_conv(*ctx, *state, 0);
-                });
-
-        WHISPER_LOG_INFO("%s: compute buffer (conv)   = %7.2f MB\n", __func__, whisper_allocr_size(state->alloc_conv) / 1e6);
-    }
-
-    // encoder allocator
-    if (!whisper_encode_external(*state)) {
-        whisper_allocr_graph_init(state->alloc_encode, ctx->backend,
-                [&]() {
-                    return whisper_build_graph_encoder(*ctx, *state);
-                });
-
-        WHISPER_LOG_INFO("%s: compute buffer (encode) = %7.2f MB\n", __func__, whisper_allocr_size(state->alloc_encode) / 1e6);
-    }
-
-    // cross allocator
-    {
-        whisper_allocr_graph_init(state->alloc_cross, ctx->backend,
-                [&]() {
-                    return whisper_build_graph_cross(*ctx, *state);
-                });
-
-        WHISPER_LOG_INFO("%s: compute buffer (cross)  = %7.2f MB\n", __func__, whisper_allocr_size(state->alloc_cross) / 1e6);
-    }
-
-    // decoder allocator
-    {
-        whisper_allocr_graph_init(state->alloc_decode, ctx->backend,
-                [&]() {
-                    const auto & hparams = ctx->model.hparams;
-
-                    // TODO: make sure this is the worst-case scenario
-                    const int n_tokens = hparams.n_text_ctx;
-                    const int n_past   = 0;
-
-                    whisper_batch_prep_legacy(state->batch, nullptr, n_tokens, n_past, 0);
-
-                    return whisper_build_graph_decoder(*ctx, *state, state->batch);
-                });
-
-        WHISPER_LOG_INFO("%s: compute buffer (decode) = %7.2f MB\n", __func__, whisper_allocr_size(state->alloc_decode) / 1e6);
-    }
-
-    whisper_allocr_graph_realloc(state->alloc_conv,   ctx->backend);
-    whisper_allocr_graph_realloc(state->alloc_encode, ctx->backend);
-    whisper_allocr_graph_realloc(state->alloc_cross,  ctx->backend);
-    whisper_allocr_graph_realloc(state->alloc_decode, ctx->backend);
-
-    return state;
-}
-
-int whisper_ctx_init_openvino_encoder(
-        struct whisper_context * ctx,
-                    const char * model_path,
-                    const char * device,
-                    const char * cache_dir) {
-#ifndef WHISPER_USE_OPENVINO
-    (void)(ctx);
-    (void)(model_path);
-    (void)(device);
-    (void)(cache_dir);
-
-    return 1;
-#else
-    if (!model_path && ctx->path_model.empty()) {
-        WHISPER_LOG_ERROR("%s: model_path is nullptr, and ctx has no model_path set.\n", __func__);
-        return 1;
-    }
-
-    std::string path_encoder;
-    if (!model_path) {
-        //if model_path is not set, attempt to find it in the same directory as ggml-<model>.bin model
-        path_encoder = whisper_openvino_get_path_encoder(ctx->path_model);
-    } else {
-        path_encoder = model_path;
-    }
-
-    std::string path_cache;
-    if (!cache_dir) {
-        //if cache_dir is not set, set it as a dir residing next to ggml-<model>.bin
-        path_cache = whisper_openvino_get_path_cache(ctx->path_model);
-    } else {
-        path_cache = cache_dir;
-    }
-
-    WHISPER_LOG_INFO("%s: loading OpenVINO model from '%s'\n", __func__, path_encoder.c_str());
-    WHISPER_LOG_INFO("%s: first run on a device may take a while ...\n", __func__);
-
-    ctx->state->ctx_openvino = whisper_openvino_init(path_encoder.c_str(), device, path_cache.c_str());
-    if (!ctx->state->ctx_openvino) {
-        WHISPER_LOG_ERROR("%s: failed to init OpenVINO encoder from '%s'\n", __func__, path_encoder.c_str());
-        return 1;
-    } else {
-        WHISPER_LOG_INFO("%s: OpenVINO model loaded\n", __func__);
-    }
-
-    return 0;
-#endif
-}
-
-struct whisper_context_params whisper_context_default_params() {
-    struct whisper_context_params result = {
-        /*.use_gpu    =*/ true,
-    };
-    return result;
-}
-
-struct whisper_context * whisper_init_from_file_with_params_no_state(const char * path_model, struct whisper_context_params params) {
-    WHISPER_LOG_INFO("%s: loading model from '%s'\n", __func__, path_model);
-
-    auto fin = std::ifstream(path_model, std::ios::binary);
-    if (!fin) {
-        WHISPER_LOG_ERROR("%s: failed to open '%s'\n", __func__, path_model);
-        return nullptr;
-    }
-
-    whisper_model_loader loader = {};
-
-    loader.context = &fin;
-
-    loader.read = [](void * ctx, void * output, size_t read_size) {
-        std::ifstream * fin = (std::ifstream*)ctx;
-        fin->read((char *)output, read_size);
-        return read_size;
-    };
-
-    loader.eof = [](void * ctx) {
-        std::ifstream * fin = (std::ifstream*)ctx;
-        return fin->eof();
-    };
-
-    loader.close = [](void * ctx) {
-        std::ifstream * fin = (std::ifstream*)ctx;
-        fin->close();
-    };
-
-    auto ctx = whisper_init_with_params_no_state(&loader, params);
-
-    if (ctx) {
-        ctx->path_model = path_model;
-    }
-
-    return ctx;
-}
-
-struct whisper_context * whisper_init_from_buffer_with_params_no_state(void * buffer, size_t buffer_size, struct whisper_context_params params) {
-    struct buf_context {
-        uint8_t* buffer;
-        size_t size;
-        size_t current_offset;
-    };
-
-    buf_context ctx = { reinterpret_cast<uint8_t*>(buffer), buffer_size, 0 };
-
-    WHISPER_LOG_INFO("%s: loading model from buffer\n", __func__);
-
-    whisper_model_loader loader = {};
-
-    loader.context = &ctx;
-
-    loader.read = [](void * ctx, void * output, size_t read_size) {
-        buf_context * buf = reinterpret_cast<buf_context *>(ctx);
-
-        size_t size_to_copy = buf->current_offset + read_size < buf->size ? read_size : buf->size - buf->current_offset;
-
-        memcpy(output, buf->buffer + buf->current_offset, size_to_copy);
-        buf->current_offset += size_to_copy;
-
-        return size_to_copy;
-    };
-
-    loader.eof = [](void * ctx) {
-        buf_context * buf = reinterpret_cast<buf_context *>(ctx);
-
-        return buf->current_offset >= buf->size;
-    };
-
-    loader.close = [](void * /*ctx*/) { };
-
-    return whisper_init_with_params_no_state(&loader, params);
-}
-
-struct whisper_context * whisper_init_with_params_no_state(struct whisper_model_loader * loader, struct whisper_context_params params) {
-    ggml_time_init();
-
-    whisper_context * ctx = new whisper_context;
-    ctx->params = params;
-
-    if (!whisper_model_load(loader, *ctx)) {
-        loader->close(loader->context);
-        WHISPER_LOG_ERROR("%s: failed to load model\n", __func__);
-        delete ctx;
-        return nullptr;
-    }
-
-    loader->close(loader->context);
-
-    return ctx;
-}
-
-struct whisper_context * whisper_init_from_file_with_params(const char * path_model, struct whisper_context_params params) {
-    whisper_context * ctx = whisper_init_from_file_with_params_no_state(path_model, params);
-    if (!ctx) {
-        return nullptr;
-    }
-
-    ctx->state = whisper_init_state(ctx);
-    if (!ctx->state) {
-        whisper_free(ctx);
-        return nullptr;
-    }
-
-    return ctx;
-}
-
-struct whisper_context * whisper_init_from_buffer_with_params(void * buffer, size_t buffer_size, struct whisper_context_params params) {
-    whisper_context * ctx = whisper_init_from_buffer_with_params_no_state(buffer, buffer_size, params);
-    if (!ctx) {
-        return nullptr;
-    }
-
-    ctx->state = whisper_init_state(ctx);
-    if (!ctx->state) {
-        whisper_free(ctx);
-        return nullptr;
-    }
-
-    return ctx;
-}
-
-struct whisper_context * whisper_init_with_params(struct whisper_model_loader * loader, struct whisper_context_params params) {
-    whisper_context * ctx = whisper_init_with_params_no_state(loader, params);
-    if (!ctx) {
-        return nullptr;
-    }
-
-    ctx->state = whisper_init_state(ctx);
-    if (!ctx->state) {
-        whisper_free(ctx);
-        return nullptr;
-    }
-
-    return ctx;
-}
-
-struct whisper_context * whisper_init_from_file(const char * path_model) {
-    return whisper_init_from_file_with_params(path_model, whisper_context_default_params());
-}
-
-struct whisper_context * whisper_init_from_buffer(void * buffer, size_t buffer_size) {
-    return whisper_init_from_buffer_with_params(buffer, buffer_size, whisper_context_default_params());
-}
-
-struct whisper_context * whisper_init(struct whisper_model_loader * loader) {
-    return whisper_init_with_params(loader, whisper_context_default_params());
-}
-
-struct whisper_context * whisper_init_from_file_no_state(const char * path_model) {
-    return whisper_init_from_file_with_params_no_state(path_model, whisper_context_default_params());
-}
-
-struct whisper_context * whisper_init_from_buffer_no_state(void * buffer, size_t buffer_size) {
-    return whisper_init_from_buffer_with_params_no_state(buffer, buffer_size, whisper_context_default_params());
-}
-
-struct whisper_context * whisper_init_no_state(struct whisper_model_loader * loader) {
-    return whisper_init_with_params_no_state(loader, whisper_context_default_params());
-}
-
-void whisper_free_state(struct whisper_state * state)
-{
-    if (state) {
-        kv_cache_free(state->kv_self);
-        kv_cache_free(state->kv_cross);
-
-#ifdef WHISPER_USE_COREML
-        if (state->ctx_coreml != nullptr) {
-            whisper_coreml_free(state->ctx_coreml);
-            state->ctx_coreml = nullptr;
-        }
-#endif
-
-#ifdef WHISPER_USE_OPENVINO
-        if (state->ctx_openvino != nullptr) {
-            whisper_openvino_free(state->ctx_openvino);
-            state->ctx_openvino = nullptr;
-        }
-#endif
-
-        whisper_batch_free(state->batch);
-
-        whisper_allocr_free(state->alloc_conv);
-        whisper_allocr_free(state->alloc_encode);
-        whisper_allocr_free(state->alloc_cross);
-        whisper_allocr_free(state->alloc_decode);
-
-        ggml_backend_free(state->backend);
-
-        delete state;
-    }
-}
-
-void whisper_free(struct whisper_context * ctx) {
-    if (ctx) {
-        if (ctx->model.ctx) {
-            ggml_free(ctx->model.ctx);
-        }
-
-        for (auto & buffer : ctx->model.buffers) {
-            if (buffer) {
-                ggml_backend_buffer_free(buffer);
-            }
-        }
-
-        whisper_free_state(ctx->state);
-
-        ggml_backend_free(ctx->backend);
-
-        delete ctx;
-    }
-}
-
-void whisper_free_context_params(struct whisper_context_params * params) {
-    if (params) {
-        delete params;
-    }
-}
-
-void whisper_free_params(struct whisper_full_params * params) {
-    if (params) {
-        delete params;
-    }
-}
-
-int whisper_pcm_to_mel_with_state(struct whisper_context * ctx, struct whisper_state * state, const float * samples, int n_samples, int n_threads) {
-    if (!log_mel_spectrogram(*state, samples, n_samples, WHISPER_SAMPLE_RATE, WHISPER_N_FFT, WHISPER_HOP_LENGTH, ctx->model.filters.n_mel, n_threads, ctx->model.filters, false, state->mel)) {
-        WHISPER_LOG_ERROR("%s: failed to compute mel spectrogram\n", __func__);
-        return -1;
-    }
-
-    return 0;
-}
-
-int whisper_pcm_to_mel(struct whisper_context * ctx, const float * samples, int n_samples, int n_threads) {
-    return whisper_pcm_to_mel_with_state(ctx, ctx->state, samples, n_samples, n_threads);
-}
-
-// same as whisper_pcm_to_mel, but applies a Phase Vocoder to speed up the audio x2 (PV without phase lock is not good)
-int whisper_pcm_to_mel_phase_vocoder_with_state(struct whisper_context * ctx, struct whisper_state * state, const float * samples, int n_samples, int n_threads) {
-    if (!log_mel_spectrogram(*state, samples, n_samples, WHISPER_SAMPLE_RATE, 2 * WHISPER_N_FFT, 2 * WHISPER_HOP_LENGTH, ctx->model.filters.n_mel, n_threads, ctx->model.filters, false, state->mel)) {
-        WHISPER_LOG_ERROR("%s: failed to compute mel spectrogram\n", __func__);
-        return -1;
-    }
-
-    return 0;
-}
-
-// same as whisper_pcm_to_mel, but applies a Phase Vocoder to speed up the audio x2 (PV without phase lock is not good)
-int whisper_pcm_to_mel_phase_vocoder(struct whisper_context * ctx, const float * samples, int n_samples, int n_threads) {
-    return whisper_pcm_to_mel_phase_vocoder_with_state(ctx, ctx->state, samples, n_samples, n_threads);
-}
-
-// same as whisper_pcm_to_mel, but applies WSOLA to speed up the audio x2
-// TODO
-
-// same as whisper_pcm_to_mel, but applies HPTSM to speed up the audio x2
-// TODO
-
-// same as whisper_pcm_to_mel, but applies PV (with phase lock) to speed up the audio x2
-// TODO
-
-int whisper_set_mel_with_state(
-        struct whisper_context * ctx,
-          struct whisper_state * state,
-                   const float * data,
-                           int   n_len,
-                           int   n_mel) {
-    if (n_mel != ctx->model.filters.n_mel) {
-        WHISPER_LOG_ERROR("%s: invalid number of mel bands: %d (expected %d)\n", __func__, n_mel, ctx->model.filters.n_mel);
-        return -1;
-    }
-
-    state->mel.n_len     = n_len;
-    state->mel.n_len_org = n_len;
-    state->mel.n_mel     = n_mel;
-
-    state->mel.data.resize(n_len*n_mel);
-    memcpy(state->mel.data.data(), data, n_len*n_mel*sizeof(float));
-
-    return 0;
-}
-
-int whisper_set_mel(
-        struct whisper_context * ctx,
-        const float * data,
-        int n_len,
-        int n_mel) {
-    return whisper_set_mel_with_state(ctx, ctx->state, data, n_len, n_mel);
-}
-
-int whisper_encode_with_state(struct whisper_context * ctx, struct whisper_state * state, int offset, int n_threads) {
-    if (!whisper_encode_internal(*ctx, *state, offset, n_threads, nullptr, nullptr)) {
-        WHISPER_LOG_ERROR("%s: failed to eval\n", __func__);
-        return -1;
-    }
-
-    return 0;
-}
-
-int whisper_encode(struct whisper_context * ctx, int offset, int n_threads) {
-    if (!whisper_encode_internal(*ctx, *ctx->state, offset, n_threads, nullptr, nullptr)) {
-        WHISPER_LOG_ERROR("%s: failed to eval\n", __func__);
-        return -1;
-    }
-
-    return 0;
-}
-
-int whisper_decode_with_state(struct whisper_context * ctx, struct whisper_state * state, const whisper_token * tokens, int n_tokens, int n_past, int n_threads) {
-    whisper_batch_prep_legacy(state->batch, tokens, n_tokens, n_past, 0);
-
-    whisper_kv_cache_seq_rm(state->kv_self, 0, n_past, -1);
-
-    if (!whisper_decode_internal(*ctx, *state, state->batch, n_threads, nullptr, nullptr)) {
-        WHISPER_LOG_ERROR("%s: failed to eval\n", __func__);
-        return 1;
-    }
-
-    return 0;
-}
-
-int whisper_decode(struct whisper_context * ctx, const whisper_token * tokens, int n_tokens, int n_past, int n_threads) {
-    if (ctx->state == nullptr) {
-        WHISPER_LOG_ERROR("%s: ERROR state was not loaded.\n", __func__);
-        return -1;
-    }
-
-    return whisper_decode_with_state(ctx, ctx->state, tokens, n_tokens, n_past, n_threads);
-}
-
-int whisper_tokenize(struct whisper_context * ctx, const char * text, whisper_token * tokens, int n_max_tokens) {
-    const auto res = tokenize(ctx->vocab, text);
-
-    if (n_max_tokens < (int) res.size()) {
-        WHISPER_LOG_ERROR("%s: too many resulting tokens: %d (max %d)\n", __func__, (int) res.size(), n_max_tokens);
-        return -1;
-    }
-
-    for (int i = 0; i < (int) res.size(); i++) {
-        tokens[i] = res[i];
-    }
-
-    return res.size();
-}
-
-int whisper_lang_max_id() {
-    auto max_id = 0;
-    for (const auto & kv : g_lang) {
-        max_id = std::max(max_id, kv.second.first);
-    }
-
-    return max_id;
-}
-
-int whisper_lang_id(const char * lang) {
-    if (!g_lang.count(lang)) {
-        for (const auto & kv : g_lang) {
-            if (kv.second.second == lang) {
-                return kv.second.first;
-            }
-        }
-
-        WHISPER_LOG_ERROR("%s: unknown language '%s'\n", __func__, lang);
-        return -1;
-    }
-    return g_lang.at(lang).first;
-}
-
-const char * whisper_lang_str(int id) {
-    for (const auto & kv : g_lang) {
-        if (kv.second.first == id) {
-            return kv.first.c_str();
-        }
-    }
-
-    WHISPER_LOG_ERROR("%s: unknown language id %d\n", __func__, id);
-    return nullptr;
-}
-
-const char * whisper_lang_str_full(int id) {
-   for (const auto & kv : g_lang) {
-        if (kv.second.first == id) {
-            return kv.second.second.c_str();
-        }
-    }
-
-    WHISPER_LOG_ERROR("%s: unknown language id %d\n", __func__, id);
-    return nullptr;
-}
-
-int whisper_lang_auto_detect_with_state(
-        struct whisper_context * ctx,
-          struct whisper_state * state,
-                           int   offset_ms,
-                           int   n_threads,
-                         float * lang_probs) {
-    const int seek = offset_ms/10;
-
-    if (seek < 0) {
-        WHISPER_LOG_ERROR("%s: offset %dms is before the start of the audio\n", __func__, offset_ms);
-        return -1;
-    }
-
-    if (seek >= state->mel.n_len_org) {
-        WHISPER_LOG_ERROR("%s: offset %dms is past the end of the audio (%dms)\n", __func__, offset_ms, state->mel.n_len_org*10);
-        return -2;
-    }
-
-    // run the encoder
-    if (whisper_encode_with_state(ctx, state, seek, n_threads) != 0) {
-        WHISPER_LOG_ERROR("%s: failed to encode\n", __func__);
-        return -6;
-    }
-
-    const std::vector<whisper_token> prompt = { whisper_token_sot(ctx) };
-
-    if (whisper_decode_with_state(ctx, state, prompt.data(), prompt.size(), 0, n_threads) != 0) {
-        WHISPER_LOG_ERROR("%s: failed to decode\n", __func__);
-        return -7;
-    }
-
-    auto & logits_id = state->decoders[0].logits_id;
-    logits_id.clear();
-
-    for (const auto & kv : g_lang) {
-        const auto token_lang = whisper_token_lang(ctx, kv.second.first);
-        logits_id.emplace_back(state->logits[token_lang], kv.second.first);
-    }
-
-    // sort descending
-    {
-        using pair_type = std::remove_reference<decltype(logits_id)>::type::value_type;
-        std::sort(logits_id.begin(), logits_id.end(), [](const pair_type & a, const pair_type & b) {
-            return a.first > b.first;
-        });
-    }
-
-    // softmax
-    {
-        const auto max = logits_id[0].first;
-
-        double sum = 0.0f;
-        for (auto & kv : logits_id) {
-            kv.first = exp(kv.first - max);
-            sum += kv.first;
-        }
-
-        for (auto & kv : logits_id) {
-            kv.first /= sum;
-        }
-    }
-
-    {
-        for (const auto & prob : logits_id) {
-            if (lang_probs) {
-                lang_probs[prob.second] = prob.first;
-            }
-
-            //printf("%s: lang %2d (%3s): %f\n", __func__, prob.second, whisper_lang_str(prob.second), prob.first);
-        }
-    }
-
-    return logits_id[0].second;
-}
-
-int whisper_lang_auto_detect(
-        struct whisper_context * ctx,
-                           int   offset_ms,
-                           int   n_threads,
-                         float * lang_probs) {
-    return whisper_lang_auto_detect_with_state(ctx, ctx->state, offset_ms, n_threads, lang_probs);
-}
-
-int whisper_model_n_vocab(struct whisper_context * ctx) {
-    return ctx->model.hparams.n_vocab;
-}
-
-int whisper_model_n_audio_ctx(struct whisper_context * ctx) {
-    return ctx->model.hparams.n_audio_ctx;
-}
-
-int whisper_model_n_audio_state(struct whisper_context * ctx) {
-    return ctx->model.hparams.n_audio_state;
-}
-
-int whisper_model_n_audio_head(struct whisper_context * ctx) {
-    return ctx->model.hparams.n_audio_head;
-}
-
-int whisper_model_n_audio_layer(struct whisper_context * ctx) {
-    return ctx->model.hparams.n_audio_layer;
-}
-
-int whisper_model_n_text_ctx(struct whisper_context * ctx) {
-    return ctx->model.hparams.n_text_ctx;
-}
-
-int whisper_model_n_text_state(struct whisper_context * ctx) {
-    return ctx->model.hparams.n_text_state;
-}
-
-int whisper_model_n_text_head(struct whisper_context * ctx) {
-    return ctx->model.hparams.n_text_head;
-}
-
-int whisper_model_n_text_layer(struct whisper_context * ctx) {
-    return ctx->model.hparams.n_text_layer;
-}
-
-int whisper_model_n_mels(struct whisper_context * ctx) {
-    return ctx->model.hparams.n_mels;
-}
-
-int whisper_model_ftype(struct whisper_context * ctx) {
-    return ctx->model.hparams.ftype;
-}
-
-int whisper_model_type(struct whisper_context * ctx) {
-    return ctx->model.type;
-}
-
-const char *whisper_model_type_readable(struct whisper_context * ctx) {
-    switch (ctx->model.type) {
-    case e_model::MODEL_TINY:
-        return "tiny";
-    case e_model::MODEL_BASE:
-        return "base";
-    case e_model::MODEL_SMALL:
-        return "small";
-    case e_model::MODEL_MEDIUM:
-        return "medium";
-    case e_model::MODEL_LARGE:
-        return "large";
-    default:
-        return "unknown";
-    }
-}
-
-int whisper_n_len_from_state(struct whisper_state * state) {
-    return state->mel.n_len_org;
-}
-
-int whisper_n_len(struct whisper_context * ctx) {
-    return ctx->state->mel.n_len_org;
-}
-
-int whisper_n_vocab(struct whisper_context * ctx) {
-    return ctx->vocab.n_vocab;
-}
-
-int whisper_n_text_ctx(struct whisper_context * ctx) {
-    return ctx->model.hparams.n_text_ctx;
-}
-
-int whisper_n_audio_ctx(struct whisper_context * ctx) {
-    return ctx->model.hparams.n_audio_ctx;
-}
-
-int whisper_is_multilingual(struct whisper_context * ctx) {
-    return ctx->vocab.is_multilingual() ? 1 : 0;
-}
-
-float * whisper_get_logits(struct whisper_context * ctx) {
-    return ctx->state->logits.data();
-}
-
-float * whisper_get_logits_from_state(struct whisper_state * state) {
-    return state->logits.data();
-}
-
-const char * whisper_token_to_str(struct whisper_context * ctx, whisper_token token) {
-    return ctx->vocab.id_to_token.at(token).c_str();
-}
-
-whisper_token whisper_token_eot(struct whisper_context * ctx) {
-    return ctx->vocab.token_eot;
-}
-
-whisper_token whisper_token_sot(struct whisper_context * ctx) {
-    return ctx->vocab.token_sot;
-}
-
-whisper_token whisper_token_solm(struct whisper_context * ctx) {
-    return ctx->vocab.token_solm;
-}
-
-whisper_token whisper_token_prev(struct whisper_context * ctx) {
-    return ctx->vocab.token_prev;
-}
-
-whisper_token whisper_token_nosp(struct whisper_context * ctx) {
-    return ctx->vocab.token_nosp;
-}
-
-whisper_token whisper_token_not(struct whisper_context * ctx) {
-    return ctx->vocab.token_not;
-}
-
-whisper_token whisper_token_beg(struct whisper_context * ctx) {
-    return ctx->vocab.token_beg;
-}
-
-whisper_token whisper_token_lang(struct whisper_context * ctx, int lang_id) {
-    return whisper_token_sot(ctx) + 1 + lang_id;
-}
-
-whisper_token whisper_token_translate(struct whisper_context * ctx) {
-    return ctx->vocab.token_translate;
-}
-
-whisper_token whisper_token_transcribe(struct whisper_context * ctx) {
-    return ctx->vocab.token_transcribe;
-}
-
-void whisper_print_timings(struct whisper_context * ctx) {
-    const int64_t t_end_us = ggml_time_us();
-
-    WHISPER_LOG_INFO("\n");
-    WHISPER_LOG_INFO("%s:     load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0f);
-    if (ctx->state != nullptr) {
-
-        const int32_t n_sample = std::max(1, ctx->state->n_sample);
-        const int32_t n_encode = std::max(1, ctx->state->n_encode);
-        const int32_t n_decode = std::max(1, ctx->state->n_decode);
-        const int32_t n_batchd = std::max(1, ctx->state->n_batchd);
-        const int32_t n_prompt = std::max(1, ctx->state->n_prompt);
-
-        WHISPER_LOG_INFO("%s:     fallbacks = %3d p / %3d h\n", __func__, ctx->state->n_fail_p, ctx->state->n_fail_h);
-        WHISPER_LOG_INFO("%s:      mel time = %8.2f ms\n", __func__, ctx->state->t_mel_us / 1000.0f);
-        WHISPER_LOG_INFO("%s:   sample time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3f * ctx->state->t_sample_us, n_sample, 1e-3f * ctx->state->t_sample_us / n_sample);
-        WHISPER_LOG_INFO("%s:   encode time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3f * ctx->state->t_encode_us, n_encode, 1e-3f * ctx->state->t_encode_us / n_encode);
-        WHISPER_LOG_INFO("%s:   decode time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3f * ctx->state->t_decode_us, n_decode, 1e-3f * ctx->state->t_decode_us / n_decode);
-        WHISPER_LOG_INFO("%s:   batchd time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3f * ctx->state->t_batchd_us, n_batchd, 1e-3f * ctx->state->t_batchd_us / n_batchd);
-        WHISPER_LOG_INFO("%s:   prompt time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3f * ctx->state->t_prompt_us, n_prompt, 1e-3f * ctx->state->t_prompt_us / n_prompt);
-    }
-    WHISPER_LOG_INFO("%s:    total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0f);
-}
-
-void whisper_reset_timings(struct whisper_context * ctx) {
-    ctx->t_start_us = ggml_time_us();
-    if (ctx->state != nullptr) {
-        ctx->state->t_mel_us = 0;
-        ctx->state->t_sample_us = 0;
-        ctx->state->t_encode_us = 0;
-        ctx->state->t_decode_us = 0;
-        ctx->state->t_batchd_us = 0;
-        ctx->state->t_prompt_us = 0;
-        ctx->state->n_sample = 0;
-        ctx->state->n_encode = 0;
-        ctx->state->n_decode = 0;
-        ctx->state->n_batchd = 0;
-        ctx->state->n_prompt = 0;
-    }
-}
-
-static int whisper_has_coreml(void) {
-#ifdef WHISPER_USE_COREML
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-static int whisper_has_openvino(void) {
-#ifdef WHISPER_USE_OPENVINO
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-const char * whisper_print_system_info(void) {
-    static std::string s;
-
-    s  = "";
-    s += "AVX = "       + std::to_string(ggml_cpu_has_avx())       + " | ";
-    s += "AVX2 = "      + std::to_string(ggml_cpu_has_avx2())      + " | ";
-    s += "AVX512 = "    + std::to_string(ggml_cpu_has_avx512())    + " | ";
-    s += "FMA = "       + std::to_string(ggml_cpu_has_fma())       + " | ";
-    s += "NEON = "      + std::to_string(ggml_cpu_has_neon())      + " | ";
-    s += "ARM_FMA = "   + std::to_string(ggml_cpu_has_arm_fma())   + " | ";
-    s += "METAL = "     + std::to_string(ggml_cpu_has_metal())     + " | ";
-    s += "F16C = "      + std::to_string(ggml_cpu_has_f16c())      + " | ";
-    s += "FP16_VA = "   + std::to_string(ggml_cpu_has_fp16_va())   + " | ";
-    s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
-    s += "BLAS = "      + std::to_string(ggml_cpu_has_blas())      + " | ";
-    s += "SSE3 = "      + std::to_string(ggml_cpu_has_sse3())      + " | ";
-    s += "SSSE3 = "     + std::to_string(ggml_cpu_has_ssse3())     + " | ";
-    s += "VSX = "       + std::to_string(ggml_cpu_has_vsx())       + " | ";
-    s += "CUDA = "      + std::to_string(ggml_cpu_has_cublas())    + " | ";
-    s += "COREML = "    + std::to_string(whisper_has_coreml())     + " | ";
-    s += "OPENVINO = "  + std::to_string(whisper_has_openvino())   + " | ";
-
-    return s.c_str();
-}
-
-//////////////////////////////////
-// Grammar - ported from llama.cpp
-//////////////////////////////////
-
-// Decodes a UTF-8 string which may end in an incomplete sequence. Adds a terminating 0 for use as
-// pointer. If an invalid sequence is encountered, returns `whisper_partial_utf8.n_remain == -1`.
-std::pair<std::vector<uint32_t>, whisper_partial_utf8> decode_utf8(
-        const char         * src,
-        whisper_partial_utf8   partial_start) {
-    static const int      lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 };
-    const char          * pos      = src;
-    std::vector<uint32_t> code_points;
-    uint32_t              value    = partial_start.value;
-    int                   n_remain = partial_start.n_remain;
-
-    // continue previous decode, if applicable
-    while (*pos != 0 && n_remain > 0) {
-        uint8_t next_byte = static_cast<uint8_t>(*pos);
-        if ((next_byte >> 6) != 2) {
-            // invalid sequence, abort
-            code_points.push_back(0);
-            return std::make_pair(std::move(code_points), whisper_partial_utf8{ 0, -1 });
-        }
-        value = (value << 6) + (next_byte & 0x3F);
-        ++pos;
-        --n_remain;
-    }
-
-    if (partial_start.n_remain > 0 && n_remain == 0) {
-        code_points.push_back(value);
-    }
-
-    // decode any subsequent utf-8 sequences, which may end in an incomplete one
-    while (*pos != 0) {
-        uint8_t  first_byte = static_cast<uint8_t>(*pos);
-        uint8_t  highbits   = first_byte >> 4;
-                 n_remain   = lookup[highbits] - 1;
-
-        if (n_remain < 0) {
-            // invalid sequence, abort
-            code_points.clear();
-            code_points.push_back(0);
-            return std::make_pair(std::move(code_points), whisper_partial_utf8{ 0, n_remain });
-        }
-
-        uint8_t  mask       = (1 << (7 - n_remain)) - 1;
-                 value      = first_byte & mask;
-        ++pos;
-        while (*pos != 0 && n_remain > 0) {
-            value = (value << 6) + (static_cast<uint8_t>(*pos) & 0x3F);
-            ++pos;
-            --n_remain;
-        }
-        if (n_remain == 0) {
-            code_points.push_back(value);
-        }
-    }
-    code_points.push_back(0);
-
-    return std::make_pair(std::move(code_points), whisper_partial_utf8{ value, n_remain });
-}
-
-// returns true iff pos points to the end of one of the definitions of a rule
-static bool whisper_grammar_is_end_of_sequence(const whisper_grammar_element * pos) {
-    switch (pos->type) {
-        case WHISPER_GRETYPE_END: return true;  // NOLINT
-        case WHISPER_GRETYPE_ALT: return true;  // NOLINT
-        default:                return false;
-    }
-}
-
-// returns true iff chr satisfies the char range at pos (regular or inverse range)
-// asserts that pos is pointing to a char range element
-static std::pair<bool, const whisper_grammar_element *> whisper_grammar_match_char(
-        const whisper_grammar_element * pos,
-        const uint32_t                chr) {
-
-    bool found            = false;
-    bool is_positive_char = pos->type == WHISPER_GRETYPE_CHAR;
-
-    WHISPER_ASSERT(is_positive_char || pos->type == WHISPER_GRETYPE_CHAR_NOT); // NOLINT
-
-    do {
-        if (pos[1].type == WHISPER_GRETYPE_CHAR_RNG_UPPER) {
-            // inclusive range, e.g. [a-z]
-            found = found || (pos->value <= chr && chr <= pos[1].value);
-            pos += 2;
-        } else {
-            // exact char match, e.g. [a] or "a"
-            found = found || pos->value == chr;
-            pos += 1;
-        }
-    } while (pos->type == WHISPER_GRETYPE_CHAR_ALT);
-
-    return std::make_pair(found == is_positive_char, pos);
-}
-
-// returns true iff some continuation of the given partial UTF-8 sequence could satisfy the char
-// range at pos (regular or inverse range)
-// asserts that pos is pointing to a char range element
-static bool whisper_grammar_match_partial_char(
-        const whisper_grammar_element * pos,
-        const whisper_partial_utf8      partial_utf8) {
-
-    bool is_positive_char = pos->type == WHISPER_GRETYPE_CHAR;
-    WHISPER_ASSERT(is_positive_char || pos->type == WHISPER_GRETYPE_CHAR_NOT);
-
-    uint32_t partial_value = partial_utf8.value;
-    int      n_remain      = partial_utf8.n_remain;
-
-    // invalid sequence or 7-bit char split across 2 bytes (overlong)
-    if (n_remain < 0 || (n_remain == 1 && partial_value < 2)) {
-        return false;
-    }
-
-    // range of possible code points this partial UTF-8 sequence could complete to
-    uint32_t low  = partial_value << (n_remain * 6);
-    uint32_t high = low | ((1 << (n_remain * 6)) - 1);
-
-    if (low == 0) {
-        if (n_remain == 2) {
-            low = 1 << 11;
-        } else if (n_remain == 3) {
-            low = 1 << 16;
-        }
-    }
-
-    do {
-        if (pos[1].type == WHISPER_GRETYPE_CHAR_RNG_UPPER) {
-            // inclusive range, e.g. [a-z]
-            if (pos->value <= high && low <= pos[1].value) {
-                return is_positive_char;
-            }
-            pos += 2;
-        } else {
-            // exact char match, e.g. [a] or "a"
-            if (low <= pos->value && pos->value <= high) {
-                return is_positive_char;
-            }
-            pos += 1;
-        }
-    } while (pos->type == WHISPER_GRETYPE_CHAR_ALT);
-
-    return !is_positive_char;
-}
-
-
-// transforms a grammar pushdown stack into N possible stacks, all ending
-// at a character range (terminal element)
-static void whisper_grammar_advance_stack(
-        const std::vector<std::vector<whisper_grammar_element>>   & rules,
-        const std::vector<const whisper_grammar_element *>        & stack,
-        std::vector<std::vector<const whisper_grammar_element *>> & new_stacks) {
-
-    if (stack.empty()) {
-        new_stacks.push_back(stack);
-        return;
-    }
-
-    const whisper_grammar_element * pos = stack.back();
-
-    switch (pos->type) {
-        case WHISPER_GRETYPE_RULE_REF: {
-            const size_t                  rule_id = static_cast<size_t>(pos->value);
-            const whisper_grammar_element * subpos  = rules[rule_id].data();
-            do {
-                // init new stack without the top (pos)
-                std::vector<const whisper_grammar_element *> new_stack(stack.begin(), stack.end() - 1);
-                if (!whisper_grammar_is_end_of_sequence(pos + 1)) {
-                    // if this rule ref is followed by another element, add that to stack
-                    new_stack.push_back(pos + 1);
-                }
-                if (!whisper_grammar_is_end_of_sequence(subpos)) {
-                    // if alternate is nonempty, add to stack
-                    new_stack.push_back(subpos);
-                }
-                whisper_grammar_advance_stack(rules, new_stack, new_stacks);
-                while (!whisper_grammar_is_end_of_sequence(subpos)) {
-                    // scan to end of alternate def
-                    subpos++;
-                }
-                if (subpos->type == WHISPER_GRETYPE_ALT) {
-                    // there's another alternate def of this rule to process
-                    subpos++;
-                } else {
-                    break;
-                }
-            } while (true);
-            break;
-        }
-        case WHISPER_GRETYPE_CHAR:
-        case WHISPER_GRETYPE_CHAR_NOT:
-            new_stacks.push_back(stack);
-            break;
-        default:
-            // end of alternate (WHISPER_GRETYPE_END, WHISPER_GRETYPE_ALT) or middle of char range
-            // (WHISPER_GRETYPE_CHAR_ALT, WHISPER_GRETYPE_CHAR_RNG_UPPER); stack should never be left on
-            // those
-            WHISPER_ASSERT(false);
-    }
-}
-
-// takes a set of possible pushdown stacks on a grammar, which are required to
-// be positioned at a character range (see `whisper_grammar_advance_stack`), and
-// produces the N possible stacks if the given char is accepted at those
-// positions
-static std::vector<std::vector<const whisper_grammar_element *>> whisper_grammar_accept(
-        const std::vector<std::vector<whisper_grammar_element>>         & rules,
-        const std::vector<std::vector<const whisper_grammar_element *>> & stacks,
-        const uint32_t                                                  chr) {
-
-    std::vector<std::vector<const whisper_grammar_element *>> new_stacks;
-
-    for (const auto & stack : stacks) {
-        if (stack.empty()) {
-            continue;
-        }
-
-        auto match = whisper_grammar_match_char(stack.back(), chr);
-        if (match.first) {
-            const whisper_grammar_element * pos = match.second;
-
-            // update top of stack to next element, if any
-            std::vector<const whisper_grammar_element *> new_stack(stack.begin(), stack.end() - 1);
-            if (!whisper_grammar_is_end_of_sequence(pos)) {
-                new_stack.push_back(pos);
-            }
-            whisper_grammar_advance_stack(rules, new_stack, new_stacks);
-        }
-    }
-
-    return new_stacks;
-}
-
-static std::vector<whisper_grammar_candidate> whisper_grammar_reject_candidates(
-        const std::vector<std::vector<whisper_grammar_element>>         & rules,
-        const std::vector<std::vector<const whisper_grammar_element *>> & stacks,
-        const std::vector<whisper_grammar_candidate>                    & candidates);
-
-static std::vector<whisper_grammar_candidate> whisper_grammar_reject_candidates_for_stack(
-        const std::vector<std::vector<whisper_grammar_element>> & rules,
-        const std::vector<const whisper_grammar_element *>      & stack,
-        const std::vector<whisper_grammar_candidate>            & candidates) {
-
-    std::vector<whisper_grammar_candidate> rejects;
-
-    if (stack.empty()) {
-        for (auto tok : candidates) {
-            if (*tok.code_points != 0 || tok.partial_utf8.n_remain != 0) {
-                rejects.push_back(tok);
-            }
-        }
-        return rejects;
-    }
-
-    const whisper_grammar_element * stack_pos = stack.back();
-
-    std::vector<whisper_grammar_candidate> next_candidates;
-    for (auto tok : candidates) {
-        if (*tok.code_points == 0) {
-            // reached end of full codepoints in token, reject iff it ended in a partial sequence
-            // that cannot satisfy this position in grammar
-            if (tok.partial_utf8.n_remain != 0 && !whisper_grammar_match_partial_char(stack_pos, tok.partial_utf8)) {
-                rejects.push_back(tok);
-            }
-        } else if (whisper_grammar_match_char(stack_pos, *tok.code_points).first) {
-            next_candidates.push_back({ tok.id, tok.code_points + 1, tok.partial_utf8 });
-        } else {
-            rejects.push_back(tok);
-        }
-    }
-
-    const auto * stack_pos_after = whisper_grammar_match_char(stack_pos, 0).second;
-
-    // update top of stack to next element, if any
-    std::vector<const whisper_grammar_element *> stack_after(stack.begin(), stack.end() - 1);
-    if (!whisper_grammar_is_end_of_sequence(stack_pos_after)) {
-        stack_after.push_back(stack_pos_after);
-    }
-    std::vector<std::vector<const whisper_grammar_element *>> next_stacks;
-    whisper_grammar_advance_stack(rules, stack_after, next_stacks);
-
-    auto next_rejects = whisper_grammar_reject_candidates(rules, next_stacks, next_candidates);
-    for (auto tok : next_rejects) {
-        rejects.push_back({ tok.id, tok.code_points - 1, tok.partial_utf8 });
-    }
-
-    return rejects;
-}
-
-static std::vector<whisper_grammar_candidate> whisper_grammar_reject_candidates(
-        const std::vector<std::vector<whisper_grammar_element>>         & rules,
-        const std::vector<std::vector<const whisper_grammar_element *>> & stacks,
-        const std::vector<whisper_grammar_candidate>                    & candidates) {
-    if (candidates.empty() || stacks.empty()) {
-        return std::vector<whisper_grammar_candidate>();
-    }
-
-    auto rejects = whisper_grammar_reject_candidates_for_stack(rules, stacks.front(), candidates);
-
-    for (size_t i = 1, size = stacks.size(); i < size; ++i) {
-        rejects = whisper_grammar_reject_candidates_for_stack(rules, stacks[i], rejects);
-    }
-    return rejects;
-}
-
-static struct whisper_grammar whisper_grammar_init(
-            const whisper_grammar_element ** rules,
-                                 size_t      n_rules,
-                                 size_t      i_start_rule) {
-    const whisper_grammar_element * pos;
-
-    // copy rule definitions into vectors
-    std::vector<std::vector<whisper_grammar_element>> vec_rules(n_rules);
-    for (size_t i = 0; i < n_rules; i++) {
-        for (pos = rules[i]; pos->type != WHISPER_GRETYPE_END; pos++) {
-            vec_rules[i].push_back(*pos);
-        }
-        vec_rules[i].push_back({WHISPER_GRETYPE_END, 0});
-    }
-
-    // loop over alternates of start rule to build initial stacks
-    std::vector<std::vector<const whisper_grammar_element *>> stacks;
-    pos = rules[i_start_rule];
-    do {
-        std::vector<const whisper_grammar_element *> stack;
-        if (!whisper_grammar_is_end_of_sequence(pos)) {
-            // if alternate is nonempty, add to stack
-            stack.push_back(pos);
-        }
-        whisper_grammar_advance_stack(vec_rules, stack, stacks);
-        while (!whisper_grammar_is_end_of_sequence(pos)) {
-            // scan to end of alternate def
-            pos++;
-        }
-        if (pos->type == WHISPER_GRETYPE_ALT) {
-            // there's another alternate def of this rule to process
-            pos++;
-        } else {
-            break;
-        }
-    } while (true);
-
-    return { std::move(vec_rules), std::move(stacks), {} };
-}
-
-static void whisper_suppress_invalid_grammar(
-             whisper_context  & ctx,
-    const whisper_full_params & params,
-           std::vector<float> & logits,
-    const     whisper_grammar & grammar) {
-
-    if (grammar.rules.empty() || grammar.stacks.empty()) {
-        return;
-    }
-
-    //bool allow_eot = false;
-    //for (const auto & stack : grammar.stacks) {
-    //    if (stack.empty()) {
-    //        allow_eot = true;
-    //        break;
-    //    }
-    //}
-
-    const whisper_token eot = whisper_token_eot(&ctx);
-
-    std::vector<std::pair<std::vector<uint32_t>, whisper_partial_utf8>> candidates_decoded;
-    std::vector<whisper_grammar_candidate>                              candidates_grammar;
-
-    for (whisper_token id = 0; id < eot; ++id) {
-        const std::string & text = ctx.vocab.id_to_token[id];
-        if (!text.empty()) {
-            candidates_decoded.push_back(decode_utf8(text.c_str(), grammar.partial_utf8));
-            candidates_grammar.push_back({ id, candidates_decoded.back().first.data(), candidates_decoded.back().second });
-        }
-    }
-
-    const auto rejects = whisper_grammar_reject_candidates(grammar.rules, grammar.stacks, candidates_grammar);
-
-    for (const auto & reject : rejects) {
-        logits[reject.id] -= params.grammar_penalty;
-    }
-
-    // when the grammar allows a continuation, we penalize the end-of-text token
-    //if (!allow_eot) {
-    //    logits[eot] -= params.grammar_penalty;
-    //}
-    //fprintf(stderr, "Allowed: (%zu tokens)\n", size - rejects.size());
-}
-
-static void whisper_grammar_accept_token(whisper_context & ctx, whisper_grammar & grammar, whisper_token token) {
-    if (grammar.rules.empty() || grammar.stacks.empty()) {
-        return;
-    }
-
-    //fprintf(stderr, "Accept: '%s'\n", ctx.vocab.id_to_token[token].c_str());
-
-    const std::string & text = ctx.vocab.id_to_token[token];
-
-    if (text.rfind("[_", 0) == 0) {
-        // fprintf(stderr, " (skipped)\n");
-        return;
-    }
-    // fprintf(stderr, "\n");
-
-    // Note terminating 0 in decoded string
-    const auto   decoded     = decode_utf8(text.c_str(), grammar.partial_utf8);
-    const auto & code_points = decoded.first;
-    for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
-        grammar.stacks = whisper_grammar_accept(grammar.rules, grammar.stacks, *it);
-    }
-    grammar.partial_utf8 = decoded.second;
-}
-
-//////////////
-// END grammar
-//////////////
-
-////////////////////////////////////////////////////////////////////////////
-
-struct whisper_context_params * whisper_context_default_params_by_ref() {
-    struct whisper_context_params params = whisper_context_default_params();
-
-    struct whisper_context_params* result = new whisper_context_params();
-    *result = params;
-    return result;
-}
-
-struct whisper_full_params * whisper_full_default_params_by_ref(enum whisper_sampling_strategy strategy) {
-    struct whisper_full_params params = whisper_full_default_params(strategy);
-
-    struct whisper_full_params* result = new whisper_full_params();
-    *result = params;
-    return result;
-}
-
-struct whisper_full_params whisper_full_default_params(enum whisper_sampling_strategy strategy) {
-    struct whisper_full_params result = {
-        /*.strategy          =*/ strategy,
-
-        /*.n_threads         =*/ std::min(4, (int32_t) std::thread::hardware_concurrency()),
-        /*.n_max_text_ctx    =*/ 16384,
-        /*.offset_ms         =*/ 0,
-        /*.duration_ms       =*/ 0,
-
-        /*.translate         =*/ false,
-        /*.no_context        =*/ true,
-        /*.no_timestamps     =*/ false,
-        /*.single_segment    =*/ false,
-        /*.print_special     =*/ false,
-        /*.print_progress    =*/ true,
-        /*.print_realtime    =*/ false,
-        /*.print_timestamps  =*/ true,
-
-        /*.token_timestamps  =*/ false,
-        /*.thold_pt          =*/ 0.01f,
-        /*.thold_ptsum       =*/ 0.01f,
-        /*.max_len           =*/ 0,
-        /*.split_on_word     =*/ false,
-        /*.max_tokens        =*/ 0,
-
-        /*.speed_up          =*/ false,
-        /*.debug_mode        =*/ false,
-        /*.audio_ctx         =*/ 0,
-
-        /*.tdrz_enable       =*/ false,
-
-        /*.initial_prompt    =*/ nullptr,
-        /*.prompt_tokens     =*/ nullptr,
-        /*.prompt_n_tokens   =*/ 0,
-
-        /*.language          =*/ "en",
-        /*.detect_language   =*/ false,
-
-        /*.suppress_blank    =*/ true,
-        /*.suppress_non_speech_tokens =*/ false,
-
-        /*.temperature       =*/  0.0f,
-        /*.max_initial_ts    =*/  1.0f,
-        /*.length_penalty    =*/ -1.0f,
-
-        /*.temperature_inc   =*/  0.2f,
-        /*.entropy_thold     =*/  2.4f,
-        /*.logprob_thold     =*/ -1.0f,
-        /*.no_speech_thold   =*/  0.6f,
-
-        /*.greedy            =*/ {
-            /*.best_of   =*/ -1,
-        },
-
-        /*.beam_search      =*/ {
-            /*.beam_size =*/ -1,
-
-            /*.patience  =*/ -1.0f,
-        },
-
-        /*.new_segment_callback           =*/ nullptr,
-        /*.new_segment_callback_user_data =*/ nullptr,
-
-        /*.progress_callback           =*/ nullptr,
-        /*.progress_callback_user_data =*/ nullptr,
-
-        /*.encoder_begin_callback           =*/ nullptr,
-        /*.encoder_begin_callback_user_data =*/ nullptr,
-
-        /*.abort_callback                   =*/ nullptr,
-        /*.abort_callback_user_data         =*/ nullptr,
-
-        /*.logits_filter_callback           =*/ nullptr,
-        /*.logits_filter_callback_user_data =*/ nullptr,
-
-        /*.grammar_rules   =*/ nullptr,
-        /*.n_grammar_rules =*/ 0,
-        /*.i_start_rule    =*/ 0,
-        /*.grammar_penalty =*/ 100.0f,
-    };
-
-    switch (strategy) {
-        case WHISPER_SAMPLING_GREEDY:
-            {
-                result.greedy = {
-                    /*.best_of   =*/ 5,
-                };
-            } break;
-        case WHISPER_SAMPLING_BEAM_SEARCH:
-            {
-                result.beam_search = {
-                    /*.beam_size =*/ 5,
-
-                    /*.patience  =*/ -1.0f,
-                };
-            } break;
-    }
-
-    return result;
-}
-
-// forward declarations
-static std::vector<float> get_signal_energy(const float * signal, int n_samples, int n_samples_per_half_window);
-static void whisper_exp_compute_token_level_timestamps(
-        struct whisper_context & ctx,
-          struct whisper_state & state,
-                           int   i_segment,
-                         float   thold_pt,
-                         float   thold_ptsum);
-
-static inline bool should_split_on_word(const char * txt, bool split_on_word) {
-    if (!split_on_word) return true;
-
-    return txt[0] == ' ';
-}
-
-// wrap the last segment to max_len characters
-// returns the number of new segments
-static int whisper_wrap_segment(struct whisper_context & ctx, struct whisper_state & state, int max_len, bool split_on_word) {
-    auto segment = state.result_all.back();
-
-    int res = 1;
-    int acc = 0;
-
-    std::string text;
-
-    for (int i = 0; i < (int) segment.tokens.size(); i++) {
-        const auto & token = segment.tokens[i];
-        if (token.id >= whisper_token_eot(&ctx)) {
-            continue;
-        }
-
-        const auto txt = whisper_token_to_str(&ctx, token.id);
-        const int cur = strlen(txt);
-
-        if (acc + cur > max_len && i > 0 && should_split_on_word(txt, split_on_word)) {
-            state.result_all.back().text = std::move(text);
-            state.result_all.back().t1 = token.t0;
-            state.result_all.back().tokens.resize(i);
-            state.result_all.back().speaker_turn_next = false;
-
-            state.result_all.push_back({});
-            state.result_all.back().t0 = token.t0;
-            state.result_all.back().t1 = segment.t1;
-
-            // add tokens [i, end] to the new segment
-            state.result_all.back().tokens.insert(
-                state.result_all.back().tokens.end(),
-                    segment.tokens.begin() + i,
-                    segment.tokens.end());
-
-            state.result_all.back().speaker_turn_next = segment.speaker_turn_next;
-
-            acc = 0;
-            text = "";
-
-            segment = state.result_all.back();
-            i = -1;
-
-            res++;
-        } else {
-            acc += cur;
-            text += txt;
-        }
-    }
-
-    state.result_all.back().text = std::move(text);
-
-    return res;
-}
-
-static const std::vector<std::string> non_speech_tokens = {
-    "\"", "#", "(", ")", "*", "+", "/", ":", ";", "<", "=", ">", "@", "[", "\\", "]", "^",
-    "_", "`", "{", "|", "}", "~", "「", "」", "『", "』", "<<", ">>", "<<<", ">>>", "--",
-    "---", "-(", "-[", "('", "(\"", "((", "))", "(((", ")))", "[[", "]]", "{{", "}}", "♪♪",
-    "♪♪♪","♩", "♪", "♫", "♬", "♭", "♮", "♯"
-};
-
-// process the logits for the selected decoder
-// - applies logit filters
-// - computes logprobs and probs
-// TODO: optimize
-static void whisper_process_logits(
-              struct whisper_context & ctx,
-               struct whisper_state  & state,
-              struct whisper_decoder & decoder,
-    const struct whisper_full_params   params,
-                               float   temperature) {
-    const auto & vocab      = ctx.vocab;
-    const auto & tokens_cur = decoder.sequence.tokens;
-
-    const bool is_initial = tokens_cur.size() == 0;
-    const int  n_logits   = vocab.id_to_token.size();
-
-    WHISPER_ASSERT(n_logits == ctx.vocab.n_vocab);
-
-    // extract the logits for the last token
-    // we will be mutating, and therefore we don't want to use the ctx.logits buffer directly
-    auto & probs    = decoder.probs;
-    auto & logits   = decoder.logits;
-    auto & logprobs = decoder.logprobs;
-    {
-        logits.resize(n_logits);
-        memcpy(logits.data(), state.logits.data() + decoder.i_batch*n_logits, n_logits*sizeof(float));
-
-        if (temperature > 0.0f) {
-            for (int i = 0; i < n_logits; i++) {
-                logits[i] /= temperature;
-            }
-        }
-
-        // will be populated a bit later
-        probs.resize(n_logits);
-        logprobs.resize(n_logits);
-    }
-
-    // apply logit filters here
-    // ref: https://github.com/openai/whisper/blob/0b1ba3d46ebf7fe6f953acfd8cad62a4f851b49f/whisper/decoding.py#L480-L493
-    {
-        // suppress blank
-        // https://github.com/openai/whisper/blob/0b1ba3d46ebf7fe6f953acfd8cad62a4f851b49f/whisper/decoding.py#L388-L390
-        if (params.suppress_blank) {
-            if (is_initial) {
-                logits[vocab.token_eot]           = -INFINITY;
-                logits[vocab.token_to_id.at(" ")] = -INFINITY;
-            }
-        }
-
-        // suppress <|notimestamps|> token
-        // ref: https://github.com/openai/whisper/blob/0b1ba3d46ebf7fe6f953acfd8cad62a4f851b49f/whisper/decoding.py#L410-L412
-        logits[vocab.token_not] = -INFINITY;
-        if (params.no_timestamps) {
-            for (int i = vocab.token_beg; i < n_logits; ++i) {
-                logits[i] = -INFINITY;
-            }
-        }
-
-        // suppress sot and nosp tokens
-        logits[vocab.token_sot]  = -INFINITY;
-        logits[vocab.token_nosp] = -INFINITY; // TODO: ignore this token for now
-
-        // [TDRZ] when tinydiarize is disabled, suppress solm token
-        if (params.tdrz_enable == false) {
-            logits[vocab.token_solm] = -INFINITY;
-        }
-
-        // suppress task tokens
-        logits[vocab.token_translate]  = -INFINITY;
-        logits[vocab.token_transcribe] = -INFINITY;
-        logits[vocab.token_prev]       = -INFINITY;
-
-        // suppress lang tokens
-        for (size_t i = 0; i < g_lang.size(); ++i) {
-            logits[whisper_token_lang(&ctx, i)] = -INFINITY;
-        }
-
-        // suppress prev token
-        logits[vocab.token_prev] = -INFINITY;
-
-        if (params.logits_filter_callback) {
-            params.logits_filter_callback(&ctx, &state, tokens_cur.data(), tokens_cur.size(), logits.data(), params.logits_filter_callback_user_data);
-        }
-
-        // suppress non-speech tokens
-        // ref: https://github.com/openai/whisper/blob/7858aa9c08d98f75575035ecd6481f462d66ca27/whisper/tokenizer.py#L224-L253
-        if (params.suppress_non_speech_tokens) {
-            for (const std::string & token : non_speech_tokens) {
-                const std::string suppress_tokens[] = {token, " " + token};
-                for (const std::string & suppress_token : suppress_tokens) {
-                    if (vocab.token_to_id.find(suppress_token) != vocab.token_to_id.end()) {
-                        logits[vocab.token_to_id.at(suppress_token)] = -INFINITY;
-                    }
-                }
-            }
-
-            // allow hyphens "-" and single quotes "'" between words, but not at the beginning of a word
-            if (vocab.token_to_id.find(" -") != vocab.token_to_id.end()) {
-                logits[vocab.token_to_id.at(" -")] = -INFINITY;
-            }
-            if (vocab.token_to_id.find(" '") != vocab.token_to_id.end()) {
-                logits[vocab.token_to_id.at(" '")] = -INFINITY;
-            }
-        }
-
-        // timestamps have to appear in pairs, except directly before EOT; mask logits accordingly
-        // https://github.com/openai/whisper/blob/0b1ba3d46ebf7fe6f953acfd8cad62a4f851b49f/whisper/decoding.py#L414-L424
-        {
-            const bool last_was_timestamp        = tokens_cur.size() > 0 && tokens_cur.back().id >= vocab.token_beg;
-            const bool penultimate_was_timestamp = tokens_cur.size() < 2 || tokens_cur[tokens_cur.size() - 2].id >= vocab.token_beg;
-
-            //WHISPER_LOG_INFO("last_was_timestamp=%d penultimate_was_timestamp=%d\n", last_was_timestamp, penultimate_was_timestamp);
-
-            if (last_was_timestamp) {
-                if (penultimate_was_timestamp) {
-                    for (int i = vocab.token_beg; i < n_logits; ++i) {
-                        logits[i] = -INFINITY;
-                    }
-                } else {
-                    for (int i = 0; i < vocab.token_eot; ++i) {
-                        logits[i] = -INFINITY;
-                    }
-                }
-            }
-        }
-
-        // the initial timestamp cannot be larger than max_initial_ts
-        // ref: https://github.com/openai/whisper/blob/0b1ba3d46ebf7fe6f953acfd8cad62a4f851b49f/whisper/decoding.py#L426-L429
-        if (is_initial && params.max_initial_ts > 0.0f) {
-            const float precision = float(WHISPER_CHUNK_SIZE)/ctx.model.hparams.n_audio_ctx;
-            const int   tid0      = std::round(params.max_initial_ts/precision);
-
-            for (int i = vocab.token_beg + tid0 + 1; i < n_logits; ++i) {
-                logits[i] = -INFINITY;
-            }
-        }
-
-        // condition timestamp tokens to be increasing
-        // ref: https://github.com/openai/whisper/pull/831#issuecomment-1385910556
-        if (decoder.has_ts) {
-            const int tid0 = decoder.seek_delta/2;
-
-            for (int i = vocab.token_beg; i < vocab.token_beg + tid0; ++i) {
-                logits[i] = -INFINITY;
-            }
-        }
-
-        // populate the logprobs array (log_softmax)
-        {
-            const float logit_max = *std::max_element(logits.begin(), logits.end());
-            float logsumexp = 0.0f;
-            for (int i = 0; i < n_logits; ++i) {
-                if (logits[i] > -INFINITY) {
-                    logsumexp += expf(logits[i] - logit_max);
-                }
-            }
-            logsumexp = logf(logsumexp) + logit_max;
-
-            for (int i = 0; i < n_logits; ++i) {
-                if (logits[i] > -INFINITY) {
-                    logprobs[i] = logits[i] - logsumexp;
-                } else {
-                    logprobs[i] = -INFINITY;
-                }
-            }
-        }
-
-        // if sum of probability over timestamps is above any other token, sample timestamp
-        // ref: https://github.com/openai/whisper/blob/0b1ba3d46ebf7fe6f953acfd8cad62a4f851b49f/whisper/decoding.py#L431-L437
-        {
-            // logsumexp over timestamps
-            float timestamp_logprob = -INFINITY;
-            {
-                float logsumexp = 0.0f;
-                const float logprob_max = *std::max_element(logprobs.begin() + vocab.token_beg, logprobs.end());
-                for (int i = vocab.token_beg; i < n_logits; ++i) {
-                    if (logprobs[i] > -INFINITY) {
-                        logsumexp += expf(logprobs[i] - logprob_max);
-                    }
-                }
-                if (logsumexp > 0.0f) {
-                    timestamp_logprob = logf(logsumexp) + logprob_max;
-                }
-            }
-
-            const float max_text_token_logprob = *std::max_element(logprobs.begin(), logprobs.begin() + vocab.token_beg);
-
-            //WHISPER_LOG_INFO("timestamp_logprob=%f max_text_token_logprob=%f\n", timestamp_logprob, max_text_token_logprob);
-
-            if (timestamp_logprob > max_text_token_logprob) {
-                for (int i = 0; i < vocab.token_beg; ++i) {
-                    logits[i]   = -INFINITY;
-                    logprobs[i] = -INFINITY;
-                }
-            } else {
-                if (params.n_grammar_rules > 0) {
-                    whisper_suppress_invalid_grammar(ctx, params, logits, decoder.grammar);
-
-                    // populate the logprobs array (log_softmax)
-                    {
-                        const float logit_max = *std::max_element(logits.begin(), logits.end());
-                        float logsumexp = 0.0f;
-                        for (int i = 0; i < n_logits; ++i) {
-                            if (logits[i] > -INFINITY) {
-                                logsumexp += expf(logits[i] - logit_max);
-                            }
-                        }
-                        logsumexp = logf(logsumexp) + logit_max;
-
-                        for (int i = 0; i < n_logits; ++i) {
-                            if (logits[i] > -INFINITY) {
-                                logprobs[i] = logits[i] - logsumexp;
-                            } else {
-                                logprobs[i] = -INFINITY;
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    }
-
-    // compute probs
-    {
-        for (int i = 0; i < n_logits; ++i) {
-            if (logits[i] == -INFINITY) {
-                probs[i] = 0.0f;
-            } else {
-                probs[i] = expf(logprobs[i]);
-            }
-        }
-    }
-
-#if 0
-    // print first 100 logits - token string : logit
-    //for (int i = 0; i < 10; i++) {
-    //    const auto token   = vocab.id_to_token.at(i);
-    //    const auto prob    = probs[i];
-    //    const auto logit   = logits[i];
-    //    const auto logprob = logprobs[i];
-    //    printf("%16s : prob=%9.5f logit=%9.5f logprob=%9.5f\n", token.c_str(), prob, logit, logprob);
-    //}
-
-    // print sorted
-    {
-        std::vector<std::pair<float, int>> pairs;
-
-        for (int i = 0; i < n_logits; ++i) {
-            pairs.push_back(std::make_pair(probs[i], i));
-        }
-
-        std::sort(pairs.begin(), pairs.end(), [](const std::pair<float, int>& a, const std::pair<float, int>& b) {
-            return a.first > b.first;
-        });
-
-        for (int i = 0; i < 10; i++) {
-            const auto token   = vocab.id_to_token.at(pairs[i].second);
-            const auto prob    = pairs[i].first;
-            const auto logit   = logits[pairs[i].second];
-            const auto logprob = logprobs[pairs[i].second];
-            printf("%16s : id=%6d prob=%9.5f logit=%9.5f logprob=%9.5f '%s'\n", token.c_str(), pairs[i].second, prob, logit, logprob, token.c_str());
-        }
-
-        printf("----------------\n");
-    }
-
-    // "And", "and", " And", " and"
-    //printf("logits[\"and\"]  = %f\n", logits[vocab.token_to_id.at("and")]);
-    //printf("logits[\"And\"]  = %f\n", logits[vocab.token_to_id.at("And")]);
-    //printf("logits[\" and\"] = %f\n", logits[vocab.token_to_id.at(" and")]);
-    //printf("logits[\" And\"] = %f\n", logits[vocab.token_to_id.at(" And")]);
-    //printf("logits[\" so\"]  = %f\n", logits[vocab.token_to_id.at(" so")]);
-
-    //printf("logprobs[\"and\"]  = %f\n", logprobs[vocab.token_to_id.at("and")]);
-    //printf("logprobs[\"And\"]  = %f\n", logprobs[vocab.token_to_id.at("And")]);
-    //printf("logprobs[\" and\"] = %f\n", logprobs[vocab.token_to_id.at(" and")]);
-    //printf("logprobs[\" And\"] = %f\n", logprobs[vocab.token_to_id.at(" And")]);
-    //printf("logprobs[\" so\"]  = %f\n", logprobs[vocab.token_to_id.at(" so")]);
-
-    //printf("probs[\"and\"]  = %f\n", probs[vocab.token_to_id.at("and")]);
-    //printf("probs[\"And\"]  = %f\n", probs[vocab.token_to_id.at("And")]);
-    //printf("probs[\" and\"] = %f\n", probs[vocab.token_to_id.at(" and")]);
-    //printf("probs[\" And\"] = %f\n", probs[vocab.token_to_id.at(" And")]);
-    //printf("probs[\" so\"]  = %f\n", probs[vocab.token_to_id.at(" so")]);
-#endif
-}
-
-static whisper_token_data whisper_sample_token(
-            whisper_context & ctx,
-      const whisper_decoder & decoder,
-                       bool   best) {
-    whisper_token_data result = {
-        0, 0, 0.0f, 0.0f, 0.0f, 0.0f, -1, -1, 0.0f,
-    };
-
-    const auto & vocab = ctx.vocab;
-
-    const auto & probs    = decoder.probs;
-    const auto & logprobs = decoder.logprobs;
-
-    const int n_logits = vocab.n_vocab;
-
-    {
-        double sum_ts = 0.0;
-        double max_ts = 0.0;
-
-        for (int i = vocab.token_beg; i < n_logits; i++) {
-            if (probs[i] == -INFINITY) {
-                continue;
-            }
-
-            sum_ts += probs[i];
-            if (max_ts < probs[i]) {
-                max_ts = probs[i];
-                result.tid = i;
-            }
-        }
-
-        result.pt    = max_ts/(sum_ts + 1e-10);
-        result.ptsum = sum_ts;
-    }
-
-    if (best) {
-        for (int i = 0; i < n_logits; ++i) {
-            if (result.p < probs[i]) {
-                result.id   = i;
-                result.p    = probs[i];
-                result.plog = logprobs[i];
-            }
-        }
-    } else {
-        std::discrete_distribution<> dist(probs.begin(), probs.end());
-
-        result.id   = dist(decoder.rng);
-        result.p    = probs[result.id];
-        result.plog = logprobs[result.id];
-    }
-
-    if (result.id >= vocab.token_beg) {
-        result.tid = result.id;
-        result.pt  = result.p;
-    }
-
-    return result;
-}
-
-static std::vector<whisper_token_data> whisper_sample_token_topk(
-            whisper_context & ctx,
-            whisper_decoder & decoder,
-                        int   k) {
-    const auto & vocab = ctx.vocab;
-
-    const auto & probs    = decoder.probs;
-    const auto & logits   = decoder.logits;
-    const auto & logprobs = decoder.logprobs;
-
-    const int n_logits = vocab.n_vocab;
-
-    auto & logits_id = decoder.logits_id;
-
-    logits_id.resize(n_logits);
-    for (int i = 0; i < n_logits; ++i) {
-        logits_id[i].first = logits[i];
-        logits_id[i].second = i;
-    }
-
-    {
-        using pair_type = std::remove_reference<decltype(logits_id)>::type::value_type;
-        std::partial_sort(
-                logits_id.begin(),
-                logits_id.begin() + k, logits_id.end(),
-                [](const pair_type & a, const pair_type & b) {
-            return a.first > b.first;
-        });
-    }
-
-    std::vector<whisper_token_data> result;
-    result.reserve(k);
-
-    whisper_token tid = vocab.token_beg;
-
-    float pt    = 0.0;
-    float ptsum = 0.0;
-
-    {
-        double sum_ts = 0.0;
-        double max_ts = 0.0;
-
-        for (int i = vocab.token_beg; i < n_logits; i++) {
-            if (probs[i] == -INFINITY) {
-                continue;
-            }
-
-            sum_ts += probs[i];
-            if (max_ts < probs[i]) {
-                max_ts = probs[i];
-                tid = i;
-            }
-        }
-
-        pt    = max_ts/(sum_ts + 1e-10);
-        ptsum = sum_ts;
-    }
-
-    std::discrete_distribution<> dist(probs.begin(), probs.end());
-
-    for (int i = 0; i < k; ++i) {
-        const auto id = dist(decoder.rng);
-        //printf("XXX %d %d %f %f %f %f\n", id, tid, probs[id], logprobs[id], pt, ptsum);
-
-        result.push_back({ id, tid, probs[id], logprobs[id], pt, ptsum, -1, -1, 0.0f, });
-
-        if (result[i].id >= vocab.token_beg) {
-            result[i].tid = result[i].id;
-            result[i].pt  = result[i].p;
-        }
-    }
-
-    return result;
-}
-
-// ref: https://github.com/openai/whisper/blob/0b1ba3d46ebf7fe6f953acfd8cad62a4f851b49f/whisper/decoding.py#L178-L192
-static void whisper_sequence_score(
-        const struct whisper_full_params & params,
-                        whisper_sequence & sequence) {
-    if (sequence.result_len == 0) {
-        return;
-    }
-
-    double result = 0.0f;
-
-    for (int i = 0; i < sequence.result_len; ++i) {
-        result += sequence.tokens[i].plog;
-    }
-
-    sequence.sum_logprobs = result;
-    sequence.avg_logprobs = result/sequence.result_len;
-
-    double penalty = sequence.result_len;
-
-    if (params.length_penalty > 0.0f) {
-        penalty = pow((5.0 + penalty)/6.0, params.length_penalty);
-    }
-
-    sequence.score = result/penalty;
-
-    // compute the entropy of the sequence of the last 32 tokens
-    {
-        const int n = 32;
-
-        int cnt = 0;
-        double entropy = 0.0f;
-
-        std::map<whisper_token, int> token_counts;
-        for (int i = std::max(0, sequence.result_len - n); i < sequence.result_len; ++i) {
-            token_counts[sequence.tokens[i].id]++;
-            cnt++;
-        }
-
-        for (const auto & kv : token_counts) {
-            const auto p = kv.second/(double)cnt;
-            entropy -= p*log(p);
-
-            //WHISPER_LOG_DEBUG("entropy: %d %f %f, count %d\n", kv.first, p, log(p), kv.second);
-        }
-
-        sequence.entropy = entropy;
-    }
-}
-
-int whisper_full_with_state(
-        struct whisper_context * ctx,
-          struct whisper_state * state,
-    struct whisper_full_params   params,
-                   const float * samples,
-                           int   n_samples) {
-    // clear old results
-    auto & result_all = state->result_all;
-
-    result_all.clear();
-
-    if (n_samples > 0) {
-        // compute log mel spectrogram
-        if (params.speed_up) {
-            // TODO: Replace PV with more advanced algorithm
-            WHISPER_LOG_ERROR("%s: failed to compute log mel spectrogram\n", __func__);
-            return -1;
-        } else {
-            if (whisper_pcm_to_mel_with_state(ctx, state, samples, n_samples, params.n_threads) != 0) {
-                WHISPER_LOG_ERROR("%s: failed to compute log mel spectrogram\n", __func__);
-                return -2;
-            }
-        }
-    }
-
-    // auto-detect language if not specified
-    if (params.language == nullptr || strlen(params.language) == 0 || strcmp(params.language, "auto") == 0 || params.detect_language) {
-        std::vector<float> probs(whisper_lang_max_id() + 1, 0.0f);
-
-        const auto lang_id = whisper_lang_auto_detect_with_state(ctx, state, 0, params.n_threads, probs.data());
-        if (lang_id < 0) {
-            WHISPER_LOG_ERROR("%s: failed to auto-detect language\n", __func__);
-            return -3;
-        }
-        state->lang_id = lang_id;
-        params.language = whisper_lang_str(lang_id);
-
-        WHISPER_LOG_INFO("%s: auto-detected language: %s (p = %f)\n", __func__, params.language, probs[whisper_lang_id(params.language)]);
-        if (params.detect_language) {
-            return 0;
-        }
-    }
-
-    if (params.token_timestamps) {
-        state->t_beg    = 0;
-        state->t_last   = 0;
-        state->tid_last = 0;
-        if (n_samples > 0) {
-            state->energy = get_signal_energy(samples, n_samples, 32);
-        }
-    }
-
-    const int seek_start = params.offset_ms/10;
-    const int seek_end = params.duration_ms == 0 ? whisper_n_len_from_state(state) : seek_start + params.duration_ms/10;
-
-    // if length of spectrogram is less than 1.0s (100 frames), then return
-    // basically don't process anything that is less than 1.0s
-    // see issue #39: https://github.com/ggerganov/whisper.cpp/issues/39
-    if (seek_end < seek_start + (params.speed_up ? 50 : 100)) {
-        WHISPER_LOG_DEBUG("%s: input is too short - %d ms < 1000 ms\n", __func__, (seek_end - seek_start)*10);
-        return 0;
-    }
-
-    // a set of temperatures to use
-    // [ t0, t0 + delta, t0 + 2*delta, ..., < 1.0f + 1e-6f ]
-    std::vector<float> temperatures;
-    if (params.temperature_inc > 0.0f) {
-        for (float t = params.temperature; t < 1.0f + 1e-6f; t += params.temperature_inc) {
-            temperatures.push_back(t);
-        }
-    } else {
-        temperatures.push_back(params.temperature);
-    }
-
-    // initialize the decoders
-    int n_decoders = 1;
-
-    switch (params.strategy) {
-        case WHISPER_SAMPLING_GREEDY:
-            {
-                n_decoders = params.greedy.best_of;
-            } break;
-        case WHISPER_SAMPLING_BEAM_SEARCH:
-            {
-                n_decoders = std::max(params.greedy.best_of, params.beam_search.beam_size);
-            } break;
-    };
-
-    n_decoders = std::max(1, n_decoders);
-
-    if (n_decoders > WHISPER_MAX_DECODERS) {
-        WHISPER_LOG_ERROR("%s: too many decoders requested (%d), max = %d\n", __func__, n_decoders, WHISPER_MAX_DECODERS);
-        return -4;
-    }
-
-    // TAGS: WHISPER_DECODER_INIT
-    for (int j = 1; j < n_decoders; j++) {
-        auto & decoder = state->decoders[j];
-
-        decoder.sequence.tokens.reserve(state->decoders[0].sequence.tokens.capacity());
-
-        decoder.probs.resize   (ctx->vocab.n_vocab);
-        decoder.logits.resize  (ctx->vocab.n_vocab);
-        decoder.logprobs.resize(ctx->vocab.n_vocab);
-        decoder.logits_id.reserve(ctx->model.hparams.n_vocab);
-
-        decoder.rng = std::mt19937(0);
-    }
-
-    // the accumulated text context so far
-    auto & prompt_past = state->prompt_past;
-    if (params.no_context) {
-        prompt_past.clear();
-    }
-
-    // prepare prompt
-    {
-        std::vector<whisper_token> prompt_tokens;
-
-        // initial prompt
-        if (!params.prompt_tokens && params.initial_prompt) {
-            prompt_tokens.resize(1024);
-            prompt_tokens.resize(whisper_tokenize(ctx, params.initial_prompt, prompt_tokens.data(), prompt_tokens.size()));
-            params.prompt_tokens   = prompt_tokens.data();
-            params.prompt_n_tokens = prompt_tokens.size();
-        }
-
-        // prepend the prompt tokens to the prompt_past
-        if (params.prompt_tokens && params.prompt_n_tokens > 0) {
-            // parse tokens from the pointer
-            for (int i = 0; i < params.prompt_n_tokens; i++) {
-                prompt_past.push_back(params.prompt_tokens[i]);
-            }
-            std::rotate(prompt_past.begin(), prompt_past.end() - params.prompt_n_tokens, prompt_past.end());
-        }
-    }
-
-    // overwrite audio_ctx, max allowed is hparams.n_audio_ctx
-    if (params.audio_ctx > whisper_n_audio_ctx(ctx)) {
-        WHISPER_LOG_ERROR("%s: audio_ctx is larger than the maximum allowed (%d > %d)\n", __func__, params.audio_ctx, whisper_n_audio_ctx(ctx));
-        return -5;
-    }
-    state->exp_n_audio_ctx = params.audio_ctx;
-
-    // these tokens determine the task that will be performed
-    std::vector<whisper_token> prompt_init = { whisper_token_sot(ctx), };
-
-    if (whisper_is_multilingual(ctx)) {
-        const int lang_id = whisper_lang_id(params.language);
-        state->lang_id = lang_id;
-        prompt_init.push_back(whisper_token_lang(ctx, lang_id));
-        if (params.translate) {
-            prompt_init.push_back(whisper_token_translate(ctx));
-        } else {
-            prompt_init.push_back(whisper_token_transcribe(ctx));
-        }
-    }
-
-    // distilled models require the "no_timestamps" token
-    {
-        const bool is_distil = ctx->model.hparams.n_text_layer == 2;
-        if (is_distil && !params.no_timestamps) {
-            WHISPER_LOG_WARN("%s: using distilled model - forcing no_timestamps\n", __func__);
-            params.no_timestamps = true;
-        }
-    }
-
-    if (params.no_timestamps) {
-        prompt_init.push_back(whisper_token_not(ctx));
-    }
-
-    int seek = seek_start;
-
-    std::vector<whisper_token> prompt;
-    prompt.reserve(whisper_n_text_ctx(ctx));
-
-    struct beam_candidate {
-        int decoder_idx;
-        int seek_delta;
-
-        bool has_ts;
-
-        whisper_sequence sequence;
-        whisper_grammar grammar;
-    };
-
-    std::vector<std::vector<beam_candidate>> bc_per_dec(n_decoders);
-    std::vector<beam_candidate> beam_candidates;
-
-    // main loop
-    while (true) {
-        if (params.progress_callback) {
-            const int progress_cur = (100*(seek - seek_start))/(seek_end - seek_start);
-
-            params.progress_callback(
-                ctx, state, progress_cur, params.progress_callback_user_data);
-        }
-
-        // if only 1 second left, then stop
-        if (seek + 100 >= seek_end) {
-            break;
-        }
-
-        if (params.encoder_begin_callback) {
-            if (params.encoder_begin_callback(ctx, state, params.encoder_begin_callback_user_data) == false) {
-                WHISPER_LOG_ERROR("%s: encoder_begin_callback returned false - aborting\n", __func__);
-                break;
-            }
-        }
-
-        // encode audio features starting at offset seek
-        if (!whisper_encode_internal(*ctx, *state, seek, params.n_threads, params.abort_callback, params.abort_callback_user_data)) {
-            WHISPER_LOG_ERROR("%s: failed to encode\n", __func__);
-            return -6;
-        }
-
-        // if there is a very short audio segment left to process, we remove any past prompt since it tends
-        // to confuse the decoder and often make it repeat or hallucinate stuff
-        if (seek > seek_start && seek + 500 >= seek_end) {
-            prompt_past.clear();
-        }
-
-        int best_decoder_id = 0;
-
-        for (int it = 0; it < (int) temperatures.size(); ++it) {
-            const float t_cur = temperatures[it];
-
-            int n_decoders_cur = 1;
-
-            switch (params.strategy) {
-                case whisper_sampling_strategy::WHISPER_SAMPLING_GREEDY:
-                    {
-                        if (t_cur > 0.0f) {
-                            n_decoders_cur = params.greedy.best_of;
-                        }
-                    } break;
-                case whisper_sampling_strategy::WHISPER_SAMPLING_BEAM_SEARCH:
-                    {
-                        if (t_cur > 0.0f) {
-                            n_decoders_cur = params.greedy.best_of;
-                        } else {
-                            n_decoders_cur = params.beam_search.beam_size;
-                        }
-                    } break;
-            };
-
-            n_decoders_cur = std::max(1, n_decoders_cur);
-
-            WHISPER_LOG_DEBUG("\n%s: strategy = %d, decoding with %d decoders, temperature = %.2f\n", __func__, params.strategy, n_decoders_cur, t_cur);
-
-            // TAGS: WHISPER_DECODER_INIT
-            for (int j = 0; j < n_decoders_cur; ++j) {
-                auto & decoder = state->decoders[j];
-
-                decoder.sequence.tokens.clear();
-                decoder.sequence.result_len       = 0;
-                decoder.sequence.sum_logprobs_all = 0.0;
-                decoder.sequence.sum_logprobs     = -INFINITY;
-                decoder.sequence.avg_logprobs     = -INFINITY;
-                decoder.sequence.entropy          = 0.0;
-                decoder.sequence.score            = -INFINITY;
-
-                decoder.seek_delta = 100*WHISPER_CHUNK_SIZE;
-
-                decoder.failed    = false;
-                decoder.completed = false;
-                decoder.has_ts    = false;
-
-                if (params.grammar_rules != nullptr) {
-                    decoder.grammar = whisper_grammar_init(params.grammar_rules, params.n_grammar_rules, params.i_start_rule);
-                } else {
-                    decoder.grammar = {};
-                }
-            }
-
-            // init prompt and kv cache for the current iteration
-            // TODO: do not recompute the prompt if it is the same as previous time
-            {
-                prompt.clear();
-
-                // if we have already generated some text, use it as a prompt to condition the next generation
-                if (!prompt_past.empty() && t_cur < 0.5f && params.n_max_text_ctx > 0) {
-                    int n_take = std::min(std::min(params.n_max_text_ctx, whisper_n_text_ctx(ctx)/2), int(prompt_past.size()));
-
-                    prompt = { whisper_token_prev(ctx) };
-                    prompt.insert(prompt.begin() + 1, prompt_past.end() - n_take, prompt_past.end());
-                }
-
-                // init new transcription with sot, language (opt) and task tokens
-                prompt.insert(prompt.end(), prompt_init.begin(), prompt_init.end());
-
-                // print the prompt
-                WHISPER_LOG_DEBUG("\n\n");
-                for (int i = 0; i < (int) prompt.size(); i++) {
-                    WHISPER_LOG_DEBUG("%s: prompt[%d] = %s\n", __func__, i, ctx->vocab.id_to_token.at(prompt[i]).c_str());
-                }
-                WHISPER_LOG_DEBUG("\n\n");
-
-                whisper_kv_cache_clear(state->kv_self);
-
-                whisper_batch_prep_legacy(state->batch, prompt.data(), prompt.size(), 0, 0);
-
-                if (!whisper_decode_internal(*ctx, *state, state->batch, params.n_threads, params.abort_callback, params.abort_callback_user_data)) {
-                    WHISPER_LOG_ERROR("%s: failed to decode\n", __func__);
-                    return -7;
-                }
-
-                {
-                    const int64_t t_start_sample_us = ggml_time_us();
-
-                    state->decoders[0].i_batch = prompt.size() - 1;
-
-                    whisper_process_logits(*ctx, *state, state->decoders[0], params, t_cur);
-
-                    for (int j = 1; j < n_decoders_cur; ++j) {
-                        auto & decoder = state->decoders[j];
-
-                        whisper_kv_cache_seq_cp(state->kv_self, 0, j, -1, -1);
-
-                        memcpy(decoder.probs.data(),    state->decoders[0].probs.data(),    decoder.probs.size()*sizeof(decoder.probs[0]));
-                        memcpy(decoder.logits.data(),   state->decoders[0].logits.data(),   decoder.logits.size()*sizeof(decoder.logits[0]));
-                        memcpy(decoder.logprobs.data(), state->decoders[0].logprobs.data(), decoder.logprobs.size()*sizeof(decoder.logprobs[0]));
-                    }
-
-                    state->t_sample_us += ggml_time_us() - t_start_sample_us;
-                }
-            }
-
-            for (int i = 0, n_max = whisper_n_text_ctx(ctx)/2 - 4; i < n_max; ++i) {
-                const int64_t t_start_sample_us = ggml_time_us();
-
-                if (params.strategy == whisper_sampling_strategy::WHISPER_SAMPLING_BEAM_SEARCH) {
-                    for (auto & bc : bc_per_dec) {
-                        bc.clear();
-                    }
-                }
-
-                // sampling
-                // TODO: avoid memory allocations, optimize, avoid threads?
-                {
-                    std::atomic<int> j_cur(0);
-
-                    auto process = [&]() {
-                        while (true) {
-                            const int j = j_cur.fetch_add(1);
-
-                            if (j >= n_decoders_cur) {
-                                break;
-                            }
-
-                            auto & decoder = state->decoders[j];
-
-                            if (decoder.completed || decoder.failed) {
-                                continue;
-                            }
-
-                            switch (params.strategy) {
-                                case whisper_sampling_strategy::WHISPER_SAMPLING_GREEDY:
-                                    {
-                                        if (t_cur < 1e-6f) {
-                                            decoder.sequence.tokens.push_back(whisper_sample_token(*ctx, decoder, true));
-                                        } else {
-                                            decoder.sequence.tokens.push_back(whisper_sample_token(*ctx, decoder, false));
-                                        }
-
-                                        decoder.sequence.sum_logprobs_all += decoder.sequence.tokens.back().plog;
-                                    } break;
-                                case whisper_sampling_strategy::WHISPER_SAMPLING_BEAM_SEARCH:
-                                    {
-                                        const auto tokens_new = whisper_sample_token_topk(*ctx, decoder, params.beam_search.beam_size);
-
-                                        for (const auto & token : tokens_new) {
-                                            bc_per_dec[j].push_back({ j, decoder.seek_delta, decoder.has_ts, decoder.sequence, decoder.grammar, });
-                                            bc_per_dec[j].back().sequence.tokens.push_back(token);
-                                            bc_per_dec[j].back().sequence.sum_logprobs_all += token.plog;
-                                        }
-                                    } break;
-                            };
-                        }
-                    };
-
-                    const int n_threads = std::min(params.n_threads, n_decoders_cur);
-
-                    if (n_threads == 1) {
-                        process();
-                    } else {
-                        std::vector<std::thread> threads(n_threads - 1);
-
-                        for (int t = 0; t < n_threads - 1; ++t) {
-                            threads[t] = std::thread(process);
-                        }
-
-                        process();
-
-                        for (int t = 0; t < n_threads - 1; ++t) {
-                            threads[t].join();
-                        }
-                    }
-                }
-
-                beam_candidates.clear();
-                for (const auto & bc : bc_per_dec) {
-                    beam_candidates.insert(beam_candidates.end(), bc.begin(), bc.end());
-
-                    if (!bc.empty()) {
-                        state->n_sample += 1;
-                    }
-                }
-
-                // for beam-search, choose the top candidates and update the KV caches
-                if (params.strategy == whisper_sampling_strategy::WHISPER_SAMPLING_BEAM_SEARCH) {
-                    std::sort(
-                            beam_candidates.begin(),
-                            beam_candidates.end(),
-                            [](const beam_candidate & a, const beam_candidate & b) {
-                        return a.sequence.sum_logprobs_all > b.sequence.sum_logprobs_all;
-                    });
-
-                    uint32_t cur_c = 0;
-
-                    for (int j = 0; j < n_decoders_cur; ++j) {
-                        auto & decoder = state->decoders[j];
-
-                        if (decoder.completed || decoder.failed) {
-                            continue;
-                        }
-
-                        if (cur_c >= beam_candidates.size()) {
-                            cur_c = 0;
-                        }
-
-                        auto & cur = beam_candidates[cur_c++];
-
-                        while (beam_candidates.size() > cur_c && beam_candidates[cur_c].sequence.sum_logprobs_all == cur.sequence.sum_logprobs_all && i > 0) {
-                            ++cur_c;
-                        }
-
-                        decoder.seek_delta = cur.seek_delta;
-                        decoder.has_ts     = cur.has_ts;
-                        decoder.sequence   = cur.sequence;
-                        decoder.grammar    = cur.grammar;
-
-                        whisper_kv_cache_seq_cp(state->kv_self, cur.decoder_idx, WHISPER_MAX_DECODERS + j, -1, -1);
-
-                        WHISPER_LOG_DEBUG("%s: beam search: decoder %d: from decoder %d: token = %10s, plog = %8.5f, sum_logprobs = %8.5f\n",
-                                __func__, j, cur.decoder_idx, ctx->vocab.id_to_token.at(decoder.sequence.tokens.back().id).c_str(), decoder.sequence.tokens.back().plog, decoder.sequence.sum_logprobs_all);
-                    }
-
-                    for (int j = 0; j < n_decoders_cur; ++j) {
-                        auto & decoder = state->decoders[j];
-
-                        if (decoder.completed || decoder.failed) {
-                            continue;
-                        }
-
-                        whisper_kv_cache_seq_rm(state->kv_self, j,                           -1, -1);
-                        whisper_kv_cache_seq_cp(state->kv_self, WHISPER_MAX_DECODERS + j, j, -1, -1);
-                        whisper_kv_cache_seq_rm(state->kv_self, WHISPER_MAX_DECODERS + j,    -1, -1);
-                    }
-                }
-
-                // update the decoder state
-                // - check if the sequence is completed
-                // - check if the sequence is failed
-                // - update sliding window based on timestamp tokens
-                for (int j = 0; j < n_decoders_cur; ++j) {
-                    auto & decoder = state->decoders[j];
-
-                    if (decoder.completed || decoder.failed) {
-                        continue;
-                    }
-
-                    auto & has_ts     = decoder.has_ts;
-                    auto & failed     = decoder.failed;
-                    auto & completed  = decoder.completed;
-                    auto & seek_delta = decoder.seek_delta;
-                    auto & result_len = decoder.sequence.result_len;
-
-                    {
-                        const auto & token = decoder.sequence.tokens.back();
-
-                        // timestamp token - update sliding window
-                        if (token.id > whisper_token_beg(ctx)) {
-                            const int seek_delta_new = 2*(token.id - whisper_token_beg(ctx));
-
-                            // do not allow to go back in time
-                            if (has_ts && seek_delta > seek_delta_new && result_len < i) {
-                                WHISPER_LOG_DEBUG("%s: decoder %d: failed due to seek_delta (%d > %d)\n", __func__, j, seek_delta, seek_delta_new);
-                                failed = true; // TODO: maybe this is not a failure ?
-                                continue;
-                            }
-
-                            seek_delta = seek_delta_new;
-                            result_len = i + 1;
-                            has_ts = true;
-                        }
-
-                        whisper_grammar_accept_token(*ctx, decoder.grammar, token.id);
-
-#ifdef WHISPER_DEBUG
-                        {
-                            const auto tt = token.pt > 0.10 ? ctx->vocab.id_to_token.at(token.tid) : "[?]";
-                            WHISPER_LOG_DEBUG("%s: id = %3d, decoder = %d, token = %6d, p = %6.3f, ts = %10s, %6.3f, result_len = %4d '%s'\n",
-                                    __func__, i, j, token.id, token.p, tt.c_str(), token.pt, result_len, ctx->vocab.id_to_token.at(token.id).c_str());
-                        }
-#endif
-
-                        // end of segment
-                        if (token.id == whisper_token_eot(ctx) ||               // end of text token
-                           (params.max_tokens > 0 && i >= params.max_tokens) || // max tokens per segment reached
-                           (has_ts && seek + seek_delta + 100 >= seek_end)      // end of audio reached
-                           ) {
-                            if (result_len == 0 && !params.no_timestamps) {
-                                if (seek + seek_delta + 100 >= seek_end) {
-                                    result_len = i + 1;
-                                } else {
-                                    WHISPER_LOG_DEBUG("%s: decoder %d failed (result_len = 0)\n", __func__, j);
-                                    failed = true;
-                                    continue;
-                                }
-                            }
-
-                            if (params.single_segment || params.no_timestamps) {
-                                result_len = i + 1;
-                                seek_delta = 100*WHISPER_CHUNK_SIZE;
-                            }
-
-                            WHISPER_LOG_DEBUG("%s: decoder %d completed\n", __func__, j);
-                            completed = true;
-                            continue;
-                        }
-
-                        // TESTS: if no tensors are loaded, it means we are running tests
-                        if (ctx->model.n_loaded == 0) {
-                            seek_delta = 100*WHISPER_CHUNK_SIZE;
-                            completed = true;
-                            continue;
-                        }
-                    }
-
-                    // sometimes, the decoding can get stuck in a repetition loop
-                    // this is an attempt to mitigate such cases - we flag the decoding as failed and use a fallback strategy
-                    if (i == n_max - 1 && (result_len == 0 || seek_delta < 100*WHISPER_CHUNK_SIZE/2)) {
-                        WHISPER_LOG_DEBUG("%s: decoder %d: failed due to repetition loop\n", __func__, j);
-                        failed = true;
-                        continue;
-                    }
-                }
-
-                // check if all decoders have finished (i.e. completed or failed)
-                {
-                    bool completed_all = true;
-
-                    for (int j = 0; j < n_decoders_cur; ++j) {
-                        auto & decoder = state->decoders[j];
-
-                        if (decoder.completed || decoder.failed) {
-                            continue;
-                        }
-
-                        completed_all = false;
-                    }
-
-                    if (completed_all) {
-                        break;
-                    }
-                }
-
-                state->t_sample_us += ggml_time_us() - t_start_sample_us;
-
-                // obtain logits for the next token
-                {
-                    auto & batch = state->batch;
-
-                    batch.n_tokens = 0;
-
-                    const int n_past = prompt.size() + i;
-
-                    for (int j = 0; j < n_decoders_cur; ++j) {
-                        auto & decoder = state->decoders[j];
-
-                        if (decoder.failed || decoder.completed) {
-                            continue;
-                        }
-
-                        //WHISPER_LOG_DEBUG("%s: decoder %d: token %d, seek_delta %d\n", __func__, j, decoder.sequence.tokens.back().id, decoder.seek_delta);
-
-                        decoder.i_batch = batch.n_tokens;
-
-                        batch.token   [batch.n_tokens]    = decoder.sequence.tokens.back().id;
-                        batch.pos     [batch.n_tokens]    = n_past;
-                        batch.n_seq_id[batch.n_tokens]    = 1;
-                        batch.seq_id  [batch.n_tokens][0] = j;
-                        batch.logits  [batch.n_tokens]    = 1;
-                        batch.n_tokens++;
-                    }
-
-                    assert(batch.n_tokens > 0);
-
-                    if (!whisper_decode_internal(*ctx, *state, state->batch, params.n_threads, params.abort_callback, params.abort_callback_user_data)) {
-                        WHISPER_LOG_ERROR("%s: failed to decode\n", __func__);
-                        return -8;
-                    }
-
-                    const int64_t t_start_sample_us = ggml_time_us();
-
-                    // TODO: avoid memory allocations, optimize, avoid threads?
-                    {
-                        std::atomic<int> j_cur(0);
-
-                        auto process = [&]() {
-                            while (true) {
-                                const int j = j_cur.fetch_add(1);
-
-                                if (j >= n_decoders_cur) {
-                                    break;
-                                }
-
-                                auto & decoder = state->decoders[j];
-
-                                if (decoder.failed || decoder.completed) {
-                                    continue;
-                                }
-
-                                whisper_process_logits(*ctx, *state, decoder, params, t_cur);
-                            }
-                        };
-
-                        const int n_threads = std::min(params.n_threads, n_decoders_cur);
-
-                        if (n_threads == 1) {
-                            process();
-                        } else {
-                            std::vector<std::thread> threads(n_threads - 1);
-
-                            for (int t = 0; t < n_threads - 1; ++t) {
-                                threads[t] = std::thread(process);
-                            }
-
-                            process();
-
-                            for (int t = 0; t < n_threads - 1; ++t) {
-                                threads[t].join();
-                            }
-                        }
-                    }
-
-                    state->t_sample_us += ggml_time_us() - t_start_sample_us;
-                }
-            }
-
-            // rank the resulting sequences and select the best one
-            {
-                double best_score = -INFINITY;
-
-                for (int j = 0; j < n_decoders_cur; ++j) {
-                    auto & decoder = state->decoders[j];
-
-                    if (decoder.failed) {
-                        continue;
-                    }
-
-                    decoder.sequence.tokens.resize(decoder.sequence.result_len);
-                    whisper_sequence_score(params, decoder.sequence);
-
-                    WHISPER_LOG_DEBUG("%s: decoder %2d: score = %8.5f, result_len = %3d, avg_logprobs = %8.5f, entropy = %8.5f\n",
-                            __func__, j, decoder.sequence.score, decoder.sequence.result_len, decoder.sequence.avg_logprobs, decoder.sequence.entropy);
-
-                    if (decoder.sequence.result_len > 32 && decoder.sequence.entropy < params.entropy_thold) {
-                        WHISPER_LOG_DEBUG("%s: decoder %2d: failed due to entropy %8.5f < %8.5f\n",
-                                __func__, j, decoder.sequence.entropy, params.entropy_thold);
-
-                        decoder.failed = true;
-                        state->n_fail_h++;
-
-                        continue;
-                    }
-
-                    if (best_score < decoder.sequence.score) {
-                        best_score = decoder.sequence.score;
-                        best_decoder_id = j;
-                    }
-                }
-
-                WHISPER_LOG_DEBUG("%s: best decoder = %d\n", __func__, best_decoder_id);
-            }
-
-            bool success = true;
-
-            // was the decoding successful for the current temperature?
-            // do fallback only if:
-            // - we are not at the last temperature
-            if (it != (int) temperatures.size() - 1) {
-                const auto & decoder = state->decoders[best_decoder_id];
-
-                if (decoder.failed || decoder.sequence.avg_logprobs < params.logprob_thold) {
-                    WHISPER_LOG_DEBUG("%s: failed due to avg_logprobs %8.5f < %8.5f\n", __func__, decoder.sequence.avg_logprobs, params.logprob_thold);
-                    success = false;
-                    state->n_fail_p++;
-                }
-            }
-
-            if (success) {
-                //for (auto & token : ctx->decoders[best_decoder_id].sequence.tokens) {
-                //    WHISPER_LOG_DEBUG("%s: token = %d, p = %6.3f, pt = %6.3f, ts = %s, str = %s\n", __func__, token.id, token.p, token.pt, ctx->vocab.id_to_token.at(token.tid).c_str(), ctx->vocab.id_to_token.at(token.id).c_str());
-                //}
-
-                break;
-            }
-
-            WHISPER_LOG_DEBUG("\n%s: failed to decode with temperature = %.2f\n", __func__, t_cur);
-        }
-
-        // output results through a user-provided callback
-        {
-            const auto & best_decoder = state->decoders[best_decoder_id];
-
-            const auto seek_delta = best_decoder.seek_delta;
-            const auto result_len = best_decoder.sequence.result_len;
-
-            const auto & tokens_cur = best_decoder.sequence.tokens;
-
-            //WHISPER_LOG_DEBUG("prompt_init.size() = %d, prompt.size() = %d, result_len = %d, seek_delta = %d\n", prompt_init.size(), prompt.size(), result_len, seek_delta);
-
-            // update prompt_past
-            prompt_past.clear();
-            if (prompt.front() == whisper_token_prev(ctx)) {
-                prompt_past.insert(prompt_past.end(), prompt.begin() + 1, prompt.end() - prompt_init.size());
-            }
-
-            for (int i = 0; i < result_len; ++i) {
-                prompt_past.push_back(tokens_cur[i].id);
-            }
-
-            if (!tokens_cur.empty() && ctx->model.n_loaded > 0) {
-                int  i0 = 0;
-                auto t0 = seek + 2*(tokens_cur.front().tid - whisper_token_beg(ctx));
-
-                std::string text;
-                bool speaker_turn_next = false;
-
-                for (int i = 0; i < (int) tokens_cur.size(); i++) {
-                    //printf("%s: %18s %6.3f %18s %6.3f\n", __func__,
-                    //        ctx->vocab.id_to_token[tokens_cur[i].id].c_str(), tokens_cur[i].p,
-                    //        ctx->vocab.id_to_token[tokens_cur[i].tid].c_str(), tokens_cur[i].pt);
-
-                    if (params.print_special || tokens_cur[i].id < whisper_token_eot(ctx)) {
-                        text += whisper_token_to_str(ctx, tokens_cur[i].id);
-                    }
-
-                    // [TDRZ] record if speaker turn was predicted after current segment
-                    if (params.tdrz_enable && tokens_cur[i].id == whisper_token_solm(ctx)) {
-                        speaker_turn_next = true;
-                    }
-
-                    if (tokens_cur[i].id > whisper_token_beg(ctx) && !params.single_segment) {
-                        const auto t1 = seek + 2*(tokens_cur[i].tid - whisper_token_beg(ctx));
-
-                        if (!text.empty()) {
-                            const auto tt0 = params.speed_up ? 2*t0 : t0;
-                            const auto tt1 = params.speed_up ? 2*t1 : t1;
-
-                            if (params.print_realtime) {
-                                if (params.print_timestamps) {
-                                    printf("[%s --> %s]  %s\n", to_timestamp(tt0).c_str(), to_timestamp(tt1).c_str(), text.c_str());
-                                } else {
-                                    printf("%s", text.c_str());
-                                    fflush(stdout);
-                                }
-                            }
-
-                            //printf("tt0 = %d, tt1 = %d, text = %s, token = %s, token_id = %d, tid = %d\n", tt0, tt1, text.c_str(), ctx->vocab.id_to_token[tokens_cur[i].id].c_str(), tokens_cur[i].id, tokens_cur[i].tid);
-
-                            result_all.push_back({ tt0, tt1, text, {}, speaker_turn_next });
-                            for (int j = i0; j <= i; j++) {
-                                result_all.back().tokens.push_back(tokens_cur[j]);
-                            }
-
-                            int n_new = 1;
-
-                            if (params.token_timestamps) {
-                                whisper_exp_compute_token_level_timestamps(
-                                        *ctx, *state, result_all.size() - 1, params.thold_pt, params.thold_ptsum);
-
-                                if (params.max_len > 0) {
-                                    n_new = whisper_wrap_segment(*ctx, *state, params.max_len, params.split_on_word);
-                                }
-                            }
-                            if (params.new_segment_callback) {
-                                params.new_segment_callback(ctx, state, n_new, params.new_segment_callback_user_data);
-                            }
-                        }
-                        text = "";
-                        while (i < (int) tokens_cur.size() && tokens_cur[i].id > whisper_token_beg(ctx)) {
-                            i++;
-                        }
-                        i--;
-                        t0 = t1;
-                        i0 = i + 1;
-                        speaker_turn_next = false;
-                    }
-                }
-
-                if (!text.empty()) {
-                    const auto t1 = seek + seek_delta;
-
-                    const auto tt0 = params.speed_up ? 2*t0 : t0;
-                    const auto tt1 = params.speed_up ? 2*t1 : t1;
-
-                    if (params.print_realtime) {
-                        if (params.print_timestamps) {
-                            printf("[%s --> %s]  %s\n", to_timestamp(tt0).c_str(), to_timestamp(tt1).c_str(), text.c_str());
-                        } else {
-                            printf("%s", text.c_str());
-                            fflush(stdout);
-                        }
-                    }
-
-                    result_all.push_back({ tt0, tt1, text, {} , speaker_turn_next });
-                    for (int j = i0; j < (int) tokens_cur.size(); j++) {
-                        result_all.back().tokens.push_back(tokens_cur[j]);
-                    }
-
-                    int n_new = 1;
-
-                    if (params.token_timestamps) {
-                        whisper_exp_compute_token_level_timestamps(
-                                *ctx, *state, result_all.size() - 1, params.thold_pt, params.thold_ptsum);
-
-                        if (params.max_len > 0) {
-                            n_new = whisper_wrap_segment(*ctx, *state, params.max_len, params.split_on_word);
-                        }
-                    }
-                    if (params.new_segment_callback) {
-                        params.new_segment_callback(ctx, state, n_new, params.new_segment_callback_user_data);
-                    }
-                }
-            }
-
-            // update audio window
-            seek += seek_delta;
-
-            WHISPER_LOG_DEBUG("seek = %d, seek_delta = %d\n", seek, seek_delta);
-        }
-    }
-
-    return 0;
-}
-
-int whisper_full(
-        struct whisper_context * ctx,
-    struct whisper_full_params   params,
-                   const float * samples,
-                           int   n_samples) {
-    return whisper_full_with_state(ctx, ctx->state, params, samples, n_samples);
-}
-
-int whisper_full_parallel(
-        struct whisper_context * ctx,
-        struct whisper_full_params params,
-        const float * samples,
-        int n_samples,
-        int n_processors) {
-    if (n_processors == 1) {
-        return whisper_full(ctx, params, samples, n_samples);
-    }
-    int ret = 0;
-
-    // prepare separate states for each thread
-    std::vector<whisper_state*> states;
-
-    const int offset_samples = (WHISPER_SAMPLE_RATE*params.offset_ms)/1000;
-    const int n_samples_per_processor = (n_samples - offset_samples)/n_processors;
-
-    // the calling thread will process the first chunk
-    // while the other threads will process the remaining chunks
-
-    std::vector<std::thread> workers(n_processors - 1);
-    for (int i = 0; i < n_processors - 1; ++i) {
-        // create a new state for each thread
-        states.push_back(whisper_init_state(ctx));
-
-        const int start_samples = offset_samples + (i + 1)*n_samples_per_processor;
-        const int n_samples_cur = (i == n_processors - 2) ? n_samples - start_samples : n_samples_per_processor;
-
-        auto params_cur = params;
-
-        params_cur.offset_ms = 0;
-        params_cur.print_progress = false;
-        params_cur.print_realtime = false;
-
-        params_cur.new_segment_callback = nullptr;
-        params_cur.new_segment_callback_user_data = nullptr;
-
-        params_cur.progress_callback = nullptr;
-        params_cur.progress_callback_user_data = nullptr;
-
-        workers[i] = std::thread(whisper_full_with_state, ctx, states[i], std::move(params_cur), samples + start_samples, n_samples_cur);
-    }
-
-    {
-        auto params_cur = params;
-
-        // We need to disable the print real-time for this one as well, otherwise it will show only for the first chunk.
-        params_cur.print_realtime = false;
-
-        // Run the first transformation using default state but only for the first chunk.
-        ret = whisper_full_with_state(ctx, ctx->state, std::move(params_cur), samples, offset_samples + n_samples_per_processor);
-    }
-
-    for (int i = 0; i < n_processors - 1; ++i) {
-        workers[i].join();
-    }
-
-    const int64_t offset_t = (int64_t) params.offset_ms/10.0;
-
-    // combine results into result_state->result_all from all other states
-    for (int i = 0; i < n_processors - 1; ++i) {
-        auto& results_i = states[i]->result_all;
-
-        for (auto& result : results_i) {
-            // correct the segment timestamp taking into account the offset
-            result.t0 += 100 * ((i + 1) * n_samples_per_processor) / WHISPER_SAMPLE_RATE + offset_t;
-            result.t1 += 100 * ((i + 1) * n_samples_per_processor) / WHISPER_SAMPLE_RATE + offset_t;
-
-            // make sure that segments are not overlapping
-            if (!ctx->state->result_all.empty()) {
-                result.t0 = std::max(result.t0, ctx->state->result_all.back().t1);
-            }
-
-            ctx->state->result_all.push_back(std::move(result));
-
-            // call the new_segment_callback for each segment
-            if (params.new_segment_callback) {
-                params.new_segment_callback(ctx, ctx->state, 1, params.new_segment_callback_user_data);
-            }
-        }
-
-        ctx->state->t_mel_us += states[i]->t_mel_us;
-
-        ctx->state->t_sample_us += states[i]->t_sample_us;
-        ctx->state->t_encode_us += states[i]->t_encode_us;
-        ctx->state->t_decode_us += states[i]->t_decode_us;
-        ctx->state->t_batchd_us += states[i]->t_batchd_us;
-        ctx->state->t_prompt_us += states[i]->t_prompt_us;
-
-        ctx->state->n_sample += states[i]->n_sample;
-        ctx->state->n_encode += states[i]->n_encode;
-        ctx->state->n_decode += states[i]->n_decode;
-        ctx->state->n_batchd += states[i]->n_batchd;
-        ctx->state->n_prompt += states[i]->n_prompt;
-
-        whisper_free_state(states[i]);
-    }
-
-    // average the timings
-    ctx->state->t_mel_us    /= n_processors;
-    ctx->state->t_sample_us /= n_processors;
-    ctx->state->t_encode_us /= n_processors;
-    ctx->state->t_decode_us /= n_processors;
-
-    // print information about the audio boundaries
-    WHISPER_LOG_WARN("\n");
-    WHISPER_LOG_WARN("%s: the audio has been split into %d chunks at the following times:\n", __func__, n_processors);
-    for (int i = 0; i < n_processors - 1; ++i) {
-        WHISPER_LOG_WARN("%s: split %d - %s\n", __func__, (i + 1), to_timestamp(100*((i + 1)*n_samples_per_processor)/WHISPER_SAMPLE_RATE + offset_t).c_str());
-    }
-    WHISPER_LOG_WARN("%s: the transcription quality may be degraded near these boundaries\n", __func__);
-
-    return ret;
-}
-
-int whisper_full_n_segments_from_state(struct whisper_state * state) {
-    return state->result_all.size();
-}
-
-int whisper_full_n_segments(struct whisper_context * ctx) {
-    return ctx->state->result_all.size();
-}
-
-int whisper_full_lang_id_from_state(struct whisper_state * state) {
-    return state->lang_id;
-}
-
-int whisper_full_lang_id(struct whisper_context * ctx) {
-    return ctx->state->lang_id;
-}
-
-int64_t whisper_full_get_segment_t0_from_state(struct whisper_state * state, int i_segment) {
-    return state->result_all[i_segment].t0;
-}
-
-int64_t whisper_full_get_segment_t0(struct whisper_context * ctx, int i_segment) {
-    return ctx->state->result_all[i_segment].t0;
-}
-
-int64_t whisper_full_get_segment_t1_from_state(struct whisper_state * state, int i_segment) {
-    return state->result_all[i_segment].t1;
-}
-
-int64_t whisper_full_get_segment_t1(struct whisper_context * ctx, int i_segment) {
-    return ctx->state->result_all[i_segment].t1;
-}
-
-bool whisper_full_get_segment_speaker_turn_next_from_state(struct whisper_state * state, int i_segment) {
-    return state->result_all[i_segment].speaker_turn_next;
-}
-
-bool whisper_full_get_segment_speaker_turn_next(struct whisper_context * ctx, int i_segment) {
-    return ctx->state->result_all[i_segment].speaker_turn_next;
-}
-
-const char * whisper_full_get_segment_text_from_state(struct whisper_state * state, int i_segment) {
-    return state->result_all[i_segment].text.c_str();
-}
-
-const char * whisper_full_get_segment_text(struct whisper_context * ctx, int i_segment) {
-    return ctx->state->result_all[i_segment].text.c_str();
-}
-
-int whisper_full_n_tokens_from_state(struct whisper_state * state, int i_segment) {
-    return state->result_all[i_segment].tokens.size();
-}
-
-int whisper_full_n_tokens(struct whisper_context * ctx, int i_segment) {
-    return ctx->state->result_all[i_segment].tokens.size();
-}
-
-const char * whisper_full_get_token_text_from_state(struct whisper_context * ctx, struct whisper_state * state, int i_segment, int i_token) {
-    return ctx->vocab.id_to_token[state->result_all[i_segment].tokens[i_token].id].c_str();
-}
-
-const char* whisper_full_get_token_text(struct whisper_context * ctx, int i_segment, int i_token) {
-    return ctx->vocab.id_to_token[ctx->state->result_all[i_segment].tokens[i_token].id].c_str();
-}
-
-whisper_token whisper_full_get_token_id_from_state(struct whisper_state * state, int i_segment, int i_token) {
-    return state->result_all[i_segment].tokens[i_token].id;
-}
-
-whisper_token whisper_full_get_token_id(struct whisper_context * ctx, int i_segment, int i_token) {
-    return ctx->state->result_all[i_segment].tokens[i_token].id;
-}
-
-struct whisper_token_data whisper_full_get_token_data_from_state(struct whisper_state * state, int i_segment, int i_token) {
-    return state->result_all[i_segment].tokens[i_token];
-}
-
-struct whisper_token_data whisper_full_get_token_data(struct whisper_context * ctx, int i_segment, int i_token) {
-    return ctx->state->result_all[i_segment].tokens[i_token];
-}
-
-float whisper_full_get_token_p_from_state(struct whisper_state * state, int i_segment, int i_token) {
-    return state->result_all[i_segment].tokens[i_token].p;
-}
-
-float whisper_full_get_token_p(struct whisper_context * ctx, int i_segment, int i_token) {
-    return ctx->state->result_all[i_segment].tokens[i_token].p;
-}
-
-// =================================================================================================
-
-//
-// Temporary interface needed for exposing ggml interface
-// Will be removed in the future when ggml becomes a separate library
-//
-
-WHISPER_API int whisper_bench_memcpy(int n_threads) {
-    fputs(whisper_bench_memcpy_str(n_threads), stderr);
-    return 0;
-}
-
-WHISPER_API const char * whisper_bench_memcpy_str(int n_threads) {
-    static std::string s;
-    s = "";
-    char strbuf[256];
-
-    ggml_time_init();
-
-    size_t n    = 20;
-    size_t arr  = n_threads > 0 ? 1024llu : n_threads; // trick to avoid compiler optimizations
-
-    // 1GB array
-    const size_t size = arr*1e6;
-
-    double sum  = 0.0;
-
-    // heat-up
-    {
-        char * src = (char *) malloc(size);
-        char * dst = (char *) malloc(size);
-
-        for (size_t i = 0; i < size; i++) src[i] = i;
-
-        memcpy(dst, src, size); // heat-up
-
-        double tsum = 0.0;
-
-        for (size_t i = 0; i < n; i++) {
-            const int64_t t0 = ggml_time_us();
-
-            memcpy(dst, src, size);
-
-            const int64_t t1 = ggml_time_us();
-
-            tsum += (t1 - t0)*1e-6;
-
-            src[rand() % size] = rand() % 256;
-        }
-
-        snprintf(strbuf, sizeof(strbuf), "memcpy: %7.2f GB/s (heat-up)\n", (double) (n*size)/(tsum*1e9));
-        s += strbuf;
-
-        // needed to prevent the compiler from optimizing the memcpy away
-        {
-            for (size_t i = 0; i < size; i++) sum += dst[i];
-        }
-
-        free(src);
-        free(dst);
-    }
-
-    // single-thread
-    {
-        char * src = (char *) malloc(size);
-        char * dst = (char *) malloc(size);
-
-        for (size_t i = 0; i < size; i++) src[i] = i;
-
-        memcpy(dst, src, size); // heat-up
-
-        double tsum = 0.0;
-
-        for (size_t i = 0; i < n; i++) {
-            const int64_t t0 = ggml_time_us();
-
-            memcpy(dst, src, size);
-
-            const int64_t t1 = ggml_time_us();
-
-            tsum += (t1 - t0)*1e-6;
-
-            src[rand() % size] = rand() % 256;
-        }
-
-        snprintf(strbuf, sizeof(strbuf), "memcpy: %7.2f GB/s ( 1 thread)\n", (double) (n*size)/(tsum*1e9));
-        s += strbuf;
-
-        // needed to prevent the compiler from optimizing the memcpy away
-        {
-            for (size_t i = 0; i < size; i++) sum += dst[i];
-        }
-
-        free(src);
-        free(dst);
-    }
-
-    // multi-thread
-
-    for (int32_t k = 1; k <= n_threads; k++) {
-        char * src = (char *) malloc(size);
-        char * dst = (char *) malloc(size);
-
-        for (size_t i = 0; i < size; i++) src[i] = i;
-
-        memcpy(dst, src, size); // heat-up
-
-        double tsum = 0.0;
-
-        auto helper = [&](int th) {
-            const int64_t i0 = (th + 0)*size/k;
-            const int64_t i1 = (th + 1)*size/k;
-
-            for (size_t i = 0; i < n; i++) {
-                memcpy(dst + i0, src + i0, i1 - i0);
-
-                src[i0 + rand() % (i1 - i0)] = rand() % 256;
-            };
-        };
-
-        const int64_t t0 = ggml_time_us();
-
-        std::vector<std::thread> threads(k - 1);
-        for (int32_t th = 0; th < k - 1; ++th) {
-            threads[th] = std::thread(helper, th);
-        }
-
-        helper(k - 1);
-
-        for (int32_t th = 0; th < k - 1; ++th) {
-            threads[th].join();
-        }
-
-        const int64_t t1 = ggml_time_us();
-
-        tsum += (t1 - t0)*1e-6;
-
-        snprintf(strbuf, sizeof(strbuf), "memcpy: %7.2f GB/s (%2d thread)\n", (double) (n*size)/(tsum*1e9), k);
-        s += strbuf;
-
-        // needed to prevent the compiler from optimizing the memcpy away
-        {
-            for (size_t i = 0; i < size; i++) sum += dst[i];
-        }
-
-        free(src);
-        free(dst);
-    }
-
-    snprintf(strbuf, sizeof(strbuf), "sum:    %f\n", sum);
-    s += strbuf;
-
-    return s.c_str();
-}
-
-WHISPER_API int whisper_bench_ggml_mul_mat(int n_threads) {
-    fputs(whisper_bench_ggml_mul_mat_str(n_threads), stderr);
-    return 0;
-}
-
-WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
-    static std::string s;
-    s = "";
-    char strbuf[256];
-
-    ggml_time_init();
-
-    const int n_max = 128;
-
-    const std::vector<size_t> sizes = {
-        64, 128, 256, 512, 1024, 2048, 4096,
-    };
-
-    const size_t N_max = sizes.back();
-
-    // a: N*N*sizeof(float)
-    // b: N*N*sizeof(float)
-    // c: N*N*sizeof(float)
-    // when F16 is used, there is an extra work buffer of size N*N*sizeof(float)
-    std::vector<uint8_t> buf(3llu*N_max*N_max*sizeof(float) + 3*ggml_tensor_overhead() + ggml_graph_overhead());
-    std::vector<uint8_t> work;
-
-    // put a bunch of random data in the buffer
-    for (size_t i = 0; i < buf.size(); i++) buf[i] = i;
-
-    for (int j = 0; j < (int) sizes.size(); j++) {
-        int n_q4_0 = 0;
-        int n_q4_1 = 0;
-        int n_q5_0 = 0;
-        int n_q5_1 = 0;
-        int n_q8_0 = 0;
-        int n_fp16 = 0;
-        int n_fp32 = 0;
-
-        // GFLOPS/s
-        double s_q4_0 = 0.0;
-        double s_q4_1 = 0.0;
-        double s_q5_0 = 0.0;
-        double s_q5_1 = 0.0;
-        double s_q8_0 = 0.0;
-        double s_fp16 = 0.0;
-        double s_fp32 = 0.0;
-
-        const size_t N = sizes[j];
-
-        for (int k = 0; k < 7; ++k) {
-            const ggml_type wtype =
-                k == 0 ? GGML_TYPE_Q4_0 :
-                k == 1 ? GGML_TYPE_Q4_1 :
-                k == 2 ? GGML_TYPE_Q5_0 :
-                k == 3 ? GGML_TYPE_Q5_1 :
-                k == 4 ? GGML_TYPE_Q8_0 :
-                k == 5 ? GGML_TYPE_F16  : GGML_TYPE_F32;
-
-            double & s = k == 0 ? s_q4_0 : k == 1 ? s_q4_1 : k == 2 ? s_q5_0 : k == 3 ? s_q5_1 : k == 4 ? s_q8_0 : k == 5 ? s_fp16 : /*k == 6*/ s_fp32;
-            int    & n = k == 0 ? n_q4_0 : k == 1 ? n_q4_1 : k == 2 ? n_q5_0 : k == 3 ? n_q5_1 : k == 4 ? n_q8_0 : k == 5 ? n_fp16 : /*k == 6*/ n_fp32;
-
-            struct ggml_init_params gparams = {
-                /*.mem_size   =*/ buf.size(),
-                /*.mem_buffer =*/ buf.data(),
-                /*.no_alloc   =*/ false,
-            };
-
-            struct ggml_context * ctx0 = ggml_init(gparams);
-
-            struct ggml_tensor * a = ggml_new_tensor_2d(ctx0, wtype,         N, N);
-            struct ggml_tensor * b = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, N, N);
-
-            struct ggml_tensor * c = ggml_mul_mat(ctx0, a, b);
-
-            struct ggml_cgraph * gf = ggml_new_graph(ctx0);
-
-            ggml_build_forward_expand(gf, c);
-
-            double tsum = 0.0;
-
-            // heat-up
-            ggml_graph_compute_helper(gf, work, n_threads, nullptr, nullptr);
-
-            for (int i = 0; i < n_max; ++i) {
-                const int64_t t0 = ggml_time_us();
-
-                ggml_graph_compute_helper(gf, work, n_threads, nullptr, nullptr);
-
-                const int64_t t1 = ggml_time_us();
-
-                tsum += (t1 - t0)*1e-6;
-                n++;
-
-                if (tsum > 1.0 && n >= 3) {
-                    break;
-                }
-            }
-
-            ggml_free(ctx0);
-
-            s = ((2.0*N*N*N*n)/tsum)*1e-9;
-        }
-
-        // Q4_0 | Q4_1
-        snprintf(strbuf, sizeof(strbuf), "%4zu x %4zu: Q4_0 %7.1f GFLOPS (%3d runs) | Q4_1 %7.1f GFLOPS (%3d runs)\n",
-                N, N, s_q4_0, n_q4_0, s_q4_1, n_q4_1);
-        s += strbuf;
-
-        // Q5_0 | Q5_1 | Q8_0
-        snprintf(strbuf, sizeof(strbuf), "%4zu x %4zu: Q5_0 %7.1f GFLOPS (%3d runs) | Q5_1 %7.1f GFLOPS (%3d runs) | Q8_0 %7.1f GFLOPS (%3d runs)\n",
-                N, N, s_q5_0, n_q5_0, s_q5_1, n_q5_1, s_q8_0, n_q8_0);
-        s += strbuf;
-
-        // F16 | F32
-        snprintf(strbuf, sizeof(strbuf), "%4zu x %4zu: F16  %7.1f GFLOPS (%3d runs) | F32  %7.1f GFLOPS (%3d runs)\n",
-                N, N, s_fp16, n_fp16, s_fp32, n_fp32);
-        s += strbuf;
-    }
-
-    return s.c_str();
-}
-
-// =================================================================================================
-
-// =================================================================================================
-
-//
-// Experimental stuff below
-//
-// Not sure if these should be part of the library at all, because the quality of the results is not
-// guaranteed. Might get removed at some point unless a robust algorithm implementation is found
-//
-
-// =================================================================================================
-
-//
-// token-level timestamps
-//
-
-static int timestamp_to_sample(int64_t t, int n_samples) {
-    return std::max(0, std::min((int) n_samples - 1, (int) ((t*WHISPER_SAMPLE_RATE)/100)));
-}
-
-static int64_t sample_to_timestamp(int i_sample) {
-    return (100ll*i_sample)/WHISPER_SAMPLE_RATE;
-}
-
-// a cost-function / heuristic that is high for text that takes longer to pronounce
-// obviously, can be improved
-static float voice_length(const std::string & text) {
-    float res = 0.0f;
-
-    for (char c : text) {
-        if (c == ' ') {
-            res += 0.01f;
-        } else if (c == ',') {
-            res += 2.00f;
-        } else if (c == '.') {
-            res += 3.00f;
-        } else if (c == '!') {
-            res += 3.00f;
-        } else if (c == '?') {
-            res += 3.00f;
-        } else if (c >= '0' && c <= '9') {
-            res += 3.00f;
-        } else {
-            res += 1.00f;
-        }
-    }
-
-    return res;
-}
-
-// average the fabs of the signal
-static std::vector<float> get_signal_energy(const float * signal, int n_samples, int n_samples_per_half_window) {
-    const int hw = n_samples_per_half_window;
-
-    std::vector<float> result(n_samples);
-
-    for (int i = 0; i < n_samples; i++) {
-        float sum = 0;
-        for (int j = -hw; j <= hw; j++) {
-            if (i + j >= 0 && i + j < n_samples) {
-                sum += fabs(signal[i + j]);
-            }
-        }
-        result[i] = sum/(2*hw + 1);
-    }
-
-    return result;
-}
-
-static void whisper_exp_compute_token_level_timestamps(
-        struct whisper_context & ctx,
-          struct whisper_state & state,
-                           int   i_segment,
-                         float   thold_pt,
-                         float   thold_ptsum) {
-    auto & segment = state.result_all[i_segment];
-    auto & tokens  = segment.tokens;
-
-    const int n_samples = state.energy.size();
-
-    if (n_samples == 0) {
-        WHISPER_LOG_ERROR("%s: no signal data available\n", __func__);
-        return;
-    }
-
-    const int64_t t0 = segment.t0;
-    const int64_t t1 = segment.t1;
-
-    const int n = tokens.size();
-
-    if (n == 0) {
-        return;
-    }
-
-    if (n == 1) {
-        tokens[0].t0 = t0;
-        tokens[0].t1 = t1;
-
-        return;
-    }
-
-    auto & t_beg    = state.t_beg;
-    auto & t_last   = state.t_last;
-    auto & tid_last = state.tid_last;
-
-    for (int j = 0; j < n; ++j) {
-        auto & token = tokens[j];
-
-        if (j == 0) {
-            if (token.id == whisper_token_beg(&ctx)) {
-                tokens[j    ].t0 = t0;
-                tokens[j    ].t1 = t0;
-                tokens[j + 1].t0 = t0;
-
-                t_beg    = t0;
-                t_last   = t0;
-                tid_last = whisper_token_beg(&ctx);
-            } else {
-                tokens[j    ].t0 = t_last;
-            }
-        }
-
-        const int64_t tt = t_beg + 2*(token.tid - whisper_token_beg(&ctx));
-
-        tokens[j].id    = token.id;
-        tokens[j].tid   = token.tid;
-        tokens[j].p     = token.p;
-        tokens[j].pt    = token.pt;
-        tokens[j].ptsum = token.ptsum;
-
-        tokens[j].vlen = voice_length(whisper_token_to_str(&ctx, token.id));
-
-        if (token.pt > thold_pt && token.ptsum > thold_ptsum && token.tid > tid_last && tt <= t1) {
-            if (j > 0) {
-                tokens[j - 1].t1 = tt;
-            }
-            tokens[j].t0 = tt;
-            tid_last = token.tid;
-        }
-    }
-
-    tokens[n - 2].t1 = t1;
-    tokens[n - 1].t0 = t1;
-    tokens[n - 1].t1 = t1;
-
-    t_last = t1;
-
-    // find intervals of tokens with unknown timestamps
-    // fill the timestamps by proportionally splitting the interval based on the token voice lengths
-    {
-        int p0 = 0;
-        int p1 = 0;
-
-        while (true) {
-            while (p1 < n && tokens[p1].t1 < 0) {
-                p1++;
-            }
-
-            if (p1 >= n) {
-                p1--;
-            }
-
-            //printf("p0=%d p1=%d t0=%lld t1=%lld\n", p0, p1, tokens[p0].t0, tokens[p1].t1);
-
-            if (p1 > p0) {
-                double psum = 0.0;
-                for (int j = p0; j <= p1; j++) {
-                    psum += tokens[j].vlen;
-                }
-
-                //printf("analyzing %d - %d, psum = %f\n", p0, p1, psum);
-
-                const double dt = tokens[p1].t1 - tokens[p0].t0;
-
-                // split the time proportionally to the voice length
-                for (int j = p0 + 1; j <= p1; j++) {
-                    const double ct = tokens[j - 1].t0 + dt*tokens[j - 1].vlen/psum;
-
-                    tokens[j - 1].t1 = ct;
-                    tokens[j    ].t0 = ct;
-                }
-            }
-
-            p1++;
-            p0 = p1;
-            if (p1 >= n) {
-                break;
-            }
-        }
-    }
-
-    // fix up (just in case)
-    for (int j = 0; j < n - 1; j++) {
-        if (tokens[j].t1 < 0) {
-            tokens[j + 1].t0 = tokens[j].t1;
-        }
-
-        if (j > 0) {
-            if (tokens[j - 1].t1 > tokens[j].t0) {
-                tokens[j].t0 = tokens[j - 1].t1;
-                tokens[j].t1 = std::max(tokens[j].t0, tokens[j].t1);
-            }
-        }
-    }
-
-    // VAD
-    // expand or contract tokens based on voice activity
-    {
-        const int hw = WHISPER_SAMPLE_RATE/8;
-
-        for (int j = 0; j < n; j++) {
-            if (tokens[j].id >= whisper_token_eot(&ctx)) {
-                continue;
-            }
-
-            int s0 = timestamp_to_sample(tokens[j].t0, n_samples);
-            int s1 = timestamp_to_sample(tokens[j].t1, n_samples);
-
-            const int ss0 = std::max(s0 - hw, 0);
-            const int ss1 = std::min(s1 + hw, n_samples);
-
-            const int ns = ss1 - ss0;
-
-            float sum = 0.0f;
-
-            for (int k = ss0; k < ss1; k++) {
-                sum += state.energy[k];
-            }
-
-            const float thold = 0.5*sum/ns;
-
-            {
-                int k = s0;
-                if (state.energy[k] > thold && j > 0) {
-                    while (k > 0 && state.energy[k] > thold) {
-                        k--;
-                    }
-                    tokens[j].t0 = sample_to_timestamp(k);
-                    if (tokens[j].t0 < tokens[j - 1].t1) {
-                        tokens[j].t0 = tokens[j - 1].t1;
-                    } else {
-                        s0 = k;
-                    }
-                } else {
-                    while (state.energy[k] < thold && k < s1) {
-                        k++;
-                    }
-                    s0 = k;
-                    tokens[j].t0 = sample_to_timestamp(k);
-                }
-            }
-
-            {
-                int k = s1;
-                if (state.energy[k] > thold) {
-                    while (k < n_samples - 1 && state.energy[k] > thold) {
-                        k++;
-                    }
-                    tokens[j].t1 = sample_to_timestamp(k);
-                    if (j < ns - 1 && tokens[j].t1 > tokens[j + 1].t0) {
-                        tokens[j].t1 = tokens[j + 1].t0;
-                    } else {
-                        s1 = k;
-                    }
-                } else {
-                    while (state.energy[k] < thold && k > s0) {
-                        k--;
-                    }
-                    s1 = k;
-                    tokens[j].t1 = sample_to_timestamp(k);
-                }
-            }
-        }
-    }
-
-    // fixed token expand (optional)
-    //{
-    //    const int t_expand = 0;
-
-    //    for (int j = 0; j < n; j++) {
-    //        if (j > 0) {
-    //            tokens[j].t0 = std::max(0, (int) (tokens[j].t0 - t_expand));
-    //        }
-    //        if (j < n - 1) {
-    //            tokens[j].t1 = tokens[j].t1 + t_expand;
-    //        }
-    //    }
-    //}
-
-    // debug info
-    //for (int j = 0; j < n; ++j) {
-    //    const auto & token = tokens[j];
-    //    const auto tt = token.pt > thold_pt && token.ptsum > 0.01 ? whisper_token_to_str(&ctx, token.tid) : "[?]";
-    //    printf("%s: %10s %6.3f %6.3f %6.3f %6.3f %5d %5d '%s'\n", __func__,
-    //            tt, token.p, token.pt, token.ptsum, token.vlen, (int) token.t0, (int) token.t1, whisper_token_to_str(&ctx, token.id));
-
-    //    if (tokens[j].id >= whisper_token_eot(&ctx)) {
-    //        continue;
-    //    }
-    //}
-}
-
-void whisper_log_set(ggml_log_callback log_callback, void * user_data) {
-    g_state.log_callback = log_callback ? log_callback : whisper_log_callback_default;
-    g_state.log_callback_user_data = user_data;
-}
-
-GGML_ATTRIBUTE_FORMAT(2, 3)
-static void whisper_log_internal(ggml_log_level level, const char * format, ...) {
-    va_list args;
-    va_start(args, format);
-    char buffer[1024];
-    int len = vsnprintf(buffer, 1024, format, args);
-    if (len < 1024) {
-        g_state.log_callback(level, buffer, g_state.log_callback_user_data);
-    } else {
-        char* buffer2 = new char[len+1];
-        vsnprintf(buffer2, len+1, format, args);
-        buffer2[len] = 0;
-        g_state.log_callback(level, buffer2, g_state.log_callback_user_data);
-        delete[] buffer2;
-    }
-    va_end(args);
-}
-
-static void whisper_log_callback_default(ggml_log_level level, const char * text, void * user_data) {
-    (void) level;
-    (void) user_data;
-    fputs(text, stderr);
-    fflush(stderr);
-}
diff --git a/ggml/examples/whisper/whisper.h b/ggml/examples/whisper/whisper.h
deleted file mode 100644
index 3143cea..0000000
--- a/ggml/examples/whisper/whisper.h
+++ /dev/null
@@ -1,625 +0,0 @@
-#ifndef WHISPER_H
-#define WHISPER_H
-
-#include "ggml.h"
-
-#include <stddef.h>
-#include <stdint.h>
-#include <stdbool.h>
-
-#ifdef __GNUC__
-#    define WHISPER_DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
-#elif defined(_MSC_VER)
-#    define WHISPER_DEPRECATED(func, hint) __declspec(deprecated(hint)) func
-#else
-#    define WHISPER_DEPRECATED(func, hint) func
-#endif
-
-#ifdef WHISPER_SHARED
-#    ifdef _WIN32
-#        ifdef WHISPER_BUILD
-#            define WHISPER_API __declspec(dllexport)
-#        else
-#            define WHISPER_API __declspec(dllimport)
-#        endif
-#    else
-#        define WHISPER_API __attribute__ ((visibility ("default")))
-#    endif
-#else
-#    define WHISPER_API
-#endif
-
-#define WHISPER_SAMPLE_RATE 16000
-#define WHISPER_N_FFT       400
-#define WHISPER_HOP_LENGTH  160
-#define WHISPER_CHUNK_SIZE  30
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-    //
-    // C interface
-    //
-    // The following interface is thread-safe as long as the sample whisper_context is not used by multiple threads
-    // concurrently.
-    //
-    // Basic usage:
-    //
-    //     #include "whisper.h"
-    //
-    //     ...
-    //
-    //     whisper_context_params cparams = whisper_context_default_params();
-    //
-    //     struct whisper_context * ctx = whisper_init_from_file_with_params("/path/to/ggml-base.en.bin", cparams);
-    //
-    //     if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
-    //         fprintf(stderr, "failed to process audio\n");
-    //         return 7;
-    //     }
-    //
-    //     const int n_segments = whisper_full_n_segments(ctx);
-    //     for (int i = 0; i < n_segments; ++i) {
-    //         const char * text = whisper_full_get_segment_text(ctx, i);
-    //         printf("%s", text);
-    //     }
-    //
-    //     whisper_free(ctx);
-    //
-    //     ...
-    //
-    // This is a demonstration of the most straightforward usage of the library.
-    // "pcmf32" contains the RAW audio data in 32-bit floating point format.
-    //
-    // The interface also allows for more fine-grained control over the computation, but it requires a deeper
-    // understanding of how the model works.
-    //
-
-    struct whisper_context;
-    struct whisper_state;
-    struct whisper_full_params;
-
-    typedef int32_t whisper_pos;
-    typedef int32_t whisper_token;
-    typedef int32_t whisper_seq_id;
-
-    struct whisper_context_params {
-        bool  use_gpu;
-    };
-
-    typedef struct whisper_token_data {
-        whisper_token id;  // token id
-        whisper_token tid; // forced timestamp token id
-
-        float p;           // probability of the token
-        float plog;        // log probability of the token
-        float pt;          // probability of the timestamp token
-        float ptsum;       // sum of probabilities of all timestamp tokens
-
-        // token-level timestamp data
-        // do not use if you haven't computed token-level timestamps
-        int64_t t0;        // start time of the token
-        int64_t t1;        //   end time of the token
-
-        float vlen;        // voice length of the token
-    } whisper_token_data;
-
-    typedef struct whisper_model_loader {
-        void * context;
-
-        size_t (*read)(void * ctx, void * output, size_t read_size);
-        bool    (*eof)(void * ctx);
-        void  (*close)(void * ctx);
-    } whisper_model_loader;
-
-    // grammar element type
-    enum whisper_gretype {
-        // end of rule definition
-        WHISPER_GRETYPE_END            = 0,
-
-        // start of alternate definition for rule
-        WHISPER_GRETYPE_ALT            = 1,
-
-        // non-terminal element: reference to rule
-        WHISPER_GRETYPE_RULE_REF       = 2,
-
-        // terminal element: character (code point)
-        WHISPER_GRETYPE_CHAR           = 3,
-
-        // inverse char(s) ([^a], [^a-b] [^abc])
-        WHISPER_GRETYPE_CHAR_NOT       = 4,
-
-        // modifies a preceding WHISPER_GRETYPE_CHAR or LLAMA_GRETYPE_CHAR_ALT to
-        // be an inclusive range ([a-z])
-        WHISPER_GRETYPE_CHAR_RNG_UPPER = 5,
-
-        // modifies a preceding WHISPER_GRETYPE_CHAR or
-        // WHISPER_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA])
-        WHISPER_GRETYPE_CHAR_ALT       = 6,
-    };
-
-    typedef struct whisper_grammar_element {
-        enum whisper_gretype type;
-        uint32_t             value; // Unicode code point or rule ID
-    } whisper_grammar_element;
-
-    // Various functions for loading a ggml whisper model.
-    // Allocate (almost) all memory needed for the model.
-    // Return NULL on failure
-    WHISPER_API struct whisper_context * whisper_init_from_file_with_params  (const char * path_model,              struct whisper_context_params params);
-    WHISPER_API struct whisper_context * whisper_init_from_buffer_with_params(void * buffer, size_t buffer_size,    struct whisper_context_params params);
-    WHISPER_API struct whisper_context * whisper_init_with_params            (struct whisper_model_loader * loader, struct whisper_context_params params);
-
-    // These are the same as the above, but the internal state of the context is not allocated automatically
-    // It is the responsibility of the caller to allocate the state using whisper_init_state() (#523)
-    WHISPER_API struct whisper_context * whisper_init_from_file_with_params_no_state  (const char * path_model,              struct whisper_context_params params);
-    WHISPER_API struct whisper_context * whisper_init_from_buffer_with_params_no_state(void * buffer, size_t buffer_size,    struct whisper_context_params params);
-    WHISPER_API struct whisper_context * whisper_init_with_params_no_state            (struct whisper_model_loader * loader, struct whisper_context_params params);
-
-    WHISPER_DEPRECATED(
-        WHISPER_API struct whisper_context * whisper_init_from_file(const char * path_model),
-        "use whisper_init_from_file_with_params instead"
-    );
-    WHISPER_DEPRECATED(
-        WHISPER_API struct whisper_context * whisper_init_from_buffer(void * buffer, size_t buffer_size),
-        "use whisper_init_from_buffer_with_params instead"
-    );
-    WHISPER_DEPRECATED(
-        WHISPER_API struct whisper_context * whisper_init(struct whisper_model_loader * loader),
-        "use whisper_init_with_params instead"
-    );
-    WHISPER_DEPRECATED(
-        WHISPER_API struct whisper_context * whisper_init_from_file_no_state(const char * path_model),
-        "use whisper_init_from_file_with_params_no_state instead"
-    );
-    WHISPER_DEPRECATED(
-        WHISPER_API struct whisper_context * whisper_init_from_buffer_no_state(void * buffer, size_t buffer_size),
-        "use whisper_init_from_buffer_with_params_no_state instead"
-    );
-    WHISPER_DEPRECATED(
-        WHISPER_API struct whisper_context * whisper_init_no_state(struct whisper_model_loader * loader),
-        "use whisper_init_with_params_no_state instead"
-    );
-
-    WHISPER_API struct whisper_state * whisper_init_state(struct whisper_context * ctx);
-
-    // Given a context, enable use of OpenVINO for encode inference.
-    // model_path: Optional path to OpenVINO encoder IR model. If set to nullptr,
-    //                      the path will be generated from the ggml model path that was passed
-    //                      in to whisper_init_from_file. For example, if 'path_model' was
-    //                      "/path/to/ggml-base.en.bin", then OpenVINO IR model path will be
-    //                      assumed to be "/path/to/ggml-base.en-encoder-openvino.xml".
-    // device: OpenVINO device to run inference on ("CPU", "GPU", etc.)
-    // cache_dir: Optional cache directory that can speed up init time, especially for
-    //                     GPU, by caching compiled 'blobs' there.
-    //                     Set to nullptr if not used.
-    // Returns 0 on success. If OpenVINO is not enabled in build, this simply returns 1.
-    WHISPER_API int whisper_ctx_init_openvino_encoder(
-        struct whisper_context * ctx,
-                    const char * model_path,
-                    const char * device,
-                    const char * cache_dir);
-
-    // Frees all allocated memory
-    WHISPER_API void whisper_free      (struct whisper_context * ctx);
-    WHISPER_API void whisper_free_state(struct whisper_state * state);
-    WHISPER_API void whisper_free_params(struct whisper_full_params * params);
-    WHISPER_API void whisper_free_context_params(struct whisper_context_params * params);
-
-    // Convert RAW PCM audio to log mel spectrogram.
-    // The resulting spectrogram is stored inside the default state of the provided whisper context.
-    // Returns 0 on success
-    WHISPER_API int whisper_pcm_to_mel(
-            struct whisper_context * ctx,
-                       const float * samples,
-                               int   n_samples,
-                               int   n_threads);
-
-    WHISPER_API int whisper_pcm_to_mel_with_state(
-            struct whisper_context * ctx,
-              struct whisper_state * state,
-                       const float * samples,
-                               int   n_samples,
-                               int   n_threads);
-
-    // Convert RAW PCM audio to log mel spectrogram but applies a Phase Vocoder to speed up the audio x2.
-    // The resulting spectrogram is stored inside the default state of the provided whisper context.
-    // Returns 0 on success
-    WHISPER_API int whisper_pcm_to_mel_phase_vocoder(
-        struct whisper_context * ctx,
-                   const float * samples,
-                           int   n_samples,
-                           int   n_threads);
-
-    WHISPER_API int whisper_pcm_to_mel_phase_vocoder_with_state(
-        struct whisper_context * ctx,
-          struct whisper_state * state,
-                   const float * samples,
-                           int   n_samples,
-                           int   n_threads);
-
-    // This can be used to set a custom log mel spectrogram inside the default state of the provided whisper context.
-    // Use this instead of whisper_pcm_to_mel() if you want to provide your own log mel spectrogram.
-    // n_mel must be 80
-    // Returns 0 on success
-    WHISPER_API int whisper_set_mel(
-            struct whisper_context * ctx,
-                       const float * data,
-                               int   n_len,
-                               int   n_mel);
-
-    WHISPER_API int whisper_set_mel_with_state(
-            struct whisper_context * ctx,
-              struct whisper_state * state,
-                       const float * data,
-                               int   n_len,
-                               int   n_mel);
-
-    // Run the Whisper encoder on the log mel spectrogram stored inside the default state in the provided whisper context.
-    // Make sure to call whisper_pcm_to_mel() or whisper_set_mel() first.
-    // offset can be used to specify the offset of the first frame in the spectrogram.
-    // Returns 0 on success
-    WHISPER_API int whisper_encode(
-            struct whisper_context * ctx,
-                               int   offset,
-                               int   n_threads);
-
-    WHISPER_API int whisper_encode_with_state(
-            struct whisper_context * ctx,
-              struct whisper_state * state,
-                               int   offset,
-                               int   n_threads);
-
-    // Run the Whisper decoder to obtain the logits and probabilities for the next token.
-    // Make sure to call whisper_encode() first.
-    // tokens + n_tokens is the provided context for the decoder.
-    // n_past is the number of tokens to use from previous decoder calls.
-    // Returns 0 on success
-    // TODO: add support for multiple decoders
-    WHISPER_API int whisper_decode(
-            struct whisper_context * ctx,
-               const whisper_token * tokens,
-                               int   n_tokens,
-                               int   n_past,
-                               int   n_threads);
-
-    WHISPER_API int whisper_decode_with_state(
-            struct whisper_context * ctx,
-              struct whisper_state * state,
-               const whisper_token * tokens,
-                               int   n_tokens,
-                               int   n_past,
-                               int   n_threads);
-
-    // Convert the provided text into tokens.
-    // The tokens pointer must be large enough to hold the resulting tokens.
-    // Returns the number of tokens on success, no more than n_max_tokens
-    // Returns -1 on failure
-    // TODO: not sure if correct
-    WHISPER_API int whisper_tokenize(
-            struct whisper_context * ctx,
-                        const char * text,
-                     whisper_token * tokens,
-                               int   n_max_tokens);
-
-    // Largest language id (i.e. number of available languages - 1)
-    WHISPER_API int whisper_lang_max_id();
-
-    // Return the id of the specified language, returns -1 if not found
-    // Examples:
-    //   "de" -> 2
-    //   "german" -> 2
-    WHISPER_API int whisper_lang_id(const char * lang);
-
-    // Return the short string of the specified language id (e.g. 2 -> "de"), returns nullptr if not found
-    WHISPER_API const char * whisper_lang_str(int id);
-
-    // Return the short string of the specified language name (e.g. 2 -> "german"), returns nullptr if not found
-    WHISPER_API const char * whisper_lang_str_full(int id);
-
-    // Use mel data at offset_ms to try and auto-detect the spoken language
-    // Make sure to call whisper_pcm_to_mel() or whisper_set_mel() first
-    // Returns the top language id or negative on failure
-    // If not null, fills the lang_probs array with the probabilities of all languages
-    // The array must be whisper_lang_max_id() + 1 in size
-    // ref: https://github.com/openai/whisper/blob/main/whisper/decoding.py#L18-L69
-    WHISPER_API int whisper_lang_auto_detect(
-            struct whisper_context * ctx,
-                               int   offset_ms,
-                               int   n_threads,
-                             float * lang_probs);
-
-    WHISPER_API int whisper_lang_auto_detect_with_state(
-            struct whisper_context * ctx,
-              struct whisper_state * state,
-                               int   offset_ms,
-                               int   n_threads,
-                             float * lang_probs);
-
-    WHISPER_API int whisper_n_len           (struct whisper_context * ctx); // mel length
-    WHISPER_API int whisper_n_len_from_state(struct whisper_state * state); // mel length
-    WHISPER_API int whisper_n_vocab         (struct whisper_context * ctx);
-    WHISPER_API int whisper_n_text_ctx      (struct whisper_context * ctx);
-    WHISPER_API int whisper_n_audio_ctx     (struct whisper_context * ctx);
-    WHISPER_API int whisper_is_multilingual (struct whisper_context * ctx);
-
-    WHISPER_API int whisper_model_n_vocab      (struct whisper_context * ctx);
-    WHISPER_API int whisper_model_n_audio_ctx  (struct whisper_context * ctx);
-    WHISPER_API int whisper_model_n_audio_state(struct whisper_context * ctx);
-    WHISPER_API int whisper_model_n_audio_head (struct whisper_context * ctx);
-    WHISPER_API int whisper_model_n_audio_layer(struct whisper_context * ctx);
-    WHISPER_API int whisper_model_n_text_ctx   (struct whisper_context * ctx);
-    WHISPER_API int whisper_model_n_text_state (struct whisper_context * ctx);
-    WHISPER_API int whisper_model_n_text_head  (struct whisper_context * ctx);
-    WHISPER_API int whisper_model_n_text_layer (struct whisper_context * ctx);
-    WHISPER_API int whisper_model_n_mels       (struct whisper_context * ctx);
-    WHISPER_API int whisper_model_ftype        (struct whisper_context * ctx);
-    WHISPER_API int whisper_model_type         (struct whisper_context * ctx);
-
-    // Token logits obtained from the last call to whisper_decode()
-    // The logits for the last token are stored in the last row
-    // Rows: n_tokens
-    // Cols: n_vocab
-    WHISPER_API float * whisper_get_logits           (struct whisper_context * ctx);
-    WHISPER_API float * whisper_get_logits_from_state(struct whisper_state * state);
-
-    // Token Id -> String. Uses the vocabulary in the provided context
-    WHISPER_API const char * whisper_token_to_str(struct whisper_context * ctx, whisper_token token);
-    WHISPER_API const char * whisper_model_type_readable(struct whisper_context * ctx);
-
-
-    // Special tokens
-    WHISPER_API whisper_token whisper_token_eot (struct whisper_context * ctx);
-    WHISPER_API whisper_token whisper_token_sot (struct whisper_context * ctx);
-    WHISPER_API whisper_token whisper_token_solm(struct whisper_context * ctx);
-    WHISPER_API whisper_token whisper_token_prev(struct whisper_context * ctx);
-    WHISPER_API whisper_token whisper_token_nosp(struct whisper_context * ctx);
-    WHISPER_API whisper_token whisper_token_not (struct whisper_context * ctx);
-    WHISPER_API whisper_token whisper_token_beg (struct whisper_context * ctx);
-    WHISPER_API whisper_token whisper_token_lang(struct whisper_context * ctx, int lang_id);
-
-    // Task tokens
-    WHISPER_API whisper_token whisper_token_translate (struct whisper_context * ctx);
-    WHISPER_API whisper_token whisper_token_transcribe(struct whisper_context * ctx);
-
-    // Performance information from the default state.
-    WHISPER_API void whisper_print_timings(struct whisper_context * ctx);
-    WHISPER_API void whisper_reset_timings(struct whisper_context * ctx);
-
-    // Print system information
-    WHISPER_API const char * whisper_print_system_info(void);
-
-    ////////////////////////////////////////////////////////////////////////////
-
-    // Available sampling strategies
-    enum whisper_sampling_strategy {
-        WHISPER_SAMPLING_GREEDY,      // similar to OpenAI's GreedyDecoder
-        WHISPER_SAMPLING_BEAM_SEARCH, // similar to OpenAI's BeamSearchDecoder
-    };
-
-    // Text segment callback
-    // Called on every newly generated text segment
-    // Use the whisper_full_...() functions to obtain the text segments
-    typedef void (*whisper_new_segment_callback)(struct whisper_context * ctx, struct whisper_state * state, int n_new, void * user_data);
-
-    // Progress callback
-    typedef void (*whisper_progress_callback)(struct whisper_context * ctx, struct whisper_state * state, int progress, void * user_data);
-
-    // Encoder begin callback
-    // If not NULL, called before the encoder starts
-    // If it returns false, the computation is aborted
-    typedef bool (*whisper_encoder_begin_callback)(struct whisper_context * ctx, struct whisper_state * state, void * user_data);
-
-    // Abort callback
-    // If not NULL, called before ggml computation
-    // If it returns true, the computation is aborted
-    typedef bool (*whisper_abort_callback)(void * user_data);
-
-    // Logits filter callback
-    // Can be used to modify the logits before sampling
-    // If not NULL, called after applying temperature to logits
-    typedef void (*whisper_logits_filter_callback)(
-            struct whisper_context * ctx,
-              struct whisper_state * state,
-          const whisper_token_data * tokens,
-                               int   n_tokens,
-                             float * logits,
-                              void * user_data);
-
-    // Parameters for the whisper_full() function
-    // If you change the order or add new parameters, make sure to update the default values in whisper.cpp:
-    // whisper_full_default_params()
-    struct whisper_full_params {
-        enum whisper_sampling_strategy strategy;
-
-        int n_threads;
-        int n_max_text_ctx;     // max tokens to use from past text as prompt for the decoder
-        int offset_ms;          // start offset in ms
-        int duration_ms;        // audio duration to process in ms
-
-        bool translate;
-        bool no_context;        // do not use past transcription (if any) as initial prompt for the decoder
-        bool no_timestamps;     // do not generate timestamps
-        bool single_segment;    // force single segment output (useful for streaming)
-        bool print_special;     // print special tokens (e.g. <SOT>, <EOT>, <BEG>, etc.)
-        bool print_progress;    // print progress information
-        bool print_realtime;    // print results from within whisper.cpp (avoid it, use callback instead)
-        bool print_timestamps;  // print timestamps for each text segment when printing realtime
-
-        // [EXPERIMENTAL] token-level timestamps
-        bool  token_timestamps; // enable token-level timestamps
-        float thold_pt;         // timestamp token probability threshold (~0.01)
-        float thold_ptsum;      // timestamp token sum probability threshold (~0.01)
-        int   max_len;          // max segment length in characters
-        bool  split_on_word;    // split on word rather than on token (when used with max_len)
-        int   max_tokens;       // max tokens per segment (0 = no limit)
-
-        // [EXPERIMENTAL] speed-up techniques
-        // note: these can significantly reduce the quality of the output
-        bool speed_up;          // speed-up the audio by 2x using Phase Vocoder
-        bool debug_mode;        // enable debug_mode provides extra info (eg. Dump log_mel)
-        int  audio_ctx;         // overwrite the audio context size (0 = use default)
-
-        // [EXPERIMENTAL] [TDRZ] tinydiarize
-        bool tdrz_enable;       // enable tinydiarize speaker turn detection
-
-        // tokens to provide to the whisper decoder as initial prompt
-        // these are prepended to any existing text context from a previous call
-        const char * initial_prompt;
-        const whisper_token * prompt_tokens;
-        int prompt_n_tokens;
-
-        // for auto-detection, set to nullptr, "" or "auto"
-        const char * language;
-        bool detect_language;
-
-        // common decoding parameters:
-        bool suppress_blank;    // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/decoding.py#L89
-        bool suppress_non_speech_tokens; // ref: https://github.com/openai/whisper/blob/7858aa9c08d98f75575035ecd6481f462d66ca27/whisper/tokenizer.py#L224-L253
-
-        float temperature;      // initial decoding temperature, ref: https://ai.stackexchange.com/a/32478
-        float max_initial_ts;   // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/decoding.py#L97
-        float length_penalty;   // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/transcribe.py#L267
-
-        // fallback parameters
-        // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/transcribe.py#L274-L278
-        float temperature_inc;
-        float entropy_thold;    // similar to OpenAI's "compression_ratio_threshold"
-        float logprob_thold;
-        float no_speech_thold;  // TODO: not implemented
-
-        struct {
-            int best_of;    // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/transcribe.py#L264
-        } greedy;
-
-        struct {
-            int beam_size;  // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/transcribe.py#L265
-
-            float patience; // TODO: not implemented, ref: https://arxiv.org/pdf/2204.05424.pdf
-        } beam_search;
-
-        // called for every newly generated text segment
-        whisper_new_segment_callback new_segment_callback;
-        void * new_segment_callback_user_data;
-
-        // called on each progress update
-        whisper_progress_callback progress_callback;
-        void * progress_callback_user_data;
-
-        // called each time before the encoder starts
-        whisper_encoder_begin_callback encoder_begin_callback;
-        void * encoder_begin_callback_user_data;
-
-        // called each time before ggml computation starts
-        whisper_abort_callback abort_callback;
-        void * abort_callback_user_data;
-
-        // called by each decoder to filter obtained logits
-        whisper_logits_filter_callback logits_filter_callback;
-        void * logits_filter_callback_user_data;
-
-        const whisper_grammar_element ** grammar_rules;
-        size_t                           n_grammar_rules;
-        size_t                           i_start_rule;
-        float                            grammar_penalty;
-    };
-
-    // NOTE: this function allocates memory, and it is the responsibility of the caller to free the pointer - see whisper_free_context_params & whisper_free_params()
-    WHISPER_API struct whisper_context_params * whisper_context_default_params_by_ref();
-    WHISPER_API struct whisper_context_params whisper_context_default_params(void);
-    WHISPER_API struct whisper_full_params * whisper_full_default_params_by_ref(enum whisper_sampling_strategy strategy);
-    WHISPER_API struct whisper_full_params whisper_full_default_params(enum whisper_sampling_strategy strategy);
-
-    // Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text
-    // Not thread safe for same context
-    // Uses the specified decoding strategy to obtain the text.
-    WHISPER_API int whisper_full(
-                struct whisper_context * ctx,
-            struct whisper_full_params   params,
-                           const float * samples,
-                                   int   n_samples);
-
-    WHISPER_API int whisper_full_with_state(
-                struct whisper_context * ctx,
-                  struct whisper_state * state,
-            struct whisper_full_params   params,
-                           const float * samples,
-                                   int   n_samples);
-
-    // Split the input audio in chunks and process each chunk separately using whisper_full_with_state()
-    // Result is stored in the default state of the context
-    // Not thread safe if executed in parallel on the same context.
-    // It seems this approach can offer some speedup in some cases.
-    // However, the transcription accuracy can be worse at the beginning and end of each chunk.
-    WHISPER_API int whisper_full_parallel(
-                struct whisper_context * ctx,
-            struct whisper_full_params   params,
-                           const float * samples,
-                                   int   n_samples,
-                                   int   n_processors);
-
-    // Number of generated text segments
-    // A segment can be a few words, a sentence, or even a paragraph.
-    WHISPER_API int whisper_full_n_segments           (struct whisper_context * ctx);
-    WHISPER_API int whisper_full_n_segments_from_state(struct whisper_state * state);
-
-    // Language id associated with the context's default state
-    WHISPER_API int whisper_full_lang_id(struct whisper_context * ctx);
-
-    // Language id associated with the provided state
-    WHISPER_API int whisper_full_lang_id_from_state(struct whisper_state * state);
-
-    // Get the start and end time of the specified segment
-    WHISPER_API int64_t whisper_full_get_segment_t0           (struct whisper_context * ctx, int i_segment);
-    WHISPER_API int64_t whisper_full_get_segment_t0_from_state(struct whisper_state * state, int i_segment);
-
-    WHISPER_API int64_t whisper_full_get_segment_t1           (struct whisper_context * ctx, int i_segment);
-    WHISPER_API int64_t whisper_full_get_segment_t1_from_state(struct whisper_state * state, int i_segment);
-
-    // Get whether the next segment is predicted as a speaker turn
-    WHISPER_API bool whisper_full_get_segment_speaker_turn_next(struct whisper_context * ctx, int i_segment);
-    WHISPER_API bool whisper_full_get_segment_speaker_turn_next_from_state(struct whisper_state * state, int i_segment);
-
-    // Get the text of the specified segment
-    WHISPER_API const char * whisper_full_get_segment_text           (struct whisper_context * ctx, int i_segment);
-    WHISPER_API const char * whisper_full_get_segment_text_from_state(struct whisper_state * state, int i_segment);
-
-    // Get number of tokens in the specified segment
-    WHISPER_API int whisper_full_n_tokens           (struct whisper_context * ctx, int i_segment);
-    WHISPER_API int whisper_full_n_tokens_from_state(struct whisper_state * state, int i_segment);
-
-    // Get the token text of the specified token in the specified segment
-    WHISPER_API const char * whisper_full_get_token_text           (struct whisper_context * ctx, int i_segment, int i_token);
-    WHISPER_API const char * whisper_full_get_token_text_from_state(struct whisper_context * ctx, struct whisper_state * state, int i_segment, int i_token);
-
-    WHISPER_API whisper_token whisper_full_get_token_id           (struct whisper_context * ctx, int i_segment, int i_token);
-    WHISPER_API whisper_token whisper_full_get_token_id_from_state(struct whisper_state * state, int i_segment, int i_token);
-
-    // Get token data for the specified token in the specified segment
-    // This contains probabilities, timestamps, etc.
-    WHISPER_API whisper_token_data whisper_full_get_token_data           (struct whisper_context * ctx, int i_segment, int i_token);
-    WHISPER_API whisper_token_data whisper_full_get_token_data_from_state(struct whisper_state * state, int i_segment, int i_token);
-
-    // Get the probability of the specified token in the specified segment
-    WHISPER_API float whisper_full_get_token_p           (struct whisper_context * ctx, int i_segment, int i_token);
-    WHISPER_API float whisper_full_get_token_p_from_state(struct whisper_state * state, int i_segment, int i_token);
-
-    ////////////////////////////////////////////////////////////////////////////
-
-    // Temporary helpers needed for exposing ggml interface
-
-    WHISPER_API int          whisper_bench_memcpy          (int n_threads);
-    WHISPER_API const char * whisper_bench_memcpy_str      (int n_threads);
-    WHISPER_API int          whisper_bench_ggml_mul_mat    (int n_threads);
-    WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads);
-
-    // Control logging output; default behavior is to print to stderr
-
-    WHISPER_API void whisper_log_set(ggml_log_callback log_callback, void * user_data);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/ggml/examples/yolo/CMakeLists.txt b/ggml/examples/yolo/CMakeLists.txt
deleted file mode 100644
index 0c068ce..0000000
--- a/ggml/examples/yolo/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-#
-# yolov3-tiny
-
-set(TEST_TARGET yolov3-tiny)
-add_executable(${TEST_TARGET} yolov3-tiny.cpp yolo-image.cpp)
-target_link_libraries(${TEST_TARGET} PRIVATE ggml common)
diff --git a/ggml/examples/yolo/README.md b/ggml/examples/yolo/README.md
deleted file mode 100644
index 0e69dc9..0000000
--- a/ggml/examples/yolo/README.md
+++ /dev/null
@@ -1,52 +0,0 @@
-This example shows how to implement YOLO object detection with ggml using pretrained model.
-
-# YOLOv3-tiny
-
-Download the model weights:
-
-```bash
-$ wget https://pjreddie.com/media/files/yolov3-tiny.weights
-$ sha1sum yolov3-tiny.weights 
-40f3c11883bef62fd850213bc14266632ed4414f  yolov3-tiny.weights
-```
-
-Convert the weights to GGUF format:
-
-```bash
-$ ./convert-yolov3-tiny.py yolov3-tiny.weights
-yolov3-tiny.weights converted to yolov3-tiny.gguf
-```
-
-Object detection:
-
-```bash
-$ wget https://raw.githubusercontent.com/pjreddie/darknet/master/data/dog.jpg
-$ ./yolov3-tiny -m yolov3-tiny.gguf -i dog.jpg
-Layer  0 output shape:  416 x 416 x   16 x   1
-Layer  1 output shape:  208 x 208 x   16 x   1
-Layer  2 output shape:  208 x 208 x   32 x   1
-Layer  3 output shape:  104 x 104 x   32 x   1
-Layer  4 output shape:  104 x 104 x   64 x   1
-Layer  5 output shape:   52 x  52 x   64 x   1
-Layer  6 output shape:   52 x  52 x  128 x   1
-Layer  7 output shape:   26 x  26 x  128 x   1
-Layer  8 output shape:   26 x  26 x  256 x   1
-Layer  9 output shape:   13 x  13 x  256 x   1
-Layer 10 output shape:   13 x  13 x  512 x   1
-Layer 11 output shape:   13 x  13 x  512 x   1
-Layer 12 output shape:   13 x  13 x 1024 x   1
-Layer 13 output shape:   13 x  13 x  256 x   1
-Layer 14 output shape:   13 x  13 x  512 x   1
-Layer 15 output shape:   13 x  13 x  255 x   1
-Layer 18 output shape:   13 x  13 x  128 x   1
-Layer 19 output shape:   26 x  26 x  128 x   1
-Layer 20 output shape:   26 x  26 x  384 x   1
-Layer 21 output shape:   26 x  26 x  256 x   1
-Layer 22 output shape:   26 x  26 x  255 x   1
-dog: 57%
-car: 52%
-truck: 56%
-car: 62%
-bicycle: 59%
-Detected objects saved in 'predictions.jpg' (time: 0.357000 sec.)
-```
\ No newline at end of file
diff --git a/ggml/examples/yolo/convert-yolov3-tiny.py b/ggml/examples/yolo/convert-yolov3-tiny.py
deleted file mode 100644
index 4c47991..0000000
--- a/ggml/examples/yolo/convert-yolov3-tiny.py
+++ /dev/null
@@ -1,53 +0,0 @@
-#!/usr/bin/env python3
-import sys
-import gguf
-import numpy as np
-
-def save_conv2d_layer(f, gguf_writer, prefix, inp_c, filters, size, batch_normalize=True):
-    biases = np.fromfile(f, dtype=np.float32, count=filters)
-    gguf_writer.add_tensor(prefix + "_biases", biases, raw_shape=(1, filters, 1, 1))
-
-    if batch_normalize:
-        scales = np.fromfile(f, dtype=np.float32, count=filters)
-        gguf_writer.add_tensor(prefix + "_scales", scales, raw_shape=(1, filters, 1, 1))
-        rolling_mean = np.fromfile(f, dtype=np.float32, count=filters)
-        gguf_writer.add_tensor(prefix + "_rolling_mean", rolling_mean, raw_shape=(1, filters, 1, 1))
-        rolling_variance = np.fromfile(f, dtype=np.float32, count=filters)
-        gguf_writer.add_tensor(prefix + "_rolling_variance", rolling_variance, raw_shape=(1, filters, 1, 1))
-
-    weights_count = filters * inp_c * size * size
-    l0_weights = np.fromfile(f, dtype=np.float32, count=weights_count)
-    ## ggml doesn't support f32 convolution yet, use f16 instead
-    l0_weights = l0_weights.astype(np.float16)
-    gguf_writer.add_tensor(prefix + "_weights", l0_weights, raw_shape=(filters, inp_c, size, size))
-
-
-if __name__ == '__main__':
-    if len(sys.argv) != 2:
-        print("Usage: %s <yolov3-tiny.weights>" % sys.argv[0])
-        sys.exit(1)
-    outfile = 'yolov3-tiny.gguf'
-    gguf_writer = gguf.GGUFWriter(outfile, 'yolov3-tiny')
-
-    f = open(sys.argv[1], 'rb')
-    f.read(20) # skip header
-    save_conv2d_layer(f, gguf_writer, "l0", 3, 16, 3)
-    save_conv2d_layer(f, gguf_writer, "l1", 16, 32, 3)
-    save_conv2d_layer(f, gguf_writer, "l2", 32, 64, 3)
-    save_conv2d_layer(f, gguf_writer, "l3", 64, 128, 3)
-    save_conv2d_layer(f, gguf_writer, "l4", 128, 256, 3)
-    save_conv2d_layer(f, gguf_writer, "l5", 256, 512, 3)
-    save_conv2d_layer(f, gguf_writer, "l6", 512, 1024, 3)
-    save_conv2d_layer(f, gguf_writer, "l7", 1024, 256, 1)
-    save_conv2d_layer(f, gguf_writer, "l8", 256, 512, 3)
-    save_conv2d_layer(f, gguf_writer, "l9", 512, 255, 1, batch_normalize=False)
-    save_conv2d_layer(f, gguf_writer, "l10", 256, 128, 1)
-    save_conv2d_layer(f, gguf_writer, "l11", 384, 256, 3)
-    save_conv2d_layer(f, gguf_writer, "l12", 256, 255, 1, batch_normalize=False)
-    f.close()
-    
-    gguf_writer.write_header_to_file()
-    gguf_writer.write_kv_data_to_file()
-    gguf_writer.write_tensors_to_file()
-    gguf_writer.close()
-    print("{} converted to {}".format(sys.argv[1], outfile))
diff --git a/ggml/examples/yolo/data/coco.names b/ggml/examples/yolo/data/coco.names
deleted file mode 100644
index ca76c80..0000000
--- a/ggml/examples/yolo/data/coco.names
+++ /dev/null
@@ -1,80 +0,0 @@
-person
-bicycle
-car
-motorbike
-aeroplane
-bus
-train
-truck
-boat
-traffic light
-fire hydrant
-stop sign
-parking meter
-bench
-bird
-cat
-dog
-horse
-sheep
-cow
-elephant
-bear
-zebra
-giraffe
-backpack
-umbrella
-handbag
-tie
-suitcase
-frisbee
-skis
-snowboard
-sports ball
-kite
-baseball bat
-baseball glove
-skateboard
-surfboard
-tennis racket
-bottle
-wine glass
-cup
-fork
-knife
-spoon
-bowl
-banana
-apple
-sandwich
-orange
-broccoli
-carrot
-hot dog
-pizza
-donut
-cake
-chair
-sofa
-pottedplant
-bed
-diningtable
-toilet
-tvmonitor
-laptop
-mouse
-remote
-keyboard
-cell phone
-microwave
-oven
-toaster
-sink
-refrigerator
-book
-clock
-vase
-scissors
-teddy bear
-hair drier
-toothbrush
diff --git a/ggml/examples/yolo/data/labels/100_0.png b/ggml/examples/yolo/data/labels/100_0.png
deleted file mode 100644
index 77878ec..0000000
--- a/ggml/examples/yolo/data/labels/100_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/100_1.png b/ggml/examples/yolo/data/labels/100_1.png
deleted file mode 100644
index 86181c0..0000000
--- a/ggml/examples/yolo/data/labels/100_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/100_2.png b/ggml/examples/yolo/data/labels/100_2.png
deleted file mode 100644
index 5d43046..0000000
--- a/ggml/examples/yolo/data/labels/100_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/100_3.png b/ggml/examples/yolo/data/labels/100_3.png
deleted file mode 100644
index 353401f..0000000
--- a/ggml/examples/yolo/data/labels/100_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/100_4.png b/ggml/examples/yolo/data/labels/100_4.png
deleted file mode 100644
index d22f891..0000000
--- a/ggml/examples/yolo/data/labels/100_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/100_5.png b/ggml/examples/yolo/data/labels/100_5.png
deleted file mode 100644
index 7f65183..0000000
--- a/ggml/examples/yolo/data/labels/100_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/100_6.png b/ggml/examples/yolo/data/labels/100_6.png
deleted file mode 100644
index 18eb173..0000000
--- a/ggml/examples/yolo/data/labels/100_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/100_7.png b/ggml/examples/yolo/data/labels/100_7.png
deleted file mode 100644
index b45964b..0000000
--- a/ggml/examples/yolo/data/labels/100_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/101_0.png b/ggml/examples/yolo/data/labels/101_0.png
deleted file mode 100644
index 8739bc7..0000000
--- a/ggml/examples/yolo/data/labels/101_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/101_1.png b/ggml/examples/yolo/data/labels/101_1.png
deleted file mode 100644
index 5a39331..0000000
--- a/ggml/examples/yolo/data/labels/101_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/101_2.png b/ggml/examples/yolo/data/labels/101_2.png
deleted file mode 100644
index c223477..0000000
--- a/ggml/examples/yolo/data/labels/101_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/101_3.png b/ggml/examples/yolo/data/labels/101_3.png
deleted file mode 100644
index 09f50e3..0000000
--- a/ggml/examples/yolo/data/labels/101_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/101_4.png b/ggml/examples/yolo/data/labels/101_4.png
deleted file mode 100644
index 7d7960b..0000000
--- a/ggml/examples/yolo/data/labels/101_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/101_5.png b/ggml/examples/yolo/data/labels/101_5.png
deleted file mode 100644
index 08d8003..0000000
--- a/ggml/examples/yolo/data/labels/101_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/101_6.png b/ggml/examples/yolo/data/labels/101_6.png
deleted file mode 100644
index 9d554db..0000000
--- a/ggml/examples/yolo/data/labels/101_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/101_7.png b/ggml/examples/yolo/data/labels/101_7.png
deleted file mode 100644
index 37855f0..0000000
--- a/ggml/examples/yolo/data/labels/101_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/102_0.png b/ggml/examples/yolo/data/labels/102_0.png
deleted file mode 100644
index 7b8b0fb..0000000
--- a/ggml/examples/yolo/data/labels/102_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/102_1.png b/ggml/examples/yolo/data/labels/102_1.png
deleted file mode 100644
index ade5aa5..0000000
--- a/ggml/examples/yolo/data/labels/102_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/102_2.png b/ggml/examples/yolo/data/labels/102_2.png
deleted file mode 100644
index 2068f0b..0000000
--- a/ggml/examples/yolo/data/labels/102_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/102_3.png b/ggml/examples/yolo/data/labels/102_3.png
deleted file mode 100644
index 00e494e..0000000
--- a/ggml/examples/yolo/data/labels/102_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/102_4.png b/ggml/examples/yolo/data/labels/102_4.png
deleted file mode 100644
index 37637d3..0000000
--- a/ggml/examples/yolo/data/labels/102_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/102_5.png b/ggml/examples/yolo/data/labels/102_5.png
deleted file mode 100644
index a86b69b..0000000
--- a/ggml/examples/yolo/data/labels/102_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/102_6.png b/ggml/examples/yolo/data/labels/102_6.png
deleted file mode 100644
index 7d6889e..0000000
--- a/ggml/examples/yolo/data/labels/102_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/102_7.png b/ggml/examples/yolo/data/labels/102_7.png
deleted file mode 100644
index a9a7381..0000000
--- a/ggml/examples/yolo/data/labels/102_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/103_0.png b/ggml/examples/yolo/data/labels/103_0.png
deleted file mode 100644
index 17cc959..0000000
--- a/ggml/examples/yolo/data/labels/103_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/103_1.png b/ggml/examples/yolo/data/labels/103_1.png
deleted file mode 100644
index 175794c..0000000
--- a/ggml/examples/yolo/data/labels/103_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/103_2.png b/ggml/examples/yolo/data/labels/103_2.png
deleted file mode 100644
index 415038d..0000000
--- a/ggml/examples/yolo/data/labels/103_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/103_3.png b/ggml/examples/yolo/data/labels/103_3.png
deleted file mode 100644
index 1101880..0000000
--- a/ggml/examples/yolo/data/labels/103_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/103_4.png b/ggml/examples/yolo/data/labels/103_4.png
deleted file mode 100644
index b8bf32e..0000000
--- a/ggml/examples/yolo/data/labels/103_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/103_5.png b/ggml/examples/yolo/data/labels/103_5.png
deleted file mode 100644
index 6c81a57..0000000
--- a/ggml/examples/yolo/data/labels/103_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/103_6.png b/ggml/examples/yolo/data/labels/103_6.png
deleted file mode 100644
index ce580c0..0000000
--- a/ggml/examples/yolo/data/labels/103_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/103_7.png b/ggml/examples/yolo/data/labels/103_7.png
deleted file mode 100644
index 1919f01..0000000
--- a/ggml/examples/yolo/data/labels/103_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/104_0.png b/ggml/examples/yolo/data/labels/104_0.png
deleted file mode 100644
index 4ce801b..0000000
--- a/ggml/examples/yolo/data/labels/104_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/104_1.png b/ggml/examples/yolo/data/labels/104_1.png
deleted file mode 100644
index cdc2ba6..0000000
--- a/ggml/examples/yolo/data/labels/104_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/104_2.png b/ggml/examples/yolo/data/labels/104_2.png
deleted file mode 100644
index aa88c84..0000000
--- a/ggml/examples/yolo/data/labels/104_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/104_3.png b/ggml/examples/yolo/data/labels/104_3.png
deleted file mode 100644
index d574267..0000000
--- a/ggml/examples/yolo/data/labels/104_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/104_4.png b/ggml/examples/yolo/data/labels/104_4.png
deleted file mode 100644
index 71f9662..0000000
--- a/ggml/examples/yolo/data/labels/104_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/104_5.png b/ggml/examples/yolo/data/labels/104_5.png
deleted file mode 100644
index 3159b0e..0000000
--- a/ggml/examples/yolo/data/labels/104_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/104_6.png b/ggml/examples/yolo/data/labels/104_6.png
deleted file mode 100644
index 018e0e1..0000000
--- a/ggml/examples/yolo/data/labels/104_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/104_7.png b/ggml/examples/yolo/data/labels/104_7.png
deleted file mode 100644
index 39fbe16..0000000
--- a/ggml/examples/yolo/data/labels/104_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/105_0.png b/ggml/examples/yolo/data/labels/105_0.png
deleted file mode 100644
index 388be1a..0000000
--- a/ggml/examples/yolo/data/labels/105_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/105_1.png b/ggml/examples/yolo/data/labels/105_1.png
deleted file mode 100644
index 284a130..0000000
--- a/ggml/examples/yolo/data/labels/105_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/105_2.png b/ggml/examples/yolo/data/labels/105_2.png
deleted file mode 100644
index d378969..0000000
--- a/ggml/examples/yolo/data/labels/105_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/105_3.png b/ggml/examples/yolo/data/labels/105_3.png
deleted file mode 100644
index f9ebb66..0000000
--- a/ggml/examples/yolo/data/labels/105_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/105_4.png b/ggml/examples/yolo/data/labels/105_4.png
deleted file mode 100644
index afb260b..0000000
--- a/ggml/examples/yolo/data/labels/105_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/105_5.png b/ggml/examples/yolo/data/labels/105_5.png
deleted file mode 100644
index 53274dd..0000000
--- a/ggml/examples/yolo/data/labels/105_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/105_6.png b/ggml/examples/yolo/data/labels/105_6.png
deleted file mode 100644
index 6b952ea..0000000
--- a/ggml/examples/yolo/data/labels/105_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/105_7.png b/ggml/examples/yolo/data/labels/105_7.png
deleted file mode 100644
index 02b1b7d..0000000
--- a/ggml/examples/yolo/data/labels/105_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/106_0.png b/ggml/examples/yolo/data/labels/106_0.png
deleted file mode 100644
index 7075259..0000000
--- a/ggml/examples/yolo/data/labels/106_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/106_1.png b/ggml/examples/yolo/data/labels/106_1.png
deleted file mode 100644
index 66e73ab..0000000
--- a/ggml/examples/yolo/data/labels/106_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/106_2.png b/ggml/examples/yolo/data/labels/106_2.png
deleted file mode 100644
index aaf88c6..0000000
--- a/ggml/examples/yolo/data/labels/106_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/106_3.png b/ggml/examples/yolo/data/labels/106_3.png
deleted file mode 100644
index bc6b981..0000000
--- a/ggml/examples/yolo/data/labels/106_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/106_4.png b/ggml/examples/yolo/data/labels/106_4.png
deleted file mode 100644
index b662b7e..0000000
--- a/ggml/examples/yolo/data/labels/106_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/106_5.png b/ggml/examples/yolo/data/labels/106_5.png
deleted file mode 100644
index 43b4561..0000000
--- a/ggml/examples/yolo/data/labels/106_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/106_6.png b/ggml/examples/yolo/data/labels/106_6.png
deleted file mode 100644
index e667664..0000000
--- a/ggml/examples/yolo/data/labels/106_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/106_7.png b/ggml/examples/yolo/data/labels/106_7.png
deleted file mode 100644
index 727f3f6..0000000
--- a/ggml/examples/yolo/data/labels/106_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/107_0.png b/ggml/examples/yolo/data/labels/107_0.png
deleted file mode 100644
index 9c8d836..0000000
--- a/ggml/examples/yolo/data/labels/107_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/107_1.png b/ggml/examples/yolo/data/labels/107_1.png
deleted file mode 100644
index 7dc3d2b..0000000
--- a/ggml/examples/yolo/data/labels/107_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/107_2.png b/ggml/examples/yolo/data/labels/107_2.png
deleted file mode 100644
index d27cf78..0000000
--- a/ggml/examples/yolo/data/labels/107_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/107_3.png b/ggml/examples/yolo/data/labels/107_3.png
deleted file mode 100644
index 04b6c22..0000000
--- a/ggml/examples/yolo/data/labels/107_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/107_4.png b/ggml/examples/yolo/data/labels/107_4.png
deleted file mode 100644
index cbc4515..0000000
--- a/ggml/examples/yolo/data/labels/107_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/107_5.png b/ggml/examples/yolo/data/labels/107_5.png
deleted file mode 100644
index bc9730c..0000000
--- a/ggml/examples/yolo/data/labels/107_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/107_6.png b/ggml/examples/yolo/data/labels/107_6.png
deleted file mode 100644
index ffbe415..0000000
--- a/ggml/examples/yolo/data/labels/107_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/107_7.png b/ggml/examples/yolo/data/labels/107_7.png
deleted file mode 100644
index 95f81a5..0000000
--- a/ggml/examples/yolo/data/labels/107_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/108_0.png b/ggml/examples/yolo/data/labels/108_0.png
deleted file mode 100644
index ce05b22..0000000
--- a/ggml/examples/yolo/data/labels/108_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/108_1.png b/ggml/examples/yolo/data/labels/108_1.png
deleted file mode 100644
index 70d3895..0000000
--- a/ggml/examples/yolo/data/labels/108_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/108_2.png b/ggml/examples/yolo/data/labels/108_2.png
deleted file mode 100644
index 7a95038..0000000
--- a/ggml/examples/yolo/data/labels/108_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/108_3.png b/ggml/examples/yolo/data/labels/108_3.png
deleted file mode 100644
index 81637b3..0000000
--- a/ggml/examples/yolo/data/labels/108_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/108_4.png b/ggml/examples/yolo/data/labels/108_4.png
deleted file mode 100644
index 0ea83f2..0000000
--- a/ggml/examples/yolo/data/labels/108_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/108_5.png b/ggml/examples/yolo/data/labels/108_5.png
deleted file mode 100644
index 98d79d8..0000000
--- a/ggml/examples/yolo/data/labels/108_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/108_6.png b/ggml/examples/yolo/data/labels/108_6.png
deleted file mode 100644
index 25303aa..0000000
--- a/ggml/examples/yolo/data/labels/108_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/108_7.png b/ggml/examples/yolo/data/labels/108_7.png
deleted file mode 100644
index 6fb03a5..0000000
--- a/ggml/examples/yolo/data/labels/108_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/109_0.png b/ggml/examples/yolo/data/labels/109_0.png
deleted file mode 100644
index 8169faf..0000000
--- a/ggml/examples/yolo/data/labels/109_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/109_1.png b/ggml/examples/yolo/data/labels/109_1.png
deleted file mode 100644
index 16bc989..0000000
--- a/ggml/examples/yolo/data/labels/109_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/109_2.png b/ggml/examples/yolo/data/labels/109_2.png
deleted file mode 100644
index cf8cf63..0000000
--- a/ggml/examples/yolo/data/labels/109_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/109_3.png b/ggml/examples/yolo/data/labels/109_3.png
deleted file mode 100644
index 5f6757b..0000000
--- a/ggml/examples/yolo/data/labels/109_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/109_4.png b/ggml/examples/yolo/data/labels/109_4.png
deleted file mode 100644
index 5d9fdab..0000000
--- a/ggml/examples/yolo/data/labels/109_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/109_5.png b/ggml/examples/yolo/data/labels/109_5.png
deleted file mode 100644
index be6b733..0000000
--- a/ggml/examples/yolo/data/labels/109_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/109_6.png b/ggml/examples/yolo/data/labels/109_6.png
deleted file mode 100644
index a02a117..0000000
--- a/ggml/examples/yolo/data/labels/109_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/109_7.png b/ggml/examples/yolo/data/labels/109_7.png
deleted file mode 100644
index 1349d93..0000000
--- a/ggml/examples/yolo/data/labels/109_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/110_0.png b/ggml/examples/yolo/data/labels/110_0.png
deleted file mode 100644
index 49a2582..0000000
--- a/ggml/examples/yolo/data/labels/110_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/110_1.png b/ggml/examples/yolo/data/labels/110_1.png
deleted file mode 100644
index 10630cc..0000000
--- a/ggml/examples/yolo/data/labels/110_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/110_2.png b/ggml/examples/yolo/data/labels/110_2.png
deleted file mode 100644
index f9004da..0000000
--- a/ggml/examples/yolo/data/labels/110_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/110_3.png b/ggml/examples/yolo/data/labels/110_3.png
deleted file mode 100644
index b0d6ae1..0000000
--- a/ggml/examples/yolo/data/labels/110_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/110_4.png b/ggml/examples/yolo/data/labels/110_4.png
deleted file mode 100644
index 8f57528..0000000
--- a/ggml/examples/yolo/data/labels/110_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/110_5.png b/ggml/examples/yolo/data/labels/110_5.png
deleted file mode 100644
index b335e49..0000000
--- a/ggml/examples/yolo/data/labels/110_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/110_6.png b/ggml/examples/yolo/data/labels/110_6.png
deleted file mode 100644
index 450fbb6..0000000
--- a/ggml/examples/yolo/data/labels/110_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/110_7.png b/ggml/examples/yolo/data/labels/110_7.png
deleted file mode 100644
index c7b4754..0000000
--- a/ggml/examples/yolo/data/labels/110_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/111_0.png b/ggml/examples/yolo/data/labels/111_0.png
deleted file mode 100644
index a0e5883..0000000
--- a/ggml/examples/yolo/data/labels/111_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/111_1.png b/ggml/examples/yolo/data/labels/111_1.png
deleted file mode 100644
index 9c17d57..0000000
--- a/ggml/examples/yolo/data/labels/111_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/111_2.png b/ggml/examples/yolo/data/labels/111_2.png
deleted file mode 100644
index 4771086..0000000
--- a/ggml/examples/yolo/data/labels/111_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/111_3.png b/ggml/examples/yolo/data/labels/111_3.png
deleted file mode 100644
index bfba021..0000000
--- a/ggml/examples/yolo/data/labels/111_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/111_4.png b/ggml/examples/yolo/data/labels/111_4.png
deleted file mode 100644
index 7a4aeda..0000000
--- a/ggml/examples/yolo/data/labels/111_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/111_5.png b/ggml/examples/yolo/data/labels/111_5.png
deleted file mode 100644
index 180a9c5..0000000
--- a/ggml/examples/yolo/data/labels/111_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/111_6.png b/ggml/examples/yolo/data/labels/111_6.png
deleted file mode 100644
index e1482cd..0000000
--- a/ggml/examples/yolo/data/labels/111_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/111_7.png b/ggml/examples/yolo/data/labels/111_7.png
deleted file mode 100644
index c9355d0..0000000
--- a/ggml/examples/yolo/data/labels/111_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/112_0.png b/ggml/examples/yolo/data/labels/112_0.png
deleted file mode 100644
index d732efe..0000000
--- a/ggml/examples/yolo/data/labels/112_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/112_1.png b/ggml/examples/yolo/data/labels/112_1.png
deleted file mode 100644
index 11243af..0000000
--- a/ggml/examples/yolo/data/labels/112_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/112_2.png b/ggml/examples/yolo/data/labels/112_2.png
deleted file mode 100644
index c3c4220..0000000
--- a/ggml/examples/yolo/data/labels/112_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/112_3.png b/ggml/examples/yolo/data/labels/112_3.png
deleted file mode 100644
index f830185..0000000
--- a/ggml/examples/yolo/data/labels/112_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/112_4.png b/ggml/examples/yolo/data/labels/112_4.png
deleted file mode 100644
index a4b184a..0000000
--- a/ggml/examples/yolo/data/labels/112_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/112_5.png b/ggml/examples/yolo/data/labels/112_5.png
deleted file mode 100644
index 02f97a2..0000000
--- a/ggml/examples/yolo/data/labels/112_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/112_6.png b/ggml/examples/yolo/data/labels/112_6.png
deleted file mode 100644
index 2001b39..0000000
--- a/ggml/examples/yolo/data/labels/112_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/112_7.png b/ggml/examples/yolo/data/labels/112_7.png
deleted file mode 100644
index 9a14a71..0000000
--- a/ggml/examples/yolo/data/labels/112_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/113_0.png b/ggml/examples/yolo/data/labels/113_0.png
deleted file mode 100644
index e36c84e..0000000
--- a/ggml/examples/yolo/data/labels/113_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/113_1.png b/ggml/examples/yolo/data/labels/113_1.png
deleted file mode 100644
index 4c81c21..0000000
--- a/ggml/examples/yolo/data/labels/113_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/113_2.png b/ggml/examples/yolo/data/labels/113_2.png
deleted file mode 100644
index b29b8d0..0000000
--- a/ggml/examples/yolo/data/labels/113_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/113_3.png b/ggml/examples/yolo/data/labels/113_3.png
deleted file mode 100644
index 6233584..0000000
--- a/ggml/examples/yolo/data/labels/113_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/113_4.png b/ggml/examples/yolo/data/labels/113_4.png
deleted file mode 100644
index d1373f9..0000000
--- a/ggml/examples/yolo/data/labels/113_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/113_5.png b/ggml/examples/yolo/data/labels/113_5.png
deleted file mode 100644
index ecfa110..0000000
--- a/ggml/examples/yolo/data/labels/113_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/113_6.png b/ggml/examples/yolo/data/labels/113_6.png
deleted file mode 100644
index c4e0d98..0000000
--- a/ggml/examples/yolo/data/labels/113_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/113_7.png b/ggml/examples/yolo/data/labels/113_7.png
deleted file mode 100644
index 55ac8cf..0000000
--- a/ggml/examples/yolo/data/labels/113_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/114_0.png b/ggml/examples/yolo/data/labels/114_0.png
deleted file mode 100644
index 9d42671..0000000
--- a/ggml/examples/yolo/data/labels/114_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/114_1.png b/ggml/examples/yolo/data/labels/114_1.png
deleted file mode 100644
index 277eed4..0000000
--- a/ggml/examples/yolo/data/labels/114_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/114_2.png b/ggml/examples/yolo/data/labels/114_2.png
deleted file mode 100644
index 9c18af3..0000000
--- a/ggml/examples/yolo/data/labels/114_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/114_3.png b/ggml/examples/yolo/data/labels/114_3.png
deleted file mode 100644
index 412f320..0000000
--- a/ggml/examples/yolo/data/labels/114_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/114_4.png b/ggml/examples/yolo/data/labels/114_4.png
deleted file mode 100644
index 65912f0..0000000
--- a/ggml/examples/yolo/data/labels/114_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/114_5.png b/ggml/examples/yolo/data/labels/114_5.png
deleted file mode 100644
index fdf7bd2..0000000
--- a/ggml/examples/yolo/data/labels/114_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/114_6.png b/ggml/examples/yolo/data/labels/114_6.png
deleted file mode 100644
index 8f6b639..0000000
--- a/ggml/examples/yolo/data/labels/114_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/114_7.png b/ggml/examples/yolo/data/labels/114_7.png
deleted file mode 100644
index acf9180..0000000
--- a/ggml/examples/yolo/data/labels/114_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/115_0.png b/ggml/examples/yolo/data/labels/115_0.png
deleted file mode 100644
index 4767c45..0000000
--- a/ggml/examples/yolo/data/labels/115_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/115_1.png b/ggml/examples/yolo/data/labels/115_1.png
deleted file mode 100644
index 79f5d39..0000000
--- a/ggml/examples/yolo/data/labels/115_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/115_2.png b/ggml/examples/yolo/data/labels/115_2.png
deleted file mode 100644
index 963c068..0000000
--- a/ggml/examples/yolo/data/labels/115_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/115_3.png b/ggml/examples/yolo/data/labels/115_3.png
deleted file mode 100644
index bc0d24f..0000000
--- a/ggml/examples/yolo/data/labels/115_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/115_4.png b/ggml/examples/yolo/data/labels/115_4.png
deleted file mode 100644
index a39b0f0..0000000
--- a/ggml/examples/yolo/data/labels/115_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/115_5.png b/ggml/examples/yolo/data/labels/115_5.png
deleted file mode 100644
index 63e0078..0000000
--- a/ggml/examples/yolo/data/labels/115_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/115_6.png b/ggml/examples/yolo/data/labels/115_6.png
deleted file mode 100644
index 344ef84..0000000
--- a/ggml/examples/yolo/data/labels/115_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/115_7.png b/ggml/examples/yolo/data/labels/115_7.png
deleted file mode 100644
index c6e743c..0000000
--- a/ggml/examples/yolo/data/labels/115_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/116_0.png b/ggml/examples/yolo/data/labels/116_0.png
deleted file mode 100644
index bf9b263..0000000
--- a/ggml/examples/yolo/data/labels/116_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/116_1.png b/ggml/examples/yolo/data/labels/116_1.png
deleted file mode 100644
index 2fb12a9..0000000
--- a/ggml/examples/yolo/data/labels/116_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/116_2.png b/ggml/examples/yolo/data/labels/116_2.png
deleted file mode 100644
index dd04491..0000000
--- a/ggml/examples/yolo/data/labels/116_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/116_3.png b/ggml/examples/yolo/data/labels/116_3.png
deleted file mode 100644
index 0314e85..0000000
--- a/ggml/examples/yolo/data/labels/116_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/116_4.png b/ggml/examples/yolo/data/labels/116_4.png
deleted file mode 100644
index cf919a6..0000000
--- a/ggml/examples/yolo/data/labels/116_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/116_5.png b/ggml/examples/yolo/data/labels/116_5.png
deleted file mode 100644
index c9e04ed..0000000
--- a/ggml/examples/yolo/data/labels/116_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/116_6.png b/ggml/examples/yolo/data/labels/116_6.png
deleted file mode 100644
index 1d0ab79..0000000
--- a/ggml/examples/yolo/data/labels/116_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/116_7.png b/ggml/examples/yolo/data/labels/116_7.png
deleted file mode 100644
index 1fcdaa6..0000000
--- a/ggml/examples/yolo/data/labels/116_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/117_0.png b/ggml/examples/yolo/data/labels/117_0.png
deleted file mode 100644
index 917b4a2..0000000
--- a/ggml/examples/yolo/data/labels/117_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/117_1.png b/ggml/examples/yolo/data/labels/117_1.png
deleted file mode 100644
index 59cae8a..0000000
--- a/ggml/examples/yolo/data/labels/117_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/117_2.png b/ggml/examples/yolo/data/labels/117_2.png
deleted file mode 100644
index d7c7f33..0000000
--- a/ggml/examples/yolo/data/labels/117_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/117_3.png b/ggml/examples/yolo/data/labels/117_3.png
deleted file mode 100644
index 18679c3..0000000
--- a/ggml/examples/yolo/data/labels/117_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/117_4.png b/ggml/examples/yolo/data/labels/117_4.png
deleted file mode 100644
index bceedf9..0000000
--- a/ggml/examples/yolo/data/labels/117_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/117_5.png b/ggml/examples/yolo/data/labels/117_5.png
deleted file mode 100644
index c0bddab..0000000
--- a/ggml/examples/yolo/data/labels/117_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/117_6.png b/ggml/examples/yolo/data/labels/117_6.png
deleted file mode 100644
index 731fd14..0000000
--- a/ggml/examples/yolo/data/labels/117_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/117_7.png b/ggml/examples/yolo/data/labels/117_7.png
deleted file mode 100644
index c04ed0b..0000000
--- a/ggml/examples/yolo/data/labels/117_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/118_0.png b/ggml/examples/yolo/data/labels/118_0.png
deleted file mode 100644
index 4b8dfc6..0000000
--- a/ggml/examples/yolo/data/labels/118_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/118_1.png b/ggml/examples/yolo/data/labels/118_1.png
deleted file mode 100644
index fa886ac..0000000
--- a/ggml/examples/yolo/data/labels/118_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/118_2.png b/ggml/examples/yolo/data/labels/118_2.png
deleted file mode 100644
index 958ff5e..0000000
--- a/ggml/examples/yolo/data/labels/118_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/118_3.png b/ggml/examples/yolo/data/labels/118_3.png
deleted file mode 100644
index 241b788..0000000
--- a/ggml/examples/yolo/data/labels/118_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/118_4.png b/ggml/examples/yolo/data/labels/118_4.png
deleted file mode 100644
index 9399152..0000000
--- a/ggml/examples/yolo/data/labels/118_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/118_5.png b/ggml/examples/yolo/data/labels/118_5.png
deleted file mode 100644
index 279c954..0000000
--- a/ggml/examples/yolo/data/labels/118_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/118_6.png b/ggml/examples/yolo/data/labels/118_6.png
deleted file mode 100644
index 9379cb9..0000000
--- a/ggml/examples/yolo/data/labels/118_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/118_7.png b/ggml/examples/yolo/data/labels/118_7.png
deleted file mode 100644
index b34ca8a..0000000
--- a/ggml/examples/yolo/data/labels/118_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/119_0.png b/ggml/examples/yolo/data/labels/119_0.png
deleted file mode 100644
index ae03b91..0000000
--- a/ggml/examples/yolo/data/labels/119_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/119_1.png b/ggml/examples/yolo/data/labels/119_1.png
deleted file mode 100644
index 7794fb6..0000000
--- a/ggml/examples/yolo/data/labels/119_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/119_2.png b/ggml/examples/yolo/data/labels/119_2.png
deleted file mode 100644
index 976c49b..0000000
--- a/ggml/examples/yolo/data/labels/119_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/119_3.png b/ggml/examples/yolo/data/labels/119_3.png
deleted file mode 100644
index 2e0160e..0000000
--- a/ggml/examples/yolo/data/labels/119_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/119_4.png b/ggml/examples/yolo/data/labels/119_4.png
deleted file mode 100644
index 0540927..0000000
--- a/ggml/examples/yolo/data/labels/119_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/119_5.png b/ggml/examples/yolo/data/labels/119_5.png
deleted file mode 100644
index 4649a59..0000000
--- a/ggml/examples/yolo/data/labels/119_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/119_6.png b/ggml/examples/yolo/data/labels/119_6.png
deleted file mode 100644
index eb5e0fb..0000000
--- a/ggml/examples/yolo/data/labels/119_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/119_7.png b/ggml/examples/yolo/data/labels/119_7.png
deleted file mode 100644
index 5615e77..0000000
--- a/ggml/examples/yolo/data/labels/119_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/120_0.png b/ggml/examples/yolo/data/labels/120_0.png
deleted file mode 100644
index 64fd621..0000000
--- a/ggml/examples/yolo/data/labels/120_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/120_1.png b/ggml/examples/yolo/data/labels/120_1.png
deleted file mode 100644
index e13ecef..0000000
--- a/ggml/examples/yolo/data/labels/120_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/120_2.png b/ggml/examples/yolo/data/labels/120_2.png
deleted file mode 100644
index b6ae604..0000000
--- a/ggml/examples/yolo/data/labels/120_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/120_3.png b/ggml/examples/yolo/data/labels/120_3.png
deleted file mode 100644
index 13a4c3c..0000000
--- a/ggml/examples/yolo/data/labels/120_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/120_4.png b/ggml/examples/yolo/data/labels/120_4.png
deleted file mode 100644
index af550a8..0000000
--- a/ggml/examples/yolo/data/labels/120_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/120_5.png b/ggml/examples/yolo/data/labels/120_5.png
deleted file mode 100644
index 38474e4..0000000
--- a/ggml/examples/yolo/data/labels/120_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/120_6.png b/ggml/examples/yolo/data/labels/120_6.png
deleted file mode 100644
index e356cde..0000000
--- a/ggml/examples/yolo/data/labels/120_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/120_7.png b/ggml/examples/yolo/data/labels/120_7.png
deleted file mode 100644
index 5a585b4..0000000
--- a/ggml/examples/yolo/data/labels/120_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/121_0.png b/ggml/examples/yolo/data/labels/121_0.png
deleted file mode 100644
index 2c20381..0000000
--- a/ggml/examples/yolo/data/labels/121_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/121_1.png b/ggml/examples/yolo/data/labels/121_1.png
deleted file mode 100644
index 820c17d..0000000
--- a/ggml/examples/yolo/data/labels/121_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/121_2.png b/ggml/examples/yolo/data/labels/121_2.png
deleted file mode 100644
index fabda00..0000000
--- a/ggml/examples/yolo/data/labels/121_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/121_3.png b/ggml/examples/yolo/data/labels/121_3.png
deleted file mode 100644
index 79ac0b1..0000000
--- a/ggml/examples/yolo/data/labels/121_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/121_4.png b/ggml/examples/yolo/data/labels/121_4.png
deleted file mode 100644
index 7ded729..0000000
--- a/ggml/examples/yolo/data/labels/121_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/121_5.png b/ggml/examples/yolo/data/labels/121_5.png
deleted file mode 100644
index 5d59b5a..0000000
--- a/ggml/examples/yolo/data/labels/121_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/121_6.png b/ggml/examples/yolo/data/labels/121_6.png
deleted file mode 100644
index 49a63f3..0000000
--- a/ggml/examples/yolo/data/labels/121_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/121_7.png b/ggml/examples/yolo/data/labels/121_7.png
deleted file mode 100644
index fea7b9e..0000000
--- a/ggml/examples/yolo/data/labels/121_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/122_0.png b/ggml/examples/yolo/data/labels/122_0.png
deleted file mode 100644
index 76ed270..0000000
--- a/ggml/examples/yolo/data/labels/122_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/122_1.png b/ggml/examples/yolo/data/labels/122_1.png
deleted file mode 100644
index 57f2857..0000000
--- a/ggml/examples/yolo/data/labels/122_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/122_2.png b/ggml/examples/yolo/data/labels/122_2.png
deleted file mode 100644
index d9f27cd..0000000
--- a/ggml/examples/yolo/data/labels/122_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/122_3.png b/ggml/examples/yolo/data/labels/122_3.png
deleted file mode 100644
index c065790..0000000
--- a/ggml/examples/yolo/data/labels/122_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/122_4.png b/ggml/examples/yolo/data/labels/122_4.png
deleted file mode 100644
index 62b891e..0000000
--- a/ggml/examples/yolo/data/labels/122_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/122_5.png b/ggml/examples/yolo/data/labels/122_5.png
deleted file mode 100644
index 276222e..0000000
--- a/ggml/examples/yolo/data/labels/122_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/122_6.png b/ggml/examples/yolo/data/labels/122_6.png
deleted file mode 100644
index 0b2924e..0000000
--- a/ggml/examples/yolo/data/labels/122_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/122_7.png b/ggml/examples/yolo/data/labels/122_7.png
deleted file mode 100644
index 8708778..0000000
--- a/ggml/examples/yolo/data/labels/122_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/123_0.png b/ggml/examples/yolo/data/labels/123_0.png
deleted file mode 100644
index 95f92aa..0000000
--- a/ggml/examples/yolo/data/labels/123_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/123_1.png b/ggml/examples/yolo/data/labels/123_1.png
deleted file mode 100644
index e0e4fdd..0000000
--- a/ggml/examples/yolo/data/labels/123_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/123_2.png b/ggml/examples/yolo/data/labels/123_2.png
deleted file mode 100644
index 6a9c0b0..0000000
--- a/ggml/examples/yolo/data/labels/123_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/123_3.png b/ggml/examples/yolo/data/labels/123_3.png
deleted file mode 100644
index 8783377..0000000
--- a/ggml/examples/yolo/data/labels/123_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/123_4.png b/ggml/examples/yolo/data/labels/123_4.png
deleted file mode 100644
index c9de4f7..0000000
--- a/ggml/examples/yolo/data/labels/123_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/123_5.png b/ggml/examples/yolo/data/labels/123_5.png
deleted file mode 100644
index 8deaa3c..0000000
--- a/ggml/examples/yolo/data/labels/123_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/123_6.png b/ggml/examples/yolo/data/labels/123_6.png
deleted file mode 100644
index 47ca880..0000000
--- a/ggml/examples/yolo/data/labels/123_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/123_7.png b/ggml/examples/yolo/data/labels/123_7.png
deleted file mode 100644
index bf3a5d7..0000000
--- a/ggml/examples/yolo/data/labels/123_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/124_0.png b/ggml/examples/yolo/data/labels/124_0.png
deleted file mode 100644
index a54db7c..0000000
--- a/ggml/examples/yolo/data/labels/124_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/124_1.png b/ggml/examples/yolo/data/labels/124_1.png
deleted file mode 100644
index c545361..0000000
--- a/ggml/examples/yolo/data/labels/124_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/124_2.png b/ggml/examples/yolo/data/labels/124_2.png
deleted file mode 100644
index c5a6a76..0000000
--- a/ggml/examples/yolo/data/labels/124_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/124_3.png b/ggml/examples/yolo/data/labels/124_3.png
deleted file mode 100644
index 37b9e82..0000000
--- a/ggml/examples/yolo/data/labels/124_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/124_4.png b/ggml/examples/yolo/data/labels/124_4.png
deleted file mode 100644
index 0521fe2..0000000
--- a/ggml/examples/yolo/data/labels/124_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/124_5.png b/ggml/examples/yolo/data/labels/124_5.png
deleted file mode 100644
index a5ad765..0000000
--- a/ggml/examples/yolo/data/labels/124_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/124_6.png b/ggml/examples/yolo/data/labels/124_6.png
deleted file mode 100644
index 2dbf0f1..0000000
--- a/ggml/examples/yolo/data/labels/124_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/124_7.png b/ggml/examples/yolo/data/labels/124_7.png
deleted file mode 100644
index 0730ca0..0000000
--- a/ggml/examples/yolo/data/labels/124_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/125_0.png b/ggml/examples/yolo/data/labels/125_0.png
deleted file mode 100644
index 67e37b2..0000000
--- a/ggml/examples/yolo/data/labels/125_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/125_1.png b/ggml/examples/yolo/data/labels/125_1.png
deleted file mode 100644
index 4dc1ea5..0000000
--- a/ggml/examples/yolo/data/labels/125_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/125_2.png b/ggml/examples/yolo/data/labels/125_2.png
deleted file mode 100644
index 6f3babe..0000000
--- a/ggml/examples/yolo/data/labels/125_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/125_3.png b/ggml/examples/yolo/data/labels/125_3.png
deleted file mode 100644
index 4c7a26c..0000000
--- a/ggml/examples/yolo/data/labels/125_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/125_4.png b/ggml/examples/yolo/data/labels/125_4.png
deleted file mode 100644
index 149eaa9..0000000
--- a/ggml/examples/yolo/data/labels/125_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/125_5.png b/ggml/examples/yolo/data/labels/125_5.png
deleted file mode 100644
index 8ae40f7..0000000
--- a/ggml/examples/yolo/data/labels/125_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/125_6.png b/ggml/examples/yolo/data/labels/125_6.png
deleted file mode 100644
index 5893043..0000000
--- a/ggml/examples/yolo/data/labels/125_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/125_7.png b/ggml/examples/yolo/data/labels/125_7.png
deleted file mode 100644
index 47bd85b..0000000
--- a/ggml/examples/yolo/data/labels/125_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/126_0.png b/ggml/examples/yolo/data/labels/126_0.png
deleted file mode 100644
index f602708..0000000
--- a/ggml/examples/yolo/data/labels/126_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/126_1.png b/ggml/examples/yolo/data/labels/126_1.png
deleted file mode 100644
index cb8b76e..0000000
--- a/ggml/examples/yolo/data/labels/126_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/126_2.png b/ggml/examples/yolo/data/labels/126_2.png
deleted file mode 100644
index ccef7a2..0000000
--- a/ggml/examples/yolo/data/labels/126_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/126_3.png b/ggml/examples/yolo/data/labels/126_3.png
deleted file mode 100644
index 9fcf7ab..0000000
--- a/ggml/examples/yolo/data/labels/126_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/126_4.png b/ggml/examples/yolo/data/labels/126_4.png
deleted file mode 100644
index d3f01bf..0000000
--- a/ggml/examples/yolo/data/labels/126_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/126_5.png b/ggml/examples/yolo/data/labels/126_5.png
deleted file mode 100644
index 00c57f1..0000000
--- a/ggml/examples/yolo/data/labels/126_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/126_6.png b/ggml/examples/yolo/data/labels/126_6.png
deleted file mode 100644
index 2c2dbb1..0000000
--- a/ggml/examples/yolo/data/labels/126_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/126_7.png b/ggml/examples/yolo/data/labels/126_7.png
deleted file mode 100644
index 55ee5a7..0000000
--- a/ggml/examples/yolo/data/labels/126_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/32_0.png b/ggml/examples/yolo/data/labels/32_0.png
deleted file mode 100644
index 22be2de..0000000
--- a/ggml/examples/yolo/data/labels/32_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/32_1.png b/ggml/examples/yolo/data/labels/32_1.png
deleted file mode 100644
index 21abfa4..0000000
--- a/ggml/examples/yolo/data/labels/32_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/32_2.png b/ggml/examples/yolo/data/labels/32_2.png
deleted file mode 100644
index eae41b9..0000000
--- a/ggml/examples/yolo/data/labels/32_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/32_3.png b/ggml/examples/yolo/data/labels/32_3.png
deleted file mode 100644
index fcc007e..0000000
--- a/ggml/examples/yolo/data/labels/32_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/32_4.png b/ggml/examples/yolo/data/labels/32_4.png
deleted file mode 100644
index f4c498c..0000000
--- a/ggml/examples/yolo/data/labels/32_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/32_5.png b/ggml/examples/yolo/data/labels/32_5.png
deleted file mode 100644
index 16248c9..0000000
--- a/ggml/examples/yolo/data/labels/32_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/32_6.png b/ggml/examples/yolo/data/labels/32_6.png
deleted file mode 100644
index 1e50183..0000000
--- a/ggml/examples/yolo/data/labels/32_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/32_7.png b/ggml/examples/yolo/data/labels/32_7.png
deleted file mode 100644
index 81bd1d5..0000000
--- a/ggml/examples/yolo/data/labels/32_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/33_0.png b/ggml/examples/yolo/data/labels/33_0.png
deleted file mode 100644
index d1a1141..0000000
--- a/ggml/examples/yolo/data/labels/33_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/33_1.png b/ggml/examples/yolo/data/labels/33_1.png
deleted file mode 100644
index a13dd33..0000000
--- a/ggml/examples/yolo/data/labels/33_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/33_2.png b/ggml/examples/yolo/data/labels/33_2.png
deleted file mode 100644
index 57201b2..0000000
--- a/ggml/examples/yolo/data/labels/33_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/33_3.png b/ggml/examples/yolo/data/labels/33_3.png
deleted file mode 100644
index 0210160..0000000
--- a/ggml/examples/yolo/data/labels/33_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/33_4.png b/ggml/examples/yolo/data/labels/33_4.png
deleted file mode 100644
index fd181d1..0000000
--- a/ggml/examples/yolo/data/labels/33_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/33_5.png b/ggml/examples/yolo/data/labels/33_5.png
deleted file mode 100644
index ed4387a..0000000
--- a/ggml/examples/yolo/data/labels/33_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/33_6.png b/ggml/examples/yolo/data/labels/33_6.png
deleted file mode 100644
index 1126292..0000000
--- a/ggml/examples/yolo/data/labels/33_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/33_7.png b/ggml/examples/yolo/data/labels/33_7.png
deleted file mode 100644
index 1d67d55..0000000
--- a/ggml/examples/yolo/data/labels/33_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/34_0.png b/ggml/examples/yolo/data/labels/34_0.png
deleted file mode 100644
index 7b24cfa..0000000
--- a/ggml/examples/yolo/data/labels/34_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/34_1.png b/ggml/examples/yolo/data/labels/34_1.png
deleted file mode 100644
index 39f7edc..0000000
--- a/ggml/examples/yolo/data/labels/34_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/34_2.png b/ggml/examples/yolo/data/labels/34_2.png
deleted file mode 100644
index 00c4e87..0000000
--- a/ggml/examples/yolo/data/labels/34_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/34_3.png b/ggml/examples/yolo/data/labels/34_3.png
deleted file mode 100644
index 79d5176..0000000
--- a/ggml/examples/yolo/data/labels/34_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/34_4.png b/ggml/examples/yolo/data/labels/34_4.png
deleted file mode 100644
index d524830..0000000
--- a/ggml/examples/yolo/data/labels/34_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/34_5.png b/ggml/examples/yolo/data/labels/34_5.png
deleted file mode 100644
index 25a7e43..0000000
--- a/ggml/examples/yolo/data/labels/34_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/34_6.png b/ggml/examples/yolo/data/labels/34_6.png
deleted file mode 100644
index bf3b613..0000000
--- a/ggml/examples/yolo/data/labels/34_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/34_7.png b/ggml/examples/yolo/data/labels/34_7.png
deleted file mode 100644
index 1148f2d..0000000
--- a/ggml/examples/yolo/data/labels/34_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/35_0.png b/ggml/examples/yolo/data/labels/35_0.png
deleted file mode 100644
index ff572a8..0000000
--- a/ggml/examples/yolo/data/labels/35_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/35_1.png b/ggml/examples/yolo/data/labels/35_1.png
deleted file mode 100644
index 1bba494..0000000
--- a/ggml/examples/yolo/data/labels/35_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/35_2.png b/ggml/examples/yolo/data/labels/35_2.png
deleted file mode 100644
index b7d708e..0000000
--- a/ggml/examples/yolo/data/labels/35_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/35_3.png b/ggml/examples/yolo/data/labels/35_3.png
deleted file mode 100644
index 302cc73..0000000
--- a/ggml/examples/yolo/data/labels/35_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/35_4.png b/ggml/examples/yolo/data/labels/35_4.png
deleted file mode 100644
index 9e11a20..0000000
--- a/ggml/examples/yolo/data/labels/35_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/35_5.png b/ggml/examples/yolo/data/labels/35_5.png
deleted file mode 100644
index 9ab56b3..0000000
--- a/ggml/examples/yolo/data/labels/35_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/35_6.png b/ggml/examples/yolo/data/labels/35_6.png
deleted file mode 100644
index d75876c..0000000
--- a/ggml/examples/yolo/data/labels/35_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/35_7.png b/ggml/examples/yolo/data/labels/35_7.png
deleted file mode 100644
index 1bbdc9f..0000000
--- a/ggml/examples/yolo/data/labels/35_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/36_0.png b/ggml/examples/yolo/data/labels/36_0.png
deleted file mode 100644
index 929660a..0000000
--- a/ggml/examples/yolo/data/labels/36_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/36_1.png b/ggml/examples/yolo/data/labels/36_1.png
deleted file mode 100644
index 646e770..0000000
--- a/ggml/examples/yolo/data/labels/36_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/36_2.png b/ggml/examples/yolo/data/labels/36_2.png
deleted file mode 100644
index ba6603c..0000000
--- a/ggml/examples/yolo/data/labels/36_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/36_3.png b/ggml/examples/yolo/data/labels/36_3.png
deleted file mode 100644
index 56e5ba3..0000000
--- a/ggml/examples/yolo/data/labels/36_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/36_4.png b/ggml/examples/yolo/data/labels/36_4.png
deleted file mode 100644
index e87046c..0000000
--- a/ggml/examples/yolo/data/labels/36_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/36_5.png b/ggml/examples/yolo/data/labels/36_5.png
deleted file mode 100644
index 5a7672d..0000000
--- a/ggml/examples/yolo/data/labels/36_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/36_6.png b/ggml/examples/yolo/data/labels/36_6.png
deleted file mode 100644
index b16b2a2..0000000
--- a/ggml/examples/yolo/data/labels/36_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/36_7.png b/ggml/examples/yolo/data/labels/36_7.png
deleted file mode 100644
index 1f94312..0000000
--- a/ggml/examples/yolo/data/labels/36_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/37_0.png b/ggml/examples/yolo/data/labels/37_0.png
deleted file mode 100644
index 44b7f4e..0000000
--- a/ggml/examples/yolo/data/labels/37_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/37_1.png b/ggml/examples/yolo/data/labels/37_1.png
deleted file mode 100644
index 36712dc..0000000
--- a/ggml/examples/yolo/data/labels/37_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/37_2.png b/ggml/examples/yolo/data/labels/37_2.png
deleted file mode 100644
index 759aed9..0000000
--- a/ggml/examples/yolo/data/labels/37_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/37_3.png b/ggml/examples/yolo/data/labels/37_3.png
deleted file mode 100644
index 03253dd..0000000
--- a/ggml/examples/yolo/data/labels/37_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/37_4.png b/ggml/examples/yolo/data/labels/37_4.png
deleted file mode 100644
index ed5e4bd..0000000
--- a/ggml/examples/yolo/data/labels/37_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/37_5.png b/ggml/examples/yolo/data/labels/37_5.png
deleted file mode 100644
index fe20590..0000000
--- a/ggml/examples/yolo/data/labels/37_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/37_6.png b/ggml/examples/yolo/data/labels/37_6.png
deleted file mode 100644
index 11f34eb..0000000
--- a/ggml/examples/yolo/data/labels/37_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/37_7.png b/ggml/examples/yolo/data/labels/37_7.png
deleted file mode 100644
index dee9ba3..0000000
--- a/ggml/examples/yolo/data/labels/37_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/38_0.png b/ggml/examples/yolo/data/labels/38_0.png
deleted file mode 100644
index 47c51e1..0000000
--- a/ggml/examples/yolo/data/labels/38_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/38_1.png b/ggml/examples/yolo/data/labels/38_1.png
deleted file mode 100644
index 3abcc8b..0000000
--- a/ggml/examples/yolo/data/labels/38_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/38_2.png b/ggml/examples/yolo/data/labels/38_2.png
deleted file mode 100644
index 47a85e9..0000000
--- a/ggml/examples/yolo/data/labels/38_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/38_3.png b/ggml/examples/yolo/data/labels/38_3.png
deleted file mode 100644
index 1af926e..0000000
--- a/ggml/examples/yolo/data/labels/38_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/38_4.png b/ggml/examples/yolo/data/labels/38_4.png
deleted file mode 100644
index 783342a..0000000
--- a/ggml/examples/yolo/data/labels/38_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/38_5.png b/ggml/examples/yolo/data/labels/38_5.png
deleted file mode 100644
index 20b9222..0000000
--- a/ggml/examples/yolo/data/labels/38_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/38_6.png b/ggml/examples/yolo/data/labels/38_6.png
deleted file mode 100644
index f68e4d9..0000000
--- a/ggml/examples/yolo/data/labels/38_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/38_7.png b/ggml/examples/yolo/data/labels/38_7.png
deleted file mode 100644
index c6ffc54..0000000
--- a/ggml/examples/yolo/data/labels/38_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/39_0.png b/ggml/examples/yolo/data/labels/39_0.png
deleted file mode 100644
index 860f831..0000000
--- a/ggml/examples/yolo/data/labels/39_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/39_1.png b/ggml/examples/yolo/data/labels/39_1.png
deleted file mode 100644
index 021b232..0000000
--- a/ggml/examples/yolo/data/labels/39_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/39_2.png b/ggml/examples/yolo/data/labels/39_2.png
deleted file mode 100644
index 7ac7b15..0000000
--- a/ggml/examples/yolo/data/labels/39_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/39_3.png b/ggml/examples/yolo/data/labels/39_3.png
deleted file mode 100644
index 82baf94..0000000
--- a/ggml/examples/yolo/data/labels/39_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/39_4.png b/ggml/examples/yolo/data/labels/39_4.png
deleted file mode 100644
index 4fb4186..0000000
--- a/ggml/examples/yolo/data/labels/39_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/39_5.png b/ggml/examples/yolo/data/labels/39_5.png
deleted file mode 100644
index 1c613c5..0000000
--- a/ggml/examples/yolo/data/labels/39_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/39_6.png b/ggml/examples/yolo/data/labels/39_6.png
deleted file mode 100644
index e570bcd..0000000
--- a/ggml/examples/yolo/data/labels/39_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/39_7.png b/ggml/examples/yolo/data/labels/39_7.png
deleted file mode 100644
index 5cb309f..0000000
--- a/ggml/examples/yolo/data/labels/39_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/40_0.png b/ggml/examples/yolo/data/labels/40_0.png
deleted file mode 100644
index 346c633..0000000
--- a/ggml/examples/yolo/data/labels/40_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/40_1.png b/ggml/examples/yolo/data/labels/40_1.png
deleted file mode 100644
index 7dfa7b5..0000000
--- a/ggml/examples/yolo/data/labels/40_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/40_2.png b/ggml/examples/yolo/data/labels/40_2.png
deleted file mode 100644
index a2b0e0f..0000000
--- a/ggml/examples/yolo/data/labels/40_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/40_3.png b/ggml/examples/yolo/data/labels/40_3.png
deleted file mode 100644
index f727425..0000000
--- a/ggml/examples/yolo/data/labels/40_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/40_4.png b/ggml/examples/yolo/data/labels/40_4.png
deleted file mode 100644
index 4c89b55..0000000
--- a/ggml/examples/yolo/data/labels/40_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/40_5.png b/ggml/examples/yolo/data/labels/40_5.png
deleted file mode 100644
index 35b4fcd..0000000
--- a/ggml/examples/yolo/data/labels/40_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/40_6.png b/ggml/examples/yolo/data/labels/40_6.png
deleted file mode 100644
index 215972b..0000000
--- a/ggml/examples/yolo/data/labels/40_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/40_7.png b/ggml/examples/yolo/data/labels/40_7.png
deleted file mode 100644
index 0d8eb55..0000000
--- a/ggml/examples/yolo/data/labels/40_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/41_0.png b/ggml/examples/yolo/data/labels/41_0.png
deleted file mode 100644
index 42dd387..0000000
--- a/ggml/examples/yolo/data/labels/41_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/41_1.png b/ggml/examples/yolo/data/labels/41_1.png
deleted file mode 100644
index 0e1da96..0000000
--- a/ggml/examples/yolo/data/labels/41_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/41_2.png b/ggml/examples/yolo/data/labels/41_2.png
deleted file mode 100644
index 31cf47d..0000000
--- a/ggml/examples/yolo/data/labels/41_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/41_3.png b/ggml/examples/yolo/data/labels/41_3.png
deleted file mode 100644
index e512186..0000000
--- a/ggml/examples/yolo/data/labels/41_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/41_4.png b/ggml/examples/yolo/data/labels/41_4.png
deleted file mode 100644
index 150cb57..0000000
--- a/ggml/examples/yolo/data/labels/41_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/41_5.png b/ggml/examples/yolo/data/labels/41_5.png
deleted file mode 100644
index 7ac1a18..0000000
--- a/ggml/examples/yolo/data/labels/41_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/41_6.png b/ggml/examples/yolo/data/labels/41_6.png
deleted file mode 100644
index 4bdd2f0..0000000
--- a/ggml/examples/yolo/data/labels/41_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/41_7.png b/ggml/examples/yolo/data/labels/41_7.png
deleted file mode 100644
index 50568ba..0000000
--- a/ggml/examples/yolo/data/labels/41_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/42_0.png b/ggml/examples/yolo/data/labels/42_0.png
deleted file mode 100644
index f41cd84..0000000
--- a/ggml/examples/yolo/data/labels/42_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/42_1.png b/ggml/examples/yolo/data/labels/42_1.png
deleted file mode 100644
index 4aee8e6..0000000
--- a/ggml/examples/yolo/data/labels/42_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/42_2.png b/ggml/examples/yolo/data/labels/42_2.png
deleted file mode 100644
index e045f1d..0000000
--- a/ggml/examples/yolo/data/labels/42_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/42_3.png b/ggml/examples/yolo/data/labels/42_3.png
deleted file mode 100644
index edc194b..0000000
--- a/ggml/examples/yolo/data/labels/42_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/42_4.png b/ggml/examples/yolo/data/labels/42_4.png
deleted file mode 100644
index e417c3e..0000000
--- a/ggml/examples/yolo/data/labels/42_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/42_5.png b/ggml/examples/yolo/data/labels/42_5.png
deleted file mode 100644
index d6593c6..0000000
--- a/ggml/examples/yolo/data/labels/42_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/42_6.png b/ggml/examples/yolo/data/labels/42_6.png
deleted file mode 100644
index ffcaaed..0000000
--- a/ggml/examples/yolo/data/labels/42_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/42_7.png b/ggml/examples/yolo/data/labels/42_7.png
deleted file mode 100644
index 4895af7..0000000
--- a/ggml/examples/yolo/data/labels/42_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/43_0.png b/ggml/examples/yolo/data/labels/43_0.png
deleted file mode 100644
index af5295e..0000000
--- a/ggml/examples/yolo/data/labels/43_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/43_1.png b/ggml/examples/yolo/data/labels/43_1.png
deleted file mode 100644
index 0e20f34..0000000
--- a/ggml/examples/yolo/data/labels/43_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/43_2.png b/ggml/examples/yolo/data/labels/43_2.png
deleted file mode 100644
index be2fbb3..0000000
--- a/ggml/examples/yolo/data/labels/43_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/43_3.png b/ggml/examples/yolo/data/labels/43_3.png
deleted file mode 100644
index 5183eb1..0000000
--- a/ggml/examples/yolo/data/labels/43_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/43_4.png b/ggml/examples/yolo/data/labels/43_4.png
deleted file mode 100644
index 526b500..0000000
--- a/ggml/examples/yolo/data/labels/43_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/43_5.png b/ggml/examples/yolo/data/labels/43_5.png
deleted file mode 100644
index 03d62ff..0000000
--- a/ggml/examples/yolo/data/labels/43_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/43_6.png b/ggml/examples/yolo/data/labels/43_6.png
deleted file mode 100644
index b245a4c..0000000
--- a/ggml/examples/yolo/data/labels/43_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/43_7.png b/ggml/examples/yolo/data/labels/43_7.png
deleted file mode 100644
index e21688a..0000000
--- a/ggml/examples/yolo/data/labels/43_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/44_0.png b/ggml/examples/yolo/data/labels/44_0.png
deleted file mode 100644
index 63e66ff..0000000
--- a/ggml/examples/yolo/data/labels/44_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/44_1.png b/ggml/examples/yolo/data/labels/44_1.png
deleted file mode 100644
index 49f2fe2..0000000
--- a/ggml/examples/yolo/data/labels/44_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/44_2.png b/ggml/examples/yolo/data/labels/44_2.png
deleted file mode 100644
index 32fd2de..0000000
--- a/ggml/examples/yolo/data/labels/44_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/44_3.png b/ggml/examples/yolo/data/labels/44_3.png
deleted file mode 100644
index bb1118f..0000000
--- a/ggml/examples/yolo/data/labels/44_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/44_4.png b/ggml/examples/yolo/data/labels/44_4.png
deleted file mode 100644
index cd80bbd..0000000
--- a/ggml/examples/yolo/data/labels/44_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/44_5.png b/ggml/examples/yolo/data/labels/44_5.png
deleted file mode 100644
index f83e115..0000000
--- a/ggml/examples/yolo/data/labels/44_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/44_6.png b/ggml/examples/yolo/data/labels/44_6.png
deleted file mode 100644
index 1f8ad89..0000000
--- a/ggml/examples/yolo/data/labels/44_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/44_7.png b/ggml/examples/yolo/data/labels/44_7.png
deleted file mode 100644
index 8c5d632..0000000
--- a/ggml/examples/yolo/data/labels/44_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/45_0.png b/ggml/examples/yolo/data/labels/45_0.png
deleted file mode 100644
index 701bdc8..0000000
--- a/ggml/examples/yolo/data/labels/45_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/45_1.png b/ggml/examples/yolo/data/labels/45_1.png
deleted file mode 100644
index 976103f..0000000
--- a/ggml/examples/yolo/data/labels/45_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/45_2.png b/ggml/examples/yolo/data/labels/45_2.png
deleted file mode 100644
index 4bc50cf..0000000
--- a/ggml/examples/yolo/data/labels/45_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/45_3.png b/ggml/examples/yolo/data/labels/45_3.png
deleted file mode 100644
index 335c2c7..0000000
--- a/ggml/examples/yolo/data/labels/45_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/45_4.png b/ggml/examples/yolo/data/labels/45_4.png
deleted file mode 100644
index 8fc2738..0000000
--- a/ggml/examples/yolo/data/labels/45_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/45_5.png b/ggml/examples/yolo/data/labels/45_5.png
deleted file mode 100644
index acc4c09..0000000
--- a/ggml/examples/yolo/data/labels/45_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/45_6.png b/ggml/examples/yolo/data/labels/45_6.png
deleted file mode 100644
index 0077104..0000000
--- a/ggml/examples/yolo/data/labels/45_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/45_7.png b/ggml/examples/yolo/data/labels/45_7.png
deleted file mode 100644
index d486ccd..0000000
--- a/ggml/examples/yolo/data/labels/45_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/46_0.png b/ggml/examples/yolo/data/labels/46_0.png
deleted file mode 100644
index 44fa798..0000000
--- a/ggml/examples/yolo/data/labels/46_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/46_1.png b/ggml/examples/yolo/data/labels/46_1.png
deleted file mode 100644
index 0f35bbb..0000000
--- a/ggml/examples/yolo/data/labels/46_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/46_2.png b/ggml/examples/yolo/data/labels/46_2.png
deleted file mode 100644
index e3355f5..0000000
--- a/ggml/examples/yolo/data/labels/46_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/46_3.png b/ggml/examples/yolo/data/labels/46_3.png
deleted file mode 100644
index 9226458..0000000
--- a/ggml/examples/yolo/data/labels/46_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/46_4.png b/ggml/examples/yolo/data/labels/46_4.png
deleted file mode 100644
index 77a841c..0000000
--- a/ggml/examples/yolo/data/labels/46_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/46_5.png b/ggml/examples/yolo/data/labels/46_5.png
deleted file mode 100644
index e86cbaf..0000000
--- a/ggml/examples/yolo/data/labels/46_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/46_6.png b/ggml/examples/yolo/data/labels/46_6.png
deleted file mode 100644
index 5011ded..0000000
--- a/ggml/examples/yolo/data/labels/46_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/46_7.png b/ggml/examples/yolo/data/labels/46_7.png
deleted file mode 100644
index 53c8871..0000000
--- a/ggml/examples/yolo/data/labels/46_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/47_0.png b/ggml/examples/yolo/data/labels/47_0.png
deleted file mode 100644
index 9b7c050..0000000
--- a/ggml/examples/yolo/data/labels/47_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/47_1.png b/ggml/examples/yolo/data/labels/47_1.png
deleted file mode 100644
index 75ba112..0000000
--- a/ggml/examples/yolo/data/labels/47_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/47_2.png b/ggml/examples/yolo/data/labels/47_2.png
deleted file mode 100644
index 2f07656..0000000
--- a/ggml/examples/yolo/data/labels/47_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/47_3.png b/ggml/examples/yolo/data/labels/47_3.png
deleted file mode 100644
index 3fd0728..0000000
--- a/ggml/examples/yolo/data/labels/47_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/47_4.png b/ggml/examples/yolo/data/labels/47_4.png
deleted file mode 100644
index 9e503eb..0000000
--- a/ggml/examples/yolo/data/labels/47_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/47_5.png b/ggml/examples/yolo/data/labels/47_5.png
deleted file mode 100644
index fddef15..0000000
--- a/ggml/examples/yolo/data/labels/47_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/47_6.png b/ggml/examples/yolo/data/labels/47_6.png
deleted file mode 100644
index 0117b55..0000000
--- a/ggml/examples/yolo/data/labels/47_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/47_7.png b/ggml/examples/yolo/data/labels/47_7.png
deleted file mode 100644
index fff1323..0000000
--- a/ggml/examples/yolo/data/labels/47_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/48_0.png b/ggml/examples/yolo/data/labels/48_0.png
deleted file mode 100644
index 4775310..0000000
--- a/ggml/examples/yolo/data/labels/48_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/48_1.png b/ggml/examples/yolo/data/labels/48_1.png
deleted file mode 100644
index 231cd3f..0000000
--- a/ggml/examples/yolo/data/labels/48_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/48_2.png b/ggml/examples/yolo/data/labels/48_2.png
deleted file mode 100644
index b653e9c..0000000
--- a/ggml/examples/yolo/data/labels/48_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/48_3.png b/ggml/examples/yolo/data/labels/48_3.png
deleted file mode 100644
index d148a75..0000000
--- a/ggml/examples/yolo/data/labels/48_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/48_4.png b/ggml/examples/yolo/data/labels/48_4.png
deleted file mode 100644
index 9e1b0d5..0000000
--- a/ggml/examples/yolo/data/labels/48_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/48_5.png b/ggml/examples/yolo/data/labels/48_5.png
deleted file mode 100644
index f554133..0000000
--- a/ggml/examples/yolo/data/labels/48_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/48_6.png b/ggml/examples/yolo/data/labels/48_6.png
deleted file mode 100644
index d841928..0000000
--- a/ggml/examples/yolo/data/labels/48_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/48_7.png b/ggml/examples/yolo/data/labels/48_7.png
deleted file mode 100644
index 145ed36..0000000
--- a/ggml/examples/yolo/data/labels/48_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/49_0.png b/ggml/examples/yolo/data/labels/49_0.png
deleted file mode 100644
index f979710..0000000
--- a/ggml/examples/yolo/data/labels/49_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/49_1.png b/ggml/examples/yolo/data/labels/49_1.png
deleted file mode 100644
index 0cb26d4..0000000
--- a/ggml/examples/yolo/data/labels/49_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/49_2.png b/ggml/examples/yolo/data/labels/49_2.png
deleted file mode 100644
index 0684a71..0000000
--- a/ggml/examples/yolo/data/labels/49_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/49_3.png b/ggml/examples/yolo/data/labels/49_3.png
deleted file mode 100644
index ce89eb7..0000000
--- a/ggml/examples/yolo/data/labels/49_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/49_4.png b/ggml/examples/yolo/data/labels/49_4.png
deleted file mode 100644
index a8f6b14..0000000
--- a/ggml/examples/yolo/data/labels/49_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/49_5.png b/ggml/examples/yolo/data/labels/49_5.png
deleted file mode 100644
index c7e8033..0000000
--- a/ggml/examples/yolo/data/labels/49_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/49_6.png b/ggml/examples/yolo/data/labels/49_6.png
deleted file mode 100644
index a091704..0000000
--- a/ggml/examples/yolo/data/labels/49_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/49_7.png b/ggml/examples/yolo/data/labels/49_7.png
deleted file mode 100644
index f385b75..0000000
--- a/ggml/examples/yolo/data/labels/49_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/50_0.png b/ggml/examples/yolo/data/labels/50_0.png
deleted file mode 100644
index 6af169d..0000000
--- a/ggml/examples/yolo/data/labels/50_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/50_1.png b/ggml/examples/yolo/data/labels/50_1.png
deleted file mode 100644
index a3c0300..0000000
--- a/ggml/examples/yolo/data/labels/50_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/50_2.png b/ggml/examples/yolo/data/labels/50_2.png
deleted file mode 100644
index 1ea6ff6..0000000
--- a/ggml/examples/yolo/data/labels/50_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/50_3.png b/ggml/examples/yolo/data/labels/50_3.png
deleted file mode 100644
index 3c0bf1d..0000000
--- a/ggml/examples/yolo/data/labels/50_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/50_4.png b/ggml/examples/yolo/data/labels/50_4.png
deleted file mode 100644
index eb6a9d9..0000000
--- a/ggml/examples/yolo/data/labels/50_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/50_5.png b/ggml/examples/yolo/data/labels/50_5.png
deleted file mode 100644
index acdb5ed..0000000
--- a/ggml/examples/yolo/data/labels/50_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/50_6.png b/ggml/examples/yolo/data/labels/50_6.png
deleted file mode 100644
index c74ac5a..0000000
--- a/ggml/examples/yolo/data/labels/50_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/50_7.png b/ggml/examples/yolo/data/labels/50_7.png
deleted file mode 100644
index 6d3f530..0000000
--- a/ggml/examples/yolo/data/labels/50_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/51_0.png b/ggml/examples/yolo/data/labels/51_0.png
deleted file mode 100644
index 48bad97..0000000
--- a/ggml/examples/yolo/data/labels/51_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/51_1.png b/ggml/examples/yolo/data/labels/51_1.png
deleted file mode 100644
index 7703a74..0000000
--- a/ggml/examples/yolo/data/labels/51_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/51_2.png b/ggml/examples/yolo/data/labels/51_2.png
deleted file mode 100644
index 804e21b..0000000
--- a/ggml/examples/yolo/data/labels/51_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/51_3.png b/ggml/examples/yolo/data/labels/51_3.png
deleted file mode 100644
index 5b08757..0000000
--- a/ggml/examples/yolo/data/labels/51_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/51_4.png b/ggml/examples/yolo/data/labels/51_4.png
deleted file mode 100644
index 30727fb..0000000
--- a/ggml/examples/yolo/data/labels/51_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/51_5.png b/ggml/examples/yolo/data/labels/51_5.png
deleted file mode 100644
index de240ea..0000000
--- a/ggml/examples/yolo/data/labels/51_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/51_6.png b/ggml/examples/yolo/data/labels/51_6.png
deleted file mode 100644
index c252b01..0000000
--- a/ggml/examples/yolo/data/labels/51_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/51_7.png b/ggml/examples/yolo/data/labels/51_7.png
deleted file mode 100644
index 806f99b..0000000
--- a/ggml/examples/yolo/data/labels/51_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/52_0.png b/ggml/examples/yolo/data/labels/52_0.png
deleted file mode 100644
index a554b31..0000000
--- a/ggml/examples/yolo/data/labels/52_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/52_1.png b/ggml/examples/yolo/data/labels/52_1.png
deleted file mode 100644
index 936c84f..0000000
--- a/ggml/examples/yolo/data/labels/52_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/52_2.png b/ggml/examples/yolo/data/labels/52_2.png
deleted file mode 100644
index 57e3f05..0000000
--- a/ggml/examples/yolo/data/labels/52_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/52_3.png b/ggml/examples/yolo/data/labels/52_3.png
deleted file mode 100644
index 6a8f99e..0000000
--- a/ggml/examples/yolo/data/labels/52_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/52_4.png b/ggml/examples/yolo/data/labels/52_4.png
deleted file mode 100644
index 91f97f0..0000000
--- a/ggml/examples/yolo/data/labels/52_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/52_5.png b/ggml/examples/yolo/data/labels/52_5.png
deleted file mode 100644
index a78504f..0000000
--- a/ggml/examples/yolo/data/labels/52_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/52_6.png b/ggml/examples/yolo/data/labels/52_6.png
deleted file mode 100644
index f52bd2c..0000000
--- a/ggml/examples/yolo/data/labels/52_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/52_7.png b/ggml/examples/yolo/data/labels/52_7.png
deleted file mode 100644
index 621b8e0..0000000
--- a/ggml/examples/yolo/data/labels/52_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/53_0.png b/ggml/examples/yolo/data/labels/53_0.png
deleted file mode 100644
index a750615..0000000
--- a/ggml/examples/yolo/data/labels/53_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/53_1.png b/ggml/examples/yolo/data/labels/53_1.png
deleted file mode 100644
index 1c4e097..0000000
--- a/ggml/examples/yolo/data/labels/53_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/53_2.png b/ggml/examples/yolo/data/labels/53_2.png
deleted file mode 100644
index f523856..0000000
--- a/ggml/examples/yolo/data/labels/53_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/53_3.png b/ggml/examples/yolo/data/labels/53_3.png
deleted file mode 100644
index 85505f8..0000000
--- a/ggml/examples/yolo/data/labels/53_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/53_4.png b/ggml/examples/yolo/data/labels/53_4.png
deleted file mode 100644
index 14e8da1..0000000
--- a/ggml/examples/yolo/data/labels/53_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/53_5.png b/ggml/examples/yolo/data/labels/53_5.png
deleted file mode 100644
index 79781fc..0000000
--- a/ggml/examples/yolo/data/labels/53_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/53_6.png b/ggml/examples/yolo/data/labels/53_6.png
deleted file mode 100644
index a6e1863..0000000
--- a/ggml/examples/yolo/data/labels/53_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/53_7.png b/ggml/examples/yolo/data/labels/53_7.png
deleted file mode 100644
index bff5eb6..0000000
--- a/ggml/examples/yolo/data/labels/53_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/54_0.png b/ggml/examples/yolo/data/labels/54_0.png
deleted file mode 100644
index 98ce423..0000000
--- a/ggml/examples/yolo/data/labels/54_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/54_1.png b/ggml/examples/yolo/data/labels/54_1.png
deleted file mode 100644
index 643ef36..0000000
--- a/ggml/examples/yolo/data/labels/54_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/54_2.png b/ggml/examples/yolo/data/labels/54_2.png
deleted file mode 100644
index 2c3894e..0000000
--- a/ggml/examples/yolo/data/labels/54_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/54_3.png b/ggml/examples/yolo/data/labels/54_3.png
deleted file mode 100644
index 6190fdc..0000000
--- a/ggml/examples/yolo/data/labels/54_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/54_4.png b/ggml/examples/yolo/data/labels/54_4.png
deleted file mode 100644
index 92afc20..0000000
--- a/ggml/examples/yolo/data/labels/54_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/54_5.png b/ggml/examples/yolo/data/labels/54_5.png
deleted file mode 100644
index feab352..0000000
--- a/ggml/examples/yolo/data/labels/54_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/54_6.png b/ggml/examples/yolo/data/labels/54_6.png
deleted file mode 100644
index c46698f..0000000
--- a/ggml/examples/yolo/data/labels/54_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/54_7.png b/ggml/examples/yolo/data/labels/54_7.png
deleted file mode 100644
index 175dbeb..0000000
--- a/ggml/examples/yolo/data/labels/54_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/55_0.png b/ggml/examples/yolo/data/labels/55_0.png
deleted file mode 100644
index 644838b..0000000
--- a/ggml/examples/yolo/data/labels/55_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/55_1.png b/ggml/examples/yolo/data/labels/55_1.png
deleted file mode 100644
index b84e0eb..0000000
--- a/ggml/examples/yolo/data/labels/55_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/55_2.png b/ggml/examples/yolo/data/labels/55_2.png
deleted file mode 100644
index 3896d08..0000000
--- a/ggml/examples/yolo/data/labels/55_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/55_3.png b/ggml/examples/yolo/data/labels/55_3.png
deleted file mode 100644
index 21e9266..0000000
--- a/ggml/examples/yolo/data/labels/55_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/55_4.png b/ggml/examples/yolo/data/labels/55_4.png
deleted file mode 100644
index d6aed1b..0000000
--- a/ggml/examples/yolo/data/labels/55_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/55_5.png b/ggml/examples/yolo/data/labels/55_5.png
deleted file mode 100644
index 1f60c36..0000000
--- a/ggml/examples/yolo/data/labels/55_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/55_6.png b/ggml/examples/yolo/data/labels/55_6.png
deleted file mode 100644
index 2a7f1fb..0000000
--- a/ggml/examples/yolo/data/labels/55_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/55_7.png b/ggml/examples/yolo/data/labels/55_7.png
deleted file mode 100644
index 80c527b..0000000
--- a/ggml/examples/yolo/data/labels/55_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/56_0.png b/ggml/examples/yolo/data/labels/56_0.png
deleted file mode 100644
index 8a1e0c7..0000000
--- a/ggml/examples/yolo/data/labels/56_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/56_1.png b/ggml/examples/yolo/data/labels/56_1.png
deleted file mode 100644
index a0a81a0..0000000
--- a/ggml/examples/yolo/data/labels/56_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/56_2.png b/ggml/examples/yolo/data/labels/56_2.png
deleted file mode 100644
index 367f20e..0000000
--- a/ggml/examples/yolo/data/labels/56_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/56_3.png b/ggml/examples/yolo/data/labels/56_3.png
deleted file mode 100644
index cf56f8f..0000000
--- a/ggml/examples/yolo/data/labels/56_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/56_4.png b/ggml/examples/yolo/data/labels/56_4.png
deleted file mode 100644
index e57cce7..0000000
--- a/ggml/examples/yolo/data/labels/56_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/56_5.png b/ggml/examples/yolo/data/labels/56_5.png
deleted file mode 100644
index c206547..0000000
--- a/ggml/examples/yolo/data/labels/56_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/56_6.png b/ggml/examples/yolo/data/labels/56_6.png
deleted file mode 100644
index 040692a..0000000
--- a/ggml/examples/yolo/data/labels/56_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/56_7.png b/ggml/examples/yolo/data/labels/56_7.png
deleted file mode 100644
index 4c27c10..0000000
--- a/ggml/examples/yolo/data/labels/56_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/57_0.png b/ggml/examples/yolo/data/labels/57_0.png
deleted file mode 100644
index e20a46f..0000000
--- a/ggml/examples/yolo/data/labels/57_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/57_1.png b/ggml/examples/yolo/data/labels/57_1.png
deleted file mode 100644
index 17cd262..0000000
--- a/ggml/examples/yolo/data/labels/57_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/57_2.png b/ggml/examples/yolo/data/labels/57_2.png
deleted file mode 100644
index 98fad29..0000000
--- a/ggml/examples/yolo/data/labels/57_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/57_3.png b/ggml/examples/yolo/data/labels/57_3.png
deleted file mode 100644
index ba80aa0..0000000
--- a/ggml/examples/yolo/data/labels/57_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/57_4.png b/ggml/examples/yolo/data/labels/57_4.png
deleted file mode 100644
index e982243..0000000
--- a/ggml/examples/yolo/data/labels/57_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/57_5.png b/ggml/examples/yolo/data/labels/57_5.png
deleted file mode 100644
index e947ff8..0000000
--- a/ggml/examples/yolo/data/labels/57_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/57_6.png b/ggml/examples/yolo/data/labels/57_6.png
deleted file mode 100644
index c4ae076..0000000
--- a/ggml/examples/yolo/data/labels/57_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/57_7.png b/ggml/examples/yolo/data/labels/57_7.png
deleted file mode 100644
index bd4ca16..0000000
--- a/ggml/examples/yolo/data/labels/57_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/58_0.png b/ggml/examples/yolo/data/labels/58_0.png
deleted file mode 100644
index d9e35f7..0000000
--- a/ggml/examples/yolo/data/labels/58_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/58_1.png b/ggml/examples/yolo/data/labels/58_1.png
deleted file mode 100644
index 0dd6dd0..0000000
--- a/ggml/examples/yolo/data/labels/58_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/58_2.png b/ggml/examples/yolo/data/labels/58_2.png
deleted file mode 100644
index b9f1319..0000000
--- a/ggml/examples/yolo/data/labels/58_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/58_3.png b/ggml/examples/yolo/data/labels/58_3.png
deleted file mode 100644
index 0604337..0000000
--- a/ggml/examples/yolo/data/labels/58_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/58_4.png b/ggml/examples/yolo/data/labels/58_4.png
deleted file mode 100644
index 7283e03..0000000
--- a/ggml/examples/yolo/data/labels/58_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/58_5.png b/ggml/examples/yolo/data/labels/58_5.png
deleted file mode 100644
index 2df3db9..0000000
--- a/ggml/examples/yolo/data/labels/58_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/58_6.png b/ggml/examples/yolo/data/labels/58_6.png
deleted file mode 100644
index e120909..0000000
--- a/ggml/examples/yolo/data/labels/58_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/58_7.png b/ggml/examples/yolo/data/labels/58_7.png
deleted file mode 100644
index e85fc0a..0000000
--- a/ggml/examples/yolo/data/labels/58_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/59_0.png b/ggml/examples/yolo/data/labels/59_0.png
deleted file mode 100644
index 09b2120..0000000
--- a/ggml/examples/yolo/data/labels/59_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/59_1.png b/ggml/examples/yolo/data/labels/59_1.png
deleted file mode 100644
index 4b91814..0000000
--- a/ggml/examples/yolo/data/labels/59_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/59_2.png b/ggml/examples/yolo/data/labels/59_2.png
deleted file mode 100644
index cc302c2..0000000
--- a/ggml/examples/yolo/data/labels/59_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/59_3.png b/ggml/examples/yolo/data/labels/59_3.png
deleted file mode 100644
index 6921796..0000000
--- a/ggml/examples/yolo/data/labels/59_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/59_4.png b/ggml/examples/yolo/data/labels/59_4.png
deleted file mode 100644
index 4b77147..0000000
--- a/ggml/examples/yolo/data/labels/59_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/59_5.png b/ggml/examples/yolo/data/labels/59_5.png
deleted file mode 100644
index 4325e62..0000000
--- a/ggml/examples/yolo/data/labels/59_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/59_6.png b/ggml/examples/yolo/data/labels/59_6.png
deleted file mode 100644
index 135da33..0000000
--- a/ggml/examples/yolo/data/labels/59_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/59_7.png b/ggml/examples/yolo/data/labels/59_7.png
deleted file mode 100644
index 353ae33..0000000
--- a/ggml/examples/yolo/data/labels/59_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/60_0.png b/ggml/examples/yolo/data/labels/60_0.png
deleted file mode 100644
index 210e77d..0000000
--- a/ggml/examples/yolo/data/labels/60_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/60_1.png b/ggml/examples/yolo/data/labels/60_1.png
deleted file mode 100644
index 507c736..0000000
--- a/ggml/examples/yolo/data/labels/60_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/60_2.png b/ggml/examples/yolo/data/labels/60_2.png
deleted file mode 100644
index 4227322..0000000
--- a/ggml/examples/yolo/data/labels/60_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/60_3.png b/ggml/examples/yolo/data/labels/60_3.png
deleted file mode 100644
index ac4d026..0000000
--- a/ggml/examples/yolo/data/labels/60_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/60_4.png b/ggml/examples/yolo/data/labels/60_4.png
deleted file mode 100644
index 5cf89ed..0000000
--- a/ggml/examples/yolo/data/labels/60_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/60_5.png b/ggml/examples/yolo/data/labels/60_5.png
deleted file mode 100644
index 5ceb7fe..0000000
--- a/ggml/examples/yolo/data/labels/60_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/60_6.png b/ggml/examples/yolo/data/labels/60_6.png
deleted file mode 100644
index 45121a7..0000000
--- a/ggml/examples/yolo/data/labels/60_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/60_7.png b/ggml/examples/yolo/data/labels/60_7.png
deleted file mode 100644
index 33eb9e3..0000000
--- a/ggml/examples/yolo/data/labels/60_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/61_0.png b/ggml/examples/yolo/data/labels/61_0.png
deleted file mode 100644
index 72bc6bf..0000000
--- a/ggml/examples/yolo/data/labels/61_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/61_1.png b/ggml/examples/yolo/data/labels/61_1.png
deleted file mode 100644
index 50def57..0000000
--- a/ggml/examples/yolo/data/labels/61_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/61_2.png b/ggml/examples/yolo/data/labels/61_2.png
deleted file mode 100644
index a0e9a67..0000000
--- a/ggml/examples/yolo/data/labels/61_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/61_3.png b/ggml/examples/yolo/data/labels/61_3.png
deleted file mode 100644
index 0b7d149..0000000
--- a/ggml/examples/yolo/data/labels/61_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/61_4.png b/ggml/examples/yolo/data/labels/61_4.png
deleted file mode 100644
index c0b71d8..0000000
--- a/ggml/examples/yolo/data/labels/61_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/61_5.png b/ggml/examples/yolo/data/labels/61_5.png
deleted file mode 100644
index cf39e63..0000000
--- a/ggml/examples/yolo/data/labels/61_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/61_6.png b/ggml/examples/yolo/data/labels/61_6.png
deleted file mode 100644
index 95282d4..0000000
--- a/ggml/examples/yolo/data/labels/61_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/61_7.png b/ggml/examples/yolo/data/labels/61_7.png
deleted file mode 100644
index 6da5c02..0000000
--- a/ggml/examples/yolo/data/labels/61_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/62_0.png b/ggml/examples/yolo/data/labels/62_0.png
deleted file mode 100644
index bb12109..0000000
--- a/ggml/examples/yolo/data/labels/62_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/62_1.png b/ggml/examples/yolo/data/labels/62_1.png
deleted file mode 100644
index 5b82445..0000000
--- a/ggml/examples/yolo/data/labels/62_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/62_2.png b/ggml/examples/yolo/data/labels/62_2.png
deleted file mode 100644
index d4ca5a6..0000000
--- a/ggml/examples/yolo/data/labels/62_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/62_3.png b/ggml/examples/yolo/data/labels/62_3.png
deleted file mode 100644
index 24ddea8..0000000
--- a/ggml/examples/yolo/data/labels/62_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/62_4.png b/ggml/examples/yolo/data/labels/62_4.png
deleted file mode 100644
index 34130a2..0000000
--- a/ggml/examples/yolo/data/labels/62_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/62_5.png b/ggml/examples/yolo/data/labels/62_5.png
deleted file mode 100644
index 823d996..0000000
--- a/ggml/examples/yolo/data/labels/62_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/62_6.png b/ggml/examples/yolo/data/labels/62_6.png
deleted file mode 100644
index 63dd85d..0000000
--- a/ggml/examples/yolo/data/labels/62_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/62_7.png b/ggml/examples/yolo/data/labels/62_7.png
deleted file mode 100644
index 7c6158f..0000000
--- a/ggml/examples/yolo/data/labels/62_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/63_0.png b/ggml/examples/yolo/data/labels/63_0.png
deleted file mode 100644
index 2d9571e..0000000
--- a/ggml/examples/yolo/data/labels/63_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/63_1.png b/ggml/examples/yolo/data/labels/63_1.png
deleted file mode 100644
index 5376e07..0000000
--- a/ggml/examples/yolo/data/labels/63_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/63_2.png b/ggml/examples/yolo/data/labels/63_2.png
deleted file mode 100644
index 746ffa0..0000000
--- a/ggml/examples/yolo/data/labels/63_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/63_3.png b/ggml/examples/yolo/data/labels/63_3.png
deleted file mode 100644
index 58bb35a..0000000
--- a/ggml/examples/yolo/data/labels/63_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/63_4.png b/ggml/examples/yolo/data/labels/63_4.png
deleted file mode 100644
index ff1dc1a..0000000
--- a/ggml/examples/yolo/data/labels/63_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/63_5.png b/ggml/examples/yolo/data/labels/63_5.png
deleted file mode 100644
index 13e34ce..0000000
--- a/ggml/examples/yolo/data/labels/63_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/63_6.png b/ggml/examples/yolo/data/labels/63_6.png
deleted file mode 100644
index 9407c83..0000000
--- a/ggml/examples/yolo/data/labels/63_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/63_7.png b/ggml/examples/yolo/data/labels/63_7.png
deleted file mode 100644
index e9c3edf..0000000
--- a/ggml/examples/yolo/data/labels/63_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/64_0.png b/ggml/examples/yolo/data/labels/64_0.png
deleted file mode 100644
index 0a3a101..0000000
--- a/ggml/examples/yolo/data/labels/64_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/64_1.png b/ggml/examples/yolo/data/labels/64_1.png
deleted file mode 100644
index 6313960..0000000
--- a/ggml/examples/yolo/data/labels/64_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/64_2.png b/ggml/examples/yolo/data/labels/64_2.png
deleted file mode 100644
index c49565c..0000000
--- a/ggml/examples/yolo/data/labels/64_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/64_3.png b/ggml/examples/yolo/data/labels/64_3.png
deleted file mode 100644
index c697f9b..0000000
--- a/ggml/examples/yolo/data/labels/64_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/64_4.png b/ggml/examples/yolo/data/labels/64_4.png
deleted file mode 100644
index de194c7..0000000
--- a/ggml/examples/yolo/data/labels/64_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/64_5.png b/ggml/examples/yolo/data/labels/64_5.png
deleted file mode 100644
index 9905004..0000000
--- a/ggml/examples/yolo/data/labels/64_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/64_6.png b/ggml/examples/yolo/data/labels/64_6.png
deleted file mode 100644
index 7b91569..0000000
--- a/ggml/examples/yolo/data/labels/64_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/64_7.png b/ggml/examples/yolo/data/labels/64_7.png
deleted file mode 100644
index b9ebcee..0000000
--- a/ggml/examples/yolo/data/labels/64_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/65_0.png b/ggml/examples/yolo/data/labels/65_0.png
deleted file mode 100644
index c388066..0000000
--- a/ggml/examples/yolo/data/labels/65_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/65_1.png b/ggml/examples/yolo/data/labels/65_1.png
deleted file mode 100644
index 9f7d7f7..0000000
--- a/ggml/examples/yolo/data/labels/65_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/65_2.png b/ggml/examples/yolo/data/labels/65_2.png
deleted file mode 100644
index 6f76694..0000000
--- a/ggml/examples/yolo/data/labels/65_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/65_3.png b/ggml/examples/yolo/data/labels/65_3.png
deleted file mode 100644
index 3593128..0000000
--- a/ggml/examples/yolo/data/labels/65_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/65_4.png b/ggml/examples/yolo/data/labels/65_4.png
deleted file mode 100644
index 23f4814..0000000
--- a/ggml/examples/yolo/data/labels/65_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/65_5.png b/ggml/examples/yolo/data/labels/65_5.png
deleted file mode 100644
index 8eae6d7..0000000
--- a/ggml/examples/yolo/data/labels/65_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/65_6.png b/ggml/examples/yolo/data/labels/65_6.png
deleted file mode 100644
index 02fec76..0000000
--- a/ggml/examples/yolo/data/labels/65_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/65_7.png b/ggml/examples/yolo/data/labels/65_7.png
deleted file mode 100644
index c80822e..0000000
--- a/ggml/examples/yolo/data/labels/65_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/66_0.png b/ggml/examples/yolo/data/labels/66_0.png
deleted file mode 100644
index 278a401..0000000
--- a/ggml/examples/yolo/data/labels/66_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/66_1.png b/ggml/examples/yolo/data/labels/66_1.png
deleted file mode 100644
index 47c3735..0000000
--- a/ggml/examples/yolo/data/labels/66_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/66_2.png b/ggml/examples/yolo/data/labels/66_2.png
deleted file mode 100644
index dfcf8c4..0000000
--- a/ggml/examples/yolo/data/labels/66_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/66_3.png b/ggml/examples/yolo/data/labels/66_3.png
deleted file mode 100644
index d8d69a7..0000000
--- a/ggml/examples/yolo/data/labels/66_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/66_4.png b/ggml/examples/yolo/data/labels/66_4.png
deleted file mode 100644
index 842976a..0000000
--- a/ggml/examples/yolo/data/labels/66_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/66_5.png b/ggml/examples/yolo/data/labels/66_5.png
deleted file mode 100644
index 991caf8..0000000
--- a/ggml/examples/yolo/data/labels/66_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/66_6.png b/ggml/examples/yolo/data/labels/66_6.png
deleted file mode 100644
index 32c5284..0000000
--- a/ggml/examples/yolo/data/labels/66_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/66_7.png b/ggml/examples/yolo/data/labels/66_7.png
deleted file mode 100644
index 14e8609..0000000
--- a/ggml/examples/yolo/data/labels/66_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/67_0.png b/ggml/examples/yolo/data/labels/67_0.png
deleted file mode 100644
index 7f2786a..0000000
--- a/ggml/examples/yolo/data/labels/67_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/67_1.png b/ggml/examples/yolo/data/labels/67_1.png
deleted file mode 100644
index 798b421..0000000
--- a/ggml/examples/yolo/data/labels/67_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/67_2.png b/ggml/examples/yolo/data/labels/67_2.png
deleted file mode 100644
index 802aaab..0000000
--- a/ggml/examples/yolo/data/labels/67_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/67_3.png b/ggml/examples/yolo/data/labels/67_3.png
deleted file mode 100644
index 9d975ca..0000000
--- a/ggml/examples/yolo/data/labels/67_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/67_4.png b/ggml/examples/yolo/data/labels/67_4.png
deleted file mode 100644
index 94ee577..0000000
--- a/ggml/examples/yolo/data/labels/67_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/67_5.png b/ggml/examples/yolo/data/labels/67_5.png
deleted file mode 100644
index 021a877..0000000
--- a/ggml/examples/yolo/data/labels/67_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/67_6.png b/ggml/examples/yolo/data/labels/67_6.png
deleted file mode 100644
index c358732..0000000
--- a/ggml/examples/yolo/data/labels/67_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/67_7.png b/ggml/examples/yolo/data/labels/67_7.png
deleted file mode 100644
index 46acdf2..0000000
--- a/ggml/examples/yolo/data/labels/67_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/68_0.png b/ggml/examples/yolo/data/labels/68_0.png
deleted file mode 100644
index 9e86a2f..0000000
--- a/ggml/examples/yolo/data/labels/68_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/68_1.png b/ggml/examples/yolo/data/labels/68_1.png
deleted file mode 100644
index d7d3ac3..0000000
--- a/ggml/examples/yolo/data/labels/68_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/68_2.png b/ggml/examples/yolo/data/labels/68_2.png
deleted file mode 100644
index 3a48f05..0000000
--- a/ggml/examples/yolo/data/labels/68_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/68_3.png b/ggml/examples/yolo/data/labels/68_3.png
deleted file mode 100644
index 808603a..0000000
--- a/ggml/examples/yolo/data/labels/68_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/68_4.png b/ggml/examples/yolo/data/labels/68_4.png
deleted file mode 100644
index d4f9bd4..0000000
--- a/ggml/examples/yolo/data/labels/68_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/68_5.png b/ggml/examples/yolo/data/labels/68_5.png
deleted file mode 100644
index 25d8a96..0000000
--- a/ggml/examples/yolo/data/labels/68_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/68_6.png b/ggml/examples/yolo/data/labels/68_6.png
deleted file mode 100644
index 0059c7f..0000000
--- a/ggml/examples/yolo/data/labels/68_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/68_7.png b/ggml/examples/yolo/data/labels/68_7.png
deleted file mode 100644
index c2aa922..0000000
--- a/ggml/examples/yolo/data/labels/68_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/69_0.png b/ggml/examples/yolo/data/labels/69_0.png
deleted file mode 100644
index 51140e0..0000000
--- a/ggml/examples/yolo/data/labels/69_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/69_1.png b/ggml/examples/yolo/data/labels/69_1.png
deleted file mode 100644
index bcf7311..0000000
--- a/ggml/examples/yolo/data/labels/69_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/69_2.png b/ggml/examples/yolo/data/labels/69_2.png
deleted file mode 100644
index 582b289..0000000
--- a/ggml/examples/yolo/data/labels/69_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/69_3.png b/ggml/examples/yolo/data/labels/69_3.png
deleted file mode 100644
index e331123..0000000
--- a/ggml/examples/yolo/data/labels/69_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/69_4.png b/ggml/examples/yolo/data/labels/69_4.png
deleted file mode 100644
index a310f84..0000000
--- a/ggml/examples/yolo/data/labels/69_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/69_5.png b/ggml/examples/yolo/data/labels/69_5.png
deleted file mode 100644
index ad69668..0000000
--- a/ggml/examples/yolo/data/labels/69_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/69_6.png b/ggml/examples/yolo/data/labels/69_6.png
deleted file mode 100644
index a7b8f45..0000000
--- a/ggml/examples/yolo/data/labels/69_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/69_7.png b/ggml/examples/yolo/data/labels/69_7.png
deleted file mode 100644
index 7f3200c..0000000
--- a/ggml/examples/yolo/data/labels/69_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/70_0.png b/ggml/examples/yolo/data/labels/70_0.png
deleted file mode 100644
index e0fb76c..0000000
--- a/ggml/examples/yolo/data/labels/70_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/70_1.png b/ggml/examples/yolo/data/labels/70_1.png
deleted file mode 100644
index 40c8217..0000000
--- a/ggml/examples/yolo/data/labels/70_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/70_2.png b/ggml/examples/yolo/data/labels/70_2.png
deleted file mode 100644
index 63c811b..0000000
--- a/ggml/examples/yolo/data/labels/70_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/70_3.png b/ggml/examples/yolo/data/labels/70_3.png
deleted file mode 100644
index 7c93cf7..0000000
--- a/ggml/examples/yolo/data/labels/70_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/70_4.png b/ggml/examples/yolo/data/labels/70_4.png
deleted file mode 100644
index 6033fb1..0000000
--- a/ggml/examples/yolo/data/labels/70_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/70_5.png b/ggml/examples/yolo/data/labels/70_5.png
deleted file mode 100644
index bfcf028..0000000
--- a/ggml/examples/yolo/data/labels/70_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/70_6.png b/ggml/examples/yolo/data/labels/70_6.png
deleted file mode 100644
index 5fe9cdb..0000000
--- a/ggml/examples/yolo/data/labels/70_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/70_7.png b/ggml/examples/yolo/data/labels/70_7.png
deleted file mode 100644
index 027af86..0000000
--- a/ggml/examples/yolo/data/labels/70_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/71_0.png b/ggml/examples/yolo/data/labels/71_0.png
deleted file mode 100644
index 679b5e9..0000000
--- a/ggml/examples/yolo/data/labels/71_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/71_1.png b/ggml/examples/yolo/data/labels/71_1.png
deleted file mode 100644
index 6f5271b..0000000
--- a/ggml/examples/yolo/data/labels/71_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/71_2.png b/ggml/examples/yolo/data/labels/71_2.png
deleted file mode 100644
index 93dc749..0000000
--- a/ggml/examples/yolo/data/labels/71_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/71_3.png b/ggml/examples/yolo/data/labels/71_3.png
deleted file mode 100644
index 03a4bc6..0000000
--- a/ggml/examples/yolo/data/labels/71_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/71_4.png b/ggml/examples/yolo/data/labels/71_4.png
deleted file mode 100644
index 3976418..0000000
--- a/ggml/examples/yolo/data/labels/71_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/71_5.png b/ggml/examples/yolo/data/labels/71_5.png
deleted file mode 100644
index b0c4295..0000000
--- a/ggml/examples/yolo/data/labels/71_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/71_6.png b/ggml/examples/yolo/data/labels/71_6.png
deleted file mode 100644
index bb8710d..0000000
--- a/ggml/examples/yolo/data/labels/71_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/71_7.png b/ggml/examples/yolo/data/labels/71_7.png
deleted file mode 100644
index 68148a9..0000000
--- a/ggml/examples/yolo/data/labels/71_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/72_0.png b/ggml/examples/yolo/data/labels/72_0.png
deleted file mode 100644
index a2edd04..0000000
--- a/ggml/examples/yolo/data/labels/72_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/72_1.png b/ggml/examples/yolo/data/labels/72_1.png
deleted file mode 100644
index f1e8bcd..0000000
--- a/ggml/examples/yolo/data/labels/72_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/72_2.png b/ggml/examples/yolo/data/labels/72_2.png
deleted file mode 100644
index a9516f1..0000000
--- a/ggml/examples/yolo/data/labels/72_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/72_3.png b/ggml/examples/yolo/data/labels/72_3.png
deleted file mode 100644
index 5b94490..0000000
--- a/ggml/examples/yolo/data/labels/72_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/72_4.png b/ggml/examples/yolo/data/labels/72_4.png
deleted file mode 100644
index 83f7dbf..0000000
--- a/ggml/examples/yolo/data/labels/72_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/72_5.png b/ggml/examples/yolo/data/labels/72_5.png
deleted file mode 100644
index 5a40ad3..0000000
--- a/ggml/examples/yolo/data/labels/72_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/72_6.png b/ggml/examples/yolo/data/labels/72_6.png
deleted file mode 100644
index b889837..0000000
--- a/ggml/examples/yolo/data/labels/72_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/72_7.png b/ggml/examples/yolo/data/labels/72_7.png
deleted file mode 100644
index 4ec34df..0000000
--- a/ggml/examples/yolo/data/labels/72_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/73_0.png b/ggml/examples/yolo/data/labels/73_0.png
deleted file mode 100644
index e554432..0000000
--- a/ggml/examples/yolo/data/labels/73_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/73_1.png b/ggml/examples/yolo/data/labels/73_1.png
deleted file mode 100644
index 08d1924..0000000
--- a/ggml/examples/yolo/data/labels/73_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/73_2.png b/ggml/examples/yolo/data/labels/73_2.png
deleted file mode 100644
index 3c87085..0000000
--- a/ggml/examples/yolo/data/labels/73_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/73_3.png b/ggml/examples/yolo/data/labels/73_3.png
deleted file mode 100644
index 11d0f76..0000000
--- a/ggml/examples/yolo/data/labels/73_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/73_4.png b/ggml/examples/yolo/data/labels/73_4.png
deleted file mode 100644
index ebacf1f..0000000
--- a/ggml/examples/yolo/data/labels/73_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/73_5.png b/ggml/examples/yolo/data/labels/73_5.png
deleted file mode 100644
index d58377d..0000000
--- a/ggml/examples/yolo/data/labels/73_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/73_6.png b/ggml/examples/yolo/data/labels/73_6.png
deleted file mode 100644
index 293b96b..0000000
--- a/ggml/examples/yolo/data/labels/73_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/73_7.png b/ggml/examples/yolo/data/labels/73_7.png
deleted file mode 100644
index 80f409d..0000000
--- a/ggml/examples/yolo/data/labels/73_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/74_0.png b/ggml/examples/yolo/data/labels/74_0.png
deleted file mode 100644
index cc1d9f3..0000000
--- a/ggml/examples/yolo/data/labels/74_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/74_1.png b/ggml/examples/yolo/data/labels/74_1.png
deleted file mode 100644
index 9171332..0000000
--- a/ggml/examples/yolo/data/labels/74_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/74_2.png b/ggml/examples/yolo/data/labels/74_2.png
deleted file mode 100644
index c08cea8..0000000
--- a/ggml/examples/yolo/data/labels/74_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/74_3.png b/ggml/examples/yolo/data/labels/74_3.png
deleted file mode 100644
index ce8e8c8..0000000
--- a/ggml/examples/yolo/data/labels/74_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/74_4.png b/ggml/examples/yolo/data/labels/74_4.png
deleted file mode 100644
index b112fa6..0000000
--- a/ggml/examples/yolo/data/labels/74_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/74_5.png b/ggml/examples/yolo/data/labels/74_5.png
deleted file mode 100644
index c9a7bbb..0000000
--- a/ggml/examples/yolo/data/labels/74_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/74_6.png b/ggml/examples/yolo/data/labels/74_6.png
deleted file mode 100644
index 8f72aac..0000000
--- a/ggml/examples/yolo/data/labels/74_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/74_7.png b/ggml/examples/yolo/data/labels/74_7.png
deleted file mode 100644
index f85a9f9..0000000
--- a/ggml/examples/yolo/data/labels/74_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/75_0.png b/ggml/examples/yolo/data/labels/75_0.png
deleted file mode 100644
index b1f3ac4..0000000
--- a/ggml/examples/yolo/data/labels/75_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/75_1.png b/ggml/examples/yolo/data/labels/75_1.png
deleted file mode 100644
index 9f06109..0000000
--- a/ggml/examples/yolo/data/labels/75_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/75_2.png b/ggml/examples/yolo/data/labels/75_2.png
deleted file mode 100644
index 9224666..0000000
--- a/ggml/examples/yolo/data/labels/75_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/75_3.png b/ggml/examples/yolo/data/labels/75_3.png
deleted file mode 100644
index eb76489..0000000
--- a/ggml/examples/yolo/data/labels/75_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/75_4.png b/ggml/examples/yolo/data/labels/75_4.png
deleted file mode 100644
index 7dfee15..0000000
--- a/ggml/examples/yolo/data/labels/75_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/75_5.png b/ggml/examples/yolo/data/labels/75_5.png
deleted file mode 100644
index 8b081f6..0000000
--- a/ggml/examples/yolo/data/labels/75_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/75_6.png b/ggml/examples/yolo/data/labels/75_6.png
deleted file mode 100644
index 95631cf..0000000
--- a/ggml/examples/yolo/data/labels/75_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/75_7.png b/ggml/examples/yolo/data/labels/75_7.png
deleted file mode 100644
index db915ea..0000000
--- a/ggml/examples/yolo/data/labels/75_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/76_0.png b/ggml/examples/yolo/data/labels/76_0.png
deleted file mode 100644
index 5ceef1d..0000000
--- a/ggml/examples/yolo/data/labels/76_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/76_1.png b/ggml/examples/yolo/data/labels/76_1.png
deleted file mode 100644
index 5386831..0000000
--- a/ggml/examples/yolo/data/labels/76_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/76_2.png b/ggml/examples/yolo/data/labels/76_2.png
deleted file mode 100644
index 6c8393a..0000000
--- a/ggml/examples/yolo/data/labels/76_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/76_3.png b/ggml/examples/yolo/data/labels/76_3.png
deleted file mode 100644
index fbe5712..0000000
--- a/ggml/examples/yolo/data/labels/76_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/76_4.png b/ggml/examples/yolo/data/labels/76_4.png
deleted file mode 100644
index e47a6eb..0000000
--- a/ggml/examples/yolo/data/labels/76_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/76_5.png b/ggml/examples/yolo/data/labels/76_5.png
deleted file mode 100644
index f349490..0000000
--- a/ggml/examples/yolo/data/labels/76_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/76_6.png b/ggml/examples/yolo/data/labels/76_6.png
deleted file mode 100644
index e661143..0000000
--- a/ggml/examples/yolo/data/labels/76_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/76_7.png b/ggml/examples/yolo/data/labels/76_7.png
deleted file mode 100644
index 64d0834..0000000
--- a/ggml/examples/yolo/data/labels/76_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/77_0.png b/ggml/examples/yolo/data/labels/77_0.png
deleted file mode 100644
index 05aab7d..0000000
--- a/ggml/examples/yolo/data/labels/77_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/77_1.png b/ggml/examples/yolo/data/labels/77_1.png
deleted file mode 100644
index 64cb608..0000000
--- a/ggml/examples/yolo/data/labels/77_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/77_2.png b/ggml/examples/yolo/data/labels/77_2.png
deleted file mode 100644
index 2da938c..0000000
--- a/ggml/examples/yolo/data/labels/77_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/77_3.png b/ggml/examples/yolo/data/labels/77_3.png
deleted file mode 100644
index 14179d9..0000000
--- a/ggml/examples/yolo/data/labels/77_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/77_4.png b/ggml/examples/yolo/data/labels/77_4.png
deleted file mode 100644
index 894b470..0000000
--- a/ggml/examples/yolo/data/labels/77_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/77_5.png b/ggml/examples/yolo/data/labels/77_5.png
deleted file mode 100644
index a55e5b6..0000000
--- a/ggml/examples/yolo/data/labels/77_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/77_6.png b/ggml/examples/yolo/data/labels/77_6.png
deleted file mode 100644
index 4f0e714..0000000
--- a/ggml/examples/yolo/data/labels/77_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/77_7.png b/ggml/examples/yolo/data/labels/77_7.png
deleted file mode 100644
index 804e8c2..0000000
--- a/ggml/examples/yolo/data/labels/77_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/78_0.png b/ggml/examples/yolo/data/labels/78_0.png
deleted file mode 100644
index e1fede7..0000000
--- a/ggml/examples/yolo/data/labels/78_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/78_1.png b/ggml/examples/yolo/data/labels/78_1.png
deleted file mode 100644
index 23b2a74..0000000
--- a/ggml/examples/yolo/data/labels/78_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/78_2.png b/ggml/examples/yolo/data/labels/78_2.png
deleted file mode 100644
index 5471820..0000000
--- a/ggml/examples/yolo/data/labels/78_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/78_3.png b/ggml/examples/yolo/data/labels/78_3.png
deleted file mode 100644
index bc9659a..0000000
--- a/ggml/examples/yolo/data/labels/78_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/78_4.png b/ggml/examples/yolo/data/labels/78_4.png
deleted file mode 100644
index 54b95f9..0000000
--- a/ggml/examples/yolo/data/labels/78_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/78_5.png b/ggml/examples/yolo/data/labels/78_5.png
deleted file mode 100644
index 2c67a51..0000000
--- a/ggml/examples/yolo/data/labels/78_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/78_6.png b/ggml/examples/yolo/data/labels/78_6.png
deleted file mode 100644
index 44326f5..0000000
--- a/ggml/examples/yolo/data/labels/78_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/78_7.png b/ggml/examples/yolo/data/labels/78_7.png
deleted file mode 100644
index 4d5d8dc..0000000
--- a/ggml/examples/yolo/data/labels/78_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/79_0.png b/ggml/examples/yolo/data/labels/79_0.png
deleted file mode 100644
index f3f4c1e..0000000
--- a/ggml/examples/yolo/data/labels/79_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/79_1.png b/ggml/examples/yolo/data/labels/79_1.png
deleted file mode 100644
index 23d960e..0000000
--- a/ggml/examples/yolo/data/labels/79_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/79_2.png b/ggml/examples/yolo/data/labels/79_2.png
deleted file mode 100644
index ce5c52b..0000000
--- a/ggml/examples/yolo/data/labels/79_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/79_3.png b/ggml/examples/yolo/data/labels/79_3.png
deleted file mode 100644
index b80a4f2..0000000
--- a/ggml/examples/yolo/data/labels/79_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/79_4.png b/ggml/examples/yolo/data/labels/79_4.png
deleted file mode 100644
index 7fa16f8..0000000
--- a/ggml/examples/yolo/data/labels/79_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/79_5.png b/ggml/examples/yolo/data/labels/79_5.png
deleted file mode 100644
index c67c5bb..0000000
--- a/ggml/examples/yolo/data/labels/79_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/79_6.png b/ggml/examples/yolo/data/labels/79_6.png
deleted file mode 100644
index e89ecd2..0000000
--- a/ggml/examples/yolo/data/labels/79_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/79_7.png b/ggml/examples/yolo/data/labels/79_7.png
deleted file mode 100644
index d964ae4..0000000
--- a/ggml/examples/yolo/data/labels/79_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/80_0.png b/ggml/examples/yolo/data/labels/80_0.png
deleted file mode 100644
index 61403c9..0000000
--- a/ggml/examples/yolo/data/labels/80_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/80_1.png b/ggml/examples/yolo/data/labels/80_1.png
deleted file mode 100644
index 992338d..0000000
--- a/ggml/examples/yolo/data/labels/80_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/80_2.png b/ggml/examples/yolo/data/labels/80_2.png
deleted file mode 100644
index cfbd431..0000000
--- a/ggml/examples/yolo/data/labels/80_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/80_3.png b/ggml/examples/yolo/data/labels/80_3.png
deleted file mode 100644
index 5de9472..0000000
--- a/ggml/examples/yolo/data/labels/80_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/80_4.png b/ggml/examples/yolo/data/labels/80_4.png
deleted file mode 100644
index 0fecaf1..0000000
--- a/ggml/examples/yolo/data/labels/80_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/80_5.png b/ggml/examples/yolo/data/labels/80_5.png
deleted file mode 100644
index cbf65bd..0000000
--- a/ggml/examples/yolo/data/labels/80_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/80_6.png b/ggml/examples/yolo/data/labels/80_6.png
deleted file mode 100644
index ff54a38..0000000
--- a/ggml/examples/yolo/data/labels/80_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/80_7.png b/ggml/examples/yolo/data/labels/80_7.png
deleted file mode 100644
index 3d6f415..0000000
--- a/ggml/examples/yolo/data/labels/80_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/81_0.png b/ggml/examples/yolo/data/labels/81_0.png
deleted file mode 100644
index 5a2025f..0000000
--- a/ggml/examples/yolo/data/labels/81_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/81_1.png b/ggml/examples/yolo/data/labels/81_1.png
deleted file mode 100644
index 511aec0..0000000
--- a/ggml/examples/yolo/data/labels/81_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/81_2.png b/ggml/examples/yolo/data/labels/81_2.png
deleted file mode 100644
index 8e2f7e9..0000000
--- a/ggml/examples/yolo/data/labels/81_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/81_3.png b/ggml/examples/yolo/data/labels/81_3.png
deleted file mode 100644
index 52edfd4..0000000
--- a/ggml/examples/yolo/data/labels/81_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/81_4.png b/ggml/examples/yolo/data/labels/81_4.png
deleted file mode 100644
index e60c7a9..0000000
--- a/ggml/examples/yolo/data/labels/81_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/81_5.png b/ggml/examples/yolo/data/labels/81_5.png
deleted file mode 100644
index d58df4f..0000000
--- a/ggml/examples/yolo/data/labels/81_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/81_6.png b/ggml/examples/yolo/data/labels/81_6.png
deleted file mode 100644
index d31ba09..0000000
--- a/ggml/examples/yolo/data/labels/81_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/81_7.png b/ggml/examples/yolo/data/labels/81_7.png
deleted file mode 100644
index 0770576..0000000
--- a/ggml/examples/yolo/data/labels/81_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/82_0.png b/ggml/examples/yolo/data/labels/82_0.png
deleted file mode 100644
index 1bc3a98..0000000
--- a/ggml/examples/yolo/data/labels/82_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/82_1.png b/ggml/examples/yolo/data/labels/82_1.png
deleted file mode 100644
index e5931dc..0000000
--- a/ggml/examples/yolo/data/labels/82_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/82_2.png b/ggml/examples/yolo/data/labels/82_2.png
deleted file mode 100644
index a5f8bb7..0000000
--- a/ggml/examples/yolo/data/labels/82_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/82_3.png b/ggml/examples/yolo/data/labels/82_3.png
deleted file mode 100644
index 8b138a9..0000000
--- a/ggml/examples/yolo/data/labels/82_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/82_4.png b/ggml/examples/yolo/data/labels/82_4.png
deleted file mode 100644
index 8a57060..0000000
--- a/ggml/examples/yolo/data/labels/82_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/82_5.png b/ggml/examples/yolo/data/labels/82_5.png
deleted file mode 100644
index f4dc04e..0000000
--- a/ggml/examples/yolo/data/labels/82_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/82_6.png b/ggml/examples/yolo/data/labels/82_6.png
deleted file mode 100644
index 2c7073a..0000000
--- a/ggml/examples/yolo/data/labels/82_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/82_7.png b/ggml/examples/yolo/data/labels/82_7.png
deleted file mode 100644
index f4d966d..0000000
--- a/ggml/examples/yolo/data/labels/82_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/83_0.png b/ggml/examples/yolo/data/labels/83_0.png
deleted file mode 100644
index c048782..0000000
--- a/ggml/examples/yolo/data/labels/83_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/83_1.png b/ggml/examples/yolo/data/labels/83_1.png
deleted file mode 100644
index 3515f98..0000000
--- a/ggml/examples/yolo/data/labels/83_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/83_2.png b/ggml/examples/yolo/data/labels/83_2.png
deleted file mode 100644
index 4a13b8d..0000000
--- a/ggml/examples/yolo/data/labels/83_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/83_3.png b/ggml/examples/yolo/data/labels/83_3.png
deleted file mode 100644
index 0944ac8..0000000
--- a/ggml/examples/yolo/data/labels/83_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/83_4.png b/ggml/examples/yolo/data/labels/83_4.png
deleted file mode 100644
index 8b9f82f..0000000
--- a/ggml/examples/yolo/data/labels/83_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/83_5.png b/ggml/examples/yolo/data/labels/83_5.png
deleted file mode 100644
index 67f70ad..0000000
--- a/ggml/examples/yolo/data/labels/83_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/83_6.png b/ggml/examples/yolo/data/labels/83_6.png
deleted file mode 100644
index 0d3c0e1..0000000
--- a/ggml/examples/yolo/data/labels/83_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/83_7.png b/ggml/examples/yolo/data/labels/83_7.png
deleted file mode 100644
index 0daab32..0000000
--- a/ggml/examples/yolo/data/labels/83_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/84_0.png b/ggml/examples/yolo/data/labels/84_0.png
deleted file mode 100644
index ffc3eb4..0000000
--- a/ggml/examples/yolo/data/labels/84_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/84_1.png b/ggml/examples/yolo/data/labels/84_1.png
deleted file mode 100644
index 956e5eb..0000000
--- a/ggml/examples/yolo/data/labels/84_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/84_2.png b/ggml/examples/yolo/data/labels/84_2.png
deleted file mode 100644
index a1e94fa..0000000
--- a/ggml/examples/yolo/data/labels/84_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/84_3.png b/ggml/examples/yolo/data/labels/84_3.png
deleted file mode 100644
index 2369b0c..0000000
--- a/ggml/examples/yolo/data/labels/84_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/84_4.png b/ggml/examples/yolo/data/labels/84_4.png
deleted file mode 100644
index 6d82ee8..0000000
--- a/ggml/examples/yolo/data/labels/84_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/84_5.png b/ggml/examples/yolo/data/labels/84_5.png
deleted file mode 100644
index 6e95b70..0000000
--- a/ggml/examples/yolo/data/labels/84_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/84_6.png b/ggml/examples/yolo/data/labels/84_6.png
deleted file mode 100644
index 9d62561..0000000
--- a/ggml/examples/yolo/data/labels/84_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/84_7.png b/ggml/examples/yolo/data/labels/84_7.png
deleted file mode 100644
index accde30..0000000
--- a/ggml/examples/yolo/data/labels/84_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/85_0.png b/ggml/examples/yolo/data/labels/85_0.png
deleted file mode 100644
index 48e9906..0000000
--- a/ggml/examples/yolo/data/labels/85_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/85_1.png b/ggml/examples/yolo/data/labels/85_1.png
deleted file mode 100644
index 3080720..0000000
--- a/ggml/examples/yolo/data/labels/85_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/85_2.png b/ggml/examples/yolo/data/labels/85_2.png
deleted file mode 100644
index e5a9369..0000000
--- a/ggml/examples/yolo/data/labels/85_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/85_3.png b/ggml/examples/yolo/data/labels/85_3.png
deleted file mode 100644
index 5816c66..0000000
--- a/ggml/examples/yolo/data/labels/85_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/85_4.png b/ggml/examples/yolo/data/labels/85_4.png
deleted file mode 100644
index 7d18ec0..0000000
--- a/ggml/examples/yolo/data/labels/85_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/85_5.png b/ggml/examples/yolo/data/labels/85_5.png
deleted file mode 100644
index f7b5550..0000000
--- a/ggml/examples/yolo/data/labels/85_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/85_6.png b/ggml/examples/yolo/data/labels/85_6.png
deleted file mode 100644
index 7106103..0000000
--- a/ggml/examples/yolo/data/labels/85_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/85_7.png b/ggml/examples/yolo/data/labels/85_7.png
deleted file mode 100644
index 7179daf..0000000
--- a/ggml/examples/yolo/data/labels/85_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/86_0.png b/ggml/examples/yolo/data/labels/86_0.png
deleted file mode 100644
index 2412ea1..0000000
--- a/ggml/examples/yolo/data/labels/86_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/86_1.png b/ggml/examples/yolo/data/labels/86_1.png
deleted file mode 100644
index 7de1777..0000000
--- a/ggml/examples/yolo/data/labels/86_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/86_2.png b/ggml/examples/yolo/data/labels/86_2.png
deleted file mode 100644
index aae67da..0000000
--- a/ggml/examples/yolo/data/labels/86_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/86_3.png b/ggml/examples/yolo/data/labels/86_3.png
deleted file mode 100644
index e932d2e..0000000
--- a/ggml/examples/yolo/data/labels/86_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/86_4.png b/ggml/examples/yolo/data/labels/86_4.png
deleted file mode 100644
index 9a580c8..0000000
--- a/ggml/examples/yolo/data/labels/86_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/86_5.png b/ggml/examples/yolo/data/labels/86_5.png
deleted file mode 100644
index 6766a5d..0000000
--- a/ggml/examples/yolo/data/labels/86_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/86_6.png b/ggml/examples/yolo/data/labels/86_6.png
deleted file mode 100644
index 462e729..0000000
--- a/ggml/examples/yolo/data/labels/86_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/86_7.png b/ggml/examples/yolo/data/labels/86_7.png
deleted file mode 100644
index 7843c12..0000000
--- a/ggml/examples/yolo/data/labels/86_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/87_0.png b/ggml/examples/yolo/data/labels/87_0.png
deleted file mode 100644
index e7c1285..0000000
--- a/ggml/examples/yolo/data/labels/87_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/87_1.png b/ggml/examples/yolo/data/labels/87_1.png
deleted file mode 100644
index e0af6e0..0000000
--- a/ggml/examples/yolo/data/labels/87_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/87_2.png b/ggml/examples/yolo/data/labels/87_2.png
deleted file mode 100644
index 4bdce94..0000000
--- a/ggml/examples/yolo/data/labels/87_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/87_3.png b/ggml/examples/yolo/data/labels/87_3.png
deleted file mode 100644
index 2d446b9..0000000
--- a/ggml/examples/yolo/data/labels/87_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/87_4.png b/ggml/examples/yolo/data/labels/87_4.png
deleted file mode 100644
index 2719131..0000000
--- a/ggml/examples/yolo/data/labels/87_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/87_5.png b/ggml/examples/yolo/data/labels/87_5.png
deleted file mode 100644
index f635def..0000000
--- a/ggml/examples/yolo/data/labels/87_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/87_6.png b/ggml/examples/yolo/data/labels/87_6.png
deleted file mode 100644
index 25ddeb7..0000000
--- a/ggml/examples/yolo/data/labels/87_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/87_7.png b/ggml/examples/yolo/data/labels/87_7.png
deleted file mode 100644
index b26e702..0000000
--- a/ggml/examples/yolo/data/labels/87_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/88_0.png b/ggml/examples/yolo/data/labels/88_0.png
deleted file mode 100644
index 43b1427..0000000
--- a/ggml/examples/yolo/data/labels/88_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/88_1.png b/ggml/examples/yolo/data/labels/88_1.png
deleted file mode 100644
index 5703b63..0000000
--- a/ggml/examples/yolo/data/labels/88_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/88_2.png b/ggml/examples/yolo/data/labels/88_2.png
deleted file mode 100644
index b3b565a..0000000
--- a/ggml/examples/yolo/data/labels/88_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/88_3.png b/ggml/examples/yolo/data/labels/88_3.png
deleted file mode 100644
index 4afd6dd..0000000
--- a/ggml/examples/yolo/data/labels/88_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/88_4.png b/ggml/examples/yolo/data/labels/88_4.png
deleted file mode 100644
index 198f7b0..0000000
--- a/ggml/examples/yolo/data/labels/88_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/88_5.png b/ggml/examples/yolo/data/labels/88_5.png
deleted file mode 100644
index e46b641..0000000
--- a/ggml/examples/yolo/data/labels/88_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/88_6.png b/ggml/examples/yolo/data/labels/88_6.png
deleted file mode 100644
index 81830a0..0000000
--- a/ggml/examples/yolo/data/labels/88_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/88_7.png b/ggml/examples/yolo/data/labels/88_7.png
deleted file mode 100644
index a916e4e..0000000
--- a/ggml/examples/yolo/data/labels/88_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/89_0.png b/ggml/examples/yolo/data/labels/89_0.png
deleted file mode 100644
index b6f9c4b..0000000
--- a/ggml/examples/yolo/data/labels/89_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/89_1.png b/ggml/examples/yolo/data/labels/89_1.png
deleted file mode 100644
index e5e7a1e..0000000
--- a/ggml/examples/yolo/data/labels/89_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/89_2.png b/ggml/examples/yolo/data/labels/89_2.png
deleted file mode 100644
index 9f9d361..0000000
--- a/ggml/examples/yolo/data/labels/89_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/89_3.png b/ggml/examples/yolo/data/labels/89_3.png
deleted file mode 100644
index 4404891..0000000
--- a/ggml/examples/yolo/data/labels/89_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/89_4.png b/ggml/examples/yolo/data/labels/89_4.png
deleted file mode 100644
index 802bfe4..0000000
--- a/ggml/examples/yolo/data/labels/89_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/89_5.png b/ggml/examples/yolo/data/labels/89_5.png
deleted file mode 100644
index d77e019..0000000
--- a/ggml/examples/yolo/data/labels/89_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/89_6.png b/ggml/examples/yolo/data/labels/89_6.png
deleted file mode 100644
index 4a4fa6b..0000000
--- a/ggml/examples/yolo/data/labels/89_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/89_7.png b/ggml/examples/yolo/data/labels/89_7.png
deleted file mode 100644
index b3dd798..0000000
--- a/ggml/examples/yolo/data/labels/89_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/90_0.png b/ggml/examples/yolo/data/labels/90_0.png
deleted file mode 100644
index 3e10db1..0000000
--- a/ggml/examples/yolo/data/labels/90_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/90_1.png b/ggml/examples/yolo/data/labels/90_1.png
deleted file mode 100644
index 9c4e67e..0000000
--- a/ggml/examples/yolo/data/labels/90_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/90_2.png b/ggml/examples/yolo/data/labels/90_2.png
deleted file mode 100644
index 92d0167..0000000
--- a/ggml/examples/yolo/data/labels/90_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/90_3.png b/ggml/examples/yolo/data/labels/90_3.png
deleted file mode 100644
index 9d346a2..0000000
--- a/ggml/examples/yolo/data/labels/90_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/90_4.png b/ggml/examples/yolo/data/labels/90_4.png
deleted file mode 100644
index ba6012b..0000000
--- a/ggml/examples/yolo/data/labels/90_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/90_5.png b/ggml/examples/yolo/data/labels/90_5.png
deleted file mode 100644
index b664cd2..0000000
--- a/ggml/examples/yolo/data/labels/90_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/90_6.png b/ggml/examples/yolo/data/labels/90_6.png
deleted file mode 100644
index 500c099..0000000
--- a/ggml/examples/yolo/data/labels/90_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/90_7.png b/ggml/examples/yolo/data/labels/90_7.png
deleted file mode 100644
index 3e899a8..0000000
--- a/ggml/examples/yolo/data/labels/90_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/91_0.png b/ggml/examples/yolo/data/labels/91_0.png
deleted file mode 100644
index 0ad0d55..0000000
--- a/ggml/examples/yolo/data/labels/91_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/91_1.png b/ggml/examples/yolo/data/labels/91_1.png
deleted file mode 100644
index afbe0f0..0000000
--- a/ggml/examples/yolo/data/labels/91_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/91_2.png b/ggml/examples/yolo/data/labels/91_2.png
deleted file mode 100644
index d2c8899..0000000
--- a/ggml/examples/yolo/data/labels/91_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/91_3.png b/ggml/examples/yolo/data/labels/91_3.png
deleted file mode 100644
index 002032d..0000000
--- a/ggml/examples/yolo/data/labels/91_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/91_4.png b/ggml/examples/yolo/data/labels/91_4.png
deleted file mode 100644
index 795ce7d..0000000
--- a/ggml/examples/yolo/data/labels/91_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/91_5.png b/ggml/examples/yolo/data/labels/91_5.png
deleted file mode 100644
index 1d1eb22..0000000
--- a/ggml/examples/yolo/data/labels/91_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/91_6.png b/ggml/examples/yolo/data/labels/91_6.png
deleted file mode 100644
index bb31267..0000000
--- a/ggml/examples/yolo/data/labels/91_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/91_7.png b/ggml/examples/yolo/data/labels/91_7.png
deleted file mode 100644
index 70c2cf3..0000000
--- a/ggml/examples/yolo/data/labels/91_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/92_0.png b/ggml/examples/yolo/data/labels/92_0.png
deleted file mode 100644
index e730aed..0000000
--- a/ggml/examples/yolo/data/labels/92_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/92_1.png b/ggml/examples/yolo/data/labels/92_1.png
deleted file mode 100644
index 7f9b51d..0000000
--- a/ggml/examples/yolo/data/labels/92_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/92_2.png b/ggml/examples/yolo/data/labels/92_2.png
deleted file mode 100644
index f01916f..0000000
--- a/ggml/examples/yolo/data/labels/92_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/92_3.png b/ggml/examples/yolo/data/labels/92_3.png
deleted file mode 100644
index b850533..0000000
--- a/ggml/examples/yolo/data/labels/92_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/92_4.png b/ggml/examples/yolo/data/labels/92_4.png
deleted file mode 100644
index ece1682..0000000
--- a/ggml/examples/yolo/data/labels/92_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/92_5.png b/ggml/examples/yolo/data/labels/92_5.png
deleted file mode 100644
index 8216690..0000000
--- a/ggml/examples/yolo/data/labels/92_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/92_6.png b/ggml/examples/yolo/data/labels/92_6.png
deleted file mode 100644
index 9b581fa..0000000
--- a/ggml/examples/yolo/data/labels/92_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/92_7.png b/ggml/examples/yolo/data/labels/92_7.png
deleted file mode 100644
index e57b2e8..0000000
--- a/ggml/examples/yolo/data/labels/92_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/93_0.png b/ggml/examples/yolo/data/labels/93_0.png
deleted file mode 100644
index c21f522..0000000
--- a/ggml/examples/yolo/data/labels/93_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/93_1.png b/ggml/examples/yolo/data/labels/93_1.png
deleted file mode 100644
index 64cbba6..0000000
--- a/ggml/examples/yolo/data/labels/93_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/93_2.png b/ggml/examples/yolo/data/labels/93_2.png
deleted file mode 100644
index a81e1cd..0000000
--- a/ggml/examples/yolo/data/labels/93_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/93_3.png b/ggml/examples/yolo/data/labels/93_3.png
deleted file mode 100644
index 273f35c..0000000
--- a/ggml/examples/yolo/data/labels/93_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/93_4.png b/ggml/examples/yolo/data/labels/93_4.png
deleted file mode 100644
index b44e4c6..0000000
--- a/ggml/examples/yolo/data/labels/93_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/93_5.png b/ggml/examples/yolo/data/labels/93_5.png
deleted file mode 100644
index 1e96e9b..0000000
--- a/ggml/examples/yolo/data/labels/93_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/93_6.png b/ggml/examples/yolo/data/labels/93_6.png
deleted file mode 100644
index 3407e55..0000000
--- a/ggml/examples/yolo/data/labels/93_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/93_7.png b/ggml/examples/yolo/data/labels/93_7.png
deleted file mode 100644
index 428e072..0000000
--- a/ggml/examples/yolo/data/labels/93_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/94_0.png b/ggml/examples/yolo/data/labels/94_0.png
deleted file mode 100644
index baa512b..0000000
--- a/ggml/examples/yolo/data/labels/94_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/94_1.png b/ggml/examples/yolo/data/labels/94_1.png
deleted file mode 100644
index 5f1b116..0000000
--- a/ggml/examples/yolo/data/labels/94_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/94_2.png b/ggml/examples/yolo/data/labels/94_2.png
deleted file mode 100644
index 1cd051e..0000000
--- a/ggml/examples/yolo/data/labels/94_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/94_3.png b/ggml/examples/yolo/data/labels/94_3.png
deleted file mode 100644
index a0a7264..0000000
--- a/ggml/examples/yolo/data/labels/94_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/94_4.png b/ggml/examples/yolo/data/labels/94_4.png
deleted file mode 100644
index 26e82bc..0000000
--- a/ggml/examples/yolo/data/labels/94_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/94_5.png b/ggml/examples/yolo/data/labels/94_5.png
deleted file mode 100644
index 19c32ec..0000000
--- a/ggml/examples/yolo/data/labels/94_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/94_6.png b/ggml/examples/yolo/data/labels/94_6.png
deleted file mode 100644
index e753cad..0000000
--- a/ggml/examples/yolo/data/labels/94_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/94_7.png b/ggml/examples/yolo/data/labels/94_7.png
deleted file mode 100644
index 7d4dfb7..0000000
--- a/ggml/examples/yolo/data/labels/94_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/95_0.png b/ggml/examples/yolo/data/labels/95_0.png
deleted file mode 100644
index 2b25519..0000000
--- a/ggml/examples/yolo/data/labels/95_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/95_1.png b/ggml/examples/yolo/data/labels/95_1.png
deleted file mode 100644
index 6fea8c2..0000000
--- a/ggml/examples/yolo/data/labels/95_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/95_2.png b/ggml/examples/yolo/data/labels/95_2.png
deleted file mode 100644
index d170be1..0000000
--- a/ggml/examples/yolo/data/labels/95_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/95_3.png b/ggml/examples/yolo/data/labels/95_3.png
deleted file mode 100644
index fe5501e..0000000
--- a/ggml/examples/yolo/data/labels/95_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/95_4.png b/ggml/examples/yolo/data/labels/95_4.png
deleted file mode 100644
index ec87fee..0000000
--- a/ggml/examples/yolo/data/labels/95_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/95_5.png b/ggml/examples/yolo/data/labels/95_5.png
deleted file mode 100644
index efa9f0b..0000000
--- a/ggml/examples/yolo/data/labels/95_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/95_6.png b/ggml/examples/yolo/data/labels/95_6.png
deleted file mode 100644
index 6840b82..0000000
--- a/ggml/examples/yolo/data/labels/95_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/95_7.png b/ggml/examples/yolo/data/labels/95_7.png
deleted file mode 100644
index 85f887d..0000000
--- a/ggml/examples/yolo/data/labels/95_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/96_0.png b/ggml/examples/yolo/data/labels/96_0.png
deleted file mode 100644
index 2033d9b..0000000
--- a/ggml/examples/yolo/data/labels/96_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/96_1.png b/ggml/examples/yolo/data/labels/96_1.png
deleted file mode 100644
index c60d36a..0000000
--- a/ggml/examples/yolo/data/labels/96_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/96_2.png b/ggml/examples/yolo/data/labels/96_2.png
deleted file mode 100644
index 878dd1b..0000000
--- a/ggml/examples/yolo/data/labels/96_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/96_3.png b/ggml/examples/yolo/data/labels/96_3.png
deleted file mode 100644
index f7429a2..0000000
--- a/ggml/examples/yolo/data/labels/96_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/96_4.png b/ggml/examples/yolo/data/labels/96_4.png
deleted file mode 100644
index 6dff9da..0000000
--- a/ggml/examples/yolo/data/labels/96_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/96_5.png b/ggml/examples/yolo/data/labels/96_5.png
deleted file mode 100644
index 4869f9d..0000000
--- a/ggml/examples/yolo/data/labels/96_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/96_6.png b/ggml/examples/yolo/data/labels/96_6.png
deleted file mode 100644
index f6a8a69..0000000
--- a/ggml/examples/yolo/data/labels/96_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/96_7.png b/ggml/examples/yolo/data/labels/96_7.png
deleted file mode 100644
index 89b19d4..0000000
--- a/ggml/examples/yolo/data/labels/96_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/97_0.png b/ggml/examples/yolo/data/labels/97_0.png
deleted file mode 100644
index 6426224..0000000
--- a/ggml/examples/yolo/data/labels/97_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/97_1.png b/ggml/examples/yolo/data/labels/97_1.png
deleted file mode 100644
index f9a61f4..0000000
--- a/ggml/examples/yolo/data/labels/97_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/97_2.png b/ggml/examples/yolo/data/labels/97_2.png
deleted file mode 100644
index d1d02ac..0000000
--- a/ggml/examples/yolo/data/labels/97_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/97_3.png b/ggml/examples/yolo/data/labels/97_3.png
deleted file mode 100644
index 7e9e936..0000000
--- a/ggml/examples/yolo/data/labels/97_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/97_4.png b/ggml/examples/yolo/data/labels/97_4.png
deleted file mode 100644
index 21dd17e..0000000
--- a/ggml/examples/yolo/data/labels/97_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/97_5.png b/ggml/examples/yolo/data/labels/97_5.png
deleted file mode 100644
index 5e59fa2..0000000
--- a/ggml/examples/yolo/data/labels/97_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/97_6.png b/ggml/examples/yolo/data/labels/97_6.png
deleted file mode 100644
index 6f279a4..0000000
--- a/ggml/examples/yolo/data/labels/97_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/97_7.png b/ggml/examples/yolo/data/labels/97_7.png
deleted file mode 100644
index 6447e6d..0000000
--- a/ggml/examples/yolo/data/labels/97_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/98_0.png b/ggml/examples/yolo/data/labels/98_0.png
deleted file mode 100644
index 3d25931..0000000
--- a/ggml/examples/yolo/data/labels/98_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/98_1.png b/ggml/examples/yolo/data/labels/98_1.png
deleted file mode 100644
index c76706a..0000000
--- a/ggml/examples/yolo/data/labels/98_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/98_2.png b/ggml/examples/yolo/data/labels/98_2.png
deleted file mode 100644
index 6dceadb..0000000
--- a/ggml/examples/yolo/data/labels/98_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/98_3.png b/ggml/examples/yolo/data/labels/98_3.png
deleted file mode 100644
index d3cf1bc..0000000
--- a/ggml/examples/yolo/data/labels/98_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/98_4.png b/ggml/examples/yolo/data/labels/98_4.png
deleted file mode 100644
index 5d5addc..0000000
--- a/ggml/examples/yolo/data/labels/98_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/98_5.png b/ggml/examples/yolo/data/labels/98_5.png
deleted file mode 100644
index 55479d9..0000000
--- a/ggml/examples/yolo/data/labels/98_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/98_6.png b/ggml/examples/yolo/data/labels/98_6.png
deleted file mode 100644
index 4d8f3d9..0000000
--- a/ggml/examples/yolo/data/labels/98_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/98_7.png b/ggml/examples/yolo/data/labels/98_7.png
deleted file mode 100644
index cec6017..0000000
--- a/ggml/examples/yolo/data/labels/98_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/99_0.png b/ggml/examples/yolo/data/labels/99_0.png
deleted file mode 100644
index f5975c9..0000000
--- a/ggml/examples/yolo/data/labels/99_0.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/99_1.png b/ggml/examples/yolo/data/labels/99_1.png
deleted file mode 100644
index 3cdfdec..0000000
--- a/ggml/examples/yolo/data/labels/99_1.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/99_2.png b/ggml/examples/yolo/data/labels/99_2.png
deleted file mode 100644
index a0dc573..0000000
--- a/ggml/examples/yolo/data/labels/99_2.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/99_3.png b/ggml/examples/yolo/data/labels/99_3.png
deleted file mode 100644
index e183f6a..0000000
--- a/ggml/examples/yolo/data/labels/99_3.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/99_4.png b/ggml/examples/yolo/data/labels/99_4.png
deleted file mode 100644
index bc1a490..0000000
--- a/ggml/examples/yolo/data/labels/99_4.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/99_5.png b/ggml/examples/yolo/data/labels/99_5.png
deleted file mode 100644
index c568d5e..0000000
--- a/ggml/examples/yolo/data/labels/99_5.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/99_6.png b/ggml/examples/yolo/data/labels/99_6.png
deleted file mode 100644
index e274446..0000000
--- a/ggml/examples/yolo/data/labels/99_6.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/data/labels/99_7.png b/ggml/examples/yolo/data/labels/99_7.png
deleted file mode 100644
index f7b9e09..0000000
--- a/ggml/examples/yolo/data/labels/99_7.png
+++ /dev/null
Binary files differ
diff --git a/ggml/examples/yolo/yolo-image.cpp b/ggml/examples/yolo/yolo-image.cpp
deleted file mode 100644
index ceddc41..0000000
--- a/ggml/examples/yolo/yolo-image.cpp
+++ /dev/null
@@ -1,210 +0,0 @@
-#define STB_IMAGE_IMPLEMENTATION
-#include "stb_image.h"
-#define STB_IMAGE_WRITE_IMPLEMENTATION
-#include "stb_image_write.h"
-
-#include "yolo-image.h"
-
-static void draw_box(yolo_image & a, int x1, int y1, int x2, int y2, float r, float g, float b)
-{
-    if (x1 < 0) x1 = 0;
-    if (x1 >= a.w) x1 = a.w-1;
-    if (x2 < 0) x2 = 0;
-    if (x2 >= a.w) x2 = a.w-1;
-
-    if (y1 < 0) y1 = 0;
-    if (y1 >= a.h) y1 = a.h-1;
-    if (y2 < 0) y2 = 0;
-    if (y2 >= a.h) y2 = a.h-1;
-
-    for (int i = x1; i <= x2; ++i){
-        a.data[i + y1*a.w + 0*a.w*a.h] = r;
-        a.data[i + y2*a.w + 0*a.w*a.h] = r;
-
-        a.data[i + y1*a.w + 1*a.w*a.h] = g;
-        a.data[i + y2*a.w + 1*a.w*a.h] = g;
-
-        a.data[i + y1*a.w + 2*a.w*a.h] = b;
-        a.data[i + y2*a.w + 2*a.w*a.h] = b;
-    }
-    for (int i = y1; i <= y2; ++i){
-        a.data[x1 + i*a.w + 0*a.w*a.h] = r;
-        a.data[x2 + i*a.w + 0*a.w*a.h] = r;
-
-        a.data[x1 + i*a.w + 1*a.w*a.h] = g;
-        a.data[x2 + i*a.w + 1*a.w*a.h] = g;
-
-        a.data[x1 + i*a.w + 2*a.w*a.h] = b;
-        a.data[x2 + i*a.w + 2*a.w*a.h] = b;
-    }
-}
-
-void draw_box_width(yolo_image & a, int x1, int y1, int x2, int y2, int w, float r, float g, float b)
-{
-    for (int i = 0; i < w; ++i) {
-        draw_box(a, x1+i, y1+i, x2-i, y2-i, r, g, b);
-    }
-}
-
-bool save_image(const yolo_image & im, const char *name, int quality)
-{
-    uint8_t *data = (uint8_t*)calloc(im.w*im.h*im.c, sizeof(uint8_t));
-    for (int k = 0; k < im.c; ++k) {
-        for (int i = 0; i < im.w*im.h; ++i) {
-            data[i*im.c+k] = (uint8_t) (255*im.data[i + k*im.w*im.h]);
-        }
-    }
-    int success = stbi_write_jpg(name, im.w, im.h, im.c, data, quality);
-    free(data);
-    if (!success) {
-        fprintf(stderr, "Failed to write image %s\n", name);
-        return false;
-    }
-    return true;
-}
-
-bool load_image(const char *fname, yolo_image & img)
-{
-    int w, h, c;
-    uint8_t * data = stbi_load(fname, &w, &h, &c, 3);
-    if (!data) {
-        return false;
-    }
-    c = 3;
-    img.w = w;
-    img.h = h;
-    img.c = c;
-    img.data.resize(w*h*c);
-    for (int k = 0; k < c; ++k){
-        for (int j = 0; j < h; ++j){
-            for (int i = 0; i < w; ++i){
-                int dst_index = i + w*j + w*h*k;
-                int src_index = k + c*i + c*w*j;
-                img.data[dst_index] = (float)data[src_index]/255.;
-            }
-        }
-    }
-    stbi_image_free(data);
-    return true;
-}
-
-static yolo_image resize_image(const yolo_image & im, int w, int h)
-{
-    yolo_image resized(w, h, im.c);
-    yolo_image part(w, im.h, im.c);
-    float w_scale = (float)(im.w - 1) / (w - 1);
-    float h_scale = (float)(im.h - 1) / (h - 1);
-    for (int k = 0; k < im.c; ++k){
-        for (int r = 0; r < im.h; ++r) {
-            for (int c = 0; c < w; ++c) {
-                float val = 0;
-                if (c == w-1 || im.w == 1){
-                    val = im.get_pixel(im.w-1, r, k);
-                } else {
-                    float sx = c*w_scale;
-                    int ix = (int) sx;
-                    float dx = sx - ix;
-                    val = (1 - dx) * im.get_pixel(ix, r, k) + dx * im.get_pixel(ix+1, r, k);
-                }
-                part.set_pixel(c, r, k, val);
-            }
-        }
-    }
-    for (int k = 0; k < im.c; ++k){
-        for (int r = 0; r < h; ++r){
-            float sy = r*h_scale;
-            int iy = (int) sy;
-            float dy = sy - iy;
-            for (int c = 0; c < w; ++c){
-                float val = (1-dy) * part.get_pixel(c, iy, k);
-                resized.set_pixel(c, r, k, val);
-            }
-            if (r == h-1 || im.h == 1) continue;
-            for (int c = 0; c < w; ++c){
-                float val = dy * part.get_pixel(c, iy+1, k);
-                resized.add_pixel(c, r, k, val);
-            }
-        }
-    }
-    return resized;
-}
-
-static void embed_image(const yolo_image & source, yolo_image & dest, int dx, int dy)
-{
-    for (int k = 0; k < source.c; ++k) {
-        for (int y = 0; y < source.h; ++y) {
-            for (int x = 0; x < source.w; ++x) {
-                float val = source.get_pixel(x, y, k);
-                dest.set_pixel(dx+x, dy+y, k, val);
-            }
-        }
-    }
-}
-
-yolo_image letterbox_image(const yolo_image & im, int w, int h)
-{
-    int new_w = im.w;
-    int new_h = im.h;
-    if (((float)w/im.w) < ((float)h/im.h)) {
-        new_w = w;
-        new_h = (im.h * w)/im.w;
-    } else {
-        new_h = h;
-        new_w = (im.w * h)/im.h;
-    }
-    yolo_image resized = resize_image(im, new_w, new_h);
-    yolo_image boxed(w, h, im.c);
-    boxed.fill(0.5);
-    embed_image(resized, boxed, (w-new_w)/2, (h-new_h)/2);
-    return boxed;
-}
-
-static yolo_image tile_images(const yolo_image & a, const yolo_image & b, int dx)
-{
-    if (a.w == 0) {
-        return b;
-    }
-    yolo_image c(a.w + b.w + dx, (a.h > b.h) ? a.h : b.h, a.c);
-    c.fill(1.0f);
-    embed_image(a, c, 0, 0);
-    embed_image(b, c, a.w + dx, 0);
-    return c;
-}
-
-static yolo_image border_image(const yolo_image & a, int border)
-{
-    yolo_image b(a.w + 2*border, a.h + 2*border, a.c);
-    b.fill(1.0f);
-    embed_image(a, b, border, border);
-    return b;
-}
-
-yolo_image get_label(const std::vector<yolo_image> & alphabet, const std::string & label, int size)
-{
-    size = size/10;
-    size = std::min(size, 7);
-    yolo_image result(0,0,0);
-    for (int i = 0; i < (int)label.size(); ++i) {
-        int ch = label[i];
-        yolo_image img = alphabet[size*128 + ch];
-        result = tile_images(result, img, -size - 1 + (size+1)/2);
-    }
-    return border_image(result, (int)(result.h*.25));
-}
-
-void draw_label(yolo_image & im, int row, int col, const yolo_image & label, const float * rgb)
-{
-    int w = label.w;
-    int h = label.h;
-    if (row - h >= 0) {
-        row = row - h;
-    }
-    for (int j = 0; j < h && j + row < im.h; j++) {
-        for (int i = 0; i < w && i + col < im.w; i++) {
-            for (int k = 0; k < label.c; k++) {
-                float val = label.get_pixel(i, j, k);
-                im.set_pixel(i + col, j + row, k, rgb[k] * val);
-            }
-        }
-    }
-}
\ No newline at end of file
diff --git a/ggml/examples/yolo/yolo-image.h b/ggml/examples/yolo/yolo-image.h
deleted file mode 100644
index 124e1cd..0000000
--- a/ggml/examples/yolo/yolo-image.h
+++ /dev/null
@@ -1,39 +0,0 @@
-#pragma once
-
-#include <string>
-#include <vector>
-#include <cassert>
-
-struct yolo_image {
-    int w, h, c;
-    std::vector<float> data;
-
-    yolo_image() : w(0), h(0), c(0) {}
-    yolo_image(int w, int h, int c) : w(w), h(h), c(c), data(w*h*c) {}
-
-    float get_pixel(int x, int y, int c) const {
-        assert(x >= 0 && x < w && y >= 0 && y < h && c >= 0 && c < this->c);
-        return data[c*w*h + y*w + x];
-    }
-
-    void set_pixel(int x, int y, int c, float val) {
-        assert(x >= 0 && x < w && y >= 0 && y < h && c >= 0 && c < this->c);
-        data[c*w*h + y*w + x] = val;
-    }
-
-    void add_pixel(int x, int y, int c, float val) {
-        assert(x >= 0 && x < w && y >= 0 && y < h && c >= 0 && c < this->c);
-        data[c*w*h + y*w + x] += val;
-    }
-
-    void fill(float val) {
-        std::fill(data.begin(), data.end(), val);
-    }
-};
-
-bool load_image(const char *fname, yolo_image & img);
-void draw_box_width(yolo_image & a, int x1, int y1, int x2, int y2, int w, float r, float g, float b);
-yolo_image letterbox_image(const yolo_image & im, int w, int h);
-bool save_image(const yolo_image & im, const char *name, int quality);
-yolo_image get_label(const std::vector<yolo_image> & alphabet, const std::string & label, int size);
-void draw_label(yolo_image & im, int row, int col, const yolo_image & label, const float * rgb);
diff --git a/ggml/examples/yolo/yolov3-tiny.cpp b/ggml/examples/yolo/yolov3-tiny.cpp
deleted file mode 100644
index 2467bd8..0000000
--- a/ggml/examples/yolo/yolov3-tiny.cpp
+++ /dev/null
@@ -1,525 +0,0 @@
-#include "ggml/ggml.h"
-#include "yolo-image.h"
-
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <ctime>
-#include <string>
-#include <vector>
-#include <algorithm>
-#include <fstream>
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-
-struct conv2d_layer {
-    struct ggml_tensor * weights;
-    struct ggml_tensor * biases;
-    struct ggml_tensor * scales;
-    struct ggml_tensor * rolling_mean;
-    struct ggml_tensor * rolling_variance;
-    int padding = 1;
-    bool batch_normalize = true;
-    bool activate = true; // true for leaky relu, false for linear
-};
-
-struct yolo_model {
-    int width = 416;
-    int height = 416;
-    std::vector<conv2d_layer> conv2d_layers;
-    struct ggml_context * ctx;
-};
-
-struct yolo_layer {
-    int classes = 80;
-    std::vector<int> mask;
-    std::vector<float> anchors;
-    struct ggml_tensor * predictions;
-
-    yolo_layer(int classes, const std::vector<int> & mask, const std::vector<float> & anchors, struct ggml_tensor * predictions)
-        : classes(classes), mask(mask), anchors(anchors), predictions(predictions)
-    { }
-
-    int entry_index(int location, int entry) const {
-        int w = predictions->ne[0];
-        int h = predictions->ne[1];
-        int n = location / (w*h);
-        int loc = location % (w*h);
-        return n*w*h*(4+classes+1) + entry*w*h + loc;
-    }
-};
-
-struct box {
-    float x, y, w, h;
-};
-
-struct detection {
-    box bbox;
-    std::vector<float> prob;
-    float objectness;
-};
-
-static bool load_model(const std::string & fname, yolo_model & model) {
-    struct gguf_init_params params = {
-        /*.no_alloc   =*/ false,
-        /*.ctx        =*/ &model.ctx,
-    };
-    gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
-    if (!ctx) {
-        fprintf(stderr, "%s: gguf_init_from_file() failed\n", __func__);
-        return false;
-    }
-    model.width  = 416;
-    model.height = 416;
-    model.conv2d_layers.resize(13);
-    model.conv2d_layers[7].padding = 0;
-    model.conv2d_layers[9].padding = 0;
-    model.conv2d_layers[9].batch_normalize = false;
-    model.conv2d_layers[9].activate = false;
-    model.conv2d_layers[10].padding = 0;
-    model.conv2d_layers[12].padding = 0;
-    model.conv2d_layers[12].batch_normalize = false;
-    model.conv2d_layers[12].activate = false;
-    for (int i = 0; i < (int)model.conv2d_layers.size(); i++) {
-        char name[256];
-        snprintf(name, sizeof(name), "l%d_weights", i);
-        model.conv2d_layers[i].weights = ggml_get_tensor(model.ctx, name);
-        snprintf(name, sizeof(name), "l%d_biases", i);
-        model.conv2d_layers[i].biases = ggml_get_tensor(model.ctx, name);
-        if (model.conv2d_layers[i].batch_normalize) {
-            snprintf(name, sizeof(name), "l%d_scales", i);
-            model.conv2d_layers[i].scales = ggml_get_tensor(model.ctx, name);
-            snprintf(name, sizeof(name), "l%d_rolling_mean", i);
-            model.conv2d_layers[i].rolling_mean = ggml_get_tensor(model.ctx, name);
-            snprintf(name, sizeof(name), "l%d_rolling_variance", i);
-            model.conv2d_layers[i].rolling_variance = ggml_get_tensor(model.ctx, name);
-        }
-    }
-    return true;
-}
-
-static bool load_labels(const char * filename, std::vector<std::string> & labels)
-{
-    std::ifstream file_in(filename);
-    if (!file_in) {
-        return false;
-    }
-    std::string line;
-    while (std::getline(file_in, line)) {
-        labels.push_back(line);
-    }
-    GGML_ASSERT(labels.size() == 80);
-    return true;
-}
-
-static bool load_alphabet(std::vector<yolo_image> & alphabet)
-{
-    alphabet.resize(8 * 128);
-    for (int j = 0; j < 8; j++) {
-        for (int i = 32; i < 127; i++) {
-            char fname[256];
-            sprintf(fname, "data/labels/%d_%d.png", i, j);
-            if (!load_image(fname, alphabet[j*128 + i])) {
-                fprintf(stderr, "Cannot load '%s'\n", fname);
-                return false;
-            }
-        }
-    }
-    return true;
-}
-
-static ggml_tensor * apply_conv2d(ggml_context * ctx, ggml_tensor * input, const conv2d_layer & layer)
-{
-    struct ggml_tensor * result = ggml_conv_2d(ctx, layer.weights, input, 1, 1, layer.padding, layer.padding, 1, 1);
-    if (layer.batch_normalize) {
-        result = ggml_sub(ctx, result, ggml_repeat(ctx, layer.rolling_mean, result));
-        result = ggml_div(ctx, result, ggml_sqrt(ctx, ggml_repeat(ctx, layer.rolling_variance, result)));
-        result = ggml_mul(ctx, result, ggml_repeat(ctx, layer.scales, result));
-    }
-    result = ggml_add(ctx, result, ggml_repeat(ctx, layer.biases, result));
-    if (layer.activate) {
-        result = ggml_leaky_relu(ctx, result, 0.1f, true);
-    }
-    return result;
-}
-
-static void activate_array(float * x, const int n)
-{
-    // logistic activation
-    for (int i = 0; i < n; i++) {
-        x[i] = 1./(1. + exp(-x[i]));
-    }
-}
-
-static void apply_yolo(yolo_layer & layer)
-{
-    int w = layer.predictions->ne[0];
-    int h = layer.predictions->ne[1];
-    int N = layer.mask.size();
-    float * data = ggml_get_data_f32(layer.predictions);
-    for (int n = 0; n < N; n++) {
-        int index = layer.entry_index(n*w*h, 0);
-        activate_array(data + index, 2*w*h);
-        index = layer.entry_index(n*w*h, 4);
-        activate_array(data + index, (1+layer.classes)*w*h);
-    }
-}
-
-static box get_yolo_box(const yolo_layer & layer, int n, int index, int i, int j, int lw, int lh, int w, int h, int stride)
-{
-    float * predictions = ggml_get_data_f32(layer.predictions);
-    box b;
-    b.x = (i + predictions[index + 0*stride]) / lw;
-    b.y = (j + predictions[index + 1*stride]) / lh;
-    b.w = exp(predictions[index + 2*stride]) * layer.anchors[2*n]   / w;
-    b.h = exp(predictions[index + 3*stride]) * layer.anchors[2*n+1] / h;
-    return b;
-}
-
-static void correct_yolo_box(box & b, int im_w, int im_h, int net_w, int net_h)
-{
-    int new_w = 0;
-    int new_h = 0;
-    if (((float)net_w/im_w) < ((float)net_h/im_h)) {
-        new_w = net_w;
-        new_h = (im_h * net_w)/im_w;
-    } else {
-        new_h = net_h;
-        new_w = (im_w * net_h)/im_h;
-    }
-    b.x = (b.x - (net_w - new_w)/2./net_w) / ((float)new_w/net_w);
-    b.y = (b.y - (net_h - new_h)/2./net_h) / ((float)new_h/net_h);
-    b.w *= (float)net_w/new_w;
-    b.h *= (float)net_h/new_h;
-}
-
-static void get_yolo_detections(const yolo_layer & layer, std::vector<detection> & detections, int im_w, int im_h, int netw, int neth, float thresh)
-{
-    int w = layer.predictions->ne[0];
-    int h = layer.predictions->ne[1];
-    int N = layer.mask.size();
-    float * predictions = ggml_get_data_f32(layer.predictions);
-    std::vector<detection> result;
-    for (int i = 0; i < w*h; i++) {
-        for (int n = 0; n < N; n++) {
-            int obj_index = layer.entry_index(n*w*h + i, 4);
-            float objectness = predictions[obj_index];
-            if (objectness <= thresh) {
-                continue;
-            }
-            detection det;
-            int box_index = layer.entry_index(n*w*h + i, 0);
-            int row = i / w;
-            int col = i % w;
-            det.bbox = get_yolo_box(layer, layer.mask[n], box_index, col, row, w, h, netw, neth, w*h);
-            correct_yolo_box(det.bbox, im_w, im_h, netw, neth);
-            det.objectness = objectness;
-            det.prob.resize(layer.classes);
-            for (int j = 0; j < layer.classes; j++) {
-                int class_index = layer.entry_index(n*w*h + i, 4 + 1 + j);
-                float prob = objectness*predictions[class_index];
-                det.prob[j] = (prob > thresh) ? prob : 0;
-            }
-            detections.push_back(det);
-        }
-    }
-}
-
-static float overlap(float x1, float w1, float x2, float w2)
-{
-    float l1 = x1 - w1/2;
-    float l2 = x2 - w2/2;
-    float left = l1 > l2 ? l1 : l2;
-    float r1 = x1 + w1/2;
-    float r2 = x2 + w2/2;
-    float right = r1 < r2 ? r1 : r2;
-    return right - left;
-}
-
-static float box_intersection(const box & a, const box & b)
-{
-    float w = overlap(a.x, a.w, b.x, b.w);
-    float h = overlap(a.y, a.h, b.y, b.h);
-    if (w < 0 || h < 0) return 0;
-    float area = w*h;
-    return area;
-}
-
-static float box_union(const box & a, const box & b)
-{
-    float i = box_intersection(a, b);
-    float u = a.w*a.h + b.w*b.h - i;
-    return u;
-}
-
-static float box_iou(const box & a, const box & b)
-{
-    return box_intersection(a, b)/box_union(a, b);
-}
-
-static void do_nms_sort(std::vector<detection> & dets, int classes, float thresh)
-{
-    int k = (int)dets.size()-1;
-    for (int i = 0; i <= k; ++i) {
-        if (dets[i].objectness == 0) {
-            std::swap(dets[i], dets[k]);
-            --k;
-            --i;
-        }
-    }
-    int total = k+1;
-    for (int k = 0; k < classes; ++k) {
-        std::sort(dets.begin(), dets.begin()+total, [=](const detection & a, const detection & b) {
-            return a.prob[k] > b.prob[k];
-        });
-        for (int i = 0; i < total; ++i) {
-            if (dets[i].prob[k] == 0) {
-                continue;
-            }
-            box a = dets[i].bbox;
-            for (int j = i+1; j < total; ++j){
-                box b = dets[j].bbox;
-                if (box_iou(a, b) > thresh) {
-                    dets[j].prob[k] = 0;
-                }
-            }
-        }
-    }
-}
-
-static float get_color(int c, int x, int max)
-{
-    float colors[6][3] = { {1,0,1}, {0,0,1}, {0,1,1}, {0,1,0}, {1,1,0}, {1,0,0} };
-    float ratio = ((float)x/max)*5;
-    int i = floor(ratio);
-    int j = ceil(ratio);
-    ratio -= i;
-    float r = (1-ratio) * colors[i][c] + ratio*colors[j][c];
-    return r;
-}
-
-static void draw_detections(yolo_image & im, const std::vector<detection> & dets, float thresh, const std::vector<std::string> & labels, const std::vector<yolo_image> & alphabet)
-{
-    int classes = (int)labels.size();
-    for (int i = 0; i < (int)dets.size(); i++) {
-        std::string labelstr;
-        int cl = -1;
-        for (int j = 0; j < (int)dets[i].prob.size(); j++) {
-            if (dets[i].prob[j] > thresh) {
-                if (cl < 0) {
-                    labelstr = labels[j];
-                    cl = j;
-                } else {
-                    labelstr += ", ";
-                    labelstr += labels[j];
-                }
-                printf("%s: %.0f%%\n", labels[j].c_str(), dets[i].prob[j]*100);
-            }
-        }
-        if (cl >= 0) {
-            int width = im.h * .006;
-            int offset = cl*123457 % classes;
-            float red = get_color(2,offset,classes);
-            float green = get_color(1,offset,classes);
-            float blue = get_color(0,offset,classes);
-            float rgb[3];
-
-            rgb[0] = red;
-            rgb[1] = green;
-            rgb[2] = blue;
-            box b = dets[i].bbox;
-
-            int left  = (b.x-b.w/2.)*im.w;
-            int right = (b.x+b.w/2.)*im.w;
-            int top   = (b.y-b.h/2.)*im.h;
-            int bot   = (b.y+b.h/2.)*im.h;
-
-            if (left < 0) left = 0;
-            if (right > im.w-1) right = im.w-1;
-            if (top < 0) top = 0;
-            if (bot > im.h-1) bot = im.h-1;
-
-            draw_box_width(im, left, top, right, bot, width, red, green, blue);
-            yolo_image label = get_label(alphabet, labelstr, (im.h*.03));
-            draw_label(im, top + width, left, label, rgb);
-        }
-    }
-}
-
-static void print_shape(int layer, const ggml_tensor * t)
-{
-    printf("Layer %2d output shape:  %3d x %3d x %4d x %3d\n", layer, (int)t->ne[0], (int)t->ne[1], (int)t->ne[2], (int)t->ne[3]);
-}
-
-void detect(yolo_image & img, const yolo_model & model, float thresh, const std::vector<std::string> & labels, const std::vector<yolo_image> & alphabet)
-{
-    static size_t buf_size = 20000000 * sizeof(float) * 4;
-    static void * buf = malloc(buf_size);
-
-    struct ggml_init_params params = {
-        /*.mem_size   =*/ buf_size,
-        /*.mem_buffer =*/ buf,
-        /*.no_alloc   =*/ false,
-    };
-
-    struct ggml_context * ctx0 = ggml_init(params);
-    struct ggml_cgraph * gf = ggml_new_graph(ctx0);
-    std::vector<detection> detections;
-
-    yolo_image sized = letterbox_image(img, model.width, model.height);
-    struct ggml_tensor * input = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, model.width, model.height, 3, 1);
-    std::memcpy(input->data, sized.data.data(), ggml_nbytes(input));
-    ggml_set_name(input, "input");
-
-    struct ggml_tensor * result = apply_conv2d(ctx0, input, model.conv2d_layers[0]);
-    print_shape(0, result);
-    result = ggml_pool_2d(ctx0, result, GGML_OP_POOL_MAX, 2, 2, 2, 2, 0, 0);
-    print_shape(1, result);
-    result = apply_conv2d(ctx0, result, model.conv2d_layers[1]);
-    print_shape(2, result);
-    result = ggml_pool_2d(ctx0, result, GGML_OP_POOL_MAX, 2, 2, 2, 2, 0, 0);
-    print_shape(3, result);
-    result = apply_conv2d(ctx0, result, model.conv2d_layers[2]);
-    print_shape(4, result);
-    result = ggml_pool_2d(ctx0, result, GGML_OP_POOL_MAX, 2, 2, 2, 2, 0, 0);
-    print_shape(5, result);
-    result = apply_conv2d(ctx0, result, model.conv2d_layers[3]);
-    print_shape(6, result);
-    result = ggml_pool_2d(ctx0, result, GGML_OP_POOL_MAX, 2, 2, 2, 2, 0, 0);
-    print_shape(7, result);
-    result = apply_conv2d(ctx0, result, model.conv2d_layers[4]);
-    struct ggml_tensor * layer_8 = result;
-    print_shape(8, result);
-    result = ggml_pool_2d(ctx0, result, GGML_OP_POOL_MAX, 2, 2, 2, 2, 0, 0);
-    print_shape(9, result);
-    result = apply_conv2d(ctx0, result, model.conv2d_layers[5]);
-    print_shape(10, result);
-    result = ggml_pool_2d(ctx0, result, GGML_OP_POOL_MAX, 2, 2, 1, 1, 0.5, 0.5);
-    print_shape(11, result);
-    result = apply_conv2d(ctx0, result, model.conv2d_layers[6]);
-    print_shape(12, result);
-    result = apply_conv2d(ctx0, result, model.conv2d_layers[7]);
-    struct ggml_tensor * layer_13 = result;
-    print_shape(13, result);
-    result = apply_conv2d(ctx0, result, model.conv2d_layers[8]);
-    print_shape(14, result);
-    result = apply_conv2d(ctx0, result, model.conv2d_layers[9]);
-    struct ggml_tensor * layer_15 = result;
-    print_shape(15, result);
-    result = apply_conv2d(ctx0, layer_13, model.conv2d_layers[10]);
-    print_shape(18, result);
-    result = ggml_upscale(ctx0, result, 2);
-    print_shape(19, result);
-    result = ggml_concat(ctx0, result, layer_8);
-    print_shape(20, result);
-    result = apply_conv2d(ctx0, result, model.conv2d_layers[11]);
-    print_shape(21, result);
-    result = apply_conv2d(ctx0, result, model.conv2d_layers[12]);
-    struct ggml_tensor * layer_22 = result;
-    print_shape(22, result);
-
-    ggml_build_forward_expand(gf, layer_15);
-    ggml_build_forward_expand(gf, layer_22);
-    ggml_graph_compute_with_ctx(ctx0, gf, 1);
-
-    yolo_layer yolo16{ 80, {3, 4, 5}, {10, 14, 23, 27, 37,58, 81, 82, 135, 169, 344, 319}, layer_15};
-    apply_yolo(yolo16);
-    get_yolo_detections(yolo16, detections, img.w, img.h, model.width, model.height, thresh);
-
-    yolo_layer yolo23{ 80, {0, 1, 2}, {10, 14, 23, 27, 37,58, 81, 82, 135, 169, 344, 319}, layer_22};
-    apply_yolo(yolo23);
-    get_yolo_detections(yolo23, detections, img.w, img.h, model.width, model.height, thresh);
-
-    do_nms_sort(detections, yolo23.classes, .45);
-    draw_detections(img, detections, thresh, labels, alphabet);
-    ggml_free(ctx0);
-}
-
-struct yolo_params {
-    float thresh          = 0.5;
-    std::string model     = "yolov3-tiny.gguf";
-    std::string fname_inp = "input.jpg";
-    std::string fname_out = "predictions.jpg";
-};
-
-void yolo_print_usage(int argc, char ** argv, const yolo_params & params) {
-    fprintf(stderr, "usage: %s [options]\n", argv[0]);
-    fprintf(stderr, "\n");
-    fprintf(stderr, "options:\n");
-    fprintf(stderr, "  -h, --help            show this help message and exit\n");
-    fprintf(stderr, "  -th T, --thresh T     detection threshold (default: %.2f)\n", params.thresh);
-    fprintf(stderr, "  -m FNAME, --model FNAME\n");
-    fprintf(stderr, "                        model path (default: %s)\n", params.model.c_str());
-    fprintf(stderr, "  -i FNAME, --inp FNAME\n");
-    fprintf(stderr, "                        input file (default: %s)\n", params.fname_inp.c_str());
-    fprintf(stderr, "  -o FNAME, --out FNAME\n");
-    fprintf(stderr, "                        output file (default: %s)\n", params.fname_out.c_str());
-    fprintf(stderr, "\n");
-}
-
-bool yolo_params_parse(int argc, char ** argv, yolo_params & params) {
-    for (int i = 1; i < argc; i++) {
-        std::string arg = argv[i];
-
-        if (arg == "-th" || arg == "--thresh") {
-            params.thresh = std::stof(argv[++i]);
-        } else if (arg == "-m" || arg == "--model") {
-            params.model = argv[++i];
-        } else if (arg == "-i" || arg == "--inp") {
-            params.fname_inp = argv[++i];
-        } else if (arg == "-o" || arg == "--out") {
-            params.fname_out = argv[++i];
-        } else if (arg == "-h" || arg == "--help") {
-            yolo_print_usage(argc, argv, params);
-            exit(0);
-        } else {
-            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
-            yolo_print_usage(argc, argv, params);
-            exit(0);
-        }
-    }
-
-    return true;
-}
-
-int main(int argc, char *argv[])
-{
-    ggml_time_init();
-    yolo_model model;
-
-    yolo_params params;
-    if (!yolo_params_parse(argc, argv, params)) {
-        return 1;
-    }
-    if (!load_model(params.model, model)) {
-        fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
-        return 1;
-    }
-    yolo_image img(0,0,0);
-    if (!load_image(params.fname_inp.c_str(), img)) {
-        fprintf(stderr, "%s: failed to load image from '%s'\n", __func__, params.fname_inp.c_str());
-        return 1;
-    }
-    std::vector<std::string> labels;
-    if (!load_labels("data/coco.names", labels)) {
-        fprintf(stderr, "%s: failed to load labels from 'data/coco.names'\n", __func__);
-        return 1;
-    }
-    std::vector<yolo_image> alphabet;
-    if (!load_alphabet(alphabet)) {
-        fprintf(stderr, "%s: failed to load alphabet\n", __func__);
-        return 1;
-    }
-    const int64_t t_start_ms = ggml_time_ms();
-    detect(img, model, params.thresh, labels, alphabet);
-    const int64_t t_detect_ms = ggml_time_ms() - t_start_ms;
-    if (!save_image(img, params.fname_out.c_str(), 80)) {
-        fprintf(stderr, "%s: failed to save image to '%s'\n", __func__, params.fname_out.c_str());
-        return 1;
-    }
-    printf("Detected objects saved in '%s' (time: %f sec.)\n", params.fname_out.c_str(), t_detect_ms / 1000.0f);
-    ggml_free(model.ctx);
-    return 0;
-}
diff --git a/ggml/ggml.pc.in b/ggml/ggml.pc.in
deleted file mode 100644
index 5f53cb8..0000000
--- a/ggml/ggml.pc.in
+++ /dev/null
@@ -1,10 +0,0 @@
-prefix=@CMAKE_INSTALL_PREFIX@
-exec_prefix=${prefix}
-includedir=${prefix}/include
-libdir=${prefix}/lib
-
-Name: ggml
-Description: The GGML Tensor Library for Machine Learning
-Version: 0.0.0
-Cflags: -I${includedir}/ggml
-Libs: -L${libdir} -lggml
diff --git a/ggml/include/ggml/ggml-alloc.h b/ggml/include/ggml/ggml-alloc.h
deleted file mode 100644
index 4e59975..0000000
--- a/ggml/include/ggml/ggml-alloc.h
+++ /dev/null
@@ -1,94 +0,0 @@
-#pragma once
-
-#include "ggml.h"
-
-#ifdef  __cplusplus
-extern "C" {
-#endif
-
-struct ggml_backend;
-struct ggml_backend_buffer;
-struct ggml_backend_buffer_type;
-
-//
-// Legacy API
-//
-
-typedef struct ggml_allocr * ggml_allocr_t;
-
-// initialize allocator for use with CPU backend only
-GGML_API ggml_allocr_t ggml_allocr_new(void * data, size_t size, size_t alignment);
-GGML_API ggml_allocr_t ggml_allocr_new_measure(size_t alignment);
-
-// initialize allocator for use with ggml-backend
-GGML_API ggml_allocr_t ggml_allocr_new_from_buffer(struct ggml_backend_buffer * buffer);
-GGML_API ggml_allocr_t ggml_allocr_new_from_backend(struct ggml_backend * backend, size_t size); // allocates an owned buffer
-GGML_API ggml_allocr_t ggml_allocr_new_measure_from_backend(struct ggml_backend * backend);
-
-GGML_API struct ggml_backend_buffer * ggml_allocr_get_buffer(ggml_allocr_t alloc);
-
-// tell the allocator to parse nodes following the order described in the list
-// you should call this if your graph are optimized to execute out-of-order
-GGML_API void   ggml_allocr_set_parse_seq(ggml_allocr_t alloc, const int * list, int n);
-
-GGML_API void   ggml_allocr_free       (ggml_allocr_t alloc);
-GGML_API bool   ggml_allocr_is_measure (ggml_allocr_t alloc);
-GGML_API void   ggml_allocr_reset      (ggml_allocr_t alloc);
-GGML_API void   ggml_allocr_alloc      (ggml_allocr_t alloc, struct ggml_tensor * tensor);
-GGML_API size_t ggml_allocr_max_size   (ggml_allocr_t alloc);
-
-GGML_API size_t ggml_allocr_alloc_graph(ggml_allocr_t alloc, struct ggml_cgraph * graph);
-
-//
-// ggml-backend v2 API
-//
-
-// Separate tensor and graph allocator objects
-// This is necessary for multi-backend allocation because the graph allocator needs to use multiple tensor allocators
-// The original API is kept as a wrapper around the new API
-
-// Tensor allocator
-typedef struct ggml_tallocr * ggml_tallocr_t;
-
-GGML_API ggml_tallocr_t ggml_tallocr_new(void * data, size_t size, size_t alignment);
-GGML_API ggml_tallocr_t ggml_tallocr_new_measure(size_t alignment);
-GGML_API ggml_tallocr_t ggml_tallocr_new_from_buft(struct ggml_backend_buffer_type * buft, size_t size);
-GGML_API ggml_tallocr_t ggml_tallocr_new_from_backend(struct ggml_backend * backend, size_t size); // allocates an owned buffer
-GGML_API ggml_tallocr_t ggml_tallocr_new_from_buffer(struct ggml_backend_buffer * buffer);
-GGML_API ggml_tallocr_t ggml_tallocr_new_measure_from_buft(struct ggml_backend_buffer_type * buft);
-GGML_API ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backend);
-
-GGML_API struct ggml_backend_buffer * ggml_tallocr_get_buffer(ggml_tallocr_t talloc);
-
-GGML_API void   ggml_tallocr_free       (ggml_tallocr_t talloc);
-GGML_API bool   ggml_tallocr_is_measure (ggml_tallocr_t talloc);
-GGML_API void   ggml_tallocr_reset      (ggml_tallocr_t talloc);
-GGML_API void   ggml_tallocr_alloc      (ggml_tallocr_t talloc, struct ggml_tensor * tensor);
-GGML_API size_t ggml_tallocr_max_size   (ggml_tallocr_t talloc);
-
-
-// Graph allocator
-typedef struct ggml_gallocr * ggml_gallocr_t;
-
-GGML_API ggml_gallocr_t ggml_gallocr_new(void);
-GGML_API void   ggml_gallocr_free(ggml_gallocr_t galloc);
-
-GGML_API void   ggml_gallocr_set_parse_seq(ggml_gallocr_t galloc, const int * list, int n);
-GGML_API size_t ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, ggml_tallocr_t talloc, struct ggml_cgraph * graph);
-
-// Allocate tensors from the allocators given by the hash table
-GGML_API void   ggml_gallocr_alloc_graph_n(
-                    ggml_gallocr_t galloc,
-                    struct ggml_cgraph * graph,
-                    struct ggml_hash_set hash_set,
-                    ggml_tallocr_t * hash_node_talloc);
-
-
-// Utils
-// Create a buffer and allocate all the tensors in a ggml_context
-GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, struct ggml_backend_buffer_type * buft);
-GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, struct ggml_backend * backend);
-
-#ifdef  __cplusplus
-}
-#endif
diff --git a/ggml/include/ggml/ggml-backend.h b/ggml/include/ggml/ggml-backend.h
deleted file mode 100644
index 4eb244a..0000000
--- a/ggml/include/ggml/ggml-backend.h
+++ /dev/null
@@ -1,198 +0,0 @@
-#pragma once
-
-#include "ggml.h"
-#include "ggml-alloc.h"
-
-#ifdef  __cplusplus
-extern "C" {
-#endif
-
-    typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
-    typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
-    typedef struct ggml_backend * ggml_backend_t;
-    typedef void * ggml_backend_graph_plan_t;
-
-    //
-    // Backend buffer
-    //
-
-    // buffer type
-    GGML_API const char *          ggml_backend_buft_name            (ggml_backend_buffer_type_t buft);
-    GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer    (ggml_backend_buffer_type_t buft, size_t size);
-    GGML_API size_t                ggml_backend_buft_get_alignment   (ggml_backend_buffer_type_t buft);
-    GGML_API size_t                ggml_backend_buft_get_alloc_size  (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
-    GGML_API bool                  ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
-    GGML_API bool                  ggml_backend_buft_is_host         (ggml_backend_buffer_type_t buft);
-
-    // buffer
-    enum ggml_backend_buffer_usage {
-        GGML_BACKEND_BUFFER_USAGE_ANY = 0,
-        GGML_BACKEND_BUFFER_USAGE_WEIGHTS = 1,
-    };
-
-    GGML_API const char *               ggml_backend_buffer_name          (ggml_backend_buffer_t buffer);
-    GGML_API void                       ggml_backend_buffer_free          (ggml_backend_buffer_t buffer);
-    GGML_API void *                     ggml_backend_buffer_get_base      (ggml_backend_buffer_t buffer);
-    GGML_API size_t                     ggml_backend_buffer_get_size      (ggml_backend_buffer_t buffer);
-    GGML_API void                       ggml_backend_buffer_init_tensor   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
-    GGML_API size_t                     ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
-    GGML_API size_t                     ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
-    GGML_API void                       ggml_backend_buffer_clear         (ggml_backend_buffer_t buffer, uint8_t value);
-    GGML_API bool                       ggml_backend_buffer_is_host       (ggml_backend_buffer_t buffer);
-    GGML_API void                       ggml_backend_buffer_set_usage     (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
-    GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_get_type      (ggml_backend_buffer_t buffer);
-    GGML_API void                       ggml_backend_buffer_reset         (ggml_backend_buffer_t buffer);
-
-    //
-    // Backend
-    //
-
-
-    GGML_API const char * ggml_backend_name(ggml_backend_t backend);
-    GGML_API void         ggml_backend_free(ggml_backend_t backend);
-
-    GGML_API ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend);
-    GGML_API ggml_backend_buffer_t      ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size);
-    GGML_API size_t                     ggml_backend_get_alignment(ggml_backend_t backend);
-
-    GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
-    GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
-
-    GGML_API void ggml_backend_tensor_set(      struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
-    GGML_API void ggml_backend_tensor_get(const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
-
-    GGML_API void ggml_backend_synchronize(ggml_backend_t backend);
-
-    GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create (ggml_backend_t backend, struct ggml_cgraph * cgraph);
-
-    GGML_API void ggml_backend_graph_plan_free   (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
-    GGML_API void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
-    GGML_API bool ggml_backend_graph_compute     (ggml_backend_t backend, struct ggml_cgraph * cgraph);
-    GGML_API bool ggml_backend_supports_op       (ggml_backend_t backend, const struct ggml_tensor * op);
-
-    // tensor copy between different backends
-    GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
-    GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst); // automatic fallback to sync copy
-
-    //
-    // CPU backend
-    //
-
-    GGML_API ggml_backend_t ggml_backend_cpu_init(void);
-
-    GGML_API bool ggml_backend_is_cpu(ggml_backend_t backend);
-    GGML_API void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads);
-
-    // Create a backend buffer from an existing pointer
-    GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
-
-    GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
-
-#ifdef GGML_USE_CPU_HBM
-    GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
-#endif
-
-    //
-    // Backend registry
-    //
-
-    // The backend registry is a registry of all the available backends, and allows initializing backends in a generic way
-
-    GGML_API size_t                     ggml_backend_reg_get_count(void);
-    GGML_API size_t                     ggml_backend_reg_find_by_name(const char * name);
-    GGML_API ggml_backend_t             ggml_backend_reg_init_backend_from_str(const char * backend_str); // str is name[:params]
-    GGML_API const char *               ggml_backend_reg_get_name(size_t i);
-    GGML_API ggml_backend_t             ggml_backend_reg_init_backend(size_t i, const char * params); // params is backend-specific
-    GGML_API ggml_backend_buffer_type_t ggml_backend_reg_get_default_buffer_type(size_t i);
-    GGML_API ggml_backend_buffer_t      ggml_backend_reg_alloc_buffer(size_t i, size_t size);
-
-    //
-    // Backend scheduler
-    //
-
-    // The backend scheduler allows for multiple backends to be used together
-    // Handles compute buffer allocation, assignment of tensors to backends, and copying of tensors between backends
-    // The backends are selected based on:
-    // - the backend that supports the operation
-    // - the location of the pre-allocated tensors (e.g. the weights)
-    /*
-      Example usage:
-
-        sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, num_backends);
-        // sched is initialized with measure allocators and cannot be used until allocated with a measure graph
-
-        // initialize buffers from a measure graph
-        measure_graph = build_graph(sched); // use the allocr to allocate inputs as needed
-
-        // in build_graph:
-        build_graph(...) {
-            // allocating tensors in a specific backend (optional, recommended: pre-allocate inputs in a different buffer)
-            alloc_cpu = ggml_backend_sched_get_allocr(sched, backend_cpu);
-            ggml_allocr_alloc(alloc_cpu, tensor);
-
-            // manually assigning nodes to a backend (optional, shouldn't be needed in most cases)
-            struct ggml_tensor * node = ggml_mul_mat(ctx, ...);
-            ggml_backend_sched_set_node_backend(sched, node, backend_gpu);
-        }
-
-        // allocate backend buffers from measure graph
-        ggml_backend_sched_init_measure(sched, measure_graph);
-
-        // the scheduler is now ready to compute graphs
-
-        // compute
-        graph = build_graph(sched);
-        ggml_backend_sched_graph_compute(sched, graph);
-    */
-
-    struct ggml_backend_sched;
-    typedef struct ggml_backend_sched * ggml_backend_sched_t;
-
-    // Initialize a backend scheduler
-    GGML_API ggml_backend_sched_t  ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size);
-    GGML_API void                  ggml_backend_sched_free(ggml_backend_sched_t sched);
-    // Initialize backend buffers from a measure graph
-    GGML_API void                  ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
-    // Get the number of splits of the last graph
-    GGML_API int                   ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
-
-    GGML_API ggml_tallocr_t        ggml_backend_sched_get_tallocr(ggml_backend_sched_t sched, ggml_backend_t backend);
-    GGML_API ggml_backend_buffer_t ggml_backend_sched_get_buffer (ggml_backend_sched_t sched, ggml_backend_t backend);
-
-    GGML_API void                  ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
-    GGML_API ggml_backend_t        ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
-
-    // Allocate and compute graph on the backend scheduler
-    GGML_API void                  ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
-
-    // Reset all assignments and allocators - must be called before using the sched allocators to allocate inputs
-    GGML_API void                  ggml_backend_sched_reset(ggml_backend_sched_t sched);
-
-    //
-    // Utils
-    //
-
-    struct ggml_backend_graph_copy {
-        ggml_backend_buffer_t buffer;
-        struct ggml_context * ctx_allocated;
-        struct ggml_context * ctx_unallocated;
-        struct ggml_cgraph * graph;
-    };
-
-    // Copy a graph to a different backend
-    GGML_API struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph);
-    GGML_API void                           ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy);
-
-    typedef bool (*ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
-
-    // Compare the output of two backends
-    GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data);
-
-    // Tensor initialization
-    GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
-    GGML_API void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
-
-
-#ifdef  __cplusplus
-}
-#endif
diff --git a/ggml/include/ggml/ggml.h b/ggml/include/ggml/ggml.h
deleted file mode 100644
index b18ba78..0000000
--- a/ggml/include/ggml/ggml.h
+++ /dev/null
@@ -1,2259 +0,0 @@
-#pragma once
-
-//
-// GGML Tensor Library
-//
-// This documentation is still a work in progress.
-// If you wish some specific topics to be covered, feel free to drop a comment:
-//
-//   https://github.com/ggerganov/whisper.cpp/issues/40
-//
-// ## Overview
-//
-// This library implements:
-//
-//  - a set of tensor operations
-//  - automatic differentiation
-//  - basic optimization algorithms
-//
-// The aim of this library is to provide a minimalistic approach for various machine learning tasks. This includes,
-// but is not limited to, the following:
-//
-//  - linear regression
-//  - support vector machines
-//  - neural networks
-//
-// The library allows the user to define a certain function using the available tensor operations. This function
-// definition is represented internally via a computation graph. Each tensor operation in the function definition
-// corresponds to a node in the graph. Having the computation graph defined, the user can choose to compute the
-// function's value and/or its gradient with respect to the input variables. Optionally, the function can be optimized
-// using one of the available optimization algorithms.
-//
-// For example, here we define the function: f(x) = a*x^2 + b
-//
-//   {
-//       struct ggml_init_params params = {
-//           .mem_size   = 16*1024*1024,
-//           .mem_buffer = NULL,
-//       };
-//
-//       // memory allocation happens here
-//       struct ggml_context * ctx = ggml_init(params);
-//
-//       struct ggml_tensor * x = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
-//
-//       ggml_set_param(ctx, x); // x is an input variable
-//
-//       struct ggml_tensor * a  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
-//       struct ggml_tensor * b  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
-//       struct ggml_tensor * x2 = ggml_mul(ctx, x, x);
-//       struct ggml_tensor * f  = ggml_add(ctx, ggml_mul(ctx, a, x2), b);
-//
-//       ...
-//   }
-//
-// Notice that the function definition above does not involve any actual computation. The computation is performed only
-// when the user explicitly requests it. For example, to compute the function's value at x = 2.0:
-//
-//   {
-//       ...
-//
-//       struct ggml_cgraph * gf = ggml_new_graph(ctx);
-//       ggml_build_forward_expand(gf, f);
-//
-//       // set the input variable and parameter values
-//       ggml_set_f32(x, 2.0f);
-//       ggml_set_f32(a, 3.0f);
-//       ggml_set_f32(b, 4.0f);
-//
-//       ggml_graph_compute_with_ctx(ctx, &gf, n_threads);
-//
-//       printf("f = %f\n", ggml_get_f32_1d(f, 0));
-//
-//       ...
-//   }
-//
-// The actual computation is performed in the ggml_graph_compute() function.
-//
-// The ggml_new_tensor_...() functions create new tensors. They are allocated in the memory buffer provided to the
-// ggml_init() function. You have to be careful not to exceed the memory buffer size. Therefore, you have to know
-// in advance how much memory you need for your computation. Alternatively, you can allocate a large enough memory
-// and after defining the computation graph, call the ggml_used_mem() function to find out how much memory was
-// actually needed.
-//
-// The ggml_set_param() function marks a tensor as an input variable. This is used by the automatic
-// differentiation and optimization algorithms.
-//
-// The described approach allows to define the function graph once and then compute its forward or backward graphs
-// multiple times. All computations will use the same memory buffer allocated in the ggml_init() function. This way
-// the user can avoid the memory allocation overhead at runtime.
-//
-// The library supports multi-dimensional tensors - up to 4 dimensions. The FP16 and FP32 data types are first class
-// citizens, but in theory the library can be extended to support FP8 and integer data types.
-//
-// Each tensor operation produces a new tensor. Initially the library was envisioned to support only the use of unary
-// and binary operations. Most of the available operations fall into one of these two categories. With time, it became
-// clear that the library needs to support more complex operations. The way to support these operations is not clear
-// yet, but a few examples are demonstrated in the following operations:
-//
-//   - ggml_permute()
-//   - ggml_conv_1d_1s()
-//   - ggml_conv_1d_2s()
-//
-// For each tensor operator, the library implements a forward and backward computation function. The forward function
-// computes the output tensor value given the input tensor values. The backward function computes the adjoint of the
-// input tensors given the adjoint of the output tensor. For a detailed explanation of what this means, take a
-// calculus class, or watch the following video:
-//
-//   What is Automatic Differentiation?
-//   https://www.youtube.com/watch?v=wG_nF1awSSY
-//
-//
-// ## Tensor data (struct ggml_tensor)
-//
-// The tensors are stored in memory via the ggml_tensor struct. The structure provides information about the size of
-// the tensor, the data type, and the memory buffer where the tensor data is stored. Additionally, it contains
-// pointers to the "source" tensors - i.e. the tensors that were used to compute the current tensor. For example:
-//
-//   {
-//       struct ggml_tensor * c = ggml_add(ctx, a, b);
-//
-//       assert(c->src[0] == a);
-//       assert(c->src[1] == b);
-//   }
-//
-// The multi-dimensional tensors are stored in row-major order. The ggml_tensor struct contains fields for the
-// number of elements in each dimension ("ne") as well as the number of bytes ("nb", a.k.a. stride). This allows
-// to store tensors that are not contiguous in memory, which is useful for operations such as transposition and
-// permutation. All tensor operations have to take the stride into account and not assume that the tensor is
-// contiguous in memory.
-//
-// The data of the tensor is accessed via the "data" pointer. For example:
-//
-//   {
-//       const int nx = 2;
-//       const int ny = 3;
-//
-//       struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, nx, ny);
-//
-//       for (int y = 0; y < ny; y++) {
-//           for (int x = 0; x < nx; x++) {
-//               *(float *) ((char *) a->data + y*a->nb[1] + x*a->nb[0]) = x + y;
-//           }
-//       }
-//
-//       ...
-//   }
-//
-// Alternatively, there are helper functions, such as ggml_get_f32_1d() and ggml_set_f32_1d() that can be used.
-//
-// ## The matrix multiplication operator (ggml_mul_mat)
-//
-// TODO
-//
-//
-// ## Multi-threading
-//
-// TODO
-//
-//
-// ## Overview of ggml.c
-//
-// TODO
-//
-//
-// ## SIMD optimizations
-//
-// TODO
-//
-//
-// ## Debugging ggml
-//
-// TODO
-//
-//
-
-#ifdef GGML_SHARED
-#    if defined(_WIN32) && !defined(__MINGW32__)
-#        ifdef GGML_BUILD
-#            define GGML_API __declspec(dllexport)
-#        else
-#            define GGML_API __declspec(dllimport)
-#        endif
-#    else
-#        define GGML_API __attribute__ ((visibility ("default")))
-#    endif
-#else
-#    define GGML_API
-#endif
-
-// TODO: support for clang
-#ifdef __GNUC__
-#    define GGML_DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
-#elif defined(_MSC_VER)
-#    define GGML_DEPRECATED(func, hint) __declspec(deprecated(hint)) func
-#else
-#    define GGML_DEPRECATED(func, hint) func
-#endif
-
-#ifndef __GNUC__
-#    define GGML_ATTRIBUTE_FORMAT(...)
-#elif defined(__MINGW32__)
-#    define GGML_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
-#else
-#    define GGML_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
-#endif
-
-#include <stdint.h>
-#include <stddef.h>
-#include <stdbool.h>
-
-#define GGML_FILE_MAGIC   0x67676d6c // "ggml"
-#define GGML_FILE_VERSION 1
-
-#define GGML_QNT_VERSION        2    // bump this on quantization format changes
-#define GGML_QNT_VERSION_FACTOR 1000 // do not change this
-
-#define GGML_MAX_DIMS           4
-#define GGML_MAX_PARAMS         2048
-#define GGML_MAX_CONTEXTS       64
-#define GGML_MAX_SRC            10
-#ifndef GGML_MAX_NAME
-#define GGML_MAX_NAME           64
-#endif
-#define GGML_MAX_OP_PARAMS      64
-#define GGML_DEFAULT_N_THREADS  4
-#define GGML_DEFAULT_GRAPH_SIZE 2048
-#if UINTPTR_MAX == 0xFFFFFFFF
-    #define GGML_MEM_ALIGN 4
-#else
-    #define GGML_MEM_ALIGN 16
-#endif
-
-#define GGML_EXIT_SUCCESS 0
-#define GGML_EXIT_ABORTED 1
-
-#define GGUF_MAGIC "GGUF"
-
-#define GGUF_VERSION 3
-
-#define GGUF_DEFAULT_ALIGNMENT 32
-
-#define GGML_UNUSED(x) (void)(x)
-
-#define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
-
-#define GGML_ASSERT(x) \
-    do { \
-        if (!(x)) { \
-            fflush(stdout); \
-            fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
-            ggml_print_backtrace(); \
-            abort(); \
-        } \
-    } while (0)
-
-#ifndef NDEBUG
-#define GGML_UNREACHABLE() GGML_ASSERT(!"statement should not be reached")
-#elif defined(__GNUC__)
-#define GGML_UNREACHABLE() __builtin_unreachable()
-#elif defined(_MSC_VER)
-#define GGML_UNREACHABLE() __assume(0)
-#else
-#define GGML_UNREACHABLE() ((void) 0)
-#endif
-
-// used to copy the number of elements and stride in bytes of tensors into local variables.
-// main purpose is to reduce code duplication and improve readability.
-//
-// example:
-//
-//    GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne);
-//    GGML_TENSOR_LOCALS(size_t,  nb1, src1, nb);
-//
-#define GGML_TENSOR_LOCALS_1(type, prefix, pointer, array) \
-    const type prefix##0 = (pointer)->array[0]; \
-    GGML_UNUSED(prefix##0);
-#define GGML_TENSOR_LOCALS_2(type, prefix, pointer, array) \
-    GGML_TENSOR_LOCALS_1    (type, prefix, pointer, array) \
-    const type prefix##1 = (pointer)->array[1]; \
-    GGML_UNUSED(prefix##1);
-#define GGML_TENSOR_LOCALS_3(type, prefix, pointer, array) \
-    GGML_TENSOR_LOCALS_2    (type, prefix, pointer, array) \
-    const type prefix##2 = (pointer)->array[2]; \
-    GGML_UNUSED(prefix##2);
-#define GGML_TENSOR_LOCALS(type, prefix, pointer, array) \
-    GGML_TENSOR_LOCALS_3  (type, prefix, pointer, array) \
-    const type prefix##3 = (pointer)->array[3]; \
-    GGML_UNUSED(prefix##3);
-
-#define GGML_TENSOR_UNARY_OP_LOCALS \
-    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
-    GGML_TENSOR_LOCALS(size_t,  nb0, src0, nb) \
-    GGML_TENSOR_LOCALS(int64_t, ne,  dst,  ne) \
-    GGML_TENSOR_LOCALS(size_t,  nb,  dst,  nb)
-
-#define GGML_TENSOR_BINARY_OP_LOCALS \
-    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
-    GGML_TENSOR_LOCALS(size_t,  nb0, src0, nb) \
-    GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \
-    GGML_TENSOR_LOCALS(size_t,  nb1, src1, nb) \
-    GGML_TENSOR_LOCALS(int64_t, ne,  dst,  ne) \
-    GGML_TENSOR_LOCALS(size_t,  nb,  dst,  nb)
-
-#ifdef  __cplusplus
-extern "C" {
-#endif
-
-#if defined(__ARM_NEON) && defined(__CUDACC__)
-    typedef half ggml_fp16_t;
-#elif defined(__ARM_NEON) && !defined(_MSC_VER)
-    typedef __fp16 ggml_fp16_t;
-#else
-    typedef uint16_t ggml_fp16_t;
-#endif
-
-    // convert FP16 <-> FP32
-    GGML_API float       ggml_fp16_to_fp32(ggml_fp16_t x);
-    GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);
-
-    GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int n);
-    GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int n);
-
-    struct ggml_object;
-    struct ggml_context;
-
-    enum ggml_type {
-        GGML_TYPE_F32  = 0,
-        GGML_TYPE_F16  = 1,
-        GGML_TYPE_Q4_0 = 2,
-        GGML_TYPE_Q4_1 = 3,
-        // GGML_TYPE_Q4_2 = 4, support has been removed
-        // GGML_TYPE_Q4_3 (5) support has been removed
-        GGML_TYPE_Q5_0 = 6,
-        GGML_TYPE_Q5_1 = 7,
-        GGML_TYPE_Q8_0 = 8,
-        GGML_TYPE_Q8_1 = 9,
-        // k-quantizations
-        GGML_TYPE_Q2_K = 10,
-        GGML_TYPE_Q3_K = 11,
-        GGML_TYPE_Q4_K = 12,
-        GGML_TYPE_Q5_K = 13,
-        GGML_TYPE_Q6_K = 14,
-        GGML_TYPE_Q8_K = 15,
-        GGML_TYPE_IQ2_XXS = 16,
-        GGML_TYPE_IQ2_XS  = 17,
-        GGML_TYPE_I8,
-        GGML_TYPE_I16,
-        GGML_TYPE_I32,
-        GGML_TYPE_COUNT,
-    };
-
-    // precision
-    enum ggml_prec {
-        GGML_PREC_DEFAULT,
-        GGML_PREC_F32,
-    };
-
-    enum ggml_backend_type {
-        GGML_BACKEND_CPU = 0,
-        GGML_BACKEND_GPU = 10,
-        GGML_BACKEND_GPU_SPLIT = 20,
-    };
-
-    // model file types
-    enum ggml_ftype {
-        GGML_FTYPE_UNKNOWN     = -1,
-        GGML_FTYPE_ALL_F32     = 0,
-        GGML_FTYPE_MOSTLY_F16  = 1,  // except 1d tensors
-        GGML_FTYPE_MOSTLY_Q4_0 = 2,  // except 1d tensors
-        GGML_FTYPE_MOSTLY_Q4_1 = 3,  // except 1d tensors
-        GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
-        GGML_FTYPE_MOSTLY_Q8_0 = 7,  // except 1d tensors
-        GGML_FTYPE_MOSTLY_Q5_0 = 8,  // except 1d tensors
-        GGML_FTYPE_MOSTLY_Q5_1 = 9,  // except 1d tensors
-        GGML_FTYPE_MOSTLY_Q2_K = 10, // except 1d tensors
-        GGML_FTYPE_MOSTLY_Q3_K = 11, // except 1d tensors
-        GGML_FTYPE_MOSTLY_Q4_K = 12, // except 1d tensors
-        GGML_FTYPE_MOSTLY_Q5_K = 13, // except 1d tensors
-        GGML_FTYPE_MOSTLY_Q6_K = 14, // except 1d tensors
-        GGML_FTYPE_MOSTLY_IQ2_XXS = 15, // except 1d tensors
-        GGML_FTYPE_MOSTLY_IQ2_XS  = 16, // except 1d tensors
-    };
-
-    // available tensor operations:
-    enum ggml_op {
-        GGML_OP_NONE = 0,
-
-        GGML_OP_DUP,
-        GGML_OP_ADD,
-        GGML_OP_ADD1,
-        GGML_OP_ACC,
-        GGML_OP_SUB,
-        GGML_OP_MUL,
-        GGML_OP_DIV,
-        GGML_OP_SQR,
-        GGML_OP_SQRT,
-        GGML_OP_LOG,
-        GGML_OP_SUM,
-        GGML_OP_SUM_ROWS,
-        GGML_OP_MEAN,
-        GGML_OP_ARGMAX,
-        GGML_OP_REPEAT,
-        GGML_OP_REPEAT_BACK,
-        GGML_OP_CONCAT,
-        GGML_OP_SILU_BACK,
-        GGML_OP_NORM, // normalize
-        GGML_OP_RMS_NORM,
-        GGML_OP_RMS_NORM_BACK,
-        GGML_OP_GROUP_NORM,
-
-        GGML_OP_MUL_MAT,
-        GGML_OP_MUL_MAT_ID,
-        GGML_OP_OUT_PROD,
-
-        GGML_OP_SCALE,
-        GGML_OP_SET,
-        GGML_OP_CPY,
-        GGML_OP_CONT,
-        GGML_OP_RESHAPE,
-        GGML_OP_VIEW,
-        GGML_OP_PERMUTE,
-        GGML_OP_TRANSPOSE,
-        GGML_OP_GET_ROWS,
-        GGML_OP_GET_ROWS_BACK,
-        GGML_OP_DIAG,
-        GGML_OP_DIAG_MASK_INF,
-        GGML_OP_DIAG_MASK_ZERO,
-        GGML_OP_SOFT_MAX,
-        GGML_OP_SOFT_MAX_BACK,
-        GGML_OP_ROPE,
-        GGML_OP_ROPE_BACK,
-        GGML_OP_ALIBI,
-        GGML_OP_CLAMP,
-        GGML_OP_CONV_TRANSPOSE_1D,
-        GGML_OP_IM2COL,
-        GGML_OP_CONV_TRANSPOSE_2D,
-        GGML_OP_POOL_1D,
-        GGML_OP_POOL_2D,
-        GGML_OP_UPSCALE, // nearest interpolate
-        GGML_OP_PAD,
-        GGML_OP_ARGSORT,
-        GGML_OP_LEAKY_RELU,
-
-        GGML_OP_FLASH_ATTN,
-        GGML_OP_FLASH_FF,
-        GGML_OP_FLASH_ATTN_BACK,
-        GGML_OP_WIN_PART,
-        GGML_OP_WIN_UNPART,
-        GGML_OP_GET_REL_POS,
-        GGML_OP_ADD_REL_POS,
-
-        GGML_OP_UNARY,
-
-        GGML_OP_MAP_UNARY,
-        GGML_OP_MAP_BINARY,
-
-        GGML_OP_MAP_CUSTOM1_F32,
-        GGML_OP_MAP_CUSTOM2_F32,
-        GGML_OP_MAP_CUSTOM3_F32,
-
-        GGML_OP_MAP_CUSTOM1,
-        GGML_OP_MAP_CUSTOM2,
-        GGML_OP_MAP_CUSTOM3,
-
-        GGML_OP_CROSS_ENTROPY_LOSS,
-        GGML_OP_CROSS_ENTROPY_LOSS_BACK,
-
-        GGML_OP_COUNT,
-    };
-
-    enum ggml_unary_op {
-        GGML_UNARY_OP_ABS,
-        GGML_UNARY_OP_SGN,
-        GGML_UNARY_OP_NEG,
-        GGML_UNARY_OP_STEP,
-        GGML_UNARY_OP_TANH,
-        GGML_UNARY_OP_ELU,
-        GGML_UNARY_OP_RELU,
-        GGML_UNARY_OP_GELU,
-        GGML_UNARY_OP_GELU_QUICK,
-        GGML_UNARY_OP_SILU,
-
-        GGML_UNARY_OP_COUNT,
-    };
-
-    enum ggml_object_type {
-        GGML_OBJECT_TENSOR,
-        GGML_OBJECT_GRAPH,
-        GGML_OBJECT_WORK_BUFFER
-    };
-
-    enum ggml_log_level {
-        GGML_LOG_LEVEL_ERROR = 2,
-        GGML_LOG_LEVEL_WARN = 3,
-        GGML_LOG_LEVEL_INFO = 4,
-        GGML_LOG_LEVEL_DEBUG = 5
-    };
-
-    // ggml object
-    struct ggml_object {
-        size_t offs;
-        size_t size;
-
-        struct ggml_object * next;
-
-        enum ggml_object_type type;
-
-        char padding[4];
-    };
-
-    static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
-
-    // n-dimensional tensor
-    struct ggml_tensor {
-        enum ggml_type         type;
-        enum ggml_backend_type backend;
-
-        struct ggml_backend_buffer * buffer;
-
-        int64_t ne[GGML_MAX_DIMS]; // number of elements
-        size_t  nb[GGML_MAX_DIMS]; // stride in bytes:
-                                   // nb[0] = ggml_type_size(type)
-                                   // nb[1] = nb[0]   * (ne[0] / ggml_blck_size(type)) + padding
-                                   // nb[i] = nb[i-1] * ne[i-1]
-
-        // compute data
-        enum ggml_op op;
-
-        // op params - allocated as int32_t for alignment
-        int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];
-
-        bool is_param;
-
-        struct ggml_tensor * grad;
-        struct ggml_tensor * src[GGML_MAX_SRC];
-
-        // performance
-        int     perf_runs;
-        int64_t perf_cycles;
-        int64_t perf_time_us;
-
-        struct ggml_tensor * view_src;
-        size_t               view_offs;
-
-        void * data;
-
-        char name[GGML_MAX_NAME];
-
-        void * extra; // extra things e.g. for ggml-cuda.cu
-
-        char padding[8];
-    };
-
-    static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
-
-    // the compute plan that needs to be prepared for ggml_graph_compute()
-    // since https://github.com/ggerganov/ggml/issues/287
-    struct ggml_cplan {
-        size_t    work_size; // size of work buffer, calculated by `ggml_graph_plan()`
-        uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
-
-        int n_threads;
-
-        // abort ggml_graph_compute when true
-        bool (*abort_callback)(void * data);
-        void * abort_callback_data;
-    };
-
-    enum ggml_cgraph_eval_order {
-        GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0,
-        GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT,
-        GGML_CGRAPH_EVAL_ORDER_COUNT
-    };
-
-    struct ggml_hash_set {
-        size_t size;
-        struct ggml_tensor ** keys;
-    };
-
-    // computation graph
-    struct ggml_cgraph {
-        int size;
-        int n_nodes;
-        int n_leafs;
-
-        struct ggml_tensor ** nodes;
-        struct ggml_tensor ** grads;
-        struct ggml_tensor ** leafs;
-
-        struct ggml_hash_set visited_hash_table;
-
-        enum ggml_cgraph_eval_order order;
-
-        // performance
-        int     perf_runs;
-        int64_t perf_cycles;
-        int64_t perf_time_us;
-    };
-
-    // scratch buffer
-    struct ggml_scratch {
-        size_t offs;
-        size_t size;
-        void * data;
-    };
-
-    struct ggml_init_params {
-        // memory pool
-        size_t mem_size;   // bytes
-        void * mem_buffer; // if NULL, memory will be allocated internally
-        bool   no_alloc;   // don't allocate memory for the tensor data
-    };
-
-
-    // compute types
-
-    // NOTE: the INIT or FINALIZE pass is not scheduled unless explicitly enabled.
-    // This behavior was changed since https://github.com/ggerganov/llama.cpp/pull/1995.
-    enum ggml_task_type {
-        GGML_TASK_INIT = 0,
-        GGML_TASK_COMPUTE,
-        GGML_TASK_FINALIZE,
-    };
-
-    struct ggml_compute_params {
-        enum ggml_task_type type;
-
-        // ith = thread index, nth = number of threads
-        int ith, nth;
-
-        // work buffer for all threads
-        size_t wsize;
-        void * wdata;
-    };
-
-    // misc
-
-    GGML_API void    ggml_time_init(void); // call this once at the beginning of the program
-    GGML_API int64_t ggml_time_ms(void);
-    GGML_API int64_t ggml_time_us(void);
-    GGML_API int64_t ggml_cycles(void);
-    GGML_API int64_t ggml_cycles_per_ms(void);
-
-    GGML_API void    ggml_print_backtrace(void);
-
-    GGML_API void    ggml_numa_init(void); // call once for better performance on NUMA systems
-    GGML_API bool    ggml_is_numa(void); // true if init detected that system has >1 NUMA node
-
-    GGML_API void    ggml_print_object (const struct ggml_object * obj);
-    GGML_API void    ggml_print_objects(const struct ggml_context * ctx);
-
-    GGML_API int64_t ggml_nelements   (const struct ggml_tensor * tensor);
-    GGML_API int64_t ggml_nrows       (const struct ggml_tensor * tensor);
-    GGML_API size_t  ggml_nbytes      (const struct ggml_tensor * tensor);
-    GGML_API size_t  ggml_nbytes_pad  (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
-
-    GGML_API int    ggml_blck_size(enum ggml_type type);
-    GGML_API size_t ggml_type_size(enum ggml_type type);             // size in bytes for all elements in a block
-    GGML_API size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
-
-    GGML_DEPRECATED(
-    GGML_API double ggml_type_sizef(enum ggml_type type), // ggml_type_size()/ggml_blck_size() as float
-    "use ggml_row_size() instead");
-
-    GGML_API const char * ggml_type_name(enum ggml_type type);
-    GGML_API const char * ggml_op_name  (enum ggml_op   op);
-    GGML_API const char * ggml_op_symbol(enum ggml_op   op);
-
-    GGML_API const char * ggml_unary_op_name(enum ggml_unary_op op);
-    GGML_API const char * ggml_op_desc(const struct ggml_tensor * t); // unary or op name
-
-    GGML_API size_t  ggml_element_size(const struct ggml_tensor * tensor);
-
-    GGML_API bool    ggml_is_quantized(enum ggml_type type);
-
-    // TODO: temporary until model loading of ggml examples is refactored
-    GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
-
-    GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
-    GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
-    GGML_API bool ggml_is_permuted  (const struct ggml_tensor * tensor);
-    GGML_API bool ggml_is_scalar    (const struct ggml_tensor * tensor);
-    GGML_API bool ggml_is_vector    (const struct ggml_tensor * tensor);
-    GGML_API bool ggml_is_matrix    (const struct ggml_tensor * tensor);
-    GGML_API bool ggml_is_3d        (const struct ggml_tensor * tensor);
-    GGML_API int  ggml_n_dims       (const struct ggml_tensor * tensor); // returns 1 for scalars
-
-    GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
-
-    // use this to compute the memory overhead of a tensor
-    GGML_API size_t ggml_tensor_overhead(void);
-
-    // main
-
-    GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
-    GGML_API void                  ggml_free(struct ggml_context * ctx);
-
-    GGML_API size_t  ggml_used_mem(const struct ggml_context * ctx);
-
-    GGML_API size_t  ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch);
-    GGML_API bool    ggml_get_no_alloc(struct ggml_context * ctx);
-    GGML_API void    ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);
-
-    GGML_API void *  ggml_get_mem_buffer     (const struct ggml_context * ctx);
-    GGML_API size_t  ggml_get_mem_size       (const struct ggml_context * ctx);
-    GGML_API size_t  ggml_get_max_tensor_size(const struct ggml_context * ctx);
-
-    GGML_API struct ggml_tensor * ggml_new_tensor(
-            struct ggml_context * ctx,
-            enum   ggml_type type,
-            int    n_dims,
-            const int64_t *ne);
-
-    GGML_API struct ggml_tensor * ggml_new_tensor_1d(
-            struct ggml_context * ctx,
-            enum   ggml_type type,
-            int64_t ne0);
-
-    GGML_API struct ggml_tensor * ggml_new_tensor_2d(
-            struct ggml_context * ctx,
-            enum   ggml_type type,
-            int64_t ne0,
-            int64_t ne1);
-
-    GGML_API struct ggml_tensor * ggml_new_tensor_3d(
-            struct ggml_context * ctx,
-            enum   ggml_type type,
-            int64_t ne0,
-            int64_t ne1,
-            int64_t ne2);
-
-    GGML_API struct ggml_tensor * ggml_new_tensor_4d(
-            struct ggml_context * ctx,
-            enum   ggml_type type,
-            int64_t ne0,
-            int64_t ne1,
-            int64_t ne2,
-            int64_t ne3);
-
-    GGML_API struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
-    GGML_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
-
-    GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
-    GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, struct ggml_tensor * src);
-
-    // Context tensor enumeration and lookup
-    GGML_API struct ggml_tensor * ggml_get_first_tensor(const struct ggml_context * ctx);
-    GGML_API struct ggml_tensor * ggml_get_next_tensor (const struct ggml_context * ctx, struct ggml_tensor * tensor);
-    GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
-
-    GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
-    GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
-    GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
-
-    // Converts a flat index into coordinates
-    GGML_API void    ggml_unravel_index(const struct ggml_tensor * tensor, int64_t i, int64_t * i0, int64_t * i1, int64_t * i2, int64_t * i3);
-
-    GGML_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
-    GGML_API void    ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
-
-    GGML_API int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
-    GGML_API void    ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value);
-
-    GGML_API float   ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
-    GGML_API void    ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
-
-    GGML_API float   ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
-    GGML_API void    ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value);
-
-    GGML_API void *  ggml_get_data    (const struct ggml_tensor * tensor);
-    GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
-
-    GGML_API enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);
-
-    GGML_API const char *         ggml_get_name   (const struct ggml_tensor * tensor);
-    GGML_API struct ggml_tensor * ggml_set_name   (      struct ggml_tensor * tensor, const char * name);
-    GGML_ATTRIBUTE_FORMAT(2, 3)
-    GGML_API struct ggml_tensor * ggml_format_name(      struct ggml_tensor * tensor, const char * fmt, ...);
-
-    //
-    // operations on tensors with backpropagation
-    //
-
-    GGML_API struct ggml_tensor * ggml_dup(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    // in-place, returns view(a)
-    GGML_API struct ggml_tensor * ggml_dup_inplace(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    GGML_API struct ggml_tensor * ggml_add(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b);
-
-    GGML_API struct ggml_tensor * ggml_add_inplace(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b);
-
-    GGML_API struct ggml_tensor * ggml_add_cast(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,
-            enum   ggml_type      type);
-
-    GGML_API struct ggml_tensor * ggml_add1(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b);
-
-    GGML_API struct ggml_tensor * ggml_add1_inplace(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b);
-
-    // dst = a
-    // view(dst, nb1, nb2, nb3, offset) += b
-    // return dst
-    GGML_API struct ggml_tensor * ggml_acc(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,
-            size_t                nb1,
-            size_t                nb2,
-            size_t                nb3,
-            size_t                offset);
-
-    GGML_API struct ggml_tensor * ggml_acc_inplace(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,
-            size_t                nb1,
-            size_t                nb2,
-            size_t                nb3,
-            size_t                offset);
-
-    GGML_API struct ggml_tensor * ggml_sub(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b);
-
-    GGML_API struct ggml_tensor * ggml_sub_inplace(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b);
-
-    GGML_API struct ggml_tensor * ggml_mul(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b);
-
-    GGML_API struct ggml_tensor * ggml_mul_inplace(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b);
-
-    GGML_API struct ggml_tensor * ggml_div(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b);
-
-    GGML_API struct ggml_tensor * ggml_div_inplace(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b);
-
-    GGML_API struct ggml_tensor * ggml_sqr(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    GGML_API struct ggml_tensor * ggml_sqr_inplace(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    GGML_API struct ggml_tensor * ggml_sqrt(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    GGML_API struct ggml_tensor * ggml_sqrt_inplace(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    GGML_API struct ggml_tensor * ggml_log(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    GGML_API struct ggml_tensor * ggml_log_inplace(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    // return scalar
-    GGML_API struct ggml_tensor * ggml_sum(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    // sums along rows, with input shape [a,b,c,d] return shape [1,b,c,d]
-    GGML_API struct ggml_tensor * ggml_sum_rows(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    // mean along rows
-    GGML_API struct ggml_tensor * ggml_mean(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    // argmax along rows
-    GGML_API struct ggml_tensor * ggml_argmax(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    // if a is the same shape as b, and a is not parameter, return a
-    // otherwise, return a new tensor: repeat(a) to fit in b
-    GGML_API struct ggml_tensor * ggml_repeat(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b);
-
-    // sums repetitions in a into shape of b
-    GGML_API struct ggml_tensor * ggml_repeat_back(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b);
-
-    // concat a and b on dim 2
-    // used in stable-diffusion
-    GGML_API struct ggml_tensor * ggml_concat(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b);
-
-    GGML_API struct ggml_tensor * ggml_abs(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    GGML_API struct ggml_tensor * ggml_abs_inplace(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    GGML_API struct ggml_tensor * ggml_sgn(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    GGML_API struct ggml_tensor * ggml_sgn_inplace(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    GGML_API struct ggml_tensor * ggml_neg(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    GGML_API struct ggml_tensor * ggml_neg_inplace(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    GGML_API struct ggml_tensor * ggml_step(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    GGML_API struct ggml_tensor * ggml_step_inplace(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    GGML_API struct ggml_tensor * ggml_tanh(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    GGML_API struct ggml_tensor * ggml_tanh_inplace(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    GGML_API struct ggml_tensor * ggml_elu(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    GGML_API struct ggml_tensor * ggml_elu_inplace(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    GGML_API struct ggml_tensor * ggml_relu(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    GGML_API struct ggml_tensor * ggml_leaky_relu(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a, float negative_slope, bool inplace);
-
-    GGML_API struct ggml_tensor * ggml_relu_inplace(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    GGML_API struct ggml_tensor * ggml_gelu(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    GGML_API struct ggml_tensor * ggml_gelu_inplace(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    GGML_API struct ggml_tensor * ggml_gelu_quick(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    GGML_API struct ggml_tensor * ggml_gelu_quick_inplace(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    GGML_API struct ggml_tensor * ggml_silu(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    GGML_API struct ggml_tensor * ggml_silu_inplace(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    // a - x
-    // b - dy
-    GGML_API struct ggml_tensor * ggml_silu_back(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b);
-
-    // normalize along rows
-    GGML_API struct ggml_tensor * ggml_norm(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            float                 eps);
-
-    GGML_API struct ggml_tensor * ggml_norm_inplace(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            float                 eps);
-
-    GGML_API struct ggml_tensor * ggml_rms_norm(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            float                 eps);
-
-    GGML_API struct ggml_tensor * ggml_rms_norm_inplace(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            float                 eps);
-
-    // group normalize along ne0*ne1*n_groups
-    // used in stable-diffusion
-    // TODO: eps is hardcoded to 1e-6 for now
-    GGML_API struct ggml_tensor * ggml_group_norm(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            int                   n_groups);
-
-    GGML_API struct ggml_tensor * ggml_group_norm_inplace(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            int                   n_groups);
-
-    // a - x
-    // b - dy
-    GGML_API struct ggml_tensor * ggml_rms_norm_back(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,
-            float                 eps);
-
-    // A: k columns, n rows => [ne03, ne02, n, k]
-    // B: k columns, m rows  (i.e. we transpose it internally) => [ne03 * x, ne02 * y, m, k]
-    // result is n columns, m rows => [ne03 * x, ne02 * y, m, n]
-    GGML_API struct ggml_tensor * ggml_mul_mat(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b);
-
-    // change the precision of a matrix multiplication
-    // set to GGML_PREC_F32 for higher precision (useful for phi-2)
-    GGML_API void ggml_mul_mat_set_prec(
-            struct ggml_tensor * a,
-            enum ggml_prec       prec);
-
-    // indirect matrix multiplication
-    //  ggml_mul_mat_id(ctx, as, ids, id, b) ~= ggml_mul_mat(as[ids[id]], b)
-    GGML_API struct ggml_tensor * ggml_mul_mat_id(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * const as[],
-            int                   n_as,
-            struct ggml_tensor  * ids,
-            int                   id,
-            struct ggml_tensor  * b);
-
-    // A: m columns, n rows,
-    // B: p columns, n rows,
-    // result is m columns, p rows
-    GGML_API struct ggml_tensor * ggml_out_prod(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b);
-
-    //
-    // operations on tensors without backpropagation
-    //
-
-    GGML_API struct ggml_tensor * ggml_scale(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            float                 s);
-
-    // in-place, returns view(a)
-    GGML_API struct ggml_tensor * ggml_scale_inplace(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            float                 s);
-
-    // b -> view(a,offset,nb1,nb2,3), return modified a
-    GGML_API struct ggml_tensor * ggml_set(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,
-            size_t                nb1,
-            size_t                nb2,
-            size_t                nb3,
-            size_t                offset);
-
-    // b -> view(a,offset,nb1,nb2,3), return view(a)
-    GGML_API struct ggml_tensor * ggml_set_inplace(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,
-            size_t                nb1,
-            size_t                nb2,
-            size_t                nb3,
-            size_t                offset);
-
-    GGML_API struct ggml_tensor * ggml_set_1d(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,
-            size_t                offset);
-
-    GGML_API struct ggml_tensor * ggml_set_1d_inplace(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,
-            size_t                offset);
-
-    // b -> view(a,offset,nb1,nb2,3), return modified a
-    GGML_API struct ggml_tensor * ggml_set_2d(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,
-            size_t                nb1,
-            size_t                offset);
-
-    // b -> view(a,offset,nb1,nb2,3), return view(a)
-    GGML_API struct ggml_tensor * ggml_set_2d_inplace(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,
-            size_t                nb1,
-            size_t                offset);
-
-    // a -> b, return view(b)
-    GGML_API struct ggml_tensor * ggml_cpy(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b);
-
-    GGML_API struct ggml_tensor * ggml_cast(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            enum   ggml_type      type);
-
-    // make contiguous
-    GGML_API struct ggml_tensor * ggml_cont(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    // make contiguous, with new shape
-    GGML_API struct ggml_tensor * ggml_cont_1d(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            int64_t               ne0);
-
-    GGML_API struct ggml_tensor * ggml_cont_2d(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            int64_t               ne0,
-            int64_t               ne1);
-
-    GGML_API struct ggml_tensor * ggml_cont_3d(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            int64_t               ne0,
-            int64_t               ne1,
-            int64_t               ne2);
-
-    GGML_API struct ggml_tensor * ggml_cont_4d(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            int64_t               ne0,
-            int64_t               ne1,
-            int64_t               ne2,
-            int64_t               ne3);
-
-    // return view(a), b specifies the new shape
-    // TODO: when we start computing gradient, make a copy instead of view
-    GGML_API struct ggml_tensor * ggml_reshape(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b);
-
-    // return view(a)
-    // TODO: when we start computing gradient, make a copy instead of view
-    GGML_API struct ggml_tensor * ggml_reshape_1d(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            int64_t               ne0);
-
-    GGML_API struct ggml_tensor * ggml_reshape_2d(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            int64_t               ne0,
-            int64_t               ne1);
-
-    // return view(a)
-    // TODO: when we start computing gradient, make a copy instead of view
-    GGML_API struct ggml_tensor * ggml_reshape_3d(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            int64_t               ne0,
-            int64_t               ne1,
-            int64_t               ne2);
-
-    GGML_API struct ggml_tensor * ggml_reshape_4d(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            int64_t               ne0,
-            int64_t               ne1,
-            int64_t               ne2,
-            int64_t               ne3);
-
-    // offset in bytes
-    GGML_API struct ggml_tensor * ggml_view_1d(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            int64_t               ne0,
-            size_t                offset);
-
-    GGML_API struct ggml_tensor * ggml_view_2d(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            int64_t               ne0,
-            int64_t               ne1,
-            size_t                nb1, // row stride in bytes
-            size_t                offset);
-
-    GGML_API struct ggml_tensor * ggml_view_3d(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            int64_t               ne0,
-            int64_t               ne1,
-            int64_t               ne2,
-            size_t                nb1, // row   stride in bytes
-            size_t                nb2, // slice stride in bytes
-            size_t                offset);
-
-    GGML_API struct ggml_tensor * ggml_view_4d(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            int64_t               ne0,
-            int64_t               ne1,
-            int64_t               ne2,
-            int64_t               ne3,
-            size_t                nb1, // row   stride in bytes
-            size_t                nb2, // slice stride in bytes
-            size_t                nb3,
-            size_t                offset);
-
-    GGML_API struct ggml_tensor * ggml_permute(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            int                   axis0,
-            int                   axis1,
-            int                   axis2,
-            int                   axis3);
-
-    // alias for ggml_permute(ctx, a, 1, 0, 2, 3)
-    GGML_API struct ggml_tensor * ggml_transpose(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    // supports 3D: a->ne[2] == b->ne[1]
-    GGML_API struct ggml_tensor * ggml_get_rows(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b);
-
-    GGML_API struct ggml_tensor * ggml_get_rows_back(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,
-            struct ggml_tensor  * c);
-
-    GGML_API struct ggml_tensor * ggml_diag(
-        struct ggml_context     * ctx,
-        struct ggml_tensor      * a);
-
-    // set elements above the diagonal to -INF
-    GGML_API struct ggml_tensor * ggml_diag_mask_inf(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            int                   n_past);
-
-    // in-place, returns view(a)
-    GGML_API struct ggml_tensor * ggml_diag_mask_inf_inplace(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            int                   n_past);
-
-    // set elements above the diagonal to 0
-    GGML_API struct ggml_tensor * ggml_diag_mask_zero(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            int                   n_past);
-
-    // in-place, returns view(a)
-    GGML_API struct ggml_tensor * ggml_diag_mask_zero_inplace(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            int                   n_past);
-
-    GGML_API struct ggml_tensor * ggml_soft_max(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    // in-place, returns view(a)
-    GGML_API struct ggml_tensor * ggml_soft_max_inplace(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    // fused soft_max(a*scale + mask)
-    // mask is optional
-    GGML_API struct ggml_tensor * ggml_soft_max_ext(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * mask,
-            float                 scale);
-
-    GGML_API struct ggml_tensor * ggml_soft_max_back(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b);
-
-    // in-place, returns view(a)
-    GGML_API struct ggml_tensor * ggml_soft_max_back_inplace(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b);
-
-    // rotary position embedding
-    // if mode & 1 == 1, skip n_past elements (DEPRECATED)
-    // if mode & 2 == 1, GPT-NeoX style
-    // if mode & 4 == 1, ChatGLM style
-    //
-    // b is an int32 vector with size a->ne[2], it contains the positions
-    GGML_API struct ggml_tensor * ggml_rope(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,
-            int                   n_dims,
-            int                   mode,
-            int                   n_ctx);
-
-    // in-place, returns view(a)
-    GGML_API struct ggml_tensor * ggml_rope_inplace(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,
-            int                   n_dims,
-            int                   mode,
-            int                   n_ctx);
-
-    // custom RoPE
-    GGML_API struct ggml_tensor * ggml_rope_custom(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,
-            int                   n_dims,
-            int                   mode,
-            int                   n_ctx,
-            int                   n_orig_ctx,
-            float                 freq_base,
-            float                 freq_scale,
-            float                 ext_factor,
-            float                 attn_factor,
-            float                 beta_fast,
-            float                 beta_slow);
-
-    // in-place, returns view(a)
-    GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,
-            int                   n_dims,
-            int                   mode,
-            int                   n_ctx,
-            int                   n_orig_ctx,
-            float                 freq_base,
-            float                 freq_scale,
-            float                 ext_factor,
-            float                 attn_factor,
-            float                 beta_fast,
-            float                 beta_slow);
-
-    // compute correction dims for YaRN RoPE scaling
-    void ggml_rope_yarn_corr_dims(
-        int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]);
-
-    // xPos RoPE, in-place, returns view(a)
-    GGML_API struct ggml_tensor * ggml_rope_xpos_inplace(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,
-            int                   n_dims,
-            float                 base,
-            bool                  down);
-
-    // rotary position embedding backward, i.e compute dx from dy
-    // a - dy
-    GGML_API struct ggml_tensor * ggml_rope_back(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,
-            int                   n_dims,
-            int                   mode,
-            int                   n_ctx,
-            int                   n_orig_ctx,
-            float                 freq_base,
-            float                 freq_scale,
-            float                 ext_factor,
-            float                 attn_factor,
-            float                 beta_fast,
-            float                 beta_slow,
-            float                 xpos_base,
-            bool                  xpos_down);
-
-    // alibi position embedding
-    // in-place, returns view(a)
-    GGML_API struct ggml_tensor * ggml_alibi(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            int                   n_past,
-            int                   n_head,
-            float                 bias_max);
-
-    // clamp
-    // in-place, returns view(a)
-    GGML_API struct ggml_tensor * ggml_clamp(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            float                 min,
-            float                 max);
-
-    GGML_API struct ggml_tensor * ggml_im2col(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,
-            int                  s0,
-            int                  s1,
-            int                  p0,
-            int                  p1,
-            int                  d0,
-            int                  d1,
-            bool                 is_2D);
-
-    GGML_API struct ggml_tensor * ggml_conv_1d(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,
-            int                   s0,  // stride
-            int                   p0,  // padding
-            int                   d0); // dilation
-
-    // conv_1d with padding = half
-    // alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d)
-    GGML_API struct ggml_tensor* ggml_conv_1d_ph(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,
-            int                   s,
-            int                   d);
-
-    GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,
-            int                   s0,
-            int                   p0,
-            int                   d0);
-
-    GGML_API struct ggml_tensor * ggml_conv_2d(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,
-            int                   s0,
-            int                   s1,
-            int                   p0,
-            int                   p1,
-            int                   d0,
-            int                   d1);
-
-
-    // kernel size is a->ne[0] x a->ne[1]
-    // stride is equal to kernel size
-    // padding is zero
-    // example:
-    // a:     16   16    3  768
-    // b:   1024 1024    3    1
-    // res:   64   64  768    1
-    // used in sam
-    GGML_API struct ggml_tensor * ggml_conv_2d_sk_p0(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b);
-
-    // kernel size is a->ne[0] x a->ne[1]
-    // stride is 1
-    // padding is half
-    // example:
-    // a:      3    3    256  256
-    // b:     64   64    256    1
-    // res:   64   64    256    1
-    // used in sam
-    GGML_API struct ggml_tensor * ggml_conv_2d_s1_ph(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b);
-
-    GGML_API struct ggml_tensor * ggml_conv_transpose_2d_p0(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,
-            int                   stride);
-
-    enum ggml_op_pool {
-        GGML_OP_POOL_MAX,
-        GGML_OP_POOL_AVG,
-        GGML_OP_POOL_COUNT,
-    };
-
-    GGML_API struct ggml_tensor * ggml_pool_1d(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            enum ggml_op_pool     op,
-            int                   k0, // kernel size
-            int                   s0, // stride
-            int                   p0); // padding
-
-    // the result will have 2*p0 padding for the first dimension
-    // and 2*p1 padding for the second dimension
-    GGML_API struct ggml_tensor * ggml_pool_2d(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            enum ggml_op_pool     op,
-            int                   k0,
-            int                   k1,
-            int                   s0,
-            int                   s1,
-            float                 p0,
-            float                 p1);
-
-    // nearest interpolate
-    // used in stable-diffusion
-    GGML_API struct ggml_tensor * ggml_upscale(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            int                   scale_factor);
-
-    // pad each dimension with zeros: [x, ..., x] -> [x, ..., x, 0, ..., 0]
-    GGML_API struct ggml_tensor * ggml_pad(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            int                  p0,
-            int                  p1,
-            int                  p2,
-            int                  p3);
-
-    // sort rows
-    enum ggml_sort_order {
-        GGML_SORT_ASC,
-        GGML_SORT_DESC,
-    };
-
-    GGML_API struct ggml_tensor * ggml_argsort(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            enum ggml_sort_order  order);
-
-    // top k elements per row
-    GGML_API struct ggml_tensor * ggml_top_k(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            int                   k);
-
-    GGML_API struct ggml_tensor * ggml_flash_attn(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * q,
-            struct ggml_tensor  * k,
-            struct ggml_tensor  * v,
-            bool                  masked);
-
-    GGML_API struct ggml_tensor * ggml_flash_attn_back(
-           struct ggml_context * ctx,
-           struct ggml_tensor  * q,
-           struct ggml_tensor  * k,
-           struct ggml_tensor  * v,
-           struct ggml_tensor  * d,
-           bool                  masked);
-
-    GGML_API struct ggml_tensor * ggml_flash_ff(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b0,
-            struct ggml_tensor  * b1,
-            struct ggml_tensor  * c0,
-            struct ggml_tensor  * c1);
-
-    // partition into non-overlapping windows with padding if needed
-    // example:
-    // a:   768   64   64    1
-    // w:    14
-    // res: 768   14   14    25
-    // used in sam
-    GGML_API struct ggml_tensor * ggml_win_part(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            int                   w);
-
-    // reverse of ggml_win_part
-    // used in sam
-    GGML_API struct ggml_tensor * ggml_win_unpart(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            int                   w0,
-            int                   h0,
-            int                   w);
-
-    GGML_API struct ggml_tensor * ggml_unary(
-            struct ggml_context * ctx,
-             struct ggml_tensor * a,
-             enum ggml_unary_op op);
-
-    GGML_API struct ggml_tensor * ggml_unary_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        enum ggml_unary_op op);
-
-    // used in sam
-    GGML_API struct ggml_tensor * ggml_get_rel_pos(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            int                   qh,
-            int                   kh);
-
-    // used in sam
-    GGML_API struct ggml_tensor * ggml_add_rel_pos(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * pw,
-            struct ggml_tensor  * ph);
-
-    GGML_API struct ggml_tensor * ggml_add_rel_pos_inplace(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * pw,
-            struct ggml_tensor  * ph);
-
-    // custom operators
-
-    typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *);
-    typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
-
-    typedef void (*ggml_custom1_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *);
-    typedef void (*ggml_custom2_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
-    typedef void (*ggml_custom3_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
-
-    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_unary_f32(
-            struct ggml_context        * ctx,
-            struct ggml_tensor         * a,
-                   ggml_unary_op_f32_t   fun),
-        "use ggml_map_custom1 instead");
-
-    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_unary_inplace_f32(
-            struct ggml_context        * ctx,
-            struct ggml_tensor         * a,
-                   ggml_unary_op_f32_t   fun),
-        "use ggml_map_custom1_inplace instead");
-
-    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_binary_f32(
-            struct ggml_context         * ctx,
-            struct ggml_tensor          * a,
-            struct ggml_tensor          * b,
-                   ggml_binary_op_f32_t   fun),
-        "use ggml_map_custom2 instead");
-
-    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_binary_inplace_f32(
-            struct ggml_context         * ctx,
-            struct ggml_tensor          * a,
-            struct ggml_tensor          * b,
-                   ggml_binary_op_f32_t   fun),
-        "use ggml_map_custom2_inplace instead");
-
-    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom1_f32(
-            struct ggml_context          * ctx,
-            struct ggml_tensor           * a,
-                   ggml_custom1_op_f32_t   fun),
-        "use ggml_map_custom1 instead");
-
-    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom1_inplace_f32(
-            struct ggml_context          * ctx,
-            struct ggml_tensor           * a,
-                   ggml_custom1_op_f32_t   fun),
-        "use ggml_map_custom1_inplace instead");
-
-    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom2_f32(
-            struct ggml_context          * ctx,
-            struct ggml_tensor           * a,
-            struct ggml_tensor           * b,
-                   ggml_custom2_op_f32_t   fun),
-        "use ggml_map_custom2 instead");
-
-    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom2_inplace_f32(
-            struct ggml_context          * ctx,
-            struct ggml_tensor           * a,
-            struct ggml_tensor           * b,
-                   ggml_custom2_op_f32_t   fun),
-        "use ggml_map_custom2_inplace instead");
-
-    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom3_f32(
-            struct ggml_context          * ctx,
-            struct ggml_tensor           * a,
-            struct ggml_tensor           * b,
-            struct ggml_tensor           * c,
-                   ggml_custom3_op_f32_t   fun),
-        "use ggml_map_custom3 instead");
-
-    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom3_inplace_f32(
-            struct ggml_context          * ctx,
-            struct ggml_tensor           * a,
-            struct ggml_tensor           * b,
-            struct ggml_tensor           * c,
-                   ggml_custom3_op_f32_t   fun),
-        "use ggml_map_custom3_inplace instead");
-
-    // custom operators v2
-
-    typedef void (*ggml_custom1_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, int ith, int nth, void * userdata);
-    typedef void (*ggml_custom2_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, int ith, int nth, void * userdata);
-    typedef void (*ggml_custom3_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, const struct ggml_tensor * c, int ith, int nth, void * userdata);
-
-    #define GGML_N_TASKS_MAX -1
-
-    GGML_API struct ggml_tensor * ggml_map_custom1(
-            struct ggml_context   * ctx,
-            struct ggml_tensor    * a,
-            ggml_custom1_op_t       fun,
-            int                     n_tasks,
-            void                  * userdata);
-
-    GGML_API struct ggml_tensor * ggml_map_custom1_inplace(
-            struct ggml_context   * ctx,
-            struct ggml_tensor    * a,
-            ggml_custom1_op_t       fun,
-            int                     n_tasks,
-            void                  * userdata);
-
-    GGML_API struct ggml_tensor * ggml_map_custom2(
-            struct ggml_context   * ctx,
-            struct ggml_tensor    * a,
-            struct ggml_tensor    * b,
-            ggml_custom2_op_t       fun,
-            int                     n_tasks,
-            void                  * userdata);
-
-    GGML_API struct ggml_tensor * ggml_map_custom2_inplace(
-            struct ggml_context   * ctx,
-            struct ggml_tensor    * a,
-            struct ggml_tensor    * b,
-            ggml_custom2_op_t       fun,
-            int                     n_tasks,
-            void                  * userdata);
-
-    GGML_API struct ggml_tensor * ggml_map_custom3(
-            struct ggml_context   * ctx,
-            struct ggml_tensor    * a,
-            struct ggml_tensor    * b,
-            struct ggml_tensor    * c,
-            ggml_custom3_op_t       fun,
-            int                     n_tasks,
-            void                  * userdata);
-
-    GGML_API struct ggml_tensor * ggml_map_custom3_inplace(
-            struct ggml_context   * ctx,
-            struct ggml_tensor    * a,
-            struct ggml_tensor    * b,
-            struct ggml_tensor    * c,
-            ggml_custom3_op_t       fun,
-            int                     n_tasks,
-            void                  * userdata);
-
-    // loss function
-
-    GGML_API struct ggml_tensor * ggml_cross_entropy_loss(
-            struct ggml_context         * ctx,
-            struct ggml_tensor          * a,
-            struct ggml_tensor          * b);
-
-    GGML_API struct ggml_tensor * ggml_cross_entropy_loss_back(
-            struct ggml_context         * ctx,
-            struct ggml_tensor          * a,
-            struct ggml_tensor          * b,
-            struct ggml_tensor          * c);
-
-    //
-    // automatic differentiation
-    //
-
-    GGML_API void ggml_set_param(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * tensor);
-
-
-    GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
-    GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep);
-
-    // graph allocation in a context
-    GGML_API struct ggml_cgraph * ggml_new_graph         (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
-    GGML_API struct ggml_cgraph * ggml_new_graph_custom  (struct ggml_context * ctx, size_t size, bool grads);
-    GGML_API struct ggml_cgraph * ggml_graph_dup         (struct ggml_context * ctx, struct ggml_cgraph * cgraph);
-    GGML_API struct ggml_cgraph   ggml_graph_view        (struct ggml_cgraph * cgraph, int i0, int i1);
-    GGML_API void                 ggml_graph_cpy         (struct ggml_cgraph * src, struct ggml_cgraph * dst);
-    GGML_API void                 ggml_graph_reset       (struct ggml_cgraph * cgraph);  // zero grads
-    GGML_API void                 ggml_graph_clear       (struct ggml_cgraph * cgraph);
-
-    GGML_API size_t ggml_graph_overhead(void);
-    GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads);
-
-    // ggml_graph_plan() has to be called before ggml_graph_compute()
-    // when plan.work_size > 0, caller must allocate memory for plan.work_data
-    GGML_API struct ggml_cplan ggml_graph_plan   (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
-    GGML_API int               ggml_graph_compute(      struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
-
-    // same as ggml_graph_compute() but the work data is allocated as a part of the context
-    // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
-    GGML_API void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
-
-    GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
-
-    GGML_API void                 ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname);
-    GGML_API struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval);
-
-    // print info and performance information for the graph
-    GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph);
-
-    // dump the graph into a file using the dot format
-    GGML_API void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename);
-
-    // build gradient checkpointing backward graph gb for gf using provided checkpoints
-    // gb_tmp will contain original backward graph with rewritten backward process nodes,
-    // but without the second forward pass nodes.
-    GGML_API void ggml_build_backward_gradient_checkpointing(
-            struct ggml_context   * ctx,
-            struct ggml_cgraph    * gf,
-            struct ggml_cgraph    * gb,
-            struct ggml_cgraph    * gb_tmp,
-            struct ggml_tensor  * * checkpoints,
-            int                     n_checkpoints);
-    //
-    // optimization
-    //
-
-    // optimization methods
-    enum ggml_opt_type {
-        GGML_OPT_ADAM,
-        GGML_OPT_LBFGS,
-    };
-
-    // linesearch methods
-    enum ggml_linesearch {
-        GGML_LINESEARCH_DEFAULT = 1,
-
-        GGML_LINESEARCH_BACKTRACKING_ARMIJO       = 0,
-        GGML_LINESEARCH_BACKTRACKING_WOLFE        = 1,
-        GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE = 2,
-    };
-
-    // optimization return values
-    enum ggml_opt_result {
-        GGML_OPT_OK = 0,
-        GGML_OPT_DID_NOT_CONVERGE,
-        GGML_OPT_NO_CONTEXT,
-        GGML_OPT_INVALID_WOLFE,
-        GGML_OPT_FAIL,
-        GGML_OPT_CANCEL,
-
-        GGML_LINESEARCH_FAIL = -128,
-        GGML_LINESEARCH_MINIMUM_STEP,
-        GGML_LINESEARCH_MAXIMUM_STEP,
-        GGML_LINESEARCH_MAXIMUM_ITERATIONS,
-        GGML_LINESEARCH_INVALID_PARAMETERS,
-    };
-
-    typedef void (*ggml_opt_callback)(void * data, int accum_step, float * sched, bool * cancel);
-    typedef void (*ggml_log_callback)(enum ggml_log_level level, const char * text, void * user_data);
-
-    // optimization parameters
-    //
-    //   see ggml.c (ggml_opt_default_params) for default values
-    //
-    struct ggml_opt_params {
-        enum ggml_opt_type type;
-
-        size_t graph_size;
-
-        int n_threads;
-
-        // delta-based convergence test
-        //
-        //   if past == 0 - disabled
-        //   if past > 0:
-        //     stop if |f(x) - f(x_past)| < delta * max(1, |f(x)|)
-        //
-        int past;
-        float delta;
-
-        // maximum number of iterations without improvement
-        //
-        //   if 0 - disabled
-        //   if > 0:
-        //     assume convergence if no cost improvement in this number of iterations
-        //
-        int max_no_improvement;
-
-        bool print_forward_graph;
-        bool print_backward_graph;
-
-        int n_gradient_accumulation;
-
-        // ADAM parameters
-        struct {
-            int n_iter;
-
-            float sched; // schedule multiplier (fixed, decay or warmup)
-            float decay; // weight decay for AdamW, use 0.0f to disable
-            int   decay_min_ndim; // minimum number of tensor dimension to apply weight decay
-            float alpha; // learning rate
-            float beta1;
-            float beta2;
-            float eps;   // epsilon for numerical stability
-            float eps_f; // epsilon for convergence test
-            float eps_g; // epsilon for convergence test
-            float gclip; // gradient clipping
-        } adam;
-
-        // LBFGS parameters
-        struct {
-            int m; // number of corrections to approximate the inv. Hessian
-            int n_iter;
-            int max_linesearch;
-
-            float eps;      // convergence tolerance
-            float ftol;     // line search tolerance
-            float wolfe;
-            float min_step;
-            float max_step;
-
-            enum ggml_linesearch linesearch;
-        } lbfgs;
-    };
-
-    struct ggml_opt_context {
-        struct ggml_context * ctx;
-        struct ggml_opt_params params;
-
-        int iter;
-        int64_t nx; // number of parameter elements
-
-        bool just_initialized;
-
-        float loss_before;
-        float loss_after;
-
-        struct {
-            struct ggml_tensor * g;  // current gradient
-            struct ggml_tensor * m;  // first moment
-            struct ggml_tensor * v;  // second moment
-            struct ggml_tensor * pf; // past function values
-            float fx_best;
-            float fx_prev;
-            int n_no_improvement;
-        } adam;
-
-        struct {
-            struct ggml_tensor * x;    // current parameters
-            struct ggml_tensor * xp;   // previous parameters
-            struct ggml_tensor * g;    // current gradient
-            struct ggml_tensor * gp;   // previous gradient
-            struct ggml_tensor * d;    // search direction
-            struct ggml_tensor * pf;   // past function values
-            struct ggml_tensor * lmal; // the L-BFGS memory alpha
-            struct ggml_tensor * lmys; // the L-BFGS memory ys
-            struct ggml_tensor * lms;  // the L-BFGS memory s
-            struct ggml_tensor * lmy;  // the L-BFGS memory y
-            float fx_best;
-            float step;
-            int j;
-            int k;
-            int end;
-            int n_no_improvement;
-        } lbfgs;
-    };
-
-    GGML_API struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type);
-
-    // optimize the function defined by the tensor f
-    GGML_API enum ggml_opt_result ggml_opt(
-            struct ggml_context * ctx,
-            struct ggml_opt_params params,
-            struct ggml_tensor * f);
-
-    // initialize optimizer context
-    GGML_API void ggml_opt_init(
-            struct ggml_context     * ctx,
-            struct ggml_opt_context * opt,
-            struct ggml_opt_params    params,
-            int64_t                   nx);
-
-    // continue optimizing the function defined by the tensor f
-    GGML_API enum ggml_opt_result ggml_opt_resume(
-            struct ggml_context * ctx,
-            struct ggml_opt_context * opt,
-            struct ggml_tensor * f);
-
-    // continue optimizing the function defined by the tensor f
-    GGML_API enum ggml_opt_result ggml_opt_resume_g(
-            struct ggml_context * ctx,
-            struct ggml_opt_context * opt,
-            struct ggml_tensor * f,
-            struct ggml_cgraph * gf,
-            struct ggml_cgraph * gb,
-            ggml_opt_callback callback,
-            void * callback_data);
-
-    //
-    // quantization
-    //
-
-    // TODO: these would probably get removed in favor of the more general ggml_quantize_chunk
-    GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
-    GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
-    GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);
-    GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);
-    GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);
-
-    GGML_API size_t ggml_quantize_q2_K(const float * src, void * dst, int n, int k, int64_t * hist);
-    GGML_API size_t ggml_quantize_q3_K(const float * src, void * dst, int n, int k, int64_t * hist);
-    GGML_API size_t ggml_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist);
-    GGML_API size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
-    GGML_API size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);
-    GGML_API size_t ggml_quantize_iq2_xxs(const float * src, void * dst, int n, int k, int64_t * hist);
-    GGML_API size_t ggml_quantize_iq2_xs (const float * src, void * dst, int n, int k, int64_t * hist);
-
-    GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
-
-    //
-    // Importance matrix
-    //
-    typedef void(*ggml_collect_imatrix_t)(const struct ggml_tensor * src0, const struct ggml_tensor * src1);
-    GGML_API void ggml_set_imatrix_collection(ggml_collect_imatrix_t imatrix_collect);
-
-    //
-    // gguf
-    //
-
-    enum gguf_type {
-        GGUF_TYPE_UINT8   = 0,
-        GGUF_TYPE_INT8    = 1,
-        GGUF_TYPE_UINT16  = 2,
-        GGUF_TYPE_INT16   = 3,
-        GGUF_TYPE_UINT32  = 4,
-        GGUF_TYPE_INT32   = 5,
-        GGUF_TYPE_FLOAT32 = 6,
-        GGUF_TYPE_BOOL    = 7,
-        GGUF_TYPE_STRING  = 8,
-        GGUF_TYPE_ARRAY   = 9,
-        GGUF_TYPE_UINT64  = 10,
-        GGUF_TYPE_INT64   = 11,
-        GGUF_TYPE_FLOAT64 = 12,
-        GGUF_TYPE_COUNT,       // marks the end of the enum
-    };
-
-    struct gguf_context;
-
-    struct gguf_init_params {
-        bool no_alloc;
-
-        // if not NULL, create a ggml_context and allocate the tensor data in it
-        struct ggml_context ** ctx;
-    };
-
-    GGML_API struct gguf_context * gguf_init_empty(void);
-    GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params);
-    //GGML_API struct gguf_context * gguf_init_from_buffer(..);
-
-    GGML_API void gguf_free(struct gguf_context * ctx);
-
-    GGML_API const char * gguf_type_name(enum gguf_type type);
-
-    GGML_API int    gguf_get_version    (const struct gguf_context * ctx);
-    GGML_API size_t gguf_get_alignment  (const struct gguf_context * ctx);
-    GGML_API size_t gguf_get_data_offset(const struct gguf_context * ctx);
-    GGML_API void * gguf_get_data       (const struct gguf_context * ctx);
-
-    GGML_API int          gguf_get_n_kv(const struct gguf_context * ctx);
-    GGML_API int          gguf_find_key(const struct gguf_context * ctx, const char * key);
-    GGML_API const char * gguf_get_key (const struct gguf_context * ctx, int key_id);
-
-    GGML_API enum gguf_type gguf_get_kv_type (const struct gguf_context * ctx, int key_id);
-    GGML_API enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int key_id);
-
-    // will abort if the wrong type is used for the key
-    GGML_API uint8_t      gguf_get_val_u8  (const struct gguf_context * ctx, int key_id);
-    GGML_API int8_t       gguf_get_val_i8  (const struct gguf_context * ctx, int key_id);
-    GGML_API uint16_t     gguf_get_val_u16 (const struct gguf_context * ctx, int key_id);
-    GGML_API int16_t      gguf_get_val_i16 (const struct gguf_context * ctx, int key_id);
-    GGML_API uint32_t     gguf_get_val_u32 (const struct gguf_context * ctx, int key_id);
-    GGML_API int32_t      gguf_get_val_i32 (const struct gguf_context * ctx, int key_id);
-    GGML_API float        gguf_get_val_f32 (const struct gguf_context * ctx, int key_id);
-    GGML_API uint64_t     gguf_get_val_u64 (const struct gguf_context * ctx, int key_id);
-    GGML_API int64_t      gguf_get_val_i64 (const struct gguf_context * ctx, int key_id);
-    GGML_API double       gguf_get_val_f64 (const struct gguf_context * ctx, int key_id);
-    GGML_API bool         gguf_get_val_bool(const struct gguf_context * ctx, int key_id);
-    GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int key_id);
-    GGML_API const void * gguf_get_val_data(const struct gguf_context * ctx, int key_id);
-    GGML_API int          gguf_get_arr_n   (const struct gguf_context * ctx, int key_id);
-    GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id);
-    GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int key_id, int i);
-
-    GGML_API int            gguf_get_n_tensors    (const struct gguf_context * ctx);
-    GGML_API int            gguf_find_tensor      (const struct gguf_context * ctx, const char * name);
-    GGML_API size_t         gguf_get_tensor_offset(const struct gguf_context * ctx, int i);
-    GGML_API char *         gguf_get_tensor_name  (const struct gguf_context * ctx, int i);
-    GGML_API enum ggml_type gguf_get_tensor_type  (const struct gguf_context * ctx, int i);
-
-    // overrides existing values or adds a new one
-    GGML_API void gguf_set_val_u8  (struct gguf_context * ctx, const char * key, uint8_t  val);
-    GGML_API void gguf_set_val_i8  (struct gguf_context * ctx, const char * key, int8_t   val);
-    GGML_API void gguf_set_val_u16 (struct gguf_context * ctx, const char * key, uint16_t val);
-    GGML_API void gguf_set_val_i16 (struct gguf_context * ctx, const char * key, int16_t  val);
-    GGML_API void gguf_set_val_u32 (struct gguf_context * ctx, const char * key, uint32_t val);
-    GGML_API void gguf_set_val_i32 (struct gguf_context * ctx, const char * key, int32_t  val);
-    GGML_API void gguf_set_val_f32 (struct gguf_context * ctx, const char * key, float    val);
-    GGML_API void gguf_set_val_u64 (struct gguf_context * ctx, const char * key, uint64_t val);
-    GGML_API void gguf_set_val_i64 (struct gguf_context * ctx, const char * key, int64_t  val);
-    GGML_API void gguf_set_val_f64 (struct gguf_context * ctx, const char * key, double   val);
-    GGML_API void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool     val);
-    GGML_API void gguf_set_val_str (struct gguf_context * ctx, const char * key, const char * val);
-    GGML_API void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, int n);
-    GGML_API void gguf_set_arr_str (struct gguf_context * ctx, const char * key, const char ** data, int n);
-
-    // set or add KV pairs from another context
-    GGML_API void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src);
-
-    // manage tensor info
-    GGML_API void gguf_add_tensor(struct gguf_context * ctx, const struct ggml_tensor * tensor);
-    GGML_API void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type);
-    GGML_API void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size);
-
-    // writing gguf files can be done in 2 ways:
-    //
-    // - write the entire gguf_context to a binary file in a single pass:
-    //
-    //   gguf_write_to_file(ctx, fname);
-    //
-    // - first prepare a file with a placeholder for the meta data, write the tensor data, then write the meta data:
-    //
-    //   FILE * f = fopen(fname, "wb");
-    //   fseek(f, gguf_get_meta_size(ctx), SEEK_SET);
-    //   fwrite(f, ...);
-    //   void * data = gguf_meta_get_meta_data(ctx);
-    //   fseek(f, 0, SEEK_SET);
-    //   fwrite(f, data, gguf_get_meta_size(ctx));
-    //   free(data);
-    //   fclose(f);
-    //
-
-    // write the entire context to a binary file
-    GGML_API void gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta);
-
-    // get the size in bytes of the meta data (header, kv pairs, tensor info) including padding
-    GGML_API size_t gguf_get_meta_size(const struct gguf_context * ctx);
-    GGML_API void   gguf_get_meta_data(const struct gguf_context * ctx, void * data);
-
-    //
-    // system info
-    //
-
-    GGML_API int ggml_cpu_has_avx        (void);
-    GGML_API int ggml_cpu_has_avx_vnni   (void);
-    GGML_API int ggml_cpu_has_avx2       (void);
-    GGML_API int ggml_cpu_has_avx512     (void);
-    GGML_API int ggml_cpu_has_avx512_vbmi(void);
-    GGML_API int ggml_cpu_has_avx512_vnni(void);
-    GGML_API int ggml_cpu_has_fma        (void);
-    GGML_API int ggml_cpu_has_neon       (void);
-    GGML_API int ggml_cpu_has_arm_fma    (void);
-    GGML_API int ggml_cpu_has_metal      (void);
-    GGML_API int ggml_cpu_has_f16c       (void);
-    GGML_API int ggml_cpu_has_fp16_va    (void);
-    GGML_API int ggml_cpu_has_wasm_simd  (void);
-    GGML_API int ggml_cpu_has_blas       (void);
-    GGML_API int ggml_cpu_has_cublas     (void);
-    GGML_API int ggml_cpu_has_clblast    (void);
-    GGML_API int ggml_cpu_has_gpublas    (void);
-    GGML_API int ggml_cpu_has_sse3       (void);
-    GGML_API int ggml_cpu_has_ssse3      (void);
-    GGML_API int ggml_cpu_has_vsx        (void);
-
-    //
-    // Internal types and functions exposed for tests and benchmarks
-    //
-
-#ifdef  __cplusplus
-// restrict not standard in C++
-#define GGML_RESTRICT
-#else
-#define GGML_RESTRICT restrict
-#endif
-    typedef void (*ggml_to_float_t)  (const void  * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
-    typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void  * GGML_RESTRICT y, int k);
-    typedef void (*ggml_vec_dot_t)   (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
-
-    typedef struct {
-        const char      * type_name;
-        int               blck_size;
-        size_t            type_size;
-        bool              is_quantized;
-        ggml_to_float_t   to_float;
-        ggml_from_float_t from_float;
-        ggml_from_float_t from_float_reference;
-        ggml_vec_dot_t    vec_dot;
-        enum ggml_type    vec_dot_type;
-    } ggml_type_traits_t;
-
-    GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
-
-#ifdef  __cplusplus
-}
-#endif
diff --git a/ggml/requirements.txt b/ggml/requirements.txt
deleted file mode 100644
index a9fd960..0000000
--- a/ggml/requirements.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-accelerate==0.19.0
-numpy==1.24.3
-sentencepiece==0.1.98
-torch==2.0.1
-torchaudio==2.0.2
-torchvision==0.15.2
-transformers==4.29.2
-gguf==0.4.5
diff --git a/ggml/scripts/sync-llama-am.sh b/ggml/scripts/sync-llama-am.sh
deleted file mode 100644
index 0aab3dc..0000000
--- a/ggml/scripts/sync-llama-am.sh
+++ /dev/null
@@ -1,155 +0,0 @@
-#!/bin/bash
-#
-# Synchronize llama.cpp changes to ggml
-#
-# Usage:
-#
-#   $ cd /path/to/ggml
-#   $ ./scripts/sync-llama-am.sh -skip hash0,hash1,hash2...
-#
-
-set -e
-
-sd=$(dirname $0)
-cd $sd/../
-
-SRC_GGML=$(pwd)
-SRC_LLAMA=$(cd ../llama.cpp; pwd)
-
-if [ ! -d $SRC_LLAMA ]; then
-    echo "llama.cpp not found at $SRC_LLAMA"
-    exit 1
-fi
-
-lc=$(cat $SRC_GGML/scripts/sync-llama.last)
-echo "Syncing llama.cpp changes since commit $lc"
-
-to_skip=""
-if [ "$1" == "-skip" ]; then
-    to_skip=$2
-fi
-
-cd $SRC_LLAMA
-
-git log --oneline $lc..HEAD
-git log --oneline $lc..HEAD --reverse | grep -v "(ggml/[0-9]*)" | grep -v "(whisper/[0-9]*)" | cut -d' ' -f1 > $SRC_GGML/llama-commits
-
-if [ ! -s $SRC_GGML/llama-commits ]; then
-    rm -v $SRC_GGML/llama-commits
-    echo "No new commits"
-    exit 0
-fi
-
-if [ -f $SRC_GGML/llama-src.patch ]; then
-    rm -v $SRC_GGML/llama-src.patch
-fi
-
-while read c; do
-    if [ -n "$to_skip" ]; then
-        if [[ $to_skip == *"$c"* ]]; then
-            echo "Skipping $c"
-            continue
-        fi
-    fi
-
-    git format-patch -k $c~1..$c --stdout -- \
-        ggml*.h \
-        ggml*.c \
-        ggml*.cpp \
-        ggml*.m \
-        ggml*.metal \
-        ggml*.cu \
-        tests/test-opt.cpp \
-        tests/test-grad0.cpp \
-        tests/test-quantize-fns.cpp \
-        tests/test-quantize-perf.cpp \
-        tests/test-backend-ops.cpp \
-        >> $SRC_GGML/llama-src.patch
-done < $SRC_GGML/llama-commits
-
-rm -v $SRC_GGML/llama-commits
-
-# delete files if empty
-if [ ! -s $SRC_GGML/llama-src.patch ]; then
-    rm -v $SRC_GGML/llama-src.patch
-fi
-
-cd $SRC_GGML
-
-if [ -f $SRC_GGML/llama-src.patch ]; then
-    # replace PR numbers
-    #
-    # Subject: some text (#1234)
-    # Subject: some text (llama/1234)
-    cat llama-src.patch | sed -e 's/^Subject: \(.*\) (#\([0-9]*\))/Subject: \1 (llama\/\2)/' > llama-src.patch.tmp
-    mv llama-src.patch.tmp llama-src.patch
-
-    cat llama-src.patch | sed -e 's/^\(.*\) (#\([0-9]*\))$/\1 (llama\/\2)/' > llama-src.patch.tmp
-    mv llama-src.patch.tmp llama-src.patch
-
-    # replace filenames:
-    #
-    # ggml.c              -> src/ggml.c
-    # ggml-alloc.c        -> src/ggml-alloc.c
-    # ggml-backend-impl.h -> src/ggml-backend-impl.h
-    # ggml-backend.c      -> src/ggml-backend.c
-    # ggml-cuda.cu        -> src/ggml-cuda.cu
-    # ggml-cuda.h         -> src/ggml-cuda.h
-    # ggml-impl.h         -> src/ggml-impl.h
-    # ggml-metal.h        -> src/ggml-metal.h
-    # ggml-metal.m        -> src/ggml-metal.m
-    # ggml-mpi.h          -> src/ggml-mpi.h
-    # ggml-mpi.c          -> src/ggml-mpi.c
-    # ggml-opencl.cpp     -> src/ggml-opencl.cpp
-    # ggml-opencl.h       -> src/ggml-opencl.h
-    # ggml-quants.c       -> src/ggml-quants.c
-    # ggml-quants.h       -> src/ggml-quants.h
-    # ggml.h              -> include/ggml/ggml.h
-    # ggml-alloc.h        -> include/ggml/ggml-alloc.h
-    # ggml-backend.h      -> include/ggml/ggml-backend.h
-    #
-    # tests/test-opt.cpp           -> tests/test-opt.cpp
-    # tests/test-grad0.cpp         -> tests/test-grad0.cpp
-    # tests/test-quantize-fns.cpp  -> tests/test-quantize-fns.cpp
-    # tests/test-quantize-perf.cpp -> tests/test-quantize-perf.cpp
-    # tests/test-backend-ops.cpp   -> tests/test-backend-ops.cpp
-
-    cat llama-src.patch | sed \
-        -e 's/\/ggml\.c/\/src\/ggml.c/g' \
-        -e 's/\/ggml-alloc\.c/\/src\/ggml-alloc.c/g' \
-        -e 's/\/ggml-backend-impl\.h/\/src\/ggml-backend-impl.h/g' \
-        -e 's/\/ggml-backend\.c/\/src\/ggml-backend.c/g' \
-        -e 's/\/ggml-cuda\.cu/\/src\/ggml-cuda.cu/g' \
-        -e 's/\/ggml-cuda\.h/\/src\/ggml-cuda.h/g' \
-        -e 's/\/ggml-impl\.h/\/src\/ggml-impl.h/g' \
-        -e 's/\/ggml-metal\.h/\/src\/ggml-metal.h/g' \
-        -e 's/\/ggml-metal\.m/\/src\/ggml-metal.m/g' \
-        -e 's/\/ggml-mpi\.h/\/src\/ggml-mpi.h/g' \
-        -e 's/\/ggml-mpi\.c/\/src\/ggml-mpi.c/g' \
-        -e 's/\/ggml-opencl\.cpp/\/src\/ggml-opencl.cpp/g' \
-        -e 's/\/ggml-opencl\.h/\/src\/ggml-opencl.h/g' \
-        -e 's/\/ggml-quants\.c/\/src\/ggml-quants.c/g' \
-        -e 's/\/ggml-quants\.h/\/src\/ggml-quants.h/g' \
-        -e 's/\/ggml\.h/\/include\/ggml\/ggml.h/g' \
-        -e 's/\/ggml-alloc\.h/\/include\/ggml\/ggml-alloc.h/g' \
-        -e 's/\/ggml-backend\.h/\/include\/ggml\/ggml-backend.h/g' \
-        -e 's/\/tests\/test-opt\.cpp/\/tests\/test-opt.cpp/g' \
-        -e 's/\/tests\/test-grad0\.cpp/\/tests\/test-grad0.cpp/g' \
-        -e 's/\/tests\/test-quantize-fns\.cpp/\/tests\/test-quantize-fns.cpp/g' \
-        -e 's/\/tests\/test-quantize-perf\.cpp/\/tests\/test-quantize-perf.cpp/g' \
-        -e 's/\/tests\/test-backend-ops\.cpp/\/tests\/test-backend-ops.cpp/g' \
-        > llama-src.patch.tmp
-    mv llama-src.patch.tmp llama-src.patch
-
-    git am llama-src.patch
-
-    rm -v $SRC_GGML/llama-src.patch
-fi
-
-# update last commit
-cd $SRC_LLAMA
-git log -1 --format=%H > $SRC_GGML/scripts/sync-llama.last
-
-echo "Done"
-
-exit 0
diff --git a/ggml/scripts/sync-llama.last b/ggml/scripts/sync-llama.last
deleted file mode 100644
index 43c27f5..0000000
--- a/ggml/scripts/sync-llama.last
+++ /dev/null
@@ -1 +0,0 @@
-c71d608ce7a1584bf5072f197919dd24f3a6163f
diff --git a/ggml/scripts/sync-llama.sh b/ggml/scripts/sync-llama.sh
deleted file mode 100644
index 67143c9..0000000
--- a/ggml/scripts/sync-llama.sh
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/bin/bash
-
-cp -rpv ../llama.cpp/ggml.c              src/ggml.c
-cp -rpv ../llama.cpp/ggml-alloc.c        src/ggml-alloc.c
-cp -rpv ../llama.cpp/ggml-backend-impl.h src/ggml-backend-impl.h
-cp -rpv ../llama.cpp/ggml-backend.c      src/ggml-backend.c
-cp -rpv ../llama.cpp/ggml-cuda.cu        src/ggml-cuda.cu
-cp -rpv ../llama.cpp/ggml-cuda.h         src/ggml-cuda.h
-cp -rpv ../llama.cpp/ggml-impl.h         src/ggml-impl.h
-cp -rpv ../llama.cpp/ggml-metal.h        src/ggml-metal.h
-cp -rpv ../llama.cpp/ggml-metal.m        src/ggml-metal.m
-cp -rpv ../llama.cpp/ggml-metal.metal    src/ggml-metal.metal
-#cp -rpv ../llama.cpp/ggml-mpi.h          src/ggml-mpi.h
-#cp -rpv ../llama.cpp/ggml-mpi.c          src/ggml-mpi.c
-cp -rpv ../llama.cpp/ggml-opencl.cpp     src/ggml-opencl.cpp
-cp -rpv ../llama.cpp/ggml-opencl.h       src/ggml-opencl.h
-cp -rpv ../llama.cpp/ggml-quants.c       src/ggml-quants.c
-cp -rpv ../llama.cpp/ggml-quants.h       src/ggml-quants.h
-cp -rpv ../llama.cpp/ggml.h              include/ggml/ggml.h
-cp -rpv ../llama.cpp/ggml-alloc.h        include/ggml/ggml-alloc.h
-cp -rpv ../llama.cpp/ggml-backend.h      include/ggml/ggml-backend.h
-
-cp -rpv ../llama.cpp/tests/test-opt.cpp           tests/test-opt.cpp
-cp -rpv ../llama.cpp/tests/test-grad0.cpp         tests/test-grad0.cpp
-cp -rpv ../llama.cpp/tests/test-quantize-fns.cpp  tests/test-quantize-fns.cpp
-cp -rpv ../llama.cpp/tests/test-quantize-perf.cpp tests/test-quantize-perf.cpp
-cp -rpv ../llama.cpp/tests/test-backend-ops.cpp   tests/test-backend-ops.cpp
diff --git a/ggml/scripts/sync-whisper-am.sh b/ggml/scripts/sync-whisper-am.sh
deleted file mode 100644
index dcbf8ce..0000000
--- a/ggml/scripts/sync-whisper-am.sh
+++ /dev/null
@@ -1,165 +0,0 @@
-#!/bin/bash
-#
-# Synchronize whisper.cpp changes to ggml
-#
-# Usage:
-#
-#   $ cd /path/to/ggml
-#   $ ./scripts/sync-whisper-am.sh -skip hash0,hash1,hash2...
-#
-
-set -e
-
-sd=$(dirname $0)
-cd $sd/../
-
-SRC_GGML=$(pwd)
-SRC_WHISPER=$(cd ../whisper.cpp; pwd)
-
-if [ ! -d $SRC_WHISPER ]; then
-    echo "whisper.cpp not found at $SRC_WHISPER"
-    exit 1
-fi
-
-lc=$(cat $SRC_GGML/scripts/sync-whisper.last)
-echo "Syncing whisper.cpp changes since commit $lc"
-
-to_skip=""
-if [ "$1" == "-skip" ]; then
-    to_skip=$2
-fi
-
-cd $SRC_WHISPER
-
-git log --oneline $lc..HEAD
-git log --oneline $lc..HEAD --reverse | grep -v "(ggml/[0-9]*)" | grep -v "(llama/[0-9]*)" | cut -d' ' -f1 > $SRC_GGML/whisper-commits
-
-if [ ! -s $SRC_GGML/whisper-commits ]; then
-    rm -v $SRC_GGML/whisper-commits
-    echo "No new commits"
-    exit 0
-fi
-
-if [ -f $SRC_GGML/whisper-src.patch ]; then
-    rm -v $SRC_GGML/whisper-src.patch
-fi
-
-while read c; do
-    if [ -n "$to_skip" ]; then
-        if [[ $to_skip == *"$c"* ]]; then
-            echo "Skipping $c"
-            continue
-        fi
-    fi
-
-    git format-patch -k $c~1..$c --stdout -- \
-    ggml*.h \
-    ggml*.c \
-    ggml*.cpp \
-    ggml*.m \
-    ggml*.metal \
-    ggml*.cu \
-    whisper.h \
-    whisper.cpp \
-    examples/common.h \
-    examples/common.cpp \
-    examples/common-ggml.h \
-    examples/common-ggml.cpp \
-    examples/main/main.cpp \
-    examples/quantize/quantize.cpp \
-    >> $SRC_GGML/whisper-src.patch
-done < $SRC_GGML/whisper-commits
-
-rm -v $SRC_GGML/whisper-commits
-
-# delete files if empty
-if [ ! -s $SRC_GGML/whisper-src.patch ]; then
-    rm -v $SRC_GGML/whisper-src.patch
-fi
-
-cd $SRC_GGML
-
-if [ -f $SRC_GGML/whisper-src.patch ]; then
-    # replace PR numbers
-    #
-    # Subject: some text (#1234)
-    # Subject: some text (whisper/1234)
-    cat whisper-src.patch | sed -e 's/^Subject: \(.*\) (#\([0-9]*\))/Subject: \1 (whisper\/\2)/' > whisper-src.patch.tmp
-    mv whisper-src.patch.tmp whisper-src.patch
-
-    cat whisper-src.patch | sed -e 's/^\(.*\) (#\([0-9]*\))$/\1 (whisper\/\2)/' > whisper-src.patch.tmp
-    mv whisper-src.patch.tmp whisper-src.patch
-
-    # replace filenames:
-    #
-    # ggml.c              -> src/ggml.c
-    # ggml-alloc.c        -> src/ggml-alloc.c
-    # ggml-backend-impl.h -> src/ggml-backend-impl.h
-    # ggml-backend.c      -> src/ggml-backend.c
-    # ggml-cuda.cu        -> src/ggml-cuda.cu
-    # ggml-cuda.h         -> src/ggml-cuda.h
-    # ggml-impl.h         -> src/ggml-impl.h
-    # ggml-metal.h        -> src/ggml-metal.h
-    # ggml-metal.m        -> src/ggml-metal.m
-    # ggml-mpi.h          -> src/ggml-mpi.h
-    # ggml-mpi.c          -> src/ggml-mpi.c
-    # ggml-opencl.cpp     -> src/ggml-opencl.cpp
-    # ggml-opencl.h       -> src/ggml-opencl.h
-    # ggml-quants.c       -> src/ggml-quants.c
-    # ggml-quants.h       -> src/ggml-quants.h
-    # ggml.h              -> include/ggml/ggml.h
-    # ggml-alloc.h        -> include/ggml/ggml-alloc.h
-    # ggml-backend.h      -> include/ggml/ggml-backend.h
-    #
-    # whisper.h           -> examples/whisper/whisper.h
-    # whisper.cpp         -> examples/whisper/whisper.cpp
-    #
-    # examples/common.h              -> examples/common.h
-    # examples/common.cpp            -> examples/common.cpp
-    # examples/common-ggml.h         -> examples/common-ggml.h
-    # examples/common-ggml.cpp       -> examples/common-ggml.cpp
-    # examples/main/main.cpp         -> examples/whisper/main.cpp
-    # examples/quantize/quantize.cpp -> examples/whisper/quantize.cpp
-
-    cat whisper-src.patch | sed \
-        -e 's/\/ggml\.c/\/src\/ggml.c/g' \
-        -e 's/\/ggml-alloc\.c/\/src\/ggml-alloc.c/g' \
-        -e 's/\/ggml-backend-impl\.h/\/src\/ggml-backend-impl.h/g' \
-        -e 's/\/ggml-backend\.c/\/src\/ggml-backend.c/g' \
-        -e 's/\/ggml-cuda\.cu/\/src\/ggml-cuda.cu/g' \
-        -e 's/\/ggml-cuda\.h/\/src\/ggml-cuda.h/g' \
-        -e 's/\/ggml-impl\.h/\/src\/ggml-impl.h/g' \
-        -e 's/\/ggml-metal\.h/\/src\/ggml-metal.h/g' \
-        -e 's/\/ggml-metal\.m/\/src\/ggml-metal.m/g' \
-        -e 's/\/ggml-mpi\.h/\/src\/ggml-mpi.h/g' \
-        -e 's/\/ggml-mpi\.c/\/src\/ggml-mpi.c/g' \
-        -e 's/\/ggml-opencl\.cpp/\/src\/ggml-opencl.cpp/g' \
-        -e 's/\/ggml-opencl\.h/\/src\/ggml-opencl.h/g' \
-        -e 's/\/ggml-quants\.c/\/src\/ggml-quants.c/g' \
-        -e 's/\/ggml-quants\.h/\/src\/ggml-quants.h/g' \
-        -e 's/\/ggml\.h/\/include\/ggml\/ggml.h/g' \
-        -e 's/\/ggml-alloc\.h/\/include\/ggml\/ggml-alloc.h/g' \
-        -e 's/\/ggml-backend\.h/\/include\/ggml\/ggml-backend.h/g' \
-        -e 's/\/whisper\.h/\/examples\/whisper\/whisper.h/g' \
-        -e 's/\/whisper\.cpp/\/examples\/whisper\/whisper.cpp/g' \
-        -e 's/\/examples\/common\.h/\/examples\/common.h/g' \
-        -e 's/\/examples\/common\.cpp/\/examples\/common.cpp/g' \
-        -e 's/\/examples\/common-ggml\.h/\/examples\/common-ggml.h/g' \
-        -e 's/\/examples\/common-ggml\.cpp/\/examples\/common-ggml.cpp/g' \
-        -e 's/\/examples\/main\/main\.cpp/\/examples\/whisper\/main.cpp/g' \
-        -e 's/\/examples\/quantize\/quantize\.cpp/\/examples\/whisper\/quantize.cpp/g' \
-        > whisper-src.patch.tmp
-    mv whisper-src.patch.tmp whisper-src.patch
-
-    git am whisper-src.patch
-
-    rm -v $SRC_GGML/whisper-src.patch
-fi
-
-# update last commit
-cd $SRC_WHISPER
-git log -1 --format=%H > $SRC_GGML/scripts/sync-whisper.last
-
-echo "Done"
-
-exit 0
diff --git a/ggml/scripts/sync-whisper.last b/ggml/scripts/sync-whisper.last
deleted file mode 100644
index 7aa44e5..0000000
--- a/ggml/scripts/sync-whisper.last
+++ /dev/null
@@ -1 +0,0 @@
-f001a3b7b6cd223134d9b449625354379249fa5b
diff --git a/ggml/scripts/sync-whisper.sh b/ggml/scripts/sync-whisper.sh
deleted file mode 100644
index 6976c77..0000000
--- a/ggml/scripts/sync-whisper.sh
+++ /dev/null
@@ -1,32 +0,0 @@
-#!/bin/bash
-
-cp -rpv ../whisper.cpp/ggml.c                         src/ggml.c
-cp -rpv ../whisper.cpp/ggml-impl.h                    src/ggml-impl.h
-cp -rpv ../whisper.cpp/ggml-alloc.c                   src/ggml-alloc.c
-cp -rpv ../whisper.cpp/ggml-backend-impl.h            src/ggml-backend-impl.h
-cp -rpv ../whisper.cpp/ggml-backend.c                 src/ggml-backend.c
-cp -rpv ../whisper.cpp/ggml-cuda.cu                   src/ggml-cuda.cu
-cp -rpv ../whisper.cpp/ggml-cuda.h                    src/ggml-cuda.h
-cp -rpv ../whisper.cpp/ggml-metal.h                   src/ggml-metal.h
-cp -rpv ../whisper.cpp/ggml-metal.m                   src/ggml-metal.m
-cp -rpv ../whisper.cpp/ggml-metal.metal               src/ggml-metal.metal
-#cp -rpv ../whisper.cpp/ggml-mpi.h                     src/ggml-mpi.h
-#cp -rpv ../whisper.cpp/ggml-mpi.m                     src/ggml-mpi.m
-cp -rpv ../whisper.cpp/ggml-opencl.cpp                src/ggml-opencl.cpp
-cp -rpv ../whisper.cpp/ggml-opencl.h                  src/ggml-opencl.h
-cp -rpv ../whisper.cpp/ggml-quants.c                  src/ggml-quants.c
-cp -rpv ../whisper.cpp/ggml-quants.h                  src/ggml-quants.h
-
-cp -rpv ../whisper.cpp/ggml.h                         include/ggml/ggml.h
-cp -rpv ../whisper.cpp/ggml-alloc.h                   include/ggml/ggml-alloc.h
-cp -rpv ../whisper.cpp/ggml-backend.h                 include/ggml/ggml-backend.h
-
-cp -rpv ../whisper.cpp/examples/common.h              examples/common.h
-cp -rpv ../whisper.cpp/examples/common.cpp            examples/common.cpp
-cp -rpv ../whisper.cpp/examples/common-ggml.h         examples/common-ggml.h
-cp -rpv ../whisper.cpp/examples/common-ggml.cpp       examples/common-ggml.cpp
-
-cp -rpv ../whisper.cpp/whisper.h                      examples/whisper/whisper.h
-cp -rpv ../whisper.cpp/whisper.cpp                    examples/whisper/whisper.cpp
-cp -rpv ../whisper.cpp/examples/main/main.cpp         examples/whisper/main.cpp
-cp -rpv ../whisper.cpp/examples/quantize/quantize.cpp examples/whisper/quantize.cpp
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
deleted file mode 100644
index 5f9baa1..0000000
--- a/ggml/src/CMakeLists.txt
+++ /dev/null
@@ -1,411 +0,0 @@
-if (GGML_ALL_WARNINGS)
-    if (NOT MSVC)
-        add_compile_options(-Wunused -Wextra -Wcast-qual -Wdouble-promotion)
-        add_compile_options("$<$<COMPILE_LANGUAGE:C>:-Wshadow;-Wno-unused-function;-Wmissing-prototypes>")
-    else()
-        # todo : windows
-    endif()
-endif()
-
-# compiler flags
-
-if (NOT MSVC)
-    #set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fno-math-errno -ffinite-math-only -funsafe-math-optimizations")
-endif()
-
-message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}")
-
-if (NOT UNAME_S)
-    execute_process(COMMAND uname -s OUTPUT_VARIABLE UNAME_S)
-endif()
-if (NOT UNAME_P)
-    execute_process(COMMAND uname -p OUTPUT_VARIABLE UNAME_P)
-endif()
-if (NOT UNAME_M)
-    execute_process(COMMAND uname -m OUTPUT_VARIABLE UNAME_M)
-endif()
-#message(STATUS "UNAME_S: ${UNAME_S}  UNAME_P: ${UNAME_P}  UNAME_M: ${UNAME_M}")
-
-# this version of Apple ld64 is buggy
-execute_process(
-    COMMAND ${CMAKE_C_COMPILER} ${CMAKE_EXE_LINKER_FLAGS} -Wl,-v
-    ERROR_VARIABLE output
-)
-if (output MATCHES "dyld-1015\.7")
-    add_compile_definitions(HAVE_BUGGY_APPLE_LINKER)
-endif()
-
-# Mac OS + Arm can report x86_64
-# ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
-if (UNAME_S MATCHES "Darwin")
-    if (NOT UNAME_P MATCHES "arm")
-        execute_process(COMMAND sysctl -n hw.optional.arm64 OUTPUT_VARIABLE SYSCTL_M)
-	if (SYSCTL_M MATCHES "1")
-            #set(UNAME_P "arm")
-            #set(UNAME_M "arm64")
-	    message(WARNING "Your arch is announced as x86_64, but it seems to actually be ARM64. Not fixing that can lead to bad performance. For more info see: https://github.com/ggerganov/whisper.cpp/issues/66\#issuecomment-#1282546789")
-	endif()
-    endif()
-endif()
-
-if (${CMAKE_SYSTEM_NAME} STREQUAL "Emscripten")
-    message(STATUS "Emscripten detected")
-elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64")
-    message(STATUS "ARM detected")
-    #set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mcpu=apple-m1")
-elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
-    message(STATUS "PPC64 detected")
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mpower9-vector")
-else()
-    message(STATUS "x86 detected")
-    #set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx -mavx2 -mfma -mf16c")
-    if (UNAME_S MATCHES "Darwin")
-        execute_process(COMMAND sysctl machdep.cpu.features OUTPUT_VARIABLE AVX1_M)
-        if (AVX1_M MATCHES "AVX1.0")
-            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx")
-        endif()
-        execute_process(COMMAND sysctl machdep.cpu.leaf7_features OUTPUT_VARIABLE AVX2_M)
-        if (AVX2_M MATCHES "AVX2")
-            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx2")
-        endif()
-        if (AVX1_M MATCHES "FMA")
-            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfma")
-        endif()
-        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mf16c")
-    elseif (UNAME_S MATCHES "Linux")
-        message(STATUS "Linux detected")
-        execute_process(COMMAND grep "avx " /proc/cpuinfo OUTPUT_VARIABLE AVX1_M)
-        if (AVX1_M MATCHES "avx")
-            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx")
-        endif()
-        execute_process(COMMAND grep "avx2 " /proc/cpuinfo OUTPUT_VARIABLE AVX2_M)
-        if (AVX2_M MATCHES "avx2")
-            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx2")
-        endif()
-        execute_process(COMMAND grep "fma " /proc/cpuinfo OUTPUT_VARIABLE FMA_M)
-        if (FMA_M MATCHES "fma")
-            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfma")
-        endif()
-        execute_process(COMMAND grep "f16c " /proc/cpuinfo OUTPUT_VARIABLE F16C_M)
-        if (F16C_M MATCHES "f16c")
-            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mf16c")
-        endif()
-        execute_process(COMMAND grep "sse3 " /proc/cpuinfo OUTPUT_VARIABLE SSE3_M)
-        if (SSE3_M MATCHES "sse3")
-            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -msse3")
-        endif()
-    elseif (UNAME_S MATCHES "Haiku")
-        message(STATUS "Haiku detected")
-        execute_process(COMMAND sysinfo -cpu COMMAND grep "AVX " OUTPUT_VARIABLE AVX1_M)
-        if (AVX1_M MATCHES "avx")
-            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx")
-        endif()
-        execute_process(COMMAND sysinfo -cpu COMMAND grep "AVX2 " OUTPUT_VARIABLE AVX2_M)
-        if (AVX2_M MATCHES "avx2")
-            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx2")
-        endif()
-        execute_process(COMMAND sysinfo -cpu COMMAND grep "FMA " OUTPUT_VARIABLE FMA_M)
-        if (FMA_M MATCHES "fma")
-            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfma")
-        endif()
-        execute_process(COMMAND sysinfo -cpu COMMAND grep "F16C " OUTPUT_VARIABLE F16C_M)
-        if (F16C_M MATCHES "f16c")
-            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mf16c")
-        endif()
-    elseif (MSVC)
-        if (GGML_AVX512)
-            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /arch:AVX512")
-            # MSVC has no compile-time flags enabling specific
-            # AVX512 extensions, neither it defines the
-            # macros corresponding to the extensions.
-            # Do it manually.
-            if (GGML_AVX512_VBMI)
-                add_compile_definitions(__AVX512VBMI__)
-            endif()
-            if (GGML_AVX512_VNNI)
-                add_compile_definitions(__AVX512VNNI__)
-            endif()
-        elseif (GGML_AVX2)
-            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /arch:AVX2")
-        elseif (GGML_AVX)
-            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /arch:AVX")
-        endif()
-    else()
-        set(CMAKE_C_FLAGS  "${CMAKE_C_FLAGS} -mfma -mf16c -mavx -mavx2")
-    endif()
-endif()
-
-# ggml
-
-set(TARGET ggml)
-
-# on APPLE - include Accelerate framework
-if (APPLE AND NOT GGML_NO_ACCELERATE)
-    find_library(ACCELERATE_FRAMEWORK Accelerate)
-    if (ACCELERATE_FRAMEWORK)
-        message(STATUS "Accelerate framework found")
-
-        set(GGML_EXTRA_LIBS  ${GGML_EXTRA_LIBS}  ${ACCELERATE_FRAMEWORK})
-        set(GGML_EXTRA_FLAGS ${GGML_EXTRA_FLAGS} -DGGML_USE_ACCELERATE)
-    else()
-        message(WARNING "Accelerate framework not found")
-    endif()
-endif()
-
-if (GGML_OPENBLAS)
-    set(OPENBLAS_INCLUDE_SEARCH_PATHS
-        /usr/include
-        /usr/include/openblas
-        /usr/include/openblas-base
-        /usr/local/include
-        /usr/local/include/openblas
-        /usr/local/include/openblas-base
-        /opt/OpenBLAS/include
-        $ENV{OpenBLAS_HOME}
-        $ENV{OpenBLAS_HOME}/include
-        )
-    find_path(OPENBLAS_INC NAMES cblas.h PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS})
-    find_library(OPENBLAS_LIB NAMES openblas libopenblas)
-    if (OPENBLAS_LIB)
-        message(STATUS "OpenBLAS found")
-
-        set(GGML_EXTRA_LIBS  ${GGML_EXTRA_LIBS}  ${OPENBLAS_LIB})
-        set(GGML_EXTRA_INCS  ${GGML_EXTRA_INCS}  ${OPENBLAS_INC})
-        set(GGML_EXTRA_FLAGS ${GGML_EXTRA_FLAGS} -DGGML_USE_OPENBLAS)
-    else()
-        message(WARNING "OpenBLAS not found")
-    endif()
-endif()
-
-if (GGML_CLBLAST)
-	set(CLBLAST_INCLUDE_SEARCH_PATHS
-        /usr/include
-        /usr/local/include
-	    $ENV{CLBLAST_HOME}
-	    $ENV{CLBLAST_HOME}/include
-        )
-	find_path(CLBLAST_INC NAMES clblast.h PATHS ${CLBLAST_INCLUDE_SEARCH_PATHS})
-	find_library(CLBLAST_LIB NAMES clblast)
-	find_library(OPENCL_LIB NAMES OpenCL)
-	if (CLBLAST_LIB AND OPENCL_LIB AND CLBLAST_INC)
-		message(STATUS "clBLAST found")
-
-		set(GGML_EXTRA_INCS  ${GGML_EXTRA_INCS}  ${CLBLAST_INC})
-		set(GGML_EXTRA_LIBS  ${GGML_EXTRA_LIBS}  ${CLBLAST_LIB}  ${OPENCL_LIB})
-		set(GGML_EXTRA_FLAGS ${GGML_EXTRA_FLAGS} -DGGML_USE_CLBLAST)
-
-		set(GGML_OPENCL_SOURCES ggml-opencl.cpp ggml-opencl.h)
-
-		link_libraries("-Wl,--copy-dt-needed-entries")
-    else()
-        message(WARNING "clBLAST not found")
-    endif()
-endif()
-
-if (GGML_CUBLAS)
-    cmake_minimum_required(VERSION 3.17)
-
-    find_package(CUDAToolkit)
-    if (CUDAToolkit_FOUND)
-        message(STATUS "cuBLAS found")
-
-        enable_language(CUDA)
-
-        set(GGML_CUDA_SOURCES ggml-cuda.cu ggml-cuda.h)
-
-        set(GGML_EXTRA_FLAGS ${GGML_EXTRA_FLAGS} -DGGML_USE_CUBLAS)
-
-        if (GGML_CUDA_FORCE_DMMV)
-            add_compile_definitions(GGML_CUDA_FORCE_DMMV)
-        endif()
-        if (GGML_CUDA_FORCE_MMQ)
-            add_compile_definitions(GGML_CUDA_FORCE_MMQ)
-        endif()
-
-        # required for dynamic parallelism
-        # set(CMAKE_CUDA_SEPARABLE_COMPILATION ON)
-
-        if (GGML_STATIC)
-            if (WIN32)
-                # As of 12.3.1 CUDA Tookit for Windows does not offer a static cublas library
-                set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas CUDA::cublasLt)
-            else ()
-                set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
-            endif()
-        else()
-            set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
-        endif()
-
-        set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cuda_driver)
-
-        if (CMAKE_BUILD_TYPE MATCHES Debug)
-            set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo")
-        endif()
-    else()
-        message(WARNING "cuBLAS not found")
-    endif()
-endif()
-
-if (GGML_HIPBLAS)
-    list(APPEND CMAKE_PREFIX_PATH /opt/rocm)
-
-    if (NOT ${CMAKE_C_COMPILER_ID} MATCHES "Clang")
-        message(WARNING "Only LLVM is supported for HIP, hint: CC=/opt/rocm/llvm/bin/clang")
-    endif()
-    if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
-        message(WARNING "Only LLVM is supported for HIP, hint: CXX=/opt/rocm/llvm/bin/clang++")
-    endif()
-
-    find_package(hip)
-    find_package(hipblas)
-    find_package(rocblas)
-
-    if (${hipblas_FOUND} AND ${hip_FOUND})
-        message(STATUS "HIP and hipBLAS found")
-
-        add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS)
-
-        add_library(ggml-rocm OBJECT ggml-cuda.cu ggml-cuda.h)
-        if (BUILD_SHARED_LIBS)
-            set_target_properties(ggml-rocm PROPERTIES POSITION_INDEPENDENT_CODE ON)
-        endif()
-        if (GGML_CUDA_FORCE_DMMV)
-            target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_DMMV)
-        endif()
-        if (GGML_CUDA_FORCE_MMQ)
-            target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_MMQ)
-        endif()
-        target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_X=${GGML_CUDA_DMMV_X})
-        target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_MMV_Y=${GGML_CUDA_MMV_Y})
-        target_compile_definitions(ggml-rocm PRIVATE K_QUANTS_PER_ITERATION=${GGML_CUDA_KQUANTS_ITER})
-        set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX)
-        target_link_libraries(ggml-rocm PRIVATE hip::device PUBLIC hip::host roc::rocblas roc::hipblas)
-        target_include_directories(ggml-rocm PRIVATE . ../include ../include/ggml)
-
-        if (GGML_STATIC)
-            message(FATAL_ERROR "Static linking not supported for HIP/ROCm")
-        endif()
-        set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} ggml-rocm)
-    else()
-        message(WARNING "hipBLAS or HIP not found. Try setting CMAKE_PREFIX_PATH=/opt/rocm")
-    endif()
-endif()
-
-if (GGML_METAL)
-    find_library(FOUNDATION_LIBRARY         Foundation              REQUIRED)
-    find_library(METAL_FRAMEWORK            Metal                   REQUIRED)
-    find_library(METALKIT_FRAMEWORK         MetalKit                REQUIRED)
-    find_library(METALPERFORMANCE_FRAMEWORK MetalPerformanceShaders REQUIRED)
-
-    set(GGML_METAL_SOURCES ggml-metal.m ggml-metal.h)
-
-    set(GGML_EXTRA_FLAGS ${GGML_EXTRA_FLAGS} -DGGML_USE_METAL)
-
-    #add_compile_definitions(GGML_METAL_NDEBUG)
-
-    # get full path to the file
-    #add_compile_definitions(GGML_METAL_DIR_KERNELS="${CMAKE_CURRENT_SOURCE_DIR}/")
-
-    # copy ggml-metal.metal to bin directory
-    configure_file(ggml-metal.metal ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal COPYONLY)
-
-    set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS}
-        ${FOUNDATION_LIBRARY}
-        ${METAL_FRAMEWORK}
-        ${METALKIT_FRAMEWORK}
-        ${METALPERFORMANCE_FRAMEWORK}
-        )
-endif()
-
-if (GGML_PERF)
-    set(GGML_EXTRA_FLAGS ${GGML_EXTRA_FLAGS} -DGGML_PERF)
-endif()
-
-add_library(${TARGET}
-    ggml.c
-    ggml-alloc.c
-    ggml-backend.c
-    ggml-quants.c
-    ggml-impl.h
-    ggml-backend-impl.h
-    ../include/ggml/ggml.h
-    ../include/ggml/ggml-alloc.h
-    ../include/ggml/ggml-backend.h
-    ${GGML_CUDA_SOURCES}
-    ${GGML_OPENCL_SOURCES}
-    ${GGML_METAL_SOURCES}
-    )
-
-target_include_directories(${TARGET} PUBLIC
-    .
-    ../include
-    ../include/ggml
-    ${GGML_EXTRA_INCS}
-    )
-
-find_library(MATH_LIBRARY m)
-if (MATH_LIBRARY)
-    target_link_libraries(${TARGET} PUBLIC ${MATH_LIBRARY})
-endif()
-
-target_link_libraries(${TARGET} PUBLIC ${GGML_EXTRA_LIBS} ${CMAKE_THREAD_LIBS_INIT})
-
-if (BUILD_SHARED_LIBS)
-    set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
-
-    target_link_libraries(${TARGET} PUBLIC
-        ${CMAKE_DL_LIBS}
-        )
-
-    target_compile_definitions(${TARGET} PUBLIC
-        GGML_SHARED
-        )
-
-    target_compile_definitions(${TARGET} PRIVATE
-        GGML_BUILD
-        )
-
-    if (GGML_METAL)
-        set_target_properties(${TARGET} PROPERTIES RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal")
-    endif()
-endif()
-
-target_compile_definitions(${TARGET} PUBLIC
-    ${GGML_EXTRA_FLAGS}
-    )
-
-if (MINGW)
-    target_link_libraries(${TARGET} PUBLIC
-        stdc++
-        )
-endif()
-
-if (GGML_CUDA_SOURCES)
-    message(STATUS "GGML CUDA sources found")
-    if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
-        # Only configure gmml CUDA architectures is not globally set
-        if (NOT DEFINED GGML_CUDA_ARCHITECTURES)
-            # Not overriden by user, so set defaults
-            set(GGML_CUDA_ARCHITECTURES 52 61 70)
-        endif()
-        message(STATUS "GGML Configuring CUDA architectures ${GGML_CUDA_ARCHITECTURES}")
-        set_property(TARGET ggml  PROPERTY CUDA_ARCHITECTURES ${GGML_CUDA_ARCHITECTURES})
-    endif()
-    set_property(TARGET ggml  PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto")
-    if (NOT MSVC)
-        target_link_libraries(ggml PUBLIC stdc++)
-    endif()
-endif()
-
-set (GGML_PUBLIC_HEADERS
-     ${CMAKE_CURRENT_SOURCE_DIR}/../include/ggml/ggml.h
-     ${CMAKE_CURRENT_SOURCE_DIR}/../include/ggml/ggml-alloc.h
-     ${CMAKE_CURRENT_SOURCE_DIR}/../include/ggml/ggml-backend.h)
-
-set_target_properties(${TARGET} PROPERTIES
-                      PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
-
-install(TARGETS ${TARGET}
-    LIBRARY DESTINATION lib
-    PUBLIC_HEADER DESTINATION include/ggml
-    )
diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
deleted file mode 100644
index 89b85d3..0000000
--- a/ggml/src/ggml-alloc.c
+++ /dev/null
@@ -1,832 +0,0 @@
-#include "ggml-alloc.h"
-#include "ggml-backend-impl.h"
-#include "ggml.h"
-#include "ggml-impl.h"
-#include <assert.h>
-#include <limits.h>
-#include <stdarg.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#define MAX(a, b) ((a) > (b) ? (a) : (b))
-#define MAX_FREE_BLOCKS 256
-
-//#define GGML_ALLOCATOR_DEBUG
-
-//#define AT_PRINTF(...) fprintf(stderr, __VA_ARGS__)
-#define AT_PRINTF(...)
-
-// TODO: GGML_PAD ?
-static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) {
-    assert(alignment && !(alignment & (alignment - 1))); // power of 2
-    size_t align = (alignment - (((uintptr_t)buffer + offset) % alignment)) % alignment;
-    return offset + align;
-}
-
-struct free_block {
-    void * addr;
-    size_t size;
-};
-
-struct ggml_tallocr {
-    struct ggml_backend_buffer * buffer;
-    bool buffer_owned;
-    void * base;
-    size_t alignment;
-
-    int n_free_blocks;
-    struct free_block free_blocks[MAX_FREE_BLOCKS];
-
-    size_t max_size;
-
-    bool measure;
-
-#ifdef GGML_ALLOCATOR_DEBUG
-    struct ggml_tensor * allocated_tensors[1024];
-#endif
-};
-
-#ifdef GGML_ALLOCATOR_DEBUG
-static void add_allocated_tensor(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
-    for (int i = 0; i < 1024; i++) {
-        if (alloc->allocated_tensors[i] == NULL) {
-            alloc->allocated_tensors[i] = tensor;
-            return;
-        }
-    }
-    GGML_ASSERT(!"out of allocated_tensors");
-}
-static void remove_allocated_tensor(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
-    for (int i = 0; i < 1024; i++) {
-        if (alloc->allocated_tensors[i] == tensor ||
-            (alloc->allocated_tensors[i] != NULL && alloc->allocated_tensors[i]->data == tensor->data)) {
-            alloc->allocated_tensors[i] = NULL;
-            return;
-        }
-    }
-    printf("tried to free tensor %s not found\n", tensor->name);
-    GGML_ASSERT(!"tensor not found");
-}
-#endif
-
-// check if a tensor is allocated by this buffer
-static bool ggml_tallocr_is_own(ggml_tallocr_t alloc, const struct ggml_tensor * tensor) {
-    return tensor->buffer == alloc->buffer && (!tensor->view_src || tensor->view_src->buffer == alloc->buffer);
-}
-
-static bool ggml_is_view(struct ggml_tensor * t) {
-    return t->view_src != NULL;
-}
-
-void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
-    GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources
-    GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated
-
-    size_t size = ggml_backend_buffer_get_alloc_size(alloc->buffer, tensor);
-    size = aligned_offset(NULL, size, alloc->alignment);
-
-    AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
-
-    size_t max_avail = 0;
-
-    // find the best fitting free block besides the last block
-    int best_fit_block = -1;
-    size_t best_fit_size = SIZE_MAX;
-    for (int i = 0; i < alloc->n_free_blocks - 1; i++) {
-        struct free_block * block = &alloc->free_blocks[i];
-        max_avail = MAX(max_avail, block->size);
-        if (block->size >= size && block->size <= best_fit_size) {
-            best_fit_block = i;
-            best_fit_size = block->size;
-        }
-    }
-
-    if (best_fit_block == -1) {
-        // the last block is our last resort
-        struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1];
-        max_avail = MAX(max_avail, block->size);
-        if (block->size >= size) {
-            best_fit_block = alloc->n_free_blocks - 1;
-        } else {
-            fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n",
-                    __func__, size, max_avail);
-            GGML_ASSERT(!"not enough space in the buffer");
-            return;
-        }
-    }
-
-    struct free_block * block = &alloc->free_blocks[best_fit_block];
-    void * addr = block->addr;
-    block->addr = (char*)block->addr + size;
-    block->size -= size;
-    if (block->size == 0) {
-        // remove block if empty
-        alloc->n_free_blocks--;
-        for (int j = best_fit_block; j < alloc->n_free_blocks; j++) {
-            alloc->free_blocks[j] = alloc->free_blocks[j+1];
-        }
-    }
-
-    AT_PRINTF("block %d, addr %p\n", best_fit_block, addr);
-
-    tensor->data = addr;
-    tensor->buffer = alloc->buffer;
-    if (!alloc->measure) {
-        ggml_backend_buffer_init_tensor(alloc->buffer, tensor);
-    }
-
-#ifdef GGML_ALLOCATOR_DEBUG
-    add_allocated_tensor(alloc, tensor);
-    size_t cur_max = (char*)addr - (char*)alloc->base + size;
-    if (cur_max > alloc->max_size) {
-        printf("max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
-        for (int i = 0; i < 1024; i++) {
-            if (alloc->allocated_tensors[i]) {
-                printf("%s (%.2f MB) ", alloc->allocated_tensors[i]->name, ggml_nbytes(alloc->allocated_tensors[i]) / 1024.0 / 1024.0);
-            }
-        }
-        printf("\n");
-    }
-#endif
-
-    alloc->max_size = MAX(alloc->max_size, (char*)addr - (char*)alloc->base + size);
-}
-
-// this is a very naive implementation, but for our case the number of free blocks should be very small
-static void ggml_tallocr_free_tensor(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
-    if (ggml_tallocr_is_own(alloc, tensor) == false) {
-        // the tensor was not allocated in this buffer
-        // this can happen because the graph allocator will try to free weights and other tensors from different buffers
-        // the easiest way to deal with this is just to ignore it
-        // AT_PRINTF("ignoring %s (their buffer: %p, our buffer: %p)\n", tensor->name, (void *)tensor->buffer, (void *)alloc->buffer);
-        return;
-    }
-
-    void * ptr = tensor->data;
-
-    size_t size = ggml_backend_buffer_get_alloc_size(alloc->buffer, tensor);
-    size = aligned_offset(NULL, size, alloc->alignment);
-    AT_PRINTF("%s: freeing %s at %p (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, ptr, size, alloc->n_free_blocks);
-
-#ifdef GGML_ALLOCATOR_DEBUG
-    remove_allocated_tensor(alloc, tensor);
-#endif
-
-    // see if we can merge with an existing block
-    for (int i = 0; i < alloc->n_free_blocks; i++) {
-        struct free_block * block = &alloc->free_blocks[i];
-        // check if ptr is at the end of the block
-        if ((char*)block->addr + block->size == ptr) {
-            block->size += size;
-            // check if we can merge with the next block
-            if (i < alloc->n_free_blocks - 1 && (char*)block->addr + block->size == alloc->free_blocks[i+1].addr) {
-                block->size += alloc->free_blocks[i+1].size;
-                alloc->n_free_blocks--;
-                for (int j = i+1; j < alloc->n_free_blocks; j++) {
-                    alloc->free_blocks[j] = alloc->free_blocks[j+1];
-                }
-            }
-            return;
-        }
-        // check if ptr is at the beginning of the block
-        if ((char*)ptr + size == block->addr) {
-            block->addr = ptr;
-            block->size += size;
-            // check if we can merge with the previous block
-            if (i > 0 && (char*)alloc->free_blocks[i-1].addr + alloc->free_blocks[i-1].size == block->addr) {
-                alloc->free_blocks[i-1].size += block->size;
-                alloc->n_free_blocks--;
-                for (int j = i; j < alloc->n_free_blocks; j++) {
-                    alloc->free_blocks[j] = alloc->free_blocks[j+1];
-                }
-            }
-            return;
-        }
-    }
-    // otherwise, add a new block
-    GGML_ASSERT(alloc->n_free_blocks < MAX_FREE_BLOCKS && "out of free blocks");
-    // insert the new block in the correct position to keep the array sorted by address (to make merging blocks faster)
-    int insert_pos = 0;
-    while (insert_pos < alloc->n_free_blocks && alloc->free_blocks[insert_pos].addr < ptr) {
-        insert_pos++;
-    }
-    // shift all blocks from insert_pos onward to make room for the new block
-    for (int i = alloc->n_free_blocks; i > insert_pos; i--) {
-        alloc->free_blocks[i] = alloc->free_blocks[i-1];
-    }
-    // insert the new block
-    alloc->free_blocks[insert_pos].addr = ptr;
-    alloc->free_blocks[insert_pos].size = size;
-    alloc->n_free_blocks++;
-}
-
-void ggml_tallocr_reset(ggml_tallocr_t alloc) {
-    alloc->n_free_blocks = 1;
-    size_t align_offset = aligned_offset(alloc->base, 0, alloc->alignment);
-    alloc->free_blocks[0].addr = (char *)alloc->base + align_offset;
-
-    if (alloc->measure) {
-        alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows
-    } else {
-        alloc->free_blocks[0].size = ggml_backend_buffer_get_size(alloc->buffer) - align_offset;
-        ggml_backend_buffer_reset(alloc->buffer);
-    }
-}
-
-ggml_tallocr_t ggml_tallocr_new(void * data, size_t size, size_t alignment) {
-    struct ggml_backend_buffer * buffer = ggml_backend_cpu_buffer_from_ptr(data, size);
-
-    ggml_tallocr_t alloc = (ggml_tallocr_t)malloc(sizeof(struct ggml_tallocr));
-
-    *alloc = (struct ggml_tallocr) {
-        /*.buffer        = */ buffer,
-        /*.buffer_owned  = */ true,
-        /*.base          = */ ggml_backend_buffer_get_base(buffer),
-        /*.alignment     = */ alignment,
-        /*.n_free_blocks = */ 0,
-        /*.free_blocks   = */ {{0}},
-        /*.max_size      = */ 0,
-        /*.measure       = */ false,
-#ifdef GGML_ALLOCATOR_DEBUG
-        /*.allocated_tensors = */ {0},
-#endif
-    };
-
-    ggml_tallocr_reset(alloc);
-
-    return alloc;
-}
-
-ggml_tallocr_t ggml_tallocr_new_measure(size_t alignment) {
-    ggml_tallocr_t alloc = ggml_tallocr_new((void *)0x1000, SIZE_MAX/2, alignment);
-    alloc->measure = true;
-
-    return alloc;
-}
-
-ggml_tallocr_t ggml_tallocr_new_measure_from_buft(struct ggml_backend_buffer_type * buft) {
-    // create a backend buffer to get the correct tensor allocation sizes
-    ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, 1);
-
-    // TODO: move alloc initialization to a common ggml_tallocr_new_impl function
-    ggml_tallocr_t alloc = ggml_tallocr_new_from_buffer(buffer);
-    alloc->buffer_owned = true;
-    alloc->measure = true;
-    ggml_tallocr_reset(alloc);
-    return alloc;
-}
-
-ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backend) {
-    return ggml_tallocr_new_measure_from_buft(ggml_backend_get_default_buffer_type(backend));
-}
-
-ggml_tallocr_t ggml_tallocr_new_from_buft(struct ggml_backend_buffer_type * buft, size_t size) {
-    // create a backend buffer to get the correct tensor allocation sizes
-    ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);
-    ggml_tallocr_t alloc = ggml_tallocr_new_from_buffer(buffer);
-    alloc->buffer_owned = true;
-    return alloc;
-}
-
-ggml_tallocr_t ggml_tallocr_new_from_backend(struct ggml_backend * backend, size_t size) {
-    return ggml_tallocr_new_from_buft(ggml_backend_get_default_buffer_type(backend), size);
-}
-
-ggml_tallocr_t ggml_tallocr_new_from_buffer(struct ggml_backend_buffer * buffer) {
-    ggml_tallocr_t alloc = (ggml_tallocr_t)malloc(sizeof(struct ggml_tallocr));
-
-    *alloc = (struct ggml_tallocr) {
-        /*.buffer        = */ buffer,
-        /*.buffer_owned  = */ false,
-        /*.base          = */ ggml_backend_buffer_get_base(buffer),
-        /*.alignment     = */ ggml_backend_buffer_get_alignment(buffer),
-        /*.n_free_blocks = */ 0,
-        /*.free_blocks   = */ {{0}},
-        /*.max_size      = */ 0,
-        /*.measure       = */ false,
-#ifdef GGML_ALLOCATOR_DEBUG
-        /*.allocated_tensors = */ {0},
-#endif
-    };
-
-    ggml_tallocr_reset(alloc);
-
-    return alloc;
-}
-
-struct ggml_backend_buffer * ggml_tallocr_get_buffer(ggml_tallocr_t alloc) {
-    return alloc->buffer;
-}
-
-void ggml_tallocr_free(ggml_tallocr_t alloc) {
-    if (alloc == NULL) {
-        return;
-    }
-
-    if (alloc->buffer_owned) {
-        ggml_backend_buffer_free(alloc->buffer);
-    }
-    free(alloc);
-}
-
-bool ggml_tallocr_is_measure(ggml_tallocr_t alloc) {
-    return alloc->measure;
-}
-
-size_t ggml_tallocr_max_size(ggml_tallocr_t alloc) {
-    return alloc->max_size;
-}
-
-// graph allocator
-
-struct hash_node {
-    int n_children;
-    int n_views;
-};
-
-struct ggml_gallocr {
-    ggml_tallocr_t talloc;
-    struct ggml_hash_set hash_set;
-    struct hash_node * hash_values;
-    size_t hash_values_size;
-    ggml_tallocr_t * hash_allocs;
-    int * parse_seq;
-    int parse_seq_len;
-};
-
-ggml_gallocr_t ggml_gallocr_new(void) {
-    ggml_gallocr_t galloc = (ggml_gallocr_t)malloc(sizeof(struct ggml_gallocr));
-
-    *galloc = (struct ggml_gallocr) {
-        /*.talloc           = */ NULL,
-        /*.hash_set         = */ {0},
-        /*.hash_values      = */ NULL,
-        /*.hash_values_size = */ 0,
-        /*.hash_allocs      = */ NULL,
-        /*.parse_seq        = */ NULL,
-        /*.parse_seq_len    = */ 0,
-    };
-
-    return galloc;
-}
-
-void ggml_gallocr_free(ggml_gallocr_t galloc) {
-    if (galloc == NULL) {
-        return;
-    }
-
-    if (galloc->hash_set.keys != NULL) {
-        free(galloc->hash_set.keys);
-    }
-    if (galloc->hash_values != NULL) {
-        free(galloc->hash_values);
-    }
-    if (galloc->hash_allocs != NULL) {
-        free(galloc->hash_allocs);
-    }
-    if (galloc->parse_seq != NULL) {
-        free(galloc->parse_seq);
-    }
-    free(galloc);
-}
-
-void ggml_gallocr_set_parse_seq(ggml_gallocr_t galloc, const int * list, int n) {
-    free(galloc->parse_seq);
-    galloc->parse_seq = malloc(sizeof(int) * n);
-
-    for (int i = 0; i < n; i++) {
-        galloc->parse_seq[i] = list[i];
-    }
-    galloc->parse_seq_len = n;
-}
-
-static struct hash_node * hash_get(ggml_gallocr_t galloc, struct ggml_tensor * t) {
-    size_t i = ggml_hash_find_or_insert(galloc->hash_set, t);
-    return &galloc->hash_values[i];
-}
-
-static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
-    if (a->type != b->type) {
-        return false;
-    }
-    for (int i = 0; i < GGML_MAX_DIMS; i++) {
-        if (a->ne[i] != b->ne[i]) {
-            return false;
-        }
-        if (a->nb[i] != b->nb[i]) {
-            return false;
-        }
-    }
-    return true;
-}
-
-static bool ggml_op_can_inplace(enum ggml_op op) {
-    switch (op) {
-        case GGML_OP_SCALE:
-        case GGML_OP_DIAG_MASK_ZERO:
-        case GGML_OP_DIAG_MASK_INF:
-        case GGML_OP_ADD:
-        case GGML_OP_ADD1:
-        case GGML_OP_SUB:
-        case GGML_OP_MUL:
-        case GGML_OP_DIV:
-        case GGML_OP_SQR:
-        case GGML_OP_SQRT:
-        case GGML_OP_LOG:
-        case GGML_OP_UNARY:
-        case GGML_OP_ROPE:
-        case GGML_OP_RMS_NORM:
-        case GGML_OP_SOFT_MAX:
-            return true;
-
-        default:
-            return false;
-    }
-}
-
-static ggml_tallocr_t node_tallocr(ggml_gallocr_t galloc, struct ggml_tensor * node) {
-    if (galloc->talloc != NULL) {
-        return galloc->talloc;
-    }
-
-    return galloc->hash_allocs[ggml_hash_find_or_insert(galloc->hash_set, node)];
-}
-
-static void init_view(ggml_gallocr_t galloc, struct ggml_tensor * view, bool update_backend) {
-    ggml_tallocr_t alloc = node_tallocr(galloc, view);
-
-    GGML_ASSERT(view->view_src != NULL && view->view_src->data != NULL);
-    if (update_backend) {
-        view->backend = view->view_src->backend;
-    }
-    // views are initialized in the alloc buffer rather than the view_src buffer
-    view->buffer  = alloc->buffer;
-    view->data    = (char *)view->view_src->data + view->view_offs;
-
-    assert(ggml_tallocr_is_measure(alloc) || !view->buffer || view->buffer->buft == alloc->buffer->buft);
-
-    if (!alloc->measure) {
-        ggml_backend_buffer_init_tensor(alloc->buffer, view);
-    }
-}
-
-static void allocate_node(ggml_gallocr_t galloc, struct ggml_tensor * node) {
-    ggml_tallocr_t alloc = node_tallocr(galloc, node);
-
-    if (node->data == NULL) {
-        if (ggml_is_view(node)) {
-            init_view(galloc, node, true);
-        } else {
-            // see if we can reuse a parent's buffer (inplace)
-            if (ggml_op_can_inplace(node->op)) {
-                for (int i = 0; i < GGML_MAX_SRC; i++) {
-                    struct ggml_tensor * parent = node->src[i];
-                    if (parent == NULL) {
-                        break;
-                    }
-
-                    // if the node's data is external, then we cannot re-use it
-                    if (ggml_tallocr_is_own(alloc, parent) == false) {
-                        AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, parent->data);
-                        continue;
-                    }
-
-                    struct hash_node * p_hn = hash_get(galloc, parent);
-                    if (parent->data != NULL && p_hn->n_children == 1 && p_hn->n_views == 0 && ggml_are_same_layout(node, parent)) {
-                        if (ggml_is_view(parent)) {
-                            struct ggml_tensor * view_src = parent->view_src;
-                            struct hash_node * view_src_hn = hash_get(galloc, view_src);
-                            if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) {
-                                // TODO: the offset of the view parent must be kept to ensure that the op doesn't overwrite
-                                // the parent's data that it will need later (same layout requirement). the problem is that then
-                                // we cannot free the tensor because the original address of the allocation is lost.
-                                // adding a view_src pointer to the tensor would solve this and simplify the code dealing with views
-                                // for now, we only reuse the parent's data if the offset is zero (view_src->data == parent->data)
-                                AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
-                                node->view_src = view_src;
-                                view_src_hn->n_views += 1;
-                                init_view(galloc, node, false);
-                                return;
-                            }
-                        } else {
-                            AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
-                            node->view_src = parent;
-                            p_hn->n_views += 1;
-                            init_view(galloc, node, false);
-                            return;
-                        }
-                    }
-                }
-            }
-            ggml_tallocr_alloc(alloc, node);
-        }
-    }
-}
-
-static void free_node(ggml_gallocr_t galloc, struct ggml_tensor * node) {
-    ggml_tallocr_t alloc = node_tallocr(galloc, node);
-
-    ggml_tallocr_free_tensor(alloc, node);
-}
-
-static void ggml_tallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * gf) {
-    const int * parse_seq     = galloc->parse_seq;
-    int         parse_seq_len = galloc->parse_seq_len;
-
-    // count number of children and views
-    for (int i = 0; i < gf->n_nodes; i++) {
-        struct ggml_tensor * node = gf->nodes[i];
-
-        if (ggml_is_view(node)) {
-            struct ggml_tensor * view_src = node->view_src;
-            hash_get(galloc, view_src)->n_views += 1;
-            if (node->buffer == NULL && node->data != NULL) {
-                // view of a pre-allocated tensor, didn't call init_view() yet
-                init_view(galloc, node, true);
-            }
-        }
-
-        for (int j = 0; j < GGML_MAX_SRC; j++) {
-            struct ggml_tensor * parent = node->src[j];
-            if (parent == NULL) {
-                break;
-            }
-            hash_get(galloc, parent)->n_children += 1;
-            if (ggml_is_view(parent) && parent->buffer == NULL && parent->data != NULL) {
-                init_view(galloc, parent, true);
-            }
-        }
-   }
-
-    // allocate tensors
-    // if we have parse_seq then we allocate nodes following the list, and we only free nodes at barriers
-    int last_barrier_pos = 0;
-    int n_nodes = parse_seq_len ? parse_seq_len : gf->n_nodes;
-
-    for (int ind = 0; ind < n_nodes; ind++) {
-        // allocate a node if there is no parse_seq or this is not a barrier
-        if (parse_seq_len == 0 || parse_seq[ind] != -1) {
-            int i = parse_seq_len ? parse_seq[ind] : ind;
-            struct ggml_tensor * node = gf->nodes[i];
-
-            // allocate parents (leafs)
-            for (int j = 0; j < GGML_MAX_SRC; j++) {
-                struct ggml_tensor * parent = node->src[j];
-                if (parent == NULL) {
-                    break;
-                }
-                allocate_node(galloc, parent);
-            }
-
-            // allocate node
-            allocate_node(galloc, node);
-
-            AT_PRINTF("exec: %s (%s) <= ", ggml_op_name(node->op), node->name);
-            for (int j = 0; j < GGML_MAX_SRC; j++) {
-                struct ggml_tensor * parent = node->src[j];
-                if (parent == NULL) {
-                    break;
-                }
-                AT_PRINTF("%s", parent->name);
-                if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
-                    AT_PRINTF(", ");
-                }
-            }
-            AT_PRINTF("\n");
-        }
-
-        // update parents
-        // update immediately if there is no parse_seq
-        // update only at barriers if there is parse_seq
-        if ((parse_seq_len == 0) || parse_seq[ind] == -1) {
-            int update_start = parse_seq_len ? last_barrier_pos : ind;
-            int update_end   = parse_seq_len ? ind              : ind + 1;
-            for (int i = update_start; i < update_end; i++) {
-                int node_i = parse_seq_len ? parse_seq[i] : i;
-                struct ggml_tensor * node = gf->nodes[node_i];
-
-                for (int j = 0; j < GGML_MAX_SRC; j++) {
-                    struct ggml_tensor * parent = node->src[j];
-                    if (parent == NULL) {
-                        break;
-                    }
-                    struct hash_node * p_hn = hash_get(galloc, parent);
-                    p_hn->n_children -= 1;
-
-                    //AT_PRINTF("parent %s: %d children, %d views\n", parent->name, parent->n_children, parent->n_views);
-
-                    if (p_hn->n_children == 0 && p_hn->n_views == 0) {
-                        if (ggml_is_view(parent)) {
-                            struct ggml_tensor * view_src = parent->view_src;
-                            struct hash_node * view_src_hn = hash_get(galloc, view_src);
-                            view_src_hn->n_views -= 1;
-                            AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src_hn->n_children, view_src_hn->n_views);
-                            if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0) {
-                                free_node(galloc, view_src);
-                            }
-                        }
-                        else {
-                            free_node(galloc, parent);
-                        }
-                    }
-                }
-            }
-            AT_PRINTF("\n");
-            if (parse_seq_len) {
-                last_barrier_pos = ind + 1;
-            }
-        }
-    }
-}
-
-size_t ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, ggml_tallocr_t talloc, struct ggml_cgraph * graph) {
-    size_t hash_size = graph->visited_hash_table.size;
-
-    // check if the hash table is initialized and large enough
-    if (galloc->hash_set.size < hash_size) {
-        if (galloc->hash_set.keys != NULL) {
-            free(galloc->hash_set.keys);
-        }
-        if (galloc->hash_values != NULL) {
-            free(galloc->hash_values);
-        }
-        galloc->hash_set.keys = malloc(sizeof(struct ggml_tensor *) * hash_size);
-        galloc->hash_set.size = hash_size;
-        galloc->hash_values = malloc(sizeof(struct hash_node) * hash_size);
-    }
-
-    // reset hash table
-    memset(galloc->hash_set.keys, 0, sizeof(struct ggml_tensor *) * hash_size);
-    memset(galloc->hash_values,   0, sizeof(struct hash_node) * hash_size);
-
-    galloc->talloc = talloc;
-    ggml_tallocr_alloc_graph_impl(galloc, graph);
-    galloc->talloc = NULL;
-
-    size_t max_size = ggml_tallocr_max_size(talloc);
-
-    return max_size;
-}
-
-void ggml_gallocr_alloc_graph_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, struct ggml_hash_set hash_set, ggml_tallocr_t * hash_node_talloc) {
-    const size_t hash_size = hash_set.size;
-
-    GGML_ASSERT(hash_size >= (size_t)(graph->n_nodes + graph->n_leafs));
-
-    galloc->talloc = NULL;
-
-    // alloc hash_values if needed
-    if (galloc->hash_values == NULL || galloc->hash_values_size < hash_size) {
-        free(galloc->hash_values);
-        galloc->hash_values      = malloc(sizeof(struct hash_node) * hash_size);
-        galloc->hash_values_size = hash_size;
-    }
-
-    // free hash_set.keys if needed
-    if (galloc->hash_set.keys != NULL) {
-        free(galloc->hash_set.keys);
-    }
-    galloc->hash_set = hash_set;
-
-    // reset hash values
-    memset(galloc->hash_values, 0, sizeof(struct hash_node) * hash_size);
-
-    galloc->hash_allocs = hash_node_talloc;
-
-    ggml_tallocr_alloc_graph_impl(galloc, graph);
-
-    // remove unowned resources
-    galloc->hash_set.keys = NULL;
-    galloc->hash_allocs = NULL;
-}
-
-// legacy API wrapper
-
-struct ggml_allocr {
-    ggml_tallocr_t talloc;
-    ggml_gallocr_t galloc;
-};
-
-static ggml_allocr_t ggml_allocr_new_impl(ggml_tallocr_t talloc) {
-    ggml_allocr_t alloc = (ggml_allocr_t)malloc(sizeof(struct ggml_allocr));
-    *alloc = (struct ggml_allocr) {
-        /*.talloc = */ talloc,
-        /*.galloc = */ ggml_gallocr_new(),
-    };
-    return alloc;
-}
-
-ggml_allocr_t ggml_allocr_new(void * data, size_t size, size_t alignment) {
-    return ggml_allocr_new_impl(ggml_tallocr_new(data, size, alignment));
-}
-
-ggml_allocr_t ggml_allocr_new_measure(size_t alignment) {
-    return ggml_allocr_new_impl(ggml_tallocr_new_measure(alignment));
-}
-
-ggml_allocr_t ggml_allocr_new_from_buffer(struct ggml_backend_buffer * buffer) {
-    return ggml_allocr_new_impl(ggml_tallocr_new_from_buffer(buffer));
-}
-
-ggml_allocr_t ggml_allocr_new_from_backend(struct ggml_backend * backend, size_t size) {
-    return ggml_allocr_new_impl(ggml_tallocr_new_from_backend(backend, size));
-}
-
-ggml_allocr_t ggml_allocr_new_measure_from_backend(struct ggml_backend * backend) {
-    return ggml_allocr_new_impl(ggml_tallocr_new_measure_from_backend(backend));
-}
-
-struct ggml_backend_buffer * ggml_allocr_get_buffer(ggml_allocr_t alloc) {
-    return ggml_tallocr_get_buffer(alloc->talloc);
-}
-
-void ggml_allocr_set_parse_seq(ggml_allocr_t alloc, const int * list, int n) {
-    ggml_gallocr_set_parse_seq(alloc->galloc, list, n);
-}
-
-void ggml_allocr_free(ggml_allocr_t alloc) {
-    if (alloc == NULL) {
-        return;
-    }
-
-    ggml_gallocr_free(alloc->galloc);
-    ggml_tallocr_free(alloc->talloc);
-    free(alloc);
-}
-
-bool ggml_allocr_is_measure(ggml_allocr_t alloc) {
-    return ggml_tallocr_is_measure(alloc->talloc);
-}
-
-void ggml_allocr_reset(ggml_allocr_t alloc) {
-    ggml_tallocr_reset(alloc->talloc);
-}
-
-void ggml_allocr_alloc(ggml_allocr_t alloc, struct ggml_tensor * tensor) {
-    ggml_tallocr_alloc(alloc->talloc, tensor);
-}
-
-size_t ggml_allocr_max_size(ggml_allocr_t alloc) {
-    return ggml_tallocr_max_size(alloc->talloc);
-}
-
-size_t ggml_allocr_alloc_graph(ggml_allocr_t alloc, struct ggml_cgraph * graph) {
-    return ggml_gallocr_alloc_graph(alloc->galloc, alloc->talloc, graph);
-}
-
-// utils
-ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
-    GGML_ASSERT(ggml_get_no_alloc(ctx) == true);
-
-    size_t alignment = ggml_backend_buft_get_alignment(buft);
-
-    size_t nbytes = 0;
-    for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
-        if (t->data == NULL && t->view_src == NULL) {
-            nbytes += GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment);
-        }
-    }
-
-    if (nbytes == 0) {
-        // all the tensors in the context are already allocated
-#ifndef NDEBUG
-        fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
-#endif
-        return NULL;
-    }
-
-    ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, nbytes);
-    if (buffer == NULL) {
-        // failed to allocate buffer
-#ifndef NDEBUG
-        fprintf(stderr, "%s: failed to allocate buffer\n", __func__);
-#endif
-        return NULL;
-    }
-
-    ggml_tallocr_t tallocr = ggml_tallocr_new_from_buffer(buffer);
-
-    for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
-        if (t->data == NULL) {
-            if (t->view_src == NULL) {
-                ggml_tallocr_alloc(tallocr, t);
-            } else {
-                ggml_backend_view_init(buffer, t);
-            }
-        } else {
-            if (t->view_src != NULL) {
-                // view of a pre-allocated tensor
-                ggml_backend_view_init(buffer, t);
-            }
-        }
-    }
-
-    ggml_tallocr_free(tallocr);
-
-    return buffer;
-}
-
-ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend) {
-    return ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_get_default_buffer_type(backend));
-}
diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h
deleted file mode 100644
index 1db3290..0000000
--- a/ggml/src/ggml-backend-impl.h
+++ /dev/null
@@ -1,116 +0,0 @@
-#pragma once
-
-// ggml-backend internal header
-
-#include "ggml-backend.h"
-
-#ifdef  __cplusplus
-extern "C" {
-#endif
-
-    //
-    // Backend buffer
-    //
-
-    // buffer type
-    typedef void * ggml_backend_buffer_type_context_t;
-
-    struct ggml_backend_buffer_type_i {
-        const char *          (*get_name)        (ggml_backend_buffer_type_t buft);
-        ggml_backend_buffer_t (*alloc_buffer)    (ggml_backend_buffer_type_t buft, size_t size);
-        size_t                (*get_alignment)   (ggml_backend_buffer_type_t buft); // tensor alignment
-        size_t                (*get_alloc_size)  (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
-        bool                  (*supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
-        // check if tensor data is in host memory
-        // should be equivalent to supports_backend(buft, ggml_backend_cpu_init())
-        bool                  (*is_host)         (ggml_backend_buffer_type_t buft);
-    };
-
-    struct ggml_backend_buffer_type {
-        struct ggml_backend_buffer_type_i  iface;
-        ggml_backend_buffer_type_context_t context;
-    };
-
-    // buffer
-    typedef void * ggml_backend_buffer_context_t;
-
-    struct ggml_backend_buffer_i {
-        const char * (*get_name)   (ggml_backend_buffer_t buffer);
-        void         (*free_buffer)(ggml_backend_buffer_t buffer);
-        void *       (*get_base)   (ggml_backend_buffer_t buffer);
-        void         (*init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
-        void         (*set_tensor) (ggml_backend_buffer_t buffer,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
-        void         (*get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
-        bool         (*cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst); // dst is in the buffer, src may be in any buffer
-        void         (*clear)      (ggml_backend_buffer_t buffer, uint8_t value);
-        void         (*reset)      (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
-    };
-
-    struct ggml_backend_buffer {
-        struct ggml_backend_buffer_i  iface;
-        ggml_backend_buffer_type_t    buft;
-        ggml_backend_buffer_context_t context;
-        size_t size;
-        enum ggml_backend_buffer_usage usage;
-    };
-
-    ggml_backend_buffer_t ggml_backend_buffer_init(
-                   ggml_backend_buffer_type_t      buft,
-            struct ggml_backend_buffer_i           iface,
-                   ggml_backend_buffer_context_t   context,
-                   size_t                          size);
-
-    // do not use directly, use ggml_backend_tensor_copy instead
-    bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst);
-
-    //
-    // Backend
-    //
-
-    typedef void * ggml_backend_context_t;
-
-    struct ggml_backend_i {
-        const char * (*get_name)(ggml_backend_t backend);
-
-        void (*free)(ggml_backend_t backend);
-
-        // buffer allocation
-        ggml_backend_buffer_type_t (*get_default_buffer_type)(ggml_backend_t backend);
-
-        // (optional) asynchronous tensor data access
-        void (*set_tensor_async)(ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
-        void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
-        bool (*cpy_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * src, struct ggml_tensor * dst);
-
-        // (optional) complete all pending operations
-        void (*synchronize)(ggml_backend_t backend);
-
-        // compute graph with a plan
-        ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
-        void                      (*graph_plan_free)   (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
-        void                      (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
-
-        // compute graph without a plan (async)
-        bool (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);
-
-        // check if the backend supports an operation
-        bool (*supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
-    };
-
-    struct ggml_backend {
-        struct ggml_backend_i iface;
-
-        ggml_backend_context_t context;
-    };
-
-    //
-    // Backend registry
-    //
-
-    typedef ggml_backend_t (*ggml_backend_init_fn)(const char * params, void * user_data);
-
-    void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data);
-
-#ifdef  __cplusplus
-}
-#endif
diff --git a/ggml/src/ggml-backend.c b/ggml/src/ggml-backend.c
deleted file mode 100644
index 505dbba..0000000
--- a/ggml/src/ggml-backend.c
+++ /dev/null
@@ -1,1678 +0,0 @@
-#include "ggml-backend-impl.h"
-#include "ggml-alloc.h"
-#include "ggml-impl.h"
-
-#include <assert.h>
-#include <limits.h>
-#include <stdarg.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-
-#define MAX(a, b) ((a) > (b) ? (a) : (b))
-
-
-// backend buffer type
-
-const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
-    return buft->iface.get_name(buft);
-}
-
-ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
-    return buft->iface.alloc_buffer(buft, size);
-}
-
-size_t ggml_backend_buft_get_alignment(ggml_backend_buffer_type_t buft) {
-    return buft->iface.get_alignment(buft);
-}
-
-size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
-    // get_alloc_size is optional, defaults to ggml_nbytes
-    if (buft->iface.get_alloc_size) {
-        return buft->iface.get_alloc_size(buft, tensor);
-    }
-    return ggml_nbytes(tensor);
-}
-
-bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
-    return buft->iface.supports_backend(buft, backend);
-}
-
-bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) {
-    if (buft->iface.is_host) {
-        return buft->iface.is_host(buft);
-    }
-    return false;
-}
-
-// backend buffer
-
-ggml_backend_buffer_t ggml_backend_buffer_init(
-               ggml_backend_buffer_type_t      buft,
-        struct ggml_backend_buffer_i           iface,
-               ggml_backend_buffer_context_t   context,
-               size_t                          size) {
-    ggml_backend_buffer_t buffer = malloc(sizeof(struct ggml_backend_buffer));
-
-    GGML_ASSERT(iface.get_base != NULL);
-
-    (*buffer) = (struct ggml_backend_buffer) {
-        /* .interface = */ iface,
-        /* .buft      = */ buft,
-        /* .context   = */ context,
-        /* .size      = */ size,
-        /* .usage     = */ GGML_BACKEND_BUFFER_USAGE_ANY
-    };
-
-    return buffer;
-}
-
-const char * ggml_backend_buffer_name(ggml_backend_buffer_t buffer) {
-    return buffer->iface.get_name(buffer);
-}
-
-void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
-    if (buffer == NULL) {
-        return;
-    }
-
-    if (buffer->iface.free_buffer != NULL) {
-        buffer->iface.free_buffer(buffer);
-    }
-    free(buffer);
-}
-
-size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
-    return buffer->size;
-}
-
-void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
-    void * base = buffer->iface.get_base(buffer);
-
-    GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL");
-
-    return base;
-}
-
-void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
-    // init_tensor is optional
-    if (buffer->iface.init_tensor) {
-        buffer->iface.init_tensor(buffer, tensor);
-    }
-}
-
-size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer) {
-    return ggml_backend_buft_get_alignment(ggml_backend_buffer_get_type(buffer));
-}
-
-size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
-    return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor);
-}
-
-void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
-    buffer->iface.clear(buffer, value);
-}
-
-bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) {
-    return ggml_backend_buft_is_host(ggml_backend_buffer_get_type(buffer));
-}
-
-void ggml_backend_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
-    buffer->usage = usage;
-}
-
-ggml_backend_buffer_type_t ggml_backend_buffer_get_type(ggml_backend_buffer_t buffer) {
-    return buffer->buft;
-}
-
-void ggml_backend_buffer_reset(ggml_backend_buffer_t buffer) {
-    if (buffer->iface.reset) {
-        buffer->iface.reset(buffer);
-    }
-}
-
-bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst) {
-    ggml_backend_buffer_t dst_buf = dst->view_src ? dst->view_src->buffer : dst->buffer;
-    if (dst_buf->iface.cpy_tensor) {
-        return src->buffer->iface.cpy_tensor(dst_buf, src, dst);
-    }
-    return false;
-}
-
-// backend
-
-const char * ggml_backend_name(ggml_backend_t backend) {
-    if (backend == NULL) {
-        return "NULL";
-    }
-    return backend->iface.get_name(backend);
-}
-
-void ggml_backend_free(ggml_backend_t backend) {
-    if (backend == NULL) {
-        return;
-    }
-
-    backend->iface.free(backend);
-}
-
-ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend) {
-    return backend->iface.get_default_buffer_type(backend);
-}
-
-ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size) {
-    return ggml_backend_buft_alloc_buffer(ggml_backend_get_default_buffer_type(backend), size);
-}
-
-size_t ggml_backend_get_alignment(ggml_backend_t backend) {
-    return ggml_backend_buft_get_alignment(ggml_backend_get_default_buffer_type(backend));
-}
-
-void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
-    GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
-    GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
-
-    if (backend->iface.set_tensor_async == NULL) {
-        ggml_backend_tensor_set(tensor, data, offset, size);
-    } else {
-        backend->iface.set_tensor_async(backend, tensor, data, offset, size);
-    }
-}
-
-void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
-    GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
-    GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
-
-    if (backend->iface.get_tensor_async == NULL) {
-        ggml_backend_tensor_get(tensor, data, offset, size);
-    } else {
-        backend->iface.get_tensor_async(backend, tensor, data, offset, size);
-    }
-}
-
-void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
-    ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
-
-    GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
-    GGML_ASSERT(buf != NULL && "tensor buffer not set");
-    GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
-
-    tensor->buffer->iface.set_tensor(buf, tensor, data, offset, size);
-}
-
-void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
-    ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
-
-    GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
-    GGML_ASSERT(tensor->buffer != NULL && "tensor buffer not set");
-    GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
-
-    tensor->buffer->iface.get_tensor(buf, tensor, data, offset, size);
-}
-
-void ggml_backend_synchronize(ggml_backend_t backend) {
-    if (backend->iface.synchronize == NULL) {
-        return;
-    }
-
-    backend->iface.synchronize(backend);
-}
-
-ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
-    return backend->iface.graph_plan_create(backend, cgraph);
-}
-
-void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
-    backend->iface.graph_plan_free(backend, plan);
-}
-
-void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
-    backend->iface.graph_plan_compute(backend, plan);
-}
-
-bool ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
-    return backend->iface.graph_compute(backend, cgraph);
-}
-
-bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
-    return backend->iface.supports_op(backend, op);
-}
-
-// backend copy
-
-static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
-    if (a->type != b->type) {
-        return false;
-    }
-    for (int i = 0; i < GGML_MAX_DIMS; i++) {
-        if (a->ne[i] != b->ne[i]) {
-            return false;
-        }
-        if (a->nb[i] != b->nb[i]) {
-            return false;
-        }
-    }
-    return true;
-}
-
-void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst) {
-    GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
-
-    if (src == dst) {
-        return;
-    }
-
-    if (ggml_backend_buffer_is_host(src->buffer)) {
-        ggml_backend_tensor_set(dst, src->data, 0, ggml_nbytes(src));
-    } else if (ggml_backend_buffer_is_host(dst->buffer)) {
-        ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src));
-    } else if (!ggml_backend_buffer_copy_tensor(src, dst)) {
-#ifndef NDEBUG
-        fprintf(stderr, "%s: warning: slow copy from %s to %s\n", __func__, ggml_backend_buffer_name(src->buffer), ggml_backend_buffer_name(dst->buffer));
-#endif
-        size_t nbytes = ggml_nbytes(src);
-        void * data = malloc(nbytes);
-        ggml_backend_tensor_get(src, data, 0, nbytes);
-        ggml_backend_tensor_set(dst, data, 0, nbytes);
-        free(data);
-    }
-}
-
-void ggml_backend_tensor_copy_async(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) {
-    GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
-
-    if (src == dst) {
-        return;
-    }
-
-    if (ggml_backend_buft_supports_backend(src->buffer->buft, backend) && ggml_backend_buft_supports_backend(dst->buffer->buft, backend)) {
-        if (backend->iface.cpy_tensor_async != NULL) {
-            if (backend->iface.cpy_tensor_async(backend, src, dst)) {
-                return;
-            }
-        }
-    }
-
-    size_t nbytes = ggml_nbytes(src);
-    if (ggml_backend_buffer_is_host(src->buffer)) {
-        ggml_backend_tensor_set_async(backend, dst, src->data, 0, nbytes);
-    }
-    else {
-        ggml_backend_tensor_copy(src, dst);
-    }
-}
-
-
-// backend registry
-
-#define GGML_MAX_BACKENDS_REG 16
-
-struct ggml_backend_reg {
-    char name[128];
-    ggml_backend_init_fn init_fn;
-    ggml_backend_buffer_type_t default_buffer_type;
-    void * user_data;
-};
-
-static struct ggml_backend_reg ggml_backend_registry[GGML_MAX_BACKENDS_REG];
-static size_t ggml_backend_registry_count = 0;
-
-static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data);
-
-static void ggml_backend_registry_init(void) {
-    static bool initialized = false;
-
-    if (initialized) {
-        return;
-    }
-
-    initialized = true;
-
-    ggml_backend_register("CPU", ggml_backend_reg_cpu_init, ggml_backend_cpu_buffer_type(), NULL);
-
-    // add forward decls here to avoid including the backend headers
-#ifdef GGML_USE_CUBLAS
-    extern void ggml_backend_cuda_reg_devices(void);
-    ggml_backend_cuda_reg_devices();
-#endif
-
-#ifdef GGML_USE_METAL
-    extern ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data);
-    extern ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
-    ggml_backend_register("Metal", ggml_backend_reg_metal_init, ggml_backend_metal_buffer_type(), NULL);
-#endif
-}
-
-void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
-    GGML_ASSERT(ggml_backend_registry_count < GGML_MAX_BACKENDS_REG);
-
-    size_t id = ggml_backend_registry_count;
-
-    ggml_backend_registry[id] = (struct ggml_backend_reg) {
-        /* .name                = */ {0},
-        /* .fn                  = */ init_fn,
-        /* .default_buffer_type = */ default_buffer_type,
-        /* .user_data           = */ user_data,
-    };
-
-    snprintf(ggml_backend_registry[id].name, sizeof(ggml_backend_registry[id].name), "%s", name);
-
-#ifndef NDEBUG
-    fprintf(stderr, "%s: registered backend %s\n", __func__, name);
-#endif
-
-    ggml_backend_registry_count++;
-}
-
-size_t ggml_backend_reg_get_count(void) {
-    ggml_backend_registry_init();
-
-    return ggml_backend_registry_count;
-}
-
-size_t ggml_backend_reg_find_by_name(const char * name) {
-    ggml_backend_registry_init();
-
-    for (size_t i = 0; i < ggml_backend_registry_count; i++) {
-        // TODO: case insensitive in a portable way
-        if (strcmp(ggml_backend_registry[i].name, name) == 0) {
-            return i;
-        }
-    }
-
-    // not found
-    return SIZE_MAX;
-}
-
-// init from backend:params string
-ggml_backend_t ggml_backend_reg_init_backend_from_str(const char * backend_str) {
-    ggml_backend_registry_init();
-
-    const char * params = strchr(backend_str, ':');
-    char backend_name[128];
-    if (params == NULL) {
-        snprintf(backend_name, sizeof(backend_name), "%s", backend_str);
-        params = "";
-    } else {
-        snprintf(backend_name, sizeof(backend_name), "%.*s", (int)(params - backend_str), backend_str);
-        params++;
-    }
-
-    size_t backend_i = ggml_backend_reg_find_by_name(backend_name);
-
-    if (backend_i == SIZE_MAX) {
-        fprintf(stderr, "%s: backend %s not found\n", __func__, backend_name);
-        return NULL;
-    }
-
-    return ggml_backend_reg_init_backend(backend_i, params);
-}
-
-const char * ggml_backend_reg_get_name(size_t i) {
-    ggml_backend_registry_init();
-
-    GGML_ASSERT(i < ggml_backend_registry_count);
-    return ggml_backend_registry[i].name;
-}
-
-ggml_backend_t ggml_backend_reg_init_backend(size_t i, const char * params) {
-    ggml_backend_registry_init();
-
-    GGML_ASSERT(i < ggml_backend_registry_count);
-    return ggml_backend_registry[i].init_fn(params, ggml_backend_registry[i].user_data);
-}
-
-ggml_backend_buffer_type_t ggml_backend_reg_get_default_buffer_type(size_t i) {
-    ggml_backend_registry_init();
-
-    GGML_ASSERT(i < ggml_backend_registry_count);
-    return ggml_backend_registry[i].default_buffer_type;
-}
-
-ggml_backend_buffer_t ggml_backend_reg_alloc_buffer(size_t i, size_t size) {
-    ggml_backend_registry_init();
-
-    GGML_ASSERT(i < ggml_backend_registry_count);
-    return ggml_backend_buft_alloc_buffer(ggml_backend_registry[i].default_buffer_type, size);
-}
-
-// backend CPU
-
-static const char * ggml_backend_cpu_buffer_name(ggml_backend_buffer_t buffer) {
-    return "CPU";
-
-    GGML_UNUSED(buffer);
-}
-
-static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
-    return (void *)buffer->context;
-}
-
-static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
-    free(buffer->context);
-}
-
-static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
-    memcpy((char *)tensor->data + offset, data, size);
-
-    GGML_UNUSED(buffer);
-}
-
-static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
-    memcpy(data, (const char *)tensor->data + offset, size);
-
-    GGML_UNUSED(buffer);
-}
-
-static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
-    if (ggml_backend_buffer_is_host(src->buffer)) {
-        memcpy(dst->data, src->data, ggml_nbytes(src));
-        return true;
-    }
-    return false;
-
-    GGML_UNUSED(buffer);
-}
-
-static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
-    memset(buffer->context, value, buffer->size);
-}
-
-static struct ggml_backend_buffer_i cpu_backend_buffer_i = {
-    /* .get_name        = */ ggml_backend_cpu_buffer_name,
-    /* .free_buffer     = */ ggml_backend_cpu_buffer_free_buffer,
-    /* .get_base        = */ ggml_backend_cpu_buffer_get_base,
-    /* .init_tensor     = */ NULL, // no initialization required
-    /* .set_tensor      = */ ggml_backend_cpu_buffer_set_tensor,
-    /* .get_tensor      = */ ggml_backend_cpu_buffer_get_tensor,
-    /* .cpy_tensor      = */ ggml_backend_cpu_buffer_cpy_tensor,
-    /* .clear           = */ ggml_backend_cpu_buffer_clear,
-    /* .reset           = */ NULL,
-};
-
-// for buffers from ptr, free is not called
-static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
-    /* .get_name        = */ ggml_backend_cpu_buffer_name,
-    /* .free_buffer     = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
-    /* .get_base        = */ ggml_backend_cpu_buffer_get_base,
-    /* .init_tensor     = */ NULL, // no initialization required
-    /* .set_tensor      = */ ggml_backend_cpu_buffer_set_tensor,
-    /* .get_tensor      = */ ggml_backend_cpu_buffer_get_tensor,
-    /* .cpy_tensor      = */ ggml_backend_cpu_buffer_cpy_tensor,
-    /* .clear           = */ ggml_backend_cpu_buffer_clear,
-    /* .reset           = */ NULL,
-};
-
-static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512
-
-static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
-    return "CPU";
-
-    GGML_UNUSED(buft);
-}
-
-static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
-    size += TENSOR_ALIGNMENT;   // malloc may return an address that is not aligned
-    void * data = malloc(size); // TODO: maybe use GGML_ALIGNED_MALLOC?
-
-    GGML_ASSERT(data != NULL && "failed to allocate buffer");
-
-    return ggml_backend_buffer_init(buft, cpu_backend_buffer_i, data, size);
-}
-
-static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
-    return TENSOR_ALIGNMENT;
-
-    GGML_UNUSED(buft);
-}
-
-static bool ggml_backend_cpu_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
-    return ggml_backend_is_cpu(backend);
-
-    GGML_UNUSED(buft);
-}
-
-static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
-    return true;
-
-    GGML_UNUSED(buft);
-}
-
-ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
-    static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
-        /* .iface = */ {
-            /* .get_name         = */ ggml_backend_cpu_buffer_type_get_name,
-            /* .alloc_buffer     = */ ggml_backend_cpu_buffer_type_alloc_buffer,
-            /* .get_alignment    = */ ggml_backend_cpu_buffer_type_get_alignment,
-            /* .get_alloc_size   = */ NULL, // defaults to ggml_nbytes
-            /* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
-            /* .is_host          = */ ggml_backend_cpu_buffer_type_is_host,
-        },
-        /* .context = */ NULL,
-    };
-
-    return &ggml_backend_cpu_buffer_type;
-}
-
-#ifdef GGML_USE_CPU_HBM
-
-// buffer type HBM
-
-#include <hbwmalloc.h>
-
-static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
-    return "CPU_HBM";
-
-    GGML_UNUSED(buft);
-}
-
-static const char * ggml_backend_cpu_hbm_buffer_get_name(ggml_backend_buffer_t buf) {
-    return "CPU_HBM";
-
-    GGML_UNUSED(buf);
-}
-
-static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
-    hbw_free(buffer->context);
-}
-
-static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
-    //void * ptr = hbw_malloc(size);
-    void * ptr;
-    int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
-    if (result != 0) {
-        fprintf(stderr, "failed to allocate HBM buffer of size %zu\n", size);
-        return NULL;
-    }
-
-    ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
-    buffer->buft = buft;
-    buffer->iface.get_name = ggml_backend_cpu_hbm_buffer_get_name;
-    buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer;
-
-    return buffer;
-}
-
-ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
-    static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = {
-        /* .iface    = */ {
-            /* .get_name         = */ ggml_backend_cpu_hbm_buffer_type_get_name,
-            /* .alloc_buffer     = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
-            /* .get_alignment    = */ ggml_backend_cpu_buffer_type_get_alignment,
-            /* .get_alloc_size   = */ NULL, // defaults to ggml_nbytes
-            /* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
-            /* .is_host          = */ ggml_backend_cpu_buffer_type_is_host,
-        },
-        /* .context  = */ NULL,
-    };
-
-    return &ggml_backend_cpu_buffer_type_hbm;
-}
-#endif
-
-struct ggml_backend_cpu_context {
-    int n_threads;
-    void * work_data;
-    size_t work_size;
-};
-
-static const char * ggml_backend_cpu_name(ggml_backend_t backend) {
-    return "CPU";
-
-    GGML_UNUSED(backend);
-}
-
-static void ggml_backend_cpu_free(ggml_backend_t backend) {
-    struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
-    free(cpu_ctx->work_data);
-    free(cpu_ctx);
-    free(backend);
-}
-
-static ggml_backend_buffer_type_t ggml_backend_cpu_get_default_buffer_type(ggml_backend_t backend) {
-    return ggml_backend_cpu_buffer_type();
-
-    GGML_UNUSED(backend);
-}
-
-struct ggml_backend_plan_cpu {
-    struct ggml_cplan cplan;
-    struct ggml_cgraph cgraph;
-};
-
-static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, const struct ggml_cgraph * cgraph) {
-    struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
-
-    struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
-
-    cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
-    cpu_plan->cgraph = *cgraph; // FIXME: deep copy
-
-    if (cpu_plan->cplan.work_size > 0) {
-        cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
-    }
-
-    return cpu_plan;
-}
-
-static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
-    struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
-
-    free(cpu_plan->cplan.work_data);
-    free(cpu_plan);
-
-    GGML_UNUSED(backend);
-}
-
-static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
-    struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
-
-    ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
-
-    GGML_UNUSED(backend);
-}
-
-static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
-    struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
-
-    struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
-
-    if (cpu_ctx->work_size < cplan.work_size) {
-        // TODO: may be faster to free and use malloc to avoid the copy
-        cpu_ctx->work_data = realloc(cpu_ctx->work_data, cplan.work_size);
-        cpu_ctx->work_size = cplan.work_size;
-    }
-
-    cplan.work_data = cpu_ctx->work_data;
-
-    ggml_graph_compute(cgraph, &cplan);
-    return true;
-}
-
-static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
-    switch (op->op) {
-        case GGML_OP_MUL_MAT:
-            return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
-        default:
-            return true;
-    }
-
-    GGML_UNUSED(backend);
-}
-
-static struct ggml_backend_i cpu_backend_i = {
-    /* .get_name                = */ ggml_backend_cpu_name,
-    /* .free                    = */ ggml_backend_cpu_free,
-    /* .get_default_buffer_type = */ ggml_backend_cpu_get_default_buffer_type,
-    /* .set_tensor_async        = */ NULL,
-    /* .get_tensor_async        = */ NULL,
-    /* .cpy_tensor_async        = */ NULL,
-    /* .synchronize             = */ NULL,
-    /* .graph_plan_create       = */ ggml_backend_cpu_graph_plan_create,
-    /* .graph_plan_free         = */ ggml_backend_cpu_graph_plan_free,
-    /* .graph_plan_compute      = */ ggml_backend_cpu_graph_plan_compute,
-    /* .graph_compute           = */ ggml_backend_cpu_graph_compute,
-    /* .supports_op             = */ ggml_backend_cpu_supports_op,
-};
-
-ggml_backend_t ggml_backend_cpu_init(void) {
-    struct ggml_backend_cpu_context * ctx = malloc(sizeof(struct ggml_backend_cpu_context));
-
-    ctx->n_threads = GGML_DEFAULT_N_THREADS;
-    ctx->work_data = NULL;
-    ctx->work_size = 0;
-
-    ggml_backend_t cpu_backend = malloc(sizeof(struct ggml_backend));
-
-    *cpu_backend = (struct ggml_backend) {
-        /* .interface = */ cpu_backend_i,
-        /* .context   = */ ctx
-    };
-    return cpu_backend;
-}
-
-bool ggml_backend_is_cpu(ggml_backend_t backend) {
-    return backend && backend->iface.get_name == ggml_backend_cpu_name;
-}
-
-void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
-    GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
-
-    struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
-    ctx->n_threads = n_threads;
-}
-
-ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
-    return ggml_backend_buffer_init(ggml_backend_cpu_buffer_type(), cpu_backend_buffer_i_from_ptr, ptr, size);
-}
-
-static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data) {
-    return ggml_backend_cpu_init();
-
-    GGML_UNUSED(params);
-    GGML_UNUSED(user_data);
-}
-
-
-// scheduler
-
-#define GGML_MAX_BACKENDS 16
-#define GGML_MAX_SPLITS 256
-#define GGML_MAX_SPLIT_INPUTS 16
-
-struct ggml_backend_sched_split {
-    ggml_tallocr_t tallocr;
-    int i_start;
-    int i_end;
-    struct ggml_tensor * inputs[GGML_MAX_SPLIT_INPUTS];
-    int n_inputs;
-    // graph view of this split
-    struct ggml_cgraph graph;
-};
-
-struct ggml_backend_sched {
-    bool is_reset; // true if the scheduler has been reset since the last graph split
-
-    int n_backends;
-    ggml_backend_t backends[GGML_MAX_BACKENDS];
-    ggml_backend_buffer_type_t bufts[GGML_MAX_BACKENDS];
-    ggml_tallocr_t  tallocs[GGML_MAX_BACKENDS];
-
-    ggml_gallocr_t galloc;
-
-    // hash keys of the nodes in the graph
-    struct ggml_hash_set    hash_set;
-    // hash values (arrays of [hash_set.size])
-    ggml_tallocr_t *        node_talloc;                     // tallocr assigned to each node (indirectly this is the backend)
-    struct ggml_tensor * (* node_copies)[GGML_MAX_BACKENDS]; // copies of each node for each destination backend
-
-    // copy of the graph with modified inputs
-    struct ggml_cgraph * graph;
-
-    struct ggml_backend_sched_split splits[GGML_MAX_SPLITS];
-    int n_splits;
-
-    struct ggml_context * ctx;
-
-    // align context_buffer to GGML_MEM_ALIGN
-    #ifdef _MSC_VER
-    __declspec(align(GGML_MEM_ALIGN))
-    #else
-    __attribute__((aligned(GGML_MEM_ALIGN)))
-    #endif
-    char context_buffer[GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS*sizeof(struct ggml_tensor) + sizeof(struct ggml_cgraph)];
-};
-
-#define hash_id(node) ggml_hash_find_or_insert(sched->hash_set, node)
-#define node_allocr(node) sched->node_talloc[hash_id(node)]
-
-static bool ggml_is_view_op(enum ggml_op op) {
-    return op == GGML_OP_VIEW || op == GGML_OP_RESHAPE || op == GGML_OP_PERMUTE || op == GGML_OP_TRANSPOSE;
-}
-
-// returns the priority of the backend, lower is better
-static int sched_backend_prio(ggml_backend_sched_t sched, ggml_backend_t backend) {
-    for (int i = 0; i < sched->n_backends; i++) {
-        if (sched->backends[i] == backend) {
-            return i;
-        }
-    }
-    return INT_MAX;
-}
-
-static int sched_allocr_prio(ggml_backend_sched_t sched, ggml_tallocr_t allocr) {
-    for (int i = 0; i < sched->n_backends; i++) {
-        if (sched->tallocs[i] == allocr) {
-            return i;
-        }
-    }
-    return INT_MAX;
-}
-
-static ggml_tallocr_t sched_allocr_from_buffer(ggml_backend_sched_t sched, ggml_backend_buffer_t buffer) {
-    if (buffer == NULL) {
-        return NULL;
-    }
-
-    // check if this is already allocate in a allocr buffer (from user manual allocations)
-    for (int i = 0; i < sched->n_backends; i++) {
-        if (ggml_tallocr_get_buffer(sched->tallocs[i]) == buffer) {
-            return sched->tallocs[i];
-        }
-    }
-
-    // find highest prio backend that supports the buffer type
-    for (int i = 0; i < sched->n_backends; i++) {
-        if (ggml_backend_buft_supports_backend(buffer->buft, sched->backends[i])) {
-            return sched->tallocs[i];
-        }
-    }
-    GGML_ASSERT(false && "tensor buffer type not supported by any backend");
-}
-
-static ggml_backend_t get_allocr_backend(ggml_backend_sched_t sched, ggml_tallocr_t allocr) {
-    if (allocr == NULL) {
-        return NULL;
-    }
-    for (int i = 0; i < sched->n_backends; i++) {
-        if (sched->tallocs[i] == allocr) {
-            return sched->backends[i];
-        }
-    }
-    GGML_UNREACHABLE();
-}
-
-#if 0
-static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS][128]; // debug only
-#define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
-#define GET_CAUSE(node) causes[hash_id(node)]
-#else
-#define SET_CAUSE(node, ...)
-#define GET_CAUSE(node) ""
-#endif
-
-// returns the backend that should be used for the node based on the current locations
-static ggml_tallocr_t sched_allocr_from_cur(ggml_backend_sched_t sched, struct ggml_tensor * node) {
-    // assign pre-allocated nodes to their backend
-    // dst
-    ggml_tallocr_t cur_allocr = sched_allocr_from_buffer(sched, node->buffer);
-    if (cur_allocr != NULL) {
-        SET_CAUSE(node, "1.dst");
-        return cur_allocr;
-    }
-    // view_src
-    if (node->view_src != NULL) {
-        cur_allocr = sched_allocr_from_buffer(sched, node->view_src->buffer);
-        if (cur_allocr != NULL) {
-            SET_CAUSE(node, "1.vsrc");
-            return cur_allocr;
-        }
-    }
-    // assign nodes that use weights to the backend of the weights
-    for (int i = 0; i < GGML_MAX_SRC; i++) {
-        const struct ggml_tensor * src = node->src[i];
-        if (src == NULL) {
-            break;
-        }
-        if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
-            ggml_tallocr_t src_allocr = sched_allocr_from_buffer(sched, src->buffer);
-            // operations with weights are always run on the same backend as the weights
-            SET_CAUSE(node, "1.wgt%d", i);
-            return src_allocr;
-        }
-    }
-
-    return NULL;
-}
-
-static char * fmt_size(size_t size) {
-    static char buffer[128];
-    if (size >= 1024*1024) {
-        sprintf(buffer, "%zuM", size/1024/1024);
-    } else {
-        sprintf(buffer, "%zuK", size/1024);
-    }
-    return buffer;
-}
-
-static void sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
-    int cur_split = 0;
-    for (int i = 0; i < graph->n_nodes; i++) {
-        if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) {
-            ggml_backend_t split_backend = get_allocr_backend(sched, sched->splits[cur_split].tallocr);
-            fprintf(stderr, "\n## SPLIT #%d: %s # %d inputs: ", cur_split, ggml_backend_name(split_backend),
-                sched->splits[cur_split].n_inputs);
-            for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) {
-                fprintf(stderr, "[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name,
-                    fmt_size(ggml_nbytes(sched->splits[cur_split].inputs[j])));
-            }
-            fprintf(stderr, "\n");
-            cur_split++;
-        }
-        struct ggml_tensor * node = graph->nodes[i];
-        if (ggml_is_view_op(node->op)) {
-            continue;
-        }
-        ggml_tallocr_t node_allocr = node_allocr(node);
-        ggml_backend_t node_backend = node_allocr ? get_allocr_backend(sched, node_allocr) : NULL; // FIXME:
-        fprintf(stderr, "node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name,
-            fmt_size(ggml_nbytes(node)), node_allocr ? ggml_backend_name(node_backend) : "NULL", GET_CAUSE(node));
-        for (int j = 0; j < GGML_MAX_SRC; j++) {
-            struct ggml_tensor * src = node->src[j];
-            if (src == NULL) {
-                break;
-            }
-            ggml_tallocr_t src_allocr = node_allocr(src);
-            ggml_backend_t src_backend = src_allocr ? get_allocr_backend(sched, src_allocr) : NULL;
-            fprintf(stderr, " %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
-                fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
-        }
-        fprintf(stderr, "\n");
-    }
-}
-
-// creates a copy of the tensor with the same memory layout
-static struct ggml_tensor * ggml_dup_tensor_layout(struct ggml_context * ctx, const struct ggml_tensor * tensor) {
-    struct ggml_tensor * dup = ggml_dup_tensor(ctx, tensor);
-    for (int i = 0; i < GGML_MAX_DIMS; i++) {
-        dup->nb[i] = tensor->nb[i];
-    }
-    return dup;
-}
-
-
-//#define DEBUG_PASS1
-//#define DEBUG_PASS2
-//#define DEBUG_PASS3
-//#define DEBUG_PASS4
-
-// assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
-static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
-    // reset splits
-    sched->n_splits = 0;
-    sched->is_reset = false;
-
-    struct ggml_init_params params = {
-        /* .mem_size =   */ sizeof(sched->context_buffer),
-        /* .mem_buffer = */ sched->context_buffer,
-        /* .no_alloc =   */ true
-    };
-
-    ggml_free(sched->ctx);
-
-    sched->ctx = ggml_init(params);
-    if (sched->ctx == NULL) {
-        fprintf(stderr, "%s: failed to initialize context\n", __func__);
-        GGML_ASSERT(false);
-    }
-
-    // pass 1: assign backends to ops with pre-allocated inputs
-    for (int i = 0; i < graph->n_leafs; i++) {
-        struct ggml_tensor * leaf = graph->leafs[i];
-        if (node_allocr(leaf) != NULL) {
-            // do not overwrite user assignments
-            continue;
-        }
-        node_allocr(leaf) = sched_allocr_from_cur(sched, leaf);
-    }
-
-    for (int i = 0; i < graph->n_nodes; i++) {
-        struct ggml_tensor * node = graph->nodes[i];
-        if (node_allocr(node) != NULL) {
-            // do not overwrite user assignments
-            continue;
-        }
-        node_allocr(node) = sched_allocr_from_cur(sched, node);
-        // src
-        for (int j = 0; j < GGML_MAX_SRC; j++) {
-            struct ggml_tensor * src = node->src[j];
-            if (src == NULL) {
-                break;
-            }
-            if (node_allocr(src) == NULL) {
-                node_allocr(src) = sched_allocr_from_cur(sched, src);
-            }
-        }
-    }
-#ifdef DEBUG_PASS1
-    fprintf(stderr, "PASS 1 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
-#endif
-
-    // pass 2: expand current backend assignments
-    // assign the same backend to adjacent nodes
-    // expand gpu backends (i.e. non last prio) up and down, ignoring cpu (the lowest priority backend)
-    // thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops
-
-    // pass 2.1 expand gpu up
-    {
-        ggml_tallocr_t cur_allocr = NULL;
-        for (int i = graph->n_nodes - 1; i >= 0; i--) {
-            struct ggml_tensor * node = graph->nodes[i];
-            if (ggml_is_view_op(node->op)) {
-                continue;
-            }
-            ggml_tallocr_t node_allocr = node_allocr(node);
-            if (node_allocr != NULL) {
-                if (sched_allocr_prio(sched, node_allocr) == sched->n_backends - 1) {
-                    // skip cpu (lowest prio backend)
-                    cur_allocr = NULL;
-                } else {
-                    cur_allocr = node_allocr;
-                }
-            } else {
-                node_allocr(node) = cur_allocr;
-                SET_CAUSE(node, "2.1");
-            }
-        }
-    }
-
-    // pass 2.2 expand gpu down
-    {
-        ggml_tallocr_t cur_allocr = NULL;
-        for (int i = 0; i < graph->n_nodes; i++) {
-            struct ggml_tensor * node = graph->nodes[i];
-            if (ggml_is_view_op(node->op)) {
-                continue;
-            }
-            ggml_tallocr_t node_allocr = node_allocr(node);
-            if (node_allocr != NULL) {
-                if (sched_allocr_prio(sched, node_allocr) == sched->n_backends - 1) {
-                    // skip cpu (lowest prio backend)
-                    cur_allocr = NULL;
-                } else {
-                    cur_allocr = node_allocr;
-                }
-            } else {
-                node_allocr(node) = cur_allocr;
-                SET_CAUSE(node, "2.2");
-            }
-        }
-    }
-
-    // pass 2.3 expand rest up
-    {
-        ggml_tallocr_t cur_allocr = NULL;
-        for (int i = graph->n_nodes - 1; i >= 0; i--) {
-            struct ggml_tensor * node = graph->nodes[i];
-            if (ggml_is_view_op(node->op)) {
-                continue;
-            }
-            ggml_tallocr_t node_allocr = node_allocr(node);
-            if (node_allocr != NULL) {
-                cur_allocr = node_allocr;
-            } else {
-                node_allocr(node) = cur_allocr;
-                SET_CAUSE(node, "2.3");
-            }
-        }
-    }
-
-    // pass 2.4 expand rest down
-    {
-        ggml_tallocr_t cur_allocr = NULL;
-        for (int i = 0; i < graph->n_nodes; i++) {
-            struct ggml_tensor * node = graph->nodes[i];
-            if (ggml_is_view_op(node->op)) {
-                continue;
-            }
-            ggml_tallocr_t node_allocr = node_allocr(node);
-            if (node_allocr != NULL) {
-                cur_allocr = node_allocr;
-            } else {
-                node_allocr(node) = cur_allocr;
-                SET_CAUSE(node, "2.4");
-            }
-        }
-    }
-#ifdef DEBUG_PASS2
-    fprintf(stderr, "PASS 2 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
-#endif
-
-    // pass 3: assign backends to remaining src from dst and view_src
-    for (int i = 0; i < graph->n_nodes; i++) {
-        struct ggml_tensor * node = graph->nodes[i];
-        ggml_tallocr_t cur_allocr = node_allocr(node);
-        if (node->view_src != NULL && cur_allocr == NULL) {
-            cur_allocr = node_allocr(node) = node_allocr(node->view_src);
-            SET_CAUSE(node, "3.vsrc");
-        }
-        for (int j = 0; j < GGML_MAX_SRC; j++) {
-            struct ggml_tensor * src = node->src[j];
-            if (src == NULL) {
-                break;
-            }
-            ggml_tallocr_t src_allocr = node_allocr(src);
-            if (src_allocr == NULL) {
-                if (src->view_src != NULL) {
-                    // views are always on the same backend as the source
-                    node_allocr(src) = node_allocr(src->view_src);
-                    SET_CAUSE(src, "3.vsrc");
-                } else {
-                    node_allocr(src) = cur_allocr;
-                    SET_CAUSE(src, "3.cur");
-                }
-            }
-        }
-    }
-#ifdef DEBUG_PASS3
-    fprintf(stderr, "PASS 3 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
-#endif
-
-    // pass 4: split graph, find tensors that need to be copied
-    {
-        int cur_split = 0;
-        // find the backend of the first split, skipping view ops
-        for (int i = 0; i < graph->n_nodes; i++) {
-            struct ggml_tensor * node = graph->nodes[i];
-            if (!ggml_is_view_op(node->op)) {
-                sched->splits[0].tallocr = node_allocr(node);
-                break;
-            }
-        }
-        sched->splits[0].i_start = 0;
-        sched->splits[0].n_inputs = 0;
-        memset(sched->splits[0].inputs, 0, sizeof(sched->splits[0].inputs)); //HACK
-        ggml_tallocr_t cur_allocr = sched->splits[0].tallocr;
-        size_t cur_backend_id = sched_allocr_prio(sched, cur_allocr);
-        for (int i = 0; i < graph->n_nodes; i++) {
-            struct ggml_tensor * node = graph->nodes[i];
-
-            if (ggml_is_view_op(node->op)) {
-                continue;
-            }
-
-            ggml_tallocr_t node_allocr = node_allocr(node);
-
-            GGML_ASSERT(node_allocr != NULL); // all nodes should be assigned by now
-
-            if (node_allocr != cur_allocr) {
-                sched->splits[cur_split].i_end = i;
-                cur_split++;
-                GGML_ASSERT(cur_split < GGML_MAX_SPLITS);
-                sched->splits[cur_split].tallocr = node_allocr;
-                sched->splits[cur_split].i_start = i;
-                sched->splits[cur_split].n_inputs = 0;
-                cur_allocr = node_allocr;
-                cur_backend_id = sched_allocr_prio(sched, cur_allocr);
-            }
-
-            // find inputs that are not on the same backend
-            for (int j = 0; j < GGML_MAX_SRC; j++) {
-                struct ggml_tensor * src = node->src[j];
-                if (src == NULL) {
-                    break;
-                }
-                ggml_tallocr_t src_allocr = node_allocr(src);
-                GGML_ASSERT(src_allocr != NULL); // all inputs should be assigned by now
-                if (src_allocr != node_allocr) {
-                    // check if the input is already in the split
-                    bool found = false;
-                    for (int k = 0; k < sched->splits[cur_split].n_inputs; k++) {
-                        if (sched->splits[cur_split].inputs[k] == src) {
-                            found = true;
-                            break;
-                        }
-                    }
-
-                    if (!found) {
-                        int n_inputs = sched->splits[cur_split].n_inputs++;
-                        //printf("split %d input %d: %s (%s)\n", cur_split, n_inputs, src->name, ggml_backend_name(get_allocr_backend(sched, src_allocr)));
-                        GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
-                        sched->splits[cur_split].inputs[n_inputs] = src;
-                    }
-
-                    // create a copy of the input in the split's backend
-                    size_t id = hash_id(src);
-                    if (sched->node_copies[id][cur_backend_id] == NULL) {
-                        ggml_backend_t backend = get_allocr_backend(sched, cur_allocr);
-                        struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
-                        ggml_format_name(tensor_copy, "%s#%s", ggml_backend_name(backend), src->name);
-
-                        sched->node_copies[id][cur_backend_id] = tensor_copy;
-                        node_allocr(tensor_copy) = cur_allocr;
-                        SET_CAUSE(tensor_copy, "4.cpy");
-                    }
-                    node->src[j] = sched->node_copies[id][cur_backend_id];
-                }
-            }
-        }
-        sched->splits[cur_split].i_end = graph->n_nodes;
-        sched->n_splits = cur_split + 1;
-    }
-#ifdef DEBUG_PASS4
-    fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
-#endif
-
-#ifndef NDEBUG
-    // sanity check: all sources should have the same backend as the node
-    for (int i = 0; i < graph->n_nodes; i++) {
-        struct ggml_tensor * node = graph->nodes[i];
-        ggml_tallocr_t node_allocr = node_allocr(node);
-        if (node_allocr == NULL) {
-            fprintf(stderr, "!!!!!!! %s has no backend\n", node->name);
-        }
-        if (node->view_src != NULL && node_allocr != node_allocr(node->view_src)) {
-            fprintf(stderr, "!!!!!!! %s has backend %s, view_src %s has backend %s\n",
-                node->name, node_allocr ? ggml_backend_name(get_allocr_backend(sched, node_allocr)) : "NULL",
-                node->view_src->name, node_allocr(node->view_src) ? ggml_backend_name(get_allocr_backend(sched, node_allocr(node->view_src))) : "NULL");
-        }
-        for (int j = 0; j < GGML_MAX_SRC; j++) {
-            struct ggml_tensor * src = node->src[j];
-            if (src == NULL) {
-                break;
-            }
-            ggml_tallocr_t src_allocr = node_allocr(src);
-            if (src_allocr != node_allocr /* && src_backend != NULL */) { // ignore nulls for now
-                fprintf(stderr, "!!!! %s has backend %s, src %d (%s) has backend %s\n",
-                    node->name, node_allocr ? ggml_backend_name(get_allocr_backend(sched, node_allocr)) : "NULL",
-                    j, src->name, src_allocr ? ggml_backend_name(get_allocr_backend(sched, src_allocr)) : "NULL");
-            }
-            if (src->view_src != NULL && src_allocr != node_allocr(src->view_src)) {
-                fprintf(stderr, "!!!!!!! [src] %s has backend %s, view_src %s has backend %s\n",
-                    src->name, src_allocr ? ggml_backend_name(get_allocr_backend(sched, src_allocr)) : "NULL",
-                    src->view_src->name, node_allocr(src->view_src) ? ggml_backend_name(get_allocr_backend(sched, node_allocr(src->view_src))) : "NULL");
-            }
-        }
-    }
-    fflush(stderr);
-#endif
-
-    // create copies of the graph for each split
-    // FIXME: avoid this copy, pass split inputs to ggml_gallocr_alloc_graph_n in some other way
-    struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_MAX_SPLIT_INPUTS, false);
-    for (int i = 0; i < sched->n_splits; i++) {
-        struct ggml_backend_sched_split * split = &sched->splits[i];
-        split->graph = ggml_graph_view(graph, split->i_start, split->i_end);
-
-        // add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
-        for (int j = 0; j < split->n_inputs; j++) {
-            struct ggml_tensor * input = split->inputs[j];
-            struct ggml_tensor * input_cpy = sched->node_copies[hash_id(input)][sched_allocr_prio(sched, split->tallocr)];
-            // add a dependency to the input source so that it is not freed before the copy is done
-            GGML_ASSERT(input_cpy->src[0] == NULL || input_cpy->src[0] == input);
-            input_cpy->src[0] = input;
-            graph_copy->nodes[graph_copy->n_nodes++] = input_cpy;
-        }
-
-        for (int j = split->i_start; j < split->i_end; j++) {
-            graph_copy->nodes[graph_copy->n_nodes++] = graph->nodes[j];
-        }
-    }
-    sched->graph = graph_copy;
-}
-
-static void sched_alloc_splits(ggml_backend_sched_t sched) {
-    ggml_gallocr_alloc_graph_n(
-        sched->galloc,
-        sched->graph,
-        sched->hash_set,
-        sched->node_talloc);
-}
-
-static void sched_compute_splits(ggml_backend_sched_t sched) {
-    uint64_t copy_us[GGML_MAX_BACKENDS] = {0};
-    uint64_t compute_us[GGML_MAX_BACKENDS] = {0};
-
-    struct ggml_backend_sched_split * splits = sched->splits;
-
-    for (int i = 0; i < sched->n_splits; i++) {
-        struct ggml_backend_sched_split * split = &splits[i];
-        ggml_backend_t split_backend = get_allocr_backend(sched, split->tallocr);
-        int split_backend_id = sched_backend_prio(sched, split_backend);
-
-        // copy the input tensors to the split backend
-        uint64_t copy_start_us = ggml_time_us();
-        for (int j = 0; j < split->n_inputs; j++) {
-            struct ggml_tensor * input = split->inputs[j];
-            struct ggml_tensor * input_cpy = sched->node_copies[hash_id(input)][split_backend_id];
-
-            GGML_ASSERT(input->buffer != NULL);
-            GGML_ASSERT(input_cpy->buffer != NULL);
-
-            // TODO: avoid this copy if it was already copied in a previous split, and the input didn't change
-            // this is important to avoid copying constants such as KQ_mask and inp_pos multiple times
-            ggml_backend_tensor_copy_async(split_backend, input, input_cpy);
-        }
-        //ggml_backend_synchronize(split_backend); // necessary to measure copy time
-        int64_t copy_end_us = ggml_time_us();
-        copy_us[split_backend_id] += copy_end_us - copy_start_us;
-
-#if 0
-        char split_filename[GGML_MAX_NAME];
-        snprintf(split_filename, GGML_MAX_NAME, "split_%i_%s.dot", i, ggml_backend_name(split_backend));
-        ggml_graph_dump_dot(split->graph, NULL, split_filename);
-#endif
-
-        uint64_t compute_start_us = ggml_time_us();
-        ggml_backend_graph_compute(split_backend, &split->graph);
-        //ggml_backend_synchronize(split_backend); // necessary to measure compute time
-        uint64_t compute_end_us = ggml_time_us();
-        compute_us[split_backend_id] += compute_end_us - compute_start_us;
-    }
-
-#if 0
-    // per-backend timings
-    fprintf(stderr, "sched_compute_splits times (%d splits):\n", sched->n_splits);
-    for (int i = 0; i < sched->n_backends; i++) {
-        if (copy_us[i] > 0 || compute_us[i] > 0) {
-            fprintf(stderr, "\t%5.5s: %lu us copy, %lu us compute\n", ggml_backend_name(sched->backends[i]), copy_us[i], compute_us[i]);
-        }
-    }
-#endif
-}
-
-static void sched_reset(ggml_backend_sched_t sched) {
-    for (int i = 0; i < sched->n_backends; i++) {
-        ggml_tallocr_reset(sched->tallocs[i]);
-    }
-    // reset state for the next run
-    size_t hash_size = sched->hash_set.size;
-    memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size);
-    memset(sched->node_talloc,   0, sizeof(sched->node_talloc[0])   * hash_size);
-    memset(sched->node_copies,   0, sizeof(sched->node_copies[0])   * hash_size);
-
-    sched->is_reset = true;
-}
-
-ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size) {
-    GGML_ASSERT(n_backends > 0);
-    GGML_ASSERT(n_backends <= GGML_MAX_BACKENDS);
-
-    struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1);
-
-    // initialize hash table
-    sched->hash_set    = ggml_hash_set_new(graph_size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
-    sched->node_talloc = calloc(sizeof(sched->node_talloc[0]) * sched->hash_set.size, 1);
-    sched->node_copies = calloc(sizeof(sched->node_copies[0]) * sched->hash_set.size, 1);
-
-    sched->n_backends = n_backends;
-    for (int i = 0; i < n_backends; i++) {
-        sched->backends[i] = backends[i];
-        sched->bufts[i] = bufts ? bufts[i] : ggml_backend_get_default_buffer_type(backends[i]);
-    }
-
-    sched->galloc = ggml_gallocr_new();
-
-    // init measure allocs for each backend
-    for (int i = 0; i < n_backends; i++) {
-        sched->tallocs[i] = ggml_tallocr_new_measure_from_buft(sched->bufts[i]);
-    }
-
-    sched_reset(sched);
-
-    return sched;
-}
-
-void ggml_backend_sched_free(ggml_backend_sched_t sched) {
-    if (sched == NULL) {
-        return;
-    }
-    for (int i = 0; i < sched->n_backends; i++) {
-        ggml_tallocr_free(sched->tallocs[i]);
-    }
-    ggml_gallocr_free(sched->galloc);
-    ggml_free(sched->ctx);
-    free(sched->hash_set.keys);
-    free(sched->node_talloc);
-    free(sched->node_copies);
-    free(sched);
-}
-
-void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
-    GGML_ASSERT(ggml_tallocr_is_measure(sched->tallocs[0])); // can only be initialized once
-
-    sched_split_graph(sched, measure_graph);
-    sched_alloc_splits(sched);
-
-    // allocate buffers and reset allocators
-    for (int i = 0; i < sched->n_backends; i++) {
-        size_t size = ggml_tallocr_max_size(sched->tallocs[i]);
-        ggml_tallocr_free(sched->tallocs[i]);
-        sched->tallocs[i] = ggml_tallocr_new_from_buft(sched->bufts[i], size);
-    }
-
-    sched_reset(sched);
-}
-
-void ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
-    GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
-
-    if (!sched->is_reset) {
-        sched_reset(sched);
-    }
-
-    sched_split_graph(sched, graph);
-    sched_alloc_splits(sched);
-    sched_compute_splits(sched);
-}
-
-void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
-    sched_reset(sched);
-}
-
-int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched) {
-    return sched->n_splits;
-}
-
-ggml_tallocr_t ggml_backend_sched_get_tallocr(ggml_backend_sched_t sched, ggml_backend_t backend) {
-    int backend_index = sched_backend_prio(sched, backend);
-    GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
-    return sched->tallocs[backend_index];
-}
-
-ggml_backend_buffer_t ggml_backend_sched_get_buffer(ggml_backend_sched_t sched, ggml_backend_t backend) {
-    int backend_index = sched_backend_prio(sched, backend);
-    GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
-    return ggml_tallocr_get_buffer(sched->tallocs[backend_index]);
-}
-
-void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
-    int backend_index = sched_backend_prio(sched, backend);
-    GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
-    node_allocr(node) = sched->tallocs[backend_index];
-}
-
-ggml_backend_t ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) {
-    ggml_tallocr_t allocr = node_allocr(node);
-    if (allocr == NULL) {
-        return NULL;
-    }
-    return get_allocr_backend(sched, allocr);
-}
-
-// utils
-
-void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
-    GGML_ASSERT(tensor->buffer == NULL);
-    //GGML_ASSERT(tensor->data == NULL); // views of pre-allocated tensors may have the data set in ggml_new_tensor, but still need to be initialized by the backend
-    GGML_ASSERT(tensor->view_src != NULL);
-    GGML_ASSERT(tensor->view_src->buffer != NULL);
-    GGML_ASSERT(tensor->view_src->data != NULL);
-
-    tensor->buffer = buffer;
-    tensor->data = (char *)tensor->view_src->data + tensor->view_offs;
-    tensor->backend = tensor->view_src->backend;
-    ggml_backend_buffer_init_tensor(buffer, tensor);
-}
-
-void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr) {
-    GGML_ASSERT(tensor->buffer == NULL);
-    GGML_ASSERT(tensor->data == NULL);
-    GGML_ASSERT(tensor->view_src == NULL);
-    GGML_ASSERT(addr >= ggml_backend_buffer_get_base(buffer));
-    GGML_ASSERT((char *)addr + ggml_backend_buffer_get_alloc_size(buffer, tensor) <=
-                (char *)ggml_backend_buffer_get_base(buffer) + ggml_backend_buffer_get_size(buffer));
-
-    tensor->buffer = buffer;
-    tensor->data = addr;
-    ggml_backend_buffer_init_tensor(buffer, tensor);
-}
-
-static struct ggml_tensor * graph_dup_tensor(struct ggml_hash_set hash_set, struct ggml_tensor ** node_copies,
-    struct ggml_context * ctx_allocated, struct ggml_context * ctx_unallocated, struct ggml_tensor * src) {
-
-    GGML_ASSERT(src != NULL);
-    GGML_ASSERT(src->data && "graph must be allocated");
-
-    size_t id = ggml_hash_insert(hash_set, src);
-    if (id == GGML_HASHTABLE_ALREADY_EXISTS) {
-        return node_copies[ggml_hash_find(hash_set, src)];
-    }
-
-    struct ggml_tensor * dst = ggml_dup_tensor_layout(src->data && !src->view_src ? ctx_allocated : ctx_unallocated, src);
-    if (src->view_src != NULL) {
-        dst->view_src = graph_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, src->view_src);
-        dst->view_offs = src->view_offs;
-    }
-    dst->op = src->op;
-    memcpy(dst->op_params, src->op_params, sizeof(dst->op_params));
-    ggml_set_name(dst, src->name);
-
-    // copy src
-    for (int i = 0; i < GGML_MAX_SRC; i++) {
-        struct ggml_tensor * s = src->src[i];
-        if (s == NULL) {
-            break;
-        }
-        dst->src[i] = graph_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, s);
-    }
-
-    node_copies[id] = dst;
-    return dst;
-}
-
-static void graph_init_tensor(struct ggml_hash_set hash_set, struct ggml_tensor ** node_copies, bool * node_init, struct ggml_tensor * src) {
-    size_t id = ggml_hash_find(hash_set, src);
-    if (node_init[id]) {
-        return;
-    }
-    node_init[id] = true;
-
-    struct ggml_tensor * dst = node_copies[id];
-    if (dst->view_src != NULL) {
-        graph_init_tensor(hash_set, node_copies, node_init, src->view_src);
-        ggml_backend_view_init(dst->view_src->buffer, dst);
-    }
-    else {
-        ggml_backend_tensor_copy(src, dst);
-    }
-
-    // init src
-    for (int i = 0; i < GGML_MAX_SRC; i++) {
-        struct ggml_tensor * s = src->src[i];
-        if (s == NULL) {
-            break;
-        }
-        graph_init_tensor(hash_set, node_copies, node_init, s);
-    }
-}
-
-struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) {
-    struct ggml_hash_set hash_set = {
-        /* .size = */ graph->visited_hash_table.size,
-        /* .keys = */ calloc(sizeof(hash_set.keys[0]) * graph->visited_hash_table.size, 1)
-    };
-    struct ggml_tensor ** node_copies = calloc(sizeof(node_copies[0]) * hash_set.size, 1);
-    bool * node_init = calloc(sizeof(node_init[0]) * hash_set.size, 1);
-
-    struct ggml_init_params params = {
-        /* .mem_size   = */ ggml_tensor_overhead()*hash_set.size + ggml_graph_overhead_custom(graph->size, false),
-        /* .mem_buffer = */ NULL,
-        /* .no_alloc   = */ true
-    };
-
-    struct ggml_context * ctx_allocated = ggml_init(params);
-    struct ggml_context * ctx_unallocated = ggml_init(params);
-
-    if (ctx_allocated == NULL || ctx_unallocated == NULL) {
-        fprintf(stderr, "failed to allocate context for graph copy\n");
-        free(hash_set.keys);
-        free(node_copies);
-        free(node_init);
-        ggml_free(ctx_allocated);
-        ggml_free(ctx_unallocated);
-        return (struct ggml_backend_graph_copy) {
-            /* .buffer           = */ NULL,
-            /* .ctx_allocated    = */ NULL,
-            /* .ctx_unallocated  = */ NULL,
-            /* .graph            = */ NULL,
-        };
-    }
-
-    // dup nodes
-    for (int i = 0; i < graph->n_nodes; i++) {
-        struct ggml_tensor * node = graph->nodes[i];
-        graph_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, node);
-    }
-
-    // allocate nodes
-    ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx_allocated, backend);
-    if (buffer == NULL) {
-        fprintf(stderr, "failed to allocate buffer for graph copy\n");
-        free(hash_set.keys);
-        free(node_copies);
-        free(node_init);
-        ggml_free(ctx_allocated);
-        ggml_free(ctx_unallocated);
-        return (struct ggml_backend_graph_copy) {
-            /* .buffer           = */ NULL,
-            /* .ctx_allocated    = */ NULL,
-            /* .ctx_unallocated  = */ NULL,
-            /* .graph            = */ NULL,
-        };
-    }
-
-    //printf("copy buffer size: %zu MB\n", ggml_backend_buffer_get_size(buffer) / 1024 / 1024);
-
-    // copy data and init views
-    for (int i = 0; i < graph->n_nodes; i++) {
-        struct ggml_tensor * node = graph->nodes[i];
-        graph_init_tensor(hash_set, node_copies, node_init, node);
-    }
-
-    // build graph copy
-    struct ggml_cgraph * graph_copy = ggml_new_graph_custom(ctx_allocated, graph->size, false);
-    for (int i = 0; i < graph->n_nodes; i++) {
-        struct ggml_tensor * node = graph->nodes[i];
-        struct ggml_tensor * node_copy = node_copies[ggml_hash_find(hash_set, node)];
-        graph_copy->nodes[i] = node_copy;
-    }
-    graph_copy->n_nodes = graph->n_nodes;
-
-    free(hash_set.keys);
-    free(node_copies);
-    free(node_init);
-
-    return (struct ggml_backend_graph_copy) {
-        /* .buffer           = */ buffer,
-        /* .ctx_allocated    = */ ctx_allocated,
-        /* .ctx_unallocated  = */ ctx_unallocated,
-        /* .graph            = */ graph_copy,
-    };
-}
-
-void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy) {
-    ggml_backend_buffer_free(copy.buffer);
-    ggml_free(copy.ctx_allocated);
-    ggml_free(copy.ctx_unallocated);
-}
-
-bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data) {
-    struct ggml_backend_graph_copy copy = ggml_backend_graph_copy(backend2, graph);
-    if (copy.buffer == NULL) {
-        return false;
-    }
-
-    struct ggml_cgraph * g1 = graph;
-    struct ggml_cgraph * g2 = copy.graph;
-
-    assert(g1->n_nodes == g2->n_nodes);
-
-    for (int i = 0; i < g1->n_nodes; i++) {
-        //printf("eval %d/%d\n", i, g1->n_nodes);
-        struct ggml_tensor * t1 = g1->nodes[i];
-        struct ggml_tensor * t2 = g2->nodes[i];
-
-        assert(t1->op == t2->op && ggml_are_same_layout(t1, t2));
-
-        struct ggml_cgraph g1v = ggml_graph_view(g1, i, i + 1);
-        struct ggml_cgraph g2v = ggml_graph_view(g2, i, i + 1);
-
-        ggml_backend_graph_compute(backend1, &g1v);
-        ggml_backend_graph_compute(backend2, &g2v);
-
-        if (ggml_is_view_op(t1->op)) {
-            continue;
-        }
-
-        // compare results, calculate rms etc
-        if (!callback(i, t1, t2, user_data)) {
-            break;
-        }
-    }
-
-    ggml_backend_graph_copy_free(copy);
-
-    return true;
-}
diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu
deleted file mode 100644
index b2f36c4..0000000
--- a/ggml/src/ggml-cuda.cu
+++ /dev/null
@@ -1,11031 +0,0 @@
-#include <algorithm>
-#include <assert.h>
-#include <atomic>
-#include <cinttypes>
-#include <cstddef>
-#include <cstdint>
-#include <float.h>
-#include <limits>
-#include <stdint.h>
-#include <stdio.h>
-#include <string>
-#include <vector>
-#include <map>
-#include <array>
-#include "ggml-cuda.h"
-#include "ggml.h"
-#include "ggml-backend-impl.h"
-
-#if defined(GGML_USE_HIPBLAS)
-#include <hip/hip_runtime.h>
-#include <hipblas/hipblas.h>
-#include <hip/hip_fp16.h>
-#ifdef __HIP_PLATFORM_AMD__
-// for rocblas_initialize()
-#include "rocblas/rocblas.h"
-#endif // __HIP_PLATFORM_AMD__
-#define CUBLAS_COMPUTE_16F HIPBLAS_R_16F
-#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
-#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
-#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
-#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
-#define CUBLAS_OP_N HIPBLAS_OP_N
-#define CUBLAS_OP_T HIPBLAS_OP_T
-#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
-#define CUBLAS_TF32_TENSOR_OP_MATH 0
-#define CUDA_R_16F  HIPBLAS_R_16F
-#define CUDA_R_32F  HIPBLAS_R_32F
-#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
-#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
-#define cublasCreate hipblasCreate
-#define cublasGemmEx hipblasGemmEx
-#define cublasGemmBatchedEx hipblasGemmBatchedEx
-#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
-#define cublasHandle_t hipblasHandle_t
-#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
-#define cublasSetStream hipblasSetStream
-#define cublasSgemm hipblasSgemm
-#define cublasStatus_t hipblasStatus_t
-#define cudaDataType_t hipblasDatatype_t //deprecated, new hipblasDatatype not in 5.6
-#define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
-#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
-#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
-#define cudaDeviceProp hipDeviceProp_t
-#define cudaDeviceSynchronize hipDeviceSynchronize
-#define cudaError_t hipError_t
-#define cudaEventCreateWithFlags hipEventCreateWithFlags
-#define cudaEventDisableTiming hipEventDisableTiming
-#define cudaEventRecord hipEventRecord
-#define cudaEvent_t hipEvent_t
-#define cudaEventDestroy hipEventDestroy
-#define cudaFree hipFree
-#define cudaFreeHost hipHostFree
-#define cudaGetDevice hipGetDevice
-#define cudaGetDeviceCount hipGetDeviceCount
-#define cudaGetDeviceProperties hipGetDeviceProperties
-#define cudaGetErrorString hipGetErrorString
-#define cudaGetLastError hipGetLastError
-#ifdef GGML_HIP_UMA
-#define cudaMalloc hipMallocManaged
-#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size)
-#else
-#define cudaMalloc hipMalloc
-#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
-#endif
-#define cudaMemcpy hipMemcpy
-#define cudaMemcpyAsync hipMemcpyAsync
-#define cudaMemcpyPeerAsync hipMemcpyPeerAsync
-#define cudaMemcpy2DAsync hipMemcpy2DAsync
-#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
-#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
-#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
-#define cudaMemcpyKind hipMemcpyKind
-#define cudaMemset hipMemset
-#define cudaMemsetAsync hipMemsetAsync
-#define cudaMemGetInfo hipMemGetInfo
-#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
-#define cudaSetDevice hipSetDevice
-#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
-#define cudaStreamFireAndForget hipStreamFireAndForget
-#define cudaStreamNonBlocking hipStreamNonBlocking
-#define cudaStreamSynchronize hipStreamSynchronize
-#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
-#define cudaStream_t hipStream_t
-#define cudaSuccess hipSuccess
-#define __trap abort
-#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
-#define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED
-#define CUBLAS_STATUS_ALLOC_FAILED HIPBLAS_STATUS_ALLOC_FAILED
-#define CUBLAS_STATUS_INVALID_VALUE HIPBLAS_STATUS_INVALID_VALUE
-#define CUBLAS_STATUS_ARCH_MISMATCH HIPBLAS_STATUS_ARCH_MISMATCH
-#define CUBLAS_STATUS_MAPPING_ERROR HIPBLAS_STATUS_MAPPING_ERROR
-#define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED
-#define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR
-#define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED
-#else
-#include <cuda_runtime.h>
-#include <cuda.h>
-#include <cublas_v2.h>
-#include <cuda_fp16.h>
-
-#if CUDART_VERSION < 11020
-#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
-#define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH
-#define CUBLAS_COMPUTE_16F CUDA_R_16F
-#define CUBLAS_COMPUTE_32F CUDA_R_32F
-#define cublasComputeType_t cudaDataType_t
-#endif // CUDART_VERSION < 11020
-
-#endif // defined(GGML_USE_HIPBLAS)
-
-#define CUDART_HMAX     11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed)
-
-#define CC_PASCAL     600
-#define MIN_CC_DP4A   610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
-#define CC_VOLTA      700
-#define CC_OFFSET_AMD 1000000
-#define CC_RDNA1      (CC_OFFSET_AMD + 1010)
-#define CC_RDNA2      (CC_OFFSET_AMD + 1030)
-#define CC_RDNA3      (CC_OFFSET_AMD + 1100)
-
-#define GGML_CUDA_MAX_NODES 8192
-
-// define this if you want to always fallback to MMQ kernels and not use cuBLAS for matrix multiplication
-// on modern hardware, using cuBLAS is recommended as it utilizes F16 tensor cores which are very performant
-// for large computational tasks. the drawback is that this requires some extra amount of VRAM:
-// -  7B quantum model: +100-200 MB
-// - 13B quantum model: +200-400 MB
-//
-//#define GGML_CUDA_FORCE_MMQ
-
-// TODO: improve this to be correct for more hardware
-//       for example, currently fails for GeForce GTX 1660 which is TURING arch (> VOLTA) but does not have tensor cores
-#if !defined(GGML_CUDA_FORCE_MMQ)
-#define CUDA_USE_TENSOR_CORES
-#endif
-
-// max batch size to use MMQ kernels when tensor cores are available
-#define MMQ_MAX_BATCH_SIZE 32
-
-#if defined(GGML_USE_HIPBLAS)
-#define __CUDA_ARCH__ 1300
-
-#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
-    defined(__gfx1150__) || defined(__gfx1151__)
-#define RDNA3
-#endif
-
-#if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \
-    defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__)
-#define RDNA2
-#endif
-
-#ifndef __has_builtin
-    #define __has_builtin(x) 0
-#endif
-
-typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
-static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
-    const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
-    const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
-#if __has_builtin(__builtin_elementwise_sub_sat)
-    const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
-    return reinterpret_cast<const int &>(c);
-#else
-    int8x4_t c;
-    int16_t tmp;
-#pragma unroll
-    for (int i = 0; i < 4; i++) {
-        tmp = va[i] - vb[i];
-        if(tmp > std::numeric_limits<int8_t>::max()) tmp = std::numeric_limits<int8_t>::max();
-        if(tmp < std::numeric_limits<int8_t>::min()) tmp = std::numeric_limits<int8_t>::min();
-        c[i] = tmp;
-    }
-    return reinterpret_cast<int &>(c);
-#endif // __has_builtin(__builtin_elementwise_sub_sat)
-}
-
-static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
-#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__)
-    c = __builtin_amdgcn_sdot4(a, b, c, false);
-#elif defined(RDNA3)
-    c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
-#elif defined(__gfx1010__) || defined(__gfx900__)
-    int tmp1;
-    int tmp2;
-    asm("\n \
-        v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 \n \
-        v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 \n \
-        v_add3_u32 %0, %1, %2, %0 \n \
-        v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 \n \
-        v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 \n \
-        v_add3_u32 %0, %1, %2, %0 \n \
-        "
-        : "+v"(c), "=&v"(tmp1), "=&v"(tmp2)
-        : "v"(a), "v"(b)
-    );
-#else
-    const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
-    const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
-    c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3];
-#endif
-    return c;
-}
-#endif // defined(GGML_USE_HIPBLAS)
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-
-static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
-
-[[noreturn]]
-static void ggml_cuda_error(const char * stmt, const char * func, const char * file, const int line, const char * msg) {
-    int id = -1; // in case cudaGetDevice fails
-    cudaGetDevice(&id);
-
-    fprintf(stderr, "CUDA error: %s\n", msg);
-    fprintf(stderr, "  current device: %d, in function %s at %s:%d\n", id, func, file, line);
-    fprintf(stderr, "  %s\n", stmt);
-    // abort with GGML_ASSERT to get a stack trace
-    GGML_ASSERT(!"CUDA error");
-}
-
-#define CUDA_CHECK_GEN(err, success, error_fn)                                      \
-     do {                                                                           \
-        auto err_ = (err);                                                          \
-        if (err_ != (success)) {                                                    \
-            ggml_cuda_error(#err, __func__, __FILE__, __LINE__, error_fn(err_));    \
-        }                                                                           \
-    } while (0)
-
-#define CUDA_CHECK(err) CUDA_CHECK_GEN(err, cudaSuccess, cudaGetErrorString)
-
-#if CUDART_VERSION >= 12000
-    static const char * cublas_get_error_str(const cublasStatus_t err) {
-        return cublasGetStatusString(err);
-    }
-#else
-    static const char * cublas_get_error_str(const cublasStatus_t err) {
-        switch (err) {
-            case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS";
-            case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED";
-            case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED";
-            case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE";
-            case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH";
-            case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR";
-            case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED";
-            case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR";
-            case CUBLAS_STATUS_NOT_SUPPORTED: return "CUBLAS_STATUS_NOT_SUPPORTED";
-            default: return "unknown error";
-        }
-    }
-#endif // CUDART_VERSION >= 12000
-
-#define CUBLAS_CHECK(err) CUDA_CHECK_GEN(err, CUBLAS_STATUS_SUCCESS, cublas_get_error_str)
-
-#if !defined(GGML_USE_HIPBLAS)
-static const char * cu_get_error_str(CUresult err) {
-    const char * err_str;
-    cuGetErrorString(err, &err_str);
-    return err_str;
-}
-#define CU_CHECK(err) CUDA_CHECK_GEN(err, CUDA_SUCCESS, cu_get_error_str)
-#endif
-
-#if CUDART_VERSION >= 11100
-#define GGML_CUDA_ASSUME(x) __builtin_assume(x)
-#else
-#define GGML_CUDA_ASSUME(x)
-#endif // CUDART_VERSION >= 11100
-
-#ifdef GGML_CUDA_F16
-typedef half dfloat; // dequantize float
-typedef half2 dfloat2;
-#else
-typedef float dfloat; // dequantize float
-typedef float2 dfloat2;
-#endif //GGML_CUDA_F16
-
-static __device__ __forceinline__ int get_int_from_int8(const int8_t * x8, const int & i32) {
-    const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
-
-    int x32 = 0;
-    x32 |= x16[0] <<  0;
-    x32 |= x16[1] << 16;
-
-    return x32;
-}
-
-static __device__ __forceinline__ int get_int_from_uint8(const uint8_t * x8, const int & i32) {
-    const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
-
-    int x32 = 0;
-    x32 |= x16[0] <<  0;
-    x32 |= x16[1] << 16;
-
-    return x32;
-}
-
-static __device__ __forceinline__ int get_int_from_int8_aligned(const int8_t * x8, const int & i32) {
-    return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
-}
-
-static __device__ __forceinline__ int get_int_from_uint8_aligned(const uint8_t * x8, const int & i32) {
-    return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
-}
-
-template<typename T>
-using to_t_cuda_t = void (*)(const void * __restrict__ x, T * __restrict__ y, int k, cudaStream_t stream);
-typedef to_t_cuda_t<float> to_fp32_cuda_t;
-typedef to_t_cuda_t<half> to_fp16_cuda_t;
-
-typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
-typedef void (*dot_kernel_k_t)(const void * __restrict__ vx, const int ib, const int iqs, const float * __restrict__ y, float & v);
-typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
-typedef void (*ggml_cuda_func_t)(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
-typedef void (*ggml_cuda_op_mul_mat_t)(
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
-    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
-    const int64_t src1_padded_row_size, cudaStream_t stream);
-typedef void (*ggml_cuda_op_flatten_t)(
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream);
-
-// QK = number of values after dequantization
-// QR = QK / number of values before dequantization
-// QI = number of 32 bit integers before dequantization
-
-#define QK4_0 32
-#define QR4_0 2
-#define QI4_0 (QK4_0 / (4 * QR4_0))
-typedef struct {
-    half    d;              // delta
-    uint8_t qs[QK4_0 / 2];  // nibbles / quants
-} block_q4_0;
-static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0 block size/padding");
-
-#define QK4_1 32
-#define QR4_1 2
-#define QI4_1 (QK4_1 / (4 * QR4_1))
-typedef struct {
-    half2   dm;             // dm.x = delta, dm.y = min
-    uint8_t qs[QK4_1 / 2];  // nibbles / quants
-} block_q4_1;
-static_assert(sizeof(block_q4_1) == sizeof(ggml_fp16_t) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding");
-
-#define QK5_0 32
-#define QR5_0 2
-#define QI5_0 (QK5_0 / (4 * QR5_0))
-typedef struct {
-    half d;                 // delta
-    uint8_t qh[4];          // 5-th bit of quants
-    uint8_t qs[QK5_0 / 2];  // nibbles / quants
-} block_q5_0;
-static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding");
-
-#define QK5_1 32
-#define QR5_1 2
-#define QI5_1 (QK5_1 / (4 * QR5_1))
-typedef struct {
-    half2 dm;               // dm.x = delta, dm.y = min
-    uint8_t qh[4];          // 5-th bit of quants
-    uint8_t qs[QK5_1 / 2];  // nibbles / quants
-} block_q5_1;
-static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
-
-#define QK8_0 32
-#define QR8_0 1
-#define QI8_0 (QK8_0 / (4 * QR8_0))
-typedef struct {
-    half    d;              // delta
-    int8_t  qs[QK8_0];      // quants
-} block_q8_0;
-static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding");
-
-#define QK8_1 32
-#define QR8_1 1
-#define QI8_1 (QK8_1 / (4 * QR8_1))
-typedef struct {
-    half2   ds;             // ds.x = delta, ds.y = sum
-    int8_t  qs[QK8_0];      // quants
-} block_q8_1;
-static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_fp16_t) + QK8_0, "wrong q8_1 block size/padding");
-
-typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs);
-typedef void (*allocate_tiles_cuda_t)(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc);
-typedef void (*load_tiles_cuda_t)(
-    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
-    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row);
-typedef float (*vec_dot_q_mul_mat_cuda_t)(
-    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
-    const int * __restrict__ y_qs, const half2 * __restrict__ y_ms, const int & i, const int & j, const int & k);
-
-//================================= k-quants
-
-#ifdef GGML_QKK_64
-#define QK_K 64
-#define K_SCALE_SIZE 4
-#else
-#define QK_K 256
-#define K_SCALE_SIZE 12
-#endif
-
-#define QR2_K 4
-#define QI2_K (QK_K / (4*QR2_K))
-typedef struct {
-    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
-    uint8_t qs[QK_K/4];      // quants
-    half2 dm;                // super-block scale for quantized scales/mins
-} block_q2_K;
-static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
-
-#define QR3_K 4
-#define QI3_K (QK_K / (4*QR3_K))
-typedef struct {
-    uint8_t hmask[QK_K/8];     // quants - high bit
-    uint8_t qs[QK_K/4];        // quants - low 2 bits
-#ifdef GGML_QKK_64
-    uint8_t scales[2]; // scales, quantized with 8 bits
-#else
-    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
-#endif
-    half d;             // super-block scale
-} block_q3_K;
-//static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + K_SCALE_SIZE, "wrong q3_K block size/padding");
-
-#define QR4_K 2
-#define QI4_K (QK_K / (4*QR4_K))
-#ifdef GGML_QKK_64
-typedef struct {
-    half    dm[2];             // super-block scales/mins
-    uint8_t scales[2];         // 4-bit block scales/mins
-    uint8_t qs[QK_K/2];        // 4--bit quants
-} block_q4_K;
-static_assert(sizeof(block_q4_K) == sizeof(half2) + QK_K/2 + 2, "wrong q4_K block size/padding");
-#else
-typedef struct {
-    half2 dm;                  // super-block scale for quantized scales/mins
-    uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits
-    uint8_t qs[QK_K/2];        // 4--bit quants
-} block_q4_K;
-static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding");
-#endif
-
-#define QR5_K 2
-#define QI5_K (QK_K / (4*QR5_K))
-#ifdef GGML_QKK_64
-typedef struct {
-    half d;                  // super-block scale
-    int8_t scales[QK_K/16];  // block scales
-    uint8_t qh[QK_K/8];      // quants, high bit
-    uint8_t qs[QK_K/2];      // quants, low 4 bits
-} block_q5_K;
-static_assert(sizeof(block_q5_K) == sizeof(ggml_fp16_t) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
-#else
-typedef struct {
-    half2 dm;                     // super-block scale for quantized scales/mins
-    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
-    uint8_t qh[QK_K/8];           // quants, high bit
-    uint8_t qs[QK_K/2];           // quants, low 4 bits
-} block_q5_K;
-static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
-#endif
-
-#define QR6_K 2
-#define QI6_K (QK_K / (4*QR6_K))
-typedef struct {
-    uint8_t ql[QK_K/2];   // quants, lower 4 bits
-    uint8_t qh[QK_K/4];   // quants, upper 2 bits
-    int8_t  scales[QK_K/16]; // scales
-    half    d;         // delta
-} block_q6_K;
-static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_K block size/padding");
-
-#define QR2_XXS 8
-#define QI2_XXS (QK_K / (4*QR2_XXS))
-typedef struct {
-    half d;
-    uint16_t qs[QK_K/8];
-} block_iq2_xxs;
-static_assert(sizeof(block_iq2_xxs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t), "wrong iq2_xxs block size/padding");
-
-#define QR2_XS 8
-#define QI2_XS (QK_K / (4*QR2_XS))
-typedef struct {
-    half d;
-    uint16_t qs[QK_K/8];
-    uint8_t  scales[QK_K/32];
-} block_iq2_xs;
-static_assert(sizeof(block_iq2_xs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t) + QK_K/32, "wrong iq2_xs block size/padding");
-
-#define WARP_SIZE 32
-#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
-
-#define CUDA_GELU_BLOCK_SIZE 256
-#define CUDA_SILU_BLOCK_SIZE 256
-#define CUDA_TANH_BLOCK_SIZE 256
-#define CUDA_RELU_BLOCK_SIZE 256
-#define CUDA_SQR_BLOCK_SIZE 256
-#define CUDA_CPY_BLOCK_SIZE 32
-#define CUDA_SCALE_BLOCK_SIZE 256
-#define CUDA_CLAMP_BLOCK_SIZE 256
-#define CUDA_ROPE_BLOCK_SIZE 256
-#define CUDA_SOFT_MAX_BLOCK_SIZE 1024
-#define CUDA_ALIBI_BLOCK_SIZE 32
-#define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
-#define CUDA_QUANTIZE_BLOCK_SIZE 256
-#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
-#define CUDA_GET_ROWS_BLOCK_SIZE 256
-#define CUDA_UPSCALE_BLOCK_SIZE 256
-#define CUDA_CONCAT_BLOCK_SIZE 256
-#define CUDA_PAD_BLOCK_SIZE 256
-#define CUDA_ACC_BLOCK_SIZE 256
-#define CUDA_IM2COL_BLOCK_SIZE 256
-
-#define CUDA_Q8_0_NE_ALIGN 2048
-
-// dmmv = dequantize_mul_mat_vec
-#ifndef GGML_CUDA_DMMV_X
-#define GGML_CUDA_DMMV_X 32
-#endif
-#ifndef GGML_CUDA_MMV_Y
-#define GGML_CUDA_MMV_Y 1
-#endif
-
-#ifndef K_QUANTS_PER_ITERATION
-#define K_QUANTS_PER_ITERATION 2
-#else
-static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
-#endif
-
-#ifndef GGML_CUDA_PEER_MAX_BATCH_SIZE
-#define GGML_CUDA_PEER_MAX_BATCH_SIZE 128
-#endif // GGML_CUDA_PEER_MAX_BATCH_SIZE
-
-#define MUL_MAT_SRC1_COL_STRIDE 128
-
-#define MAX_STREAMS 8
-static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_DEVICES][MAX_STREAMS] = { { nullptr } };
-
-struct ggml_tensor_extra_gpu {
-    void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
-    cudaEvent_t events[GGML_CUDA_MAX_DEVICES][MAX_STREAMS]; // events for synchronizing multiple GPUs
-};
-
-// this is faster on Windows
-// probably because the Windows CUDA libraries forget to make this check before invoking the drivers
-static void ggml_cuda_set_device(const int device) {
-    int current_device;
-    CUDA_CHECK(cudaGetDevice(&current_device));
-
-    if (device == current_device) {
-        return;
-    }
-
-    CUDA_CHECK(cudaSetDevice(device));
-}
-
-static int g_device_count = -1;
-static int g_main_device = 0;
-static std::array<float, GGML_CUDA_MAX_DEVICES> g_default_tensor_split = {};
-
-struct cuda_device_capabilities {
-    int     cc;                 // compute capability
-    size_t  smpb;               // max. shared memory per block
-    bool    vmm;                // virtual memory support
-    size_t  vmm_granularity;    // granularity of virtual memory
-};
-
-static cuda_device_capabilities g_device_caps[GGML_CUDA_MAX_DEVICES] = { {0, 0, false, 0} };
-
-static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
-
-[[noreturn]]
-static __device__ void bad_arch() {
-    printf("ERROR: ggml-cuda was compiled without support for the current GPU architecture.\n");
-    __trap();
-
-    (void) bad_arch; // suppress unused function warning
-}
-
-static __device__ __forceinline__ float warp_reduce_sum(float x) {
-#pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        x += __shfl_xor_sync(0xffffffff, x, mask, 32);
-    }
-    return x;
-}
-
-static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
-#pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32);
-        a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32);
-    }
-    return a;
-}
-
-static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
-#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
-#pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32));
-    }
-    return a;
-#else
-    (void) a;
-    bad_arch();
-#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
-}
-
-static __device__ __forceinline__ float warp_reduce_max(float x) {
-#pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        x = fmaxf(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
-    }
-    return x;
-}
-
-static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
-#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
-#pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        x = __hmax2(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
-    }
-    return x;
-#else
-    (void) x;
-    bad_arch();
-#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
-}
-
-static __device__ __forceinline__ float op_repeat(const float a, const float b) {
-    return b;
-    GGML_UNUSED(a);
-}
-
-static __device__ __forceinline__ float op_add(const float a, const float b) {
-    return a + b;
-}
-
-static __device__ __forceinline__ float op_mul(const float a, const float b) {
-    return a * b;
-}
-
-static __device__ __forceinline__ float op_div(const float a, const float b) {
-    return a / b;
-}
-
-template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
-static __global__ void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst,
-        int ne0, int ne1, int ne2, int ne3,
-        int ne10, int ne11, int ne12, int ne13,
-        /*int s0, */ int s1,  int s2,  int s3,
-        /*int s10,*/ int s11, int s12, int s13) {
-    const int i0s = blockDim.x*blockIdx.x + threadIdx.x;
-    const int i1 = (blockDim.y*blockIdx.y + threadIdx.y);
-    const int i2 = (blockDim.z*blockIdx.z + threadIdx.z) / ne3;
-    const int i3 = (blockDim.z*blockIdx.z + threadIdx.z) % ne3;
-
-    if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
-        return;
-    }
-
-    const int i11 = i1 % ne11;
-    const int i12 = i2 % ne12;
-    const int i13 = i3 % ne13;
-
-    const size_t i_src0 = i3*s3 + i2*s2 + i1*s1;
-    const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
-    const size_t i_dst  = i_src0;
-
-    const src0_t * src0_row = src0 + i_src0;
-    const src1_t * src1_row = src1 + i_src1;
-    dst_t * dst_row = dst + i_dst;
-
-    for (int i0 = i0s; i0 < ne0; i0 += blockDim.x*gridDim.x) {
-        const int i10 = i0 % ne10;
-        dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
-    }
-}
-
-template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
-static __global__ void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t * dst,
-        int ne0, int ne1, int ne2, int ne3,
-        int ne10, int ne11, int ne12, int ne13,
-        /*int s0, */ int s1,  int s2,  int s3,
-        /*int s10,*/ int s11, int s12, int s13) {
-
-    const int i = blockDim.x*blockIdx.x + threadIdx.x;
-
-    const int i3 = i/(ne2*ne1*ne0);
-    const int i2 = (i/(ne1*ne0)) % ne2;
-    const int i1 = (i/ne0) % ne1;
-    const int i0 = i % ne0;
-
-    if (i0 >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
-        return;
-    }
-
-    const int i11 = i1 % ne11;
-    const int i12 = i2 % ne12;
-    const int i13 = i3 % ne13;
-
-    const size_t i_src0 = i3*s3 + i2*s2 + i1*s1;
-    const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
-    const size_t i_dst  = i_src0;
-
-    const src0_t * src0_row = src0 + i_src0;
-    const src1_t * src1_row = src1 + i_src1;
-    dst_t * dst_row = dst + i_dst;
-
-    const int i10 = i0 % ne10;
-    dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
-}
-
-static __global__ void acc_f32(const float * x, const float * y, float * dst, const int ne,
-    const int ne10, const int ne11, const int ne12,
-    const int nb1, const int nb2, int offset) {
-    const int i = blockDim.x * blockIdx.x + threadIdx.x;
-    if (i >= ne) {
-        return;
-    }
-    int src1_idx = i - offset;
-    int oz = src1_idx / nb2;
-    int oy = (src1_idx - (oz * nb2)) / nb1;
-    int ox = src1_idx % nb1;
-    if (src1_idx >= 0 && ox < ne10 && oy < ne11 && oz < ne12) {
-        dst[i] = x[i] + y[ox + oy * ne10 + oz * ne10 * ne11];
-    } else {
-        dst[i] = x[i];
-    }
-}
-
-static __global__ void gelu_f32(const float * x, float * dst, const int k) {
-    const float GELU_COEF_A    = 0.044715f;
-    const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
-    const int i = blockDim.x*blockIdx.x + threadIdx.x;
-
-    if (i >= k) {
-        return;
-    }
-
-    float xi = x[i];
-    dst[i] = 0.5f*xi*(1.0f + tanhf(SQRT_2_OVER_PI*xi*(1.0f + GELU_COEF_A*xi*xi)));
-}
-
-static __global__ void silu_f32(const float * x, float * dst, const int k) {
-    const int i = blockDim.x*blockIdx.x + threadIdx.x;
-
-    if (i >= k) {
-        return;
-    }
-    dst[i] = x[i] / (1.0f + expf(-x[i]));
-}
-
-static __global__ void gelu_quick_f32(const float * x, float * dst, int k) {
-    const float GELU_QUICK_COEF = -1.702f;
-    const int i  = blockDim.x*blockIdx.x + threadIdx.x;
-    if (i >= k) {
-        return;
-    }
-    dst[i] = x[i] * (1.0f / (1.0f + expf(GELU_QUICK_COEF * x[i])));
-}
-
-static __global__ void tanh_f32(const float * x, float * dst, int k) {
-    const int i  = blockDim.x*blockIdx.x + threadIdx.x;
-    if (i >= k) {
-        return;
-    }
-    dst[i] = tanhf(x[i]);
-}
-
-static __global__ void relu_f32(const float * x, float * dst, const int k) {
-    const int i = blockDim.x*blockIdx.x + threadIdx.x;
-
-    if (i >= k) {
-        return;
-    }
-    dst[i] = fmaxf(x[i], 0);
-}
-
-static __global__ void leaky_relu_f32(const float * x, float * dst, const int k, const float negative_slope) {
-    const int i  = blockDim.x*blockIdx.x + threadIdx.x;
-    if (i >= k) {
-        return;
-    }
-    dst[i] = fmaxf(x[i], 0) + fminf(x[i], 0.0f) * negative_slope;
-}
-
-static __global__ void sqr_f32(const float * x, float * dst, const int k) {
-    const int i = blockDim.x*blockIdx.x + threadIdx.x;
-
-    if (i >= k) {
-        return;
-    }
-    dst[i] = x[i] * x[i];
-}
-
-template <int block_size>
-static __global__ void norm_f32(const float * x, float * dst, const int ncols, const float eps) {
-    const int row = blockIdx.x*blockDim.y + threadIdx.y;
-    const int tid = threadIdx.x;
-
-    float2 mean_var = make_float2(0.f, 0.f);
-
-    for (int col = tid; col < ncols; col += block_size) {
-        const float xi = x[row*ncols + col];
-        mean_var.x += xi;
-        mean_var.y += xi * xi;
-    }
-
-    // sum up partial sums
-    mean_var = warp_reduce_sum(mean_var);
-    if (block_size > WARP_SIZE) {
-        __shared__ float2 s_sum[32];
-        int warp_id = threadIdx.x / WARP_SIZE;
-        int lane_id = threadIdx.x % WARP_SIZE;
-        if (lane_id == 0) {
-            s_sum[warp_id] = mean_var;
-        }
-        __syncthreads();
-        mean_var = s_sum[lane_id];
-        mean_var = warp_reduce_sum(mean_var);
-    }
-
-    const float mean = mean_var.x / ncols;
-    const float var = mean_var.y / ncols - mean * mean;
-    const float inv_std = rsqrtf(var + eps);
-
-    for (int col = tid; col < ncols; col += block_size) {
-        dst[row*ncols + col] = (x[row*ncols + col] - mean) * inv_std;
-    }
-}
-
-static __global__ void concat_f32(const float * x,const float * y, float * dst, const int ne0, const int ne02) {
-    int nidx = threadIdx.x + blockIdx.x * blockDim.x;
-    if (nidx >= ne0) {
-        return;
-    }
-    // operation
-    int offset_dst =
-        nidx +
-        blockIdx.y * ne0 +
-        blockIdx.z * ne0 * gridDim.y;
-    if (blockIdx.z < ne02) { // src0
-        int offset_src =
-            nidx +
-            blockIdx.y * ne0 +
-            blockIdx.z * ne0 * gridDim.y;
-            dst[offset_dst] = x[offset_src];
-    } else {
-        int offset_src =
-            nidx +
-            blockIdx.y * ne0 +
-            (blockIdx.z - ne02) * ne0 *  gridDim.y;
-            dst[offset_dst] = y[offset_src];
-    }
-}
-
-static __global__ void upscale_f32(const float * x, float * dst, const int ne00, const int ne00xne01, const int scale_factor) {
-    // blockIdx.z: idx of ne02*ne03
-    // blockIdx.y: idx of ne01*scale_factor, aka ne1
-    // blockIDx.x: idx of ne00*scale_factor / BLOCK_SIZE
-    // ne00xne01: ne00 * ne01
-    int ne0 = ne00 * scale_factor;
-    int nidx = threadIdx.x + blockIdx.x * blockDim.x;
-    if (nidx >= ne0) {
-        return;
-    }
-    // operation
-    int i00 = nidx / scale_factor;
-    int i01 = blockIdx.y / scale_factor;
-    int offset_src =
-        i00 +
-        i01 * ne00 +
-        blockIdx.z * ne00xne01;
-    int offset_dst =
-        nidx +
-        blockIdx.y * ne0 +
-        blockIdx.z * ne0 * gridDim.y;
-    dst[offset_dst] = x[offset_src];
-}
-
-static __global__ void pad_f32(const float * x, float * dst, const int ne0, const int ne00, const int ne01, const int ne02, const int ne03) {
-    // blockIdx.z: idx of ne2*ne3, aka ne02*ne03
-    // blockIdx.y: idx of ne1
-    // blockIDx.x: idx of ne0 / BLOCK_SIZE
-    int nidx = threadIdx.x + blockIdx.x * blockDim.x;
-    if (nidx >= ne0) {
-        return;
-    }
-
-    // operation
-    int offset_dst =
-        nidx +
-        blockIdx.y * ne0 +
-        blockIdx.z * ne0 * gridDim.y;
-    if (nidx < ne00 && blockIdx.y < ne01 && blockIdx.z < ne02*ne03) {
-        int offset_src =
-            nidx +
-            blockIdx.y * ne00 +
-            blockIdx.z * ne00 * ne01;
-            dst[offset_dst] = x[offset_src];
-    } else {
-        dst[offset_dst] = 0.0f;
-    }
-}
-
-template <int block_size>
-static __global__ void group_norm_f32(const float * x, float * dst, const int group_size, const int ne_elements, const float eps) {
-    // blockIdx.x: num_groups idx
-    // threadIdx.x: block_size idx
-    int start = blockIdx.x * group_size;
-    int end = start + group_size;
-
-    start += threadIdx.x;
-
-    if (end >= ne_elements) {
-        end = ne_elements;
-    }
-
-    float tmp = 0.0f; // partial sum for thread in warp
-
-    for (int j = start; j < end; j += block_size) {
-        tmp += x[j];
-    }
-
-    tmp = warp_reduce_sum(tmp);
-    if (block_size > WARP_SIZE) {
-        __shared__ float s_sum[32];
-        int warp_id = threadIdx.x / WARP_SIZE;
-        int lane_id = threadIdx.x % WARP_SIZE;
-        if (lane_id == 0) {
-            s_sum[warp_id] = tmp;
-        }
-        __syncthreads();
-        tmp = s_sum[lane_id];
-        tmp = warp_reduce_sum(tmp);
-    }
-
-    float mean = tmp / group_size;
-    tmp = 0.0f;
-
-    for (int j = start; j < end; j += block_size) {
-        float xi = x[j] - mean;
-        dst[j] = xi;
-        tmp += xi * xi;
-    }
-
-    tmp = warp_reduce_sum(tmp);
-    if (block_size > WARP_SIZE) {
-        __shared__ float s_sum[32];
-        int warp_id = threadIdx.x / WARP_SIZE;
-        int lane_id = threadIdx.x % WARP_SIZE;
-        if (lane_id == 0) {
-            s_sum[warp_id] = tmp;
-        }
-        __syncthreads();
-        tmp = s_sum[lane_id];
-        tmp = warp_reduce_sum(tmp);
-    }
-
-    float variance = tmp / group_size;
-    float scale = rsqrtf(variance + eps);
-    for (int j = start; j < end; j += block_size) {
-        dst[j] *= scale;
-    }
-}
-
-template <int block_size>
-static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps) {
-    const int row = blockIdx.x*blockDim.y + threadIdx.y;
-    const int tid = threadIdx.x;
-
-    float tmp = 0.0f; // partial sum for thread in warp
-
-    for (int col = tid; col < ncols; col += block_size) {
-        const float xi = x[row*ncols + col];
-        tmp += xi * xi;
-    }
-
-    // sum up partial sums
-    tmp = warp_reduce_sum(tmp);
-    if (block_size > WARP_SIZE) {
-        __shared__ float s_sum[32];
-        int warp_id = threadIdx.x / WARP_SIZE;
-        int lane_id = threadIdx.x % WARP_SIZE;
-        if (lane_id == 0) {
-            s_sum[warp_id] = tmp;
-        }
-        __syncthreads();
-        tmp = s_sum[lane_id];
-        tmp = warp_reduce_sum(tmp);
-    }
-
-    const float mean = tmp / ncols;
-    const float scale = rsqrtf(mean + eps);
-
-    for (int col = tid; col < ncols; col += block_size) {
-        dst[row*ncols + col] = scale * x[row*ncols + col];
-    }
-}
-
-static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
-    const block_q4_0 * x = (const block_q4_0 *) vx;
-
-    const dfloat d = x[ib].d;
-
-    const int vui = x[ib].qs[iqs];
-
-    v.x = vui & 0xF;
-    v.y = vui >> 4;
-
-#ifdef GGML_CUDA_F16
-    v = __hsub2(v, {8.0f, 8.0f});
-    v = __hmul2(v, {d, d});
-#else
-    v.x = (v.x - 8.0f) * d;
-    v.y = (v.y - 8.0f) * d;
-#endif // GGML_CUDA_F16
-}
-
-static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
-    const block_q4_1 * x = (const block_q4_1 *) vx;
-
-    const dfloat d = __low2half(x[ib].dm);
-    const dfloat m = __high2half(x[ib].dm);
-
-    const int vui = x[ib].qs[iqs];
-
-    v.x = vui & 0xF;
-    v.y = vui >> 4;
-
-#ifdef GGML_CUDA_F16
-    v = __hmul2(v, {d, d});
-    v = __hadd2(v, {m, m});
-#else
-    v.x = (v.x * d) + m;
-    v.y = (v.y * d) + m;
-#endif // GGML_CUDA_F16
-}
-
-static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
-    const block_q5_0 * x = (const block_q5_0 *) vx;
-
-    const dfloat d = x[ib].d;
-
-    uint32_t qh;
-    memcpy(&qh, x[ib].qh, sizeof(qh));
-
-    const int xh_0 = ((qh >> (iqs +  0)) << 4) & 0x10;
-    const int xh_1 = ((qh >> (iqs + 12))     ) & 0x10;
-
-    v.x = ((x[ib].qs[iqs] & 0xf) | xh_0);
-    v.y = ((x[ib].qs[iqs] >>  4) | xh_1);
-
-#ifdef GGML_CUDA_F16
-    v = __hsub2(v, {16.0f, 16.0f});
-    v = __hmul2(v, {d, d});
-#else
-    v.x = (v.x - 16.0f) * d;
-    v.y = (v.y - 16.0f) * d;
-#endif // GGML_CUDA_F16
-}
-
-static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
-    const block_q5_1 * x = (const block_q5_1 *) vx;
-
-    const dfloat d = __low2half(x[ib].dm);
-    const dfloat m = __high2half(x[ib].dm);
-
-    uint32_t qh;
-    memcpy(&qh, x[ib].qh, sizeof(qh));
-
-    const int xh_0 = ((qh >> (iqs +  0)) << 4) & 0x10;
-    const int xh_1 = ((qh >> (iqs + 12))     ) & 0x10;
-
-    v.x = ((x[ib].qs[iqs] & 0xf) | xh_0);
-    v.y = ((x[ib].qs[iqs] >>  4) | xh_1);
-
-#ifdef GGML_CUDA_F16
-    v = __hmul2(v, {d, d});
-    v = __hadd2(v, {m, m});
-#else
-    v.x = (v.x * d) + m;
-    v.y = (v.y * d) + m;
-#endif // GGML_CUDA_F16
-}
-
-static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
-    const block_q8_0 * x = (const block_q8_0 *) vx;
-
-    const dfloat d = x[ib].d;
-
-    v.x = x[ib].qs[iqs + 0];
-    v.y = x[ib].qs[iqs + 1];
-
-#ifdef GGML_CUDA_F16
-    v = __hmul2(v, {d, d});
-#else
-    v.x *= d;
-    v.y *= d;
-#endif // GGML_CUDA_F16
-}
-
-//================================== k-quants
-
-template<typename dst_t>
-static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
-
-    const int i   = blockIdx.x;
-    const block_q2_K * x = (const block_q2_K *) vx;
-
-    const int tid = threadIdx.x;
-#if QK_K == 256
-    const int n   = tid/32;
-    const int l   = tid - 32*n;
-    const int is  = 8*n + l/16;
-
-    const uint8_t q = x[i].qs[32*n + l];
-    dst_t * y = yy + i*QK_K + 128*n;
-
-    float dall = __low2half(x[i].dm);
-    float dmin = __high2half(x[i].dm);
-    y[l+ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
-    y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
-    y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
-    y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4);
-#else
-    const int is = tid/16;  // 0 or 1
-    const int il = tid%16;  // 0...15
-    const uint8_t q = x[i].qs[il] >> (2*is);
-    dst_t * y = yy + i*QK_K + 16*is + il;
-    float dall = __low2half(x[i].dm);
-    float dmin = __high2half(x[i].dm);
-    y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
-    y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4);
-#endif
-
-}
-
-template<typename dst_t>
-static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
-
-    const int i = blockIdx.x;
-    const block_q3_K * x = (const block_q3_K *) vx;
-
-#if QK_K == 256
-    const int r = threadIdx.x/4;
-    const int tid = r/2;
-    const int is0 = r%2;
-    const int l0 = 16*is0 + 4*(threadIdx.x%4);
-    const int n = tid / 4;
-    const int j = tid - 4*n;
-
-    uint8_t m = 1 << (4*n + j);
-    int is = 8*n + 2*j + is0;
-    int shift = 2*j;
-
-    int8_t us = is <  4 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+8] >> 0) & 3) << 4) :
-                is <  8 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+4] >> 2) & 3) << 4) :
-                is < 12 ? (x[i].scales[is-8] >>  4) | (((x[i].scales[is+0] >> 4) & 3) << 4) :
-                          (x[i].scales[is-8] >>  4) | (((x[i].scales[is-4] >> 6) & 3) << 4);
-    float d_all = x[i].d;
-    float dl = d_all * (us - 32);
-
-    dst_t * y = yy + i*QK_K + 128*n + 32*j;
-    const uint8_t * q = x[i].qs + 32*n;
-    const uint8_t * hm = x[i].hmask;
-
-    for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4));
-#else
-    const int tid = threadIdx.x;
-    const int is  = tid/16;  // 0 or 1
-    const int il  = tid%16;  // 0...15
-    const int im  = il/8;    // 0...1
-    const int in  = il%8;    // 0...7
-
-    dst_t * y = yy + i*QK_K + 16*is + il;
-
-    const uint8_t q = x[i].qs[il] >> (2*is);
-    const uint8_t h = x[i].hmask[in] >> (2*is + im);
-    const float   d = (float)x[i].d;
-
-    if (is == 0) {
-        y[ 0] = d * ((x[i].scales[0] & 0xF) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
-        y[32] = d * ((x[i].scales[1] & 0xF) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
-    } else {
-        y[ 0] = d * ((x[i].scales[0] >>  4) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
-        y[32] = d * ((x[i].scales[1] >>  4) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
-    }
-#endif
-
-}
-
-#if QK_K == 256
-static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
-    if (j < 4) {
-        d = q[j] & 63; m = q[j + 4] & 63;
-    } else {
-        d = (q[j+4] & 0xF) | ((q[j-4] >> 6) << 4);
-        m = (q[j+4] >>  4) | ((q[j-0] >> 6) << 4);
-    }
-}
-#endif
-
-template<typename dst_t>
-static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
-    const block_q4_K * x = (const block_q4_K *) vx;
-
-    const int i = blockIdx.x;
-
-#if QK_K == 256
-    // assume 32 threads
-    const int tid = threadIdx.x;
-    const int il  = tid/8;
-    const int ir  = tid%8;
-    const int is  = 2*il;
-    const int n   = 4;
-
-    dst_t * y = yy + i*QK_K + 64*il + n*ir;
-
-    const float dall = __low2half(x[i].dm);
-    const float dmin = __high2half(x[i].dm);
-
-    const uint8_t * q = x[i].qs + 32*il + n*ir;
-
-    uint8_t sc, m;
-    get_scale_min_k4(is + 0, x[i].scales, sc, m);
-    const float d1 = dall * sc; const float m1 = dmin * m;
-    get_scale_min_k4(is + 1, x[i].scales, sc, m);
-    const float d2 = dall * sc; const float m2 = dmin * m;
-    for (int l = 0; l < n; ++l) {
-        y[l + 0] = d1 * (q[l] & 0xF) - m1;
-        y[l +32] = d2 * (q[l] >>  4) - m2;
-    }
-#else
-    const int tid = threadIdx.x;
-    const uint8_t * q = x[i].qs;
-    dst_t * y = yy + i*QK_K;
-    const float d = (float)x[i].dm[0];
-    const float m = (float)x[i].dm[1];
-    y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4);
-    y[tid+32] = d * (x[i].scales[1] & 0xF) * (q[tid] >>  4) - m * (x[i].scales[1] >> 4);
-#endif
-}
-
-template<typename dst_t>
-static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
-    const block_q5_K * x = (const block_q5_K *) vx;
-
-    const int i = blockIdx.x;
-
-#if QK_K == 256
-    // assume 64 threads - this is very slightly better than the one below
-    const int tid = threadIdx.x;
-    const int il  = tid/16;   // il is in 0...3
-    const int ir  = tid%16;   // ir is in 0...15
-    const int is  = 2*il;     // is is in 0...6
-
-    dst_t * y = yy + i*QK_K + 64*il + 2*ir;
-
-    const float dall = __low2half(x[i].dm);
-    const float dmin = __high2half(x[i].dm);
-
-    const uint8_t * ql = x[i].qs + 32*il + 2*ir;
-    const uint8_t * qh = x[i].qh + 2*ir;
-
-    uint8_t sc, m;
-    get_scale_min_k4(is + 0, x[i].scales, sc, m);
-    const float d1 = dall * sc; const float m1 = dmin * m;
-    get_scale_min_k4(is + 1, x[i].scales, sc, m);
-    const float d2 = dall * sc; const float m2 = dmin * m;
-
-    uint8_t   hm  = 1 << (2*il);
-    y[ 0] = d1 * ((ql[ 0] & 0xF) + (qh[ 0] & hm ? 16 : 0)) - m1;
-    y[ 1] = d1 * ((ql[ 1] & 0xF) + (qh[ 1] & hm ? 16 : 0)) - m1;
-    hm <<= 1;
-    y[32] = d2 * ((ql[ 0] >>  4) + (qh[ 0] & hm ? 16 : 0)) - m2;
-    y[33] = d2 * ((ql[ 1] >>  4) + (qh[ 1] & hm ? 16 : 0)) - m2;
-#else
-    const int tid = threadIdx.x;
-    const uint8_t q = x[i].qs[tid];
-    const int im = tid/8;  // 0...3
-    const int in = tid%8;  // 0...7
-    const int is = tid/16; // 0 or 1
-    const uint8_t h = x[i].qh[in] >> im;
-    const float d = x[i].d;
-    dst_t * y = yy + i*QK_K + tid;
-    y[ 0] = d * x[i].scales[is+0] * ((q & 0xF) - ((h >> 0) & 1 ? 0 : 16));
-    y[32] = d * x[i].scales[is+2] * ((q >>  4) - ((h >> 4) & 1 ? 0 : 16));
-#endif
-}
-
-template<typename dst_t>
-static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
-    const block_q6_K * x = (const block_q6_K *) vx;
-
-    const int i = blockIdx.x;
-#if QK_K == 256
-
-    // assume 64 threads - this is very slightly better than the one below
-    const int tid = threadIdx.x;
-    const int ip  = tid/32;   // ip is 0 or 1
-    const int il  = tid - 32*ip; // 0...32
-    const int is  = 8*ip + il/16;
-
-    dst_t * y = yy + i*QK_K + 128*ip + il;
-
-    const float d = x[i].d;
-
-    const uint8_t * ql = x[i].ql + 64*ip + il;
-    const uint8_t   qh = x[i].qh[32*ip + il];
-    const int8_t  * sc = x[i].scales + is;
-
-    y[ 0] = d * sc[0] * ((int8_t)((ql[ 0] & 0xF) | (((qh >> 0) & 3) << 4)) - 32);
-    y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32);
-    y[64] = d * sc[4] * ((int8_t)((ql[ 0]  >> 4) | (((qh >> 4) & 3) << 4)) - 32);
-    y[96] = d * sc[6] * ((int8_t)((ql[32]  >> 4) | (((qh >> 6) & 3) << 4)) - 32);
-#else
-
-    // assume 32 threads
-    const int tid = threadIdx.x;
-    const int ip  = tid/16;         // 0 or 1
-    const int il  = tid - 16*ip;    // 0...15
-
-    dst_t * y = yy + i*QK_K + 16*ip + il;
-
-    const float d = x[i].d;
-
-    const uint8_t   ql = x[i].ql[16*ip + il];
-    const uint8_t   qh = x[i].qh[il] >> (2*ip);
-    const int8_t  * sc = x[i].scales;
-
-    y[ 0] = d * sc[ip+0] * ((int8_t)((ql & 0xF) | (((qh >> 0) & 3) << 4)) - 32);
-    y[32] = d * sc[ip+2] * ((int8_t)((ql  >> 4) | (((qh >> 4) & 3) << 4)) - 32);
-#endif
-}
-
-static const __device__ uint64_t iq2xxs_grid[256] = {
-    0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
-    0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x08080808082b0808,
-    0x08080808082b082b, 0x08080808082b2b08, 0x08080808082b2b2b, 0x0808080819080819,
-    0x0808080819081908, 0x0808080819190808, 0x0808080819192b08, 0x08080808192b0819,
-    0x08080808192b1908, 0x080808082b080808, 0x080808082b08082b, 0x080808082b082b2b,
-    0x080808082b2b082b, 0x0808081908080819, 0x0808081908081908, 0x0808081908190808,
-    0x0808081908191919, 0x0808081919080808, 0x080808192b081908, 0x080808192b192b08,
-    0x0808082b08080808, 0x0808082b0808082b, 0x0808082b082b082b, 0x0808082b2b08082b,
-    0x0808190808080819, 0x0808190808081908, 0x0808190808190808, 0x08081908082b0819,
-    0x08081908082b1908, 0x0808190819080808, 0x080819081908082b, 0x0808190819082b08,
-    0x08081908192b0808, 0x080819082b080819, 0x080819082b081908, 0x080819082b190808,
-    0x080819082b2b1908, 0x0808191908080808, 0x080819190808082b, 0x0808191908082b08,
-    0x08081919082b0808, 0x080819191908192b, 0x08081919192b2b19, 0x080819192b080808,
-    0x080819192b190819, 0x0808192b08082b19, 0x0808192b08190808, 0x0808192b19080808,
-    0x0808192b2b081908, 0x0808192b2b2b1908, 0x08082b0808080808, 0x08082b0808081919,
-    0x08082b0808082b08, 0x08082b0808191908, 0x08082b08082b2b08, 0x08082b0819080819,
-    0x08082b0819081908, 0x08082b0819190808, 0x08082b081919082b, 0x08082b082b082b08,
-    0x08082b1908081908, 0x08082b1919080808, 0x08082b2b0808082b, 0x08082b2b08191908,
-    0x0819080808080819, 0x0819080808081908, 0x0819080808190808, 0x08190808082b0819,
-    0x0819080819080808, 0x08190808192b0808, 0x081908082b081908, 0x081908082b190808,
-    0x081908082b191919, 0x0819081908080808, 0x0819081908082b08, 0x08190819082b0808,
-    0x0819081919190808, 0x0819081919192b2b, 0x081908192b080808, 0x0819082b082b1908,
-    0x0819082b19081919, 0x0819190808080808, 0x0819190808082b08, 0x08191908082b0808,
-    0x08191908082b1919, 0x0819190819082b19, 0x081919082b080808, 0x0819191908192b08,
-    0x08191919192b082b, 0x0819192b08080808, 0x0819192b0819192b, 0x08192b0808080819,
-    0x08192b0808081908, 0x08192b0808190808, 0x08192b0819080808, 0x08192b082b080819,
-    0x08192b1908080808, 0x08192b1908081919, 0x08192b192b2b0808, 0x08192b2b19190819,
-    0x082b080808080808, 0x082b08080808082b, 0x082b080808082b2b, 0x082b080819081908,
-    0x082b0808192b0819, 0x082b08082b080808, 0x082b08082b08082b, 0x082b0819082b2b19,
-    0x082b081919082b08, 0x082b082b08080808, 0x082b082b0808082b, 0x082b190808080819,
-    0x082b190808081908, 0x082b190808190808, 0x082b190819080808, 0x082b19081919192b,
-    0x082b191908080808, 0x082b191919080819, 0x082b1919192b1908, 0x082b192b2b190808,
-    0x082b2b0808082b08, 0x082b2b08082b0808, 0x082b2b082b191908, 0x082b2b2b19081908,
-    0x1908080808080819, 0x1908080808081908, 0x1908080808190808, 0x1908080808192b08,
-    0x19080808082b0819, 0x19080808082b1908, 0x1908080819080808, 0x1908080819082b08,
-    0x190808081919192b, 0x19080808192b0808, 0x190808082b080819, 0x190808082b081908,
-    0x190808082b190808, 0x1908081908080808, 0x19080819082b0808, 0x19080819192b0819,
-    0x190808192b080808, 0x190808192b081919, 0x1908082b08080819, 0x1908082b08190808,
-    0x1908082b19082b08, 0x1908082b1919192b, 0x1908082b192b2b08, 0x1908190808080808,
-    0x1908190808082b08, 0x19081908082b0808, 0x190819082b080808, 0x190819082b192b19,
-    0x190819190819082b, 0x19081919082b1908, 0x1908192b08080808, 0x19082b0808080819,
-    0x19082b0808081908, 0x19082b0808190808, 0x19082b0819080808, 0x19082b0819081919,
-    0x19082b1908080808, 0x19082b1919192b08, 0x19082b19192b0819, 0x19082b192b08082b,
-    0x19082b2b19081919, 0x19082b2b2b190808, 0x1919080808080808, 0x1919080808082b08,
-    0x1919080808190819, 0x1919080808192b19, 0x19190808082b0808, 0x191908082b080808,
-    0x191908082b082b08, 0x1919081908081908, 0x191908191908082b, 0x191908192b2b1908,
-    0x1919082b2b190819, 0x191919082b190808, 0x191919082b19082b, 0x1919191908082b2b,
-    0x1919192b08080819, 0x1919192b19191908, 0x19192b0808080808, 0x19192b0808190819,
-    0x19192b0808192b19, 0x19192b08192b1908, 0x19192b1919080808, 0x19192b2b08082b08,
-    0x192b080808081908, 0x192b080808190808, 0x192b080819080808, 0x192b0808192b2b08,
-    0x192b081908080808, 0x192b081919191919, 0x192b082b08192b08, 0x192b082b192b0808,
-    0x192b190808080808, 0x192b190808081919, 0x192b191908190808, 0x192b19190819082b,
-    0x192b19192b081908, 0x192b2b081908082b, 0x2b08080808080808, 0x2b0808080808082b,
-    0x2b08080808082b2b, 0x2b08080819080819, 0x2b0808082b08082b, 0x2b08081908081908,
-    0x2b08081908192b08, 0x2b08081919080808, 0x2b08082b08190819, 0x2b08190808080819,
-    0x2b08190808081908, 0x2b08190808190808, 0x2b08190808191919, 0x2b08190819080808,
-    0x2b081908192b0808, 0x2b08191908080808, 0x2b0819191908192b, 0x2b0819192b191908,
-    0x2b08192b08082b19, 0x2b08192b19080808, 0x2b08192b192b0808, 0x2b082b080808082b,
-    0x2b082b1908081908, 0x2b082b2b08190819, 0x2b19080808081908, 0x2b19080808190808,
-    0x2b190808082b1908, 0x2b19080819080808, 0x2b1908082b2b0819, 0x2b1908190819192b,
-    0x2b1908192b080808, 0x2b19082b19081919, 0x2b19190808080808, 0x2b191908082b082b,
-    0x2b19190819081908, 0x2b19191919190819, 0x2b192b082b080819, 0x2b192b19082b0808,
-    0x2b2b08080808082b, 0x2b2b080819190808, 0x2b2b08082b081919, 0x2b2b081908082b19,
-    0x2b2b082b08080808, 0x2b2b190808192b08, 0x2b2b2b0819190808, 0x2b2b2b1908081908,
-};
-
-static const __device__ uint64_t iq2xs_grid[512] = {
-    0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
-    0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x080808080819192b,
-    0x0808080808192b19, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919,
-    0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908, 0x080808081908192b,
-    0x0808080819082b19, 0x0808080819190808, 0x080808081919082b, 0x0808080819191919,
-    0x0808080819192b08, 0x08080808192b0819, 0x08080808192b1908, 0x080808082b080808,
-    0x080808082b08082b, 0x080808082b081919, 0x080808082b082b08, 0x080808082b190819,
-    0x080808082b191908, 0x080808082b192b19, 0x080808082b2b0808, 0x0808081908080819,
-    0x0808081908081908, 0x080808190808192b, 0x0808081908082b19, 0x0808081908190808,
-    0x080808190819082b, 0x0808081908191919, 0x0808081908192b08, 0x0808081908192b2b,
-    0x08080819082b0819, 0x08080819082b1908, 0x0808081919080808, 0x080808191908082b,
-    0x0808081919081919, 0x0808081919082b08, 0x0808081919190819, 0x0808081919191908,
-    0x08080819192b0808, 0x08080819192b2b08, 0x080808192b080819, 0x080808192b081908,
-    0x080808192b190808, 0x0808082b08080808, 0x0808082b0808082b, 0x0808082b08081919,
-    0x0808082b08082b08, 0x0808082b08190819, 0x0808082b08191908, 0x0808082b082b0808,
-    0x0808082b19080819, 0x0808082b19081908, 0x0808082b19190808, 0x0808082b19191919,
-    0x0808082b2b080808, 0x0808082b2b082b2b, 0x0808190808080819, 0x0808190808081908,
-    0x080819080808192b, 0x0808190808082b19, 0x0808190808190808, 0x080819080819082b,
-    0x0808190808191919, 0x0808190808192b08, 0x08081908082b0819, 0x08081908082b1908,
-    0x0808190819080808, 0x080819081908082b, 0x0808190819081919, 0x0808190819082b08,
-    0x0808190819190819, 0x0808190819191908, 0x080819081919192b, 0x08081908192b0808,
-    0x080819082b080819, 0x080819082b081908, 0x080819082b190808, 0x0808191908080808,
-    0x080819190808082b, 0x0808191908081919, 0x0808191908082b08, 0x0808191908190819,
-    0x0808191908191908, 0x08081919082b0808, 0x0808191919080819, 0x0808191919081908,
-    0x0808191919190808, 0x08081919192b0819, 0x080819192b080808, 0x0808192b08080819,
-    0x0808192b08081908, 0x0808192b08190808, 0x0808192b082b192b, 0x0808192b19080808,
-    0x0808192b1908082b, 0x0808192b2b081908, 0x08082b0808080808, 0x08082b080808082b,
-    0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808082b2b, 0x08082b0808190819,
-    0x08082b0808191908, 0x08082b08082b0808, 0x08082b08082b1919, 0x08082b0819080819,
-    0x08082b0819081908, 0x08082b0819190808, 0x08082b0819192b08, 0x08082b082b080808,
-    0x08082b082b2b0808, 0x08082b082b2b2b2b, 0x08082b1908080819, 0x08082b1908081908,
-    0x08082b1908190808, 0x08082b1919080808, 0x08082b192b080819, 0x08082b192b082b19,
-    0x08082b2b08080808, 0x08082b2b082b0808, 0x08082b2b082b2b08, 0x08082b2b2b19192b,
-    0x08082b2b2b2b0808, 0x0819080808080819, 0x0819080808081908, 0x081908080808192b,
-    0x0819080808082b19, 0x0819080808190808, 0x081908080819082b, 0x0819080808191919,
-    0x0819080808192b08, 0x08190808082b0819, 0x08190808082b1908, 0x0819080819080808,
-    0x081908081908082b, 0x0819080819081919, 0x0819080819082b08, 0x0819080819190819,
-    0x0819080819191908, 0x08190808192b0808, 0x08190808192b2b2b, 0x081908082b080819,
-    0x081908082b081908, 0x081908082b190808, 0x0819081908080808, 0x081908190808082b,
-    0x0819081908081919, 0x0819081908082b08, 0x0819081908190819, 0x0819081908191908,
-    0x08190819082b0808, 0x0819081919080819, 0x0819081919081908, 0x0819081919190808,
-    0x081908192b080808, 0x081908192b191908, 0x081908192b19192b, 0x0819082b08080819,
-    0x0819082b08081908, 0x0819082b0808192b, 0x0819082b08190808, 0x0819082b19080808,
-    0x0819082b192b0808, 0x0819190808080808, 0x081919080808082b, 0x0819190808081919,
-    0x0819190808082b08, 0x0819190808190819, 0x0819190808191908, 0x08191908082b0808,
-    0x0819190819080819, 0x0819190819081908, 0x0819190819082b19, 0x0819190819190808,
-    0x08191908192b1908, 0x081919082b080808, 0x0819191908080819, 0x0819191908081908,
-    0x0819191908190808, 0x0819191919080808, 0x0819192b08080808, 0x0819192b08191908,
-    0x0819192b19082b19, 0x08192b0808080819, 0x08192b0808081908, 0x08192b0808190808,
-    0x08192b080819082b, 0x08192b0819080808, 0x08192b0819191908, 0x08192b082b08192b,
-    0x08192b1908080808, 0x08192b1908081919, 0x08192b19192b192b, 0x08192b2b19190819,
-    0x08192b2b2b2b2b19, 0x082b080808080808, 0x082b08080808082b, 0x082b080808081919,
-    0x082b080808082b08, 0x082b080808082b2b, 0x082b080808190819, 0x082b080808191908,
-    0x082b0808082b0808, 0x082b080819080819, 0x082b080819081908, 0x082b080819190808,
-    0x082b08082b080808, 0x082b08082b2b0808, 0x082b081908080819, 0x082b081908081908,
-    0x082b081908190808, 0x082b081919080808, 0x082b081919082b08, 0x082b0819192b1919,
-    0x082b082b08080808, 0x082b082b082b082b, 0x082b082b2b080808, 0x082b082b2b2b2b08,
-    0x082b190808080819, 0x082b190808081908, 0x082b190808190808, 0x082b1908082b2b19,
-    0x082b190819080808, 0x082b191908080808, 0x082b191919080819, 0x082b19191919082b,
-    0x082b19192b192b19, 0x082b192b08080819, 0x082b192b08192b2b, 0x082b192b2b2b192b,
-    0x082b2b0808080808, 0x082b2b0808082b08, 0x082b2b0808082b2b, 0x082b2b08082b0808,
-    0x082b2b0819191919, 0x082b2b082b082b08, 0x082b2b082b2b082b, 0x082b2b19192b2b08,
-    0x082b2b192b190808, 0x082b2b2b08082b08, 0x082b2b2b082b0808, 0x082b2b2b2b08082b,
-    0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819, 0x1908080808081908,
-    0x190808080808192b, 0x1908080808082b19, 0x1908080808190808, 0x190808080819082b,
-    0x1908080808191919, 0x1908080808192b08, 0x19080808082b0819, 0x19080808082b1908,
-    0x1908080819080808, 0x190808081908082b, 0x1908080819081919, 0x1908080819082b08,
-    0x1908080819082b2b, 0x1908080819190819, 0x1908080819191908, 0x19080808192b0808,
-    0x19080808192b1919, 0x190808082b080819, 0x190808082b081908, 0x190808082b190808,
-    0x1908081908080808, 0x190808190808082b, 0x1908081908081919, 0x1908081908082b08,
-    0x1908081908190819, 0x1908081908191908, 0x19080819082b0808, 0x1908081919080819,
-    0x1908081919081908, 0x1908081919190808, 0x190808192b080808, 0x190808192b081919,
-    0x190808192b2b082b, 0x1908082b08080819, 0x1908082b08081908, 0x1908082b08190808,
-    0x1908082b0819082b, 0x1908082b082b2b19, 0x1908082b19080808, 0x1908190808080808,
-    0x190819080808082b, 0x1908190808081919, 0x1908190808082b08, 0x1908190808190819,
-    0x1908190808191908, 0x1908190808192b19, 0x19081908082b0808, 0x1908190819080819,
-    0x1908190819081908, 0x1908190819190808, 0x190819082b080808, 0x190819082b191908,
-    0x1908191908080819, 0x1908191908081908, 0x1908191908190808, 0x19081919082b1908,
-    0x1908191919080808, 0x190819192b192b2b, 0x1908192b08080808, 0x1908192b08082b2b,
-    0x1908192b19081908, 0x1908192b19190808, 0x19082b0808080819, 0x19082b0808081908,
-    0x19082b0808190808, 0x19082b0819080808, 0x19082b0819081919, 0x19082b0819191908,
-    0x19082b08192b082b, 0x19082b1908080808, 0x19082b1908190819, 0x19082b1919081908,
-    0x19082b1919190808, 0x19082b19192b2b19, 0x19082b2b08081908, 0x1919080808080808,
-    0x191908080808082b, 0x1919080808081919, 0x1919080808082b08, 0x1919080808190819,
-    0x1919080808191908, 0x19190808082b0808, 0x19190808082b2b08, 0x1919080819080819,
-    0x1919080819081908, 0x1919080819190808, 0x191908082b080808, 0x1919081908080819,
-    0x1919081908081908, 0x1919081908190808, 0x1919081908191919, 0x1919081919080808,
-    0x191908191908082b, 0x1919082b08080808, 0x1919082b19081908, 0x1919082b2b2b2b2b,
-    0x1919190808080819, 0x1919190808081908, 0x1919190808190808, 0x19191908082b0819,
-    0x1919190819080808, 0x19191908192b0808, 0x191919082b080819, 0x191919082b2b0819,
-    0x1919191908080808, 0x1919191908082b08, 0x191919192b080808, 0x191919192b082b08,
-    0x1919192b082b0819, 0x1919192b192b2b08, 0x1919192b2b2b0819, 0x19192b0808080808,
-    0x19192b0808191908, 0x19192b0819080819, 0x19192b0819190808, 0x19192b082b192b19,
-    0x19192b1908192b2b, 0x19192b1919080808, 0x19192b191908082b, 0x19192b2b2b081919,
-    0x192b080808080819, 0x192b080808081908, 0x192b080808190808, 0x192b080819080808,
-    0x192b080819191908, 0x192b0808192b082b, 0x192b08082b08192b, 0x192b08082b2b2b19,
-    0x192b081908080808, 0x192b082b082b1908, 0x192b082b19082b2b, 0x192b082b2b19082b,
-    0x192b190808080808, 0x192b19080819192b, 0x192b191908190808, 0x192b191919080808,
-    0x192b191919081919, 0x192b19192b2b1908, 0x192b2b0808080819, 0x192b2b08192b2b2b,
-    0x192b2b19082b1919, 0x192b2b2b0808192b, 0x192b2b2b19191908, 0x192b2b2b192b082b,
-    0x2b08080808080808, 0x2b0808080808082b, 0x2b08080808081919, 0x2b08080808082b08,
-    0x2b08080808190819, 0x2b08080808191908, 0x2b080808082b0808, 0x2b080808082b2b2b,
-    0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808, 0x2b0808082b080808,
-    0x2b0808082b08082b, 0x2b0808082b2b2b08, 0x2b0808082b2b2b2b, 0x2b08081908080819,
-    0x2b08081908081908, 0x2b0808190808192b, 0x2b08081908190808, 0x2b08081919080808,
-    0x2b08081919190819, 0x2b08081919192b19, 0x2b08082b08080808, 0x2b08082b082b0808,
-    0x2b08082b2b080808, 0x2b08082b2b08082b, 0x2b08082b2b2b0808, 0x2b08082b2b2b2b08,
-    0x2b08190808080819, 0x2b08190808081908, 0x2b08190808190808, 0x2b0819080819082b,
-    0x2b08190808191919, 0x2b08190819080808, 0x2b081908192b0808, 0x2b0819082b082b19,
-    0x2b08191908080808, 0x2b08191919081908, 0x2b0819192b2b1919, 0x2b08192b08192b08,
-    0x2b08192b192b2b2b, 0x2b082b0808080808, 0x2b082b0808082b08, 0x2b082b08082b1919,
-    0x2b082b0819192b2b, 0x2b082b082b080808, 0x2b082b082b08082b, 0x2b082b082b2b2b08,
-    0x2b082b190808192b, 0x2b082b2b082b082b, 0x2b082b2b2b080808, 0x2b082b2b2b082b08,
-    0x2b082b2b2b19192b, 0x2b082b2b2b2b2b08, 0x2b19080808080819, 0x2b19080808081908,
-    0x2b19080808190808, 0x2b19080819080808, 0x2b1908081919192b, 0x2b1908082b081908,
-    0x2b19081908080808, 0x2b190819082b082b, 0x2b190819192b1908, 0x2b19082b1919192b,
-    0x2b19082b2b082b19, 0x2b19190808080808, 0x2b19190808081919, 0x2b19190819081908,
-    0x2b19190819190808, 0x2b19190819192b08, 0x2b191919082b2b19, 0x2b1919192b190808,
-    0x2b1919192b19082b, 0x2b19192b19080819, 0x2b192b0819190819, 0x2b192b082b2b192b,
-    0x2b192b1919082b19, 0x2b192b2b08191919, 0x2b192b2b192b0808, 0x2b2b080808080808,
-    0x2b2b08080808082b, 0x2b2b080808082b08, 0x2b2b080808082b2b, 0x2b2b0808082b0808,
-    0x2b2b0808082b2b2b, 0x2b2b08082b2b0808, 0x2b2b081919190819, 0x2b2b081919192b19,
-    0x2b2b08192b2b192b, 0x2b2b082b08080808, 0x2b2b082b0808082b, 0x2b2b082b08082b08,
-    0x2b2b082b082b2b2b, 0x2b2b082b2b080808, 0x2b2b082b2b2b0808, 0x2b2b190819080808,
-    0x2b2b19082b191919, 0x2b2b192b192b1919, 0x2b2b192b2b192b08, 0x2b2b2b0808082b2b,
-    0x2b2b2b08082b0808, 0x2b2b2b08082b082b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b0808,
-    0x2b2b2b082b2b2b08, 0x2b2b2b1908081908, 0x2b2b2b192b081908, 0x2b2b2b192b08192b,
-    0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b, 0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b,
-};
-
-static const __device__ uint8_t ksigns_iq2xs[128] = {
-      0, 129, 130,   3, 132,   5,   6, 135, 136,   9,  10, 139,  12, 141, 142,  15,
-    144,  17,  18, 147,  20, 149, 150,  23,  24, 153, 154,  27, 156,  29,  30, 159,
-    160,  33,  34, 163,  36, 165, 166,  39,  40, 169, 170,  43, 172,  45,  46, 175,
-     48, 177, 178,  51, 180,  53,  54, 183, 184,  57,  58, 187,  60, 189, 190,  63,
-    192,  65,  66, 195,  68, 197, 198,  71,  72, 201, 202,  75, 204,  77,  78, 207,
-     80, 209, 210,  83, 212,  85,  86, 215, 216,  89,  90, 219,  92, 221, 222,  95,
-     96, 225, 226,  99, 228, 101, 102, 231, 232, 105, 106, 235, 108, 237, 238, 111,
-    240, 113, 114, 243, 116, 245, 246, 119, 120, 249, 250, 123, 252, 125, 126, 255,
-};
-
-static const __device__ uint8_t kmask_iq2xs[8] = {1, 2, 4, 8, 16, 32, 64, 128};
-
-inline bool ggml_cuda_supports_mmq(enum ggml_type type) {
-    switch (type) {
-        case GGML_TYPE_Q4_0:
-        case GGML_TYPE_Q4_1:
-        case GGML_TYPE_Q5_0:
-        case GGML_TYPE_Q5_1:
-        case GGML_TYPE_Q8_0:
-        case GGML_TYPE_Q2_K:
-        case GGML_TYPE_Q3_K:
-        case GGML_TYPE_Q4_K:
-        case GGML_TYPE_Q5_K:
-        case GGML_TYPE_Q6_K:
-            return true;
-        default:
-            return false;
-    }
-}
-
-template<typename dst_t>
-static __global__ void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
-
-    const int i   = blockIdx.x;
-    const block_iq2_xxs * x = (const block_iq2_xxs  *) vx;
-
-    const int tid = threadIdx.x;
-#if QK_K == 256
-    const int il = tid/8; // 0...3
-    const int ib = tid%8; // 0...7
-    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
-    const uint16_t * q2 = x[i].qs + 4*ib;
-    const uint8_t  * aux8 = (const uint8_t *)q2;
-    const uint8_t  * grid = (const uint8_t *)(iq2xxs_grid + aux8[il]);
-    const uint32_t aux32 = q2[2] | (q2[3] << 16);
-    const float d = (float)x[i].d * (0.5f + (aux32 >> 28)) * 0.25f;
-    const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*il) & 127];
-    for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
-#else
-    assert(false);
-#endif
-
-}
-
-template<typename dst_t>
-static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
-
-    const int i   = blockIdx.x;
-    const block_iq2_xs * x = (const block_iq2_xs *) vx;
-
-    const int tid = threadIdx.x;
-#if QK_K == 256
-    const int il = tid/8; // 0...3
-    const int ib = tid%8; // 0...7
-    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
-    const uint16_t * q2 = x[i].qs + 4*ib;
-    const uint8_t  * grid = (const uint8_t *)(iq2xs_grid + (q2[il] & 511));
-    const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
-    const uint8_t signs = ksigns_iq2xs[q2[il] >> 9];
-    for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
-#else
-    assert(false);
-#endif
-
-}
-
-static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
-
-    static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
-
-    const int row = blockIdx.x*blockDim.y + threadIdx.y;
-    if (row > nrows) return;
-
-    const int num_blocks_per_row = ncols / QK_K;
-    const int ib0 = row*num_blocks_per_row;
-
-    const block_q2_K * x = (const block_q2_K *)vx + ib0;
-
-    float tmp = 0; // partial sum for thread in warp
-
-#if QK_K == 256
-    const int tid = threadIdx.x/K_QUANTS_PER_ITERATION;  // 0...31 or 0...15
-    const int ix  = threadIdx.x%K_QUANTS_PER_ITERATION;  // 0 or 0,1
-
-    const int step = 16/K_QUANTS_PER_ITERATION;
-
-    const int im = tid/step;                             // 0 or 1. 0 computes 0..., 1 computes 128...
-    const int in = tid - step*im;                        // 0...15 or 0...7
-
-    const int l0 = K_QUANTS_PER_ITERATION*in;            // 0...15 or 0...14 in steps of 2
-    const int q_offset = 32*im + l0;
-    const int s_offset = 8*im;
-    const int y_offset = 128*im + l0;
-
-    uint32_t aux[4];
-    const uint8_t * d = (const uint8_t *)aux;
-    const uint8_t * m = (const uint8_t *)(aux + 2);
-
-    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
-
-        const float   * y = yy + i * QK_K + y_offset;
-        const uint8_t * q = x[i].qs + q_offset;
-
-        const float dall = __low2half(x[i].dm);
-        const float dmin = __high2half(x[i].dm);
-
-        const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset);
-        aux[0] = a[0] & 0x0f0f0f0f;
-        aux[1] = a[1] & 0x0f0f0f0f;
-        aux[2] = (a[0] >> 4) & 0x0f0f0f0f;
-        aux[3] = (a[1] >> 4) & 0x0f0f0f0f;
-
-        float sum1 = 0, sum2 = 0;
-        for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
-            sum1 += y[l+ 0] * d[0] * ((q[l+ 0] >> 0) & 3)
-                  + y[l+32] * d[2] * ((q[l+ 0] >> 2) & 3)
-                  + y[l+64] * d[4] * ((q[l+ 0] >> 4) & 3)
-                  + y[l+96] * d[6] * ((q[l+ 0] >> 6) & 3)
-                  + y[l+16] * d[1] * ((q[l+16] >> 0) & 3)
-                  + y[l+48] * d[3] * ((q[l+16] >> 2) & 3)
-                  + y[l+80] * d[5] * ((q[l+16] >> 4) & 3)
-                  +y[l+112] * d[7] * ((q[l+16] >> 6) & 3);
-            sum2 += y[l+ 0] * m[0] + y[l+32] * m[2] + y[l+64] * m[4] + y[ l+96] * m[6]
-                  + y[l+16] * m[1] + y[l+48] * m[3] + y[l+80] * m[5] + y[l+112] * m[7];
-
-        }
-        tmp += dall * sum1 - dmin * sum2;
-
-    }
-#else
-    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...15 or 0...7
-    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);  // 0....1 or 0...3
-    const int offset = tid * K_QUANTS_PER_ITERATION;
-
-    uint32_t uaux[2];
-    const uint8_t * d = (const uint8_t *)uaux;
-
-    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
-
-        const float   * y = yy + i * QK_K + offset;
-        const uint8_t * q = x[i].qs + offset;
-        const uint32_t * s = (const uint32_t *)x[i].scales;
-
-        uaux[0] = s[0] & 0x0f0f0f0f;
-        uaux[1] = (s[0] >> 4) & 0x0f0f0f0f;
-
-        const float2 dall = __half22float2(x[i].dm);
-
-        float sum1 = 0, sum2 = 0;
-        for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
-            const uint8_t ql = q[l];
-            sum1 += y[l+ 0] * d[0] * ((ql >> 0) & 3)
-                  + y[l+16] * d[1] * ((ql >> 2) & 3)
-                  + y[l+32] * d[2] * ((ql >> 4) & 3)
-                  + y[l+48] * d[3] * ((ql >> 6) & 3);
-            sum2 += y[l+0] * d[4] + y[l+16] * d[5] + y[l+32] * d[6] + y[l+48] * d[7];
-        }
-        tmp += dall.x * sum1 - dall.y * sum2;
-    }
-#endif
-
-    // sum up partial sums and write back result
-#pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
-    }
-
-    if (threadIdx.x == 0) {
-        dst[row] = tmp;
-    }
-}
-
-static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
-
-    const int row = blockIdx.x*blockDim.y + threadIdx.y;
-    if (row > nrows) return;
-
-    const int num_blocks_per_row = ncols / QK_K;
-    const int ib0 = row*num_blocks_per_row;
-
-    const block_q3_K * x = (const block_q3_K *)vx + ib0;
-
-    float tmp = 0; // partial sum for thread in warp
-
-#if QK_K == 256
-
-    const uint16_t kmask1 = 0x0303;
-    const uint16_t kmask2 = 0x0f0f;
-
-    const int tid = threadIdx.x/K_QUANTS_PER_ITERATION;  // 0...31 or 0...16
-    const int ix  = threadIdx.x%K_QUANTS_PER_ITERATION;  // 0 or 0,1
-
-    const int n  = K_QUANTS_PER_ITERATION;               // iterations in the inner loop
-    const int step = 16/K_QUANTS_PER_ITERATION;
-    const int im = tid/step;                             // 0 or 1. 0 computes 0..., 1 computes 128...
-    const int in = tid - step*im;                        // 0....15 or 0...7
-
-    const uint8_t m = 1 << (4*im);
-
-    const int l0 = n*in;                                 // 0...15 or 0...14 in steps of 2
-    const int q_offset =  32*im + l0;
-    const int y_offset = 128*im + l0;
-
-    uint16_t utmp[4];
-    const int8_t * s = (const int8_t *)utmp;
-
-    const uint16_t s_shift = 4*im;
-
-    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
-
-        const float   * y  = yy + i * QK_K + y_offset;
-        const uint8_t * q = x[i].qs + q_offset;
-        const uint8_t * h = x[i].hmask + l0;
-
-        const uint16_t * a = (const uint16_t *)x[i].scales;
-        utmp[0] = ((a[0] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 0)) & kmask1) << 4);
-        utmp[1] = ((a[1] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 0)) & kmask1) << 4);
-        utmp[2] = ((a[2] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 2)) & kmask1) << 4);
-        utmp[3] = ((a[3] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 2)) & kmask1) << 4);
-
-        const float d = x[i].d;
-
-        float sum = 0;
-        for (int l = 0; l < n; ++l) {
-            sum += y[l+ 0] * (s[0] - 32) * (((q[l] >> 0) & 3) - (h[l] & (m << 0) ? 0 : 4))
-                 + y[l+32] * (s[2] - 32) * (((q[l] >> 2) & 3) - (h[l] & (m << 1) ? 0 : 4))
-                 + y[l+64] * (s[4] - 32) * (((q[l] >> 4) & 3) - (h[l] & (m << 2) ? 0 : 4))
-                 + y[l+96] * (s[6] - 32) * (((q[l] >> 6) & 3) - (h[l] & (m << 3) ? 0 : 4));
-            sum += y[l+16] * (s[1] - 32) * (((q[l+16] >> 0) & 3) - (h[l+16] & (m << 0) ? 0 : 4))
-                 + y[l+48] * (s[3] - 32) * (((q[l+16] >> 2) & 3) - (h[l+16] & (m << 1) ? 0 : 4))
-                 + y[l+80] * (s[5] - 32) * (((q[l+16] >> 4) & 3) - (h[l+16] & (m << 2) ? 0 : 4))
-                + y[l+112] * (s[7] - 32) * (((q[l+16] >> 6) & 3) - (h[l+16] & (m << 3) ? 0 : 4));
-        }
-        tmp += d * sum;
-
-    }
-#else
-
-    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...15 or 0...7
-    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);  // 0....1 or 0...3
-    const int offset = tid * K_QUANTS_PER_ITERATION;         // 0...15 or 0...14
-    const int in = offset/8;                                 // 0 or 1
-    const int im = offset%8;                                 // 0...7
-
-    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
-
-        const float   * y = yy + i * QK_K + offset;
-        const uint8_t * q = x[i].qs + offset;
-        const uint8_t * s = x[i].scales;
-
-        const float dall = (float)x[i].d;
-
-        float sum = 0;
-        for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
-            const uint8_t hl = x[i].hmask[im+l] >> in;
-            const uint8_t ql = q[l];
-            sum += y[l+ 0] * dall * ((s[0] & 0xF) - 8) * ((int8_t)((ql >> 0) & 3) - ((hl >> 0) & 1 ? 0 : 4))
-                 + y[l+16] * dall * ((s[0] >>  4) - 8) * ((int8_t)((ql >> 2) & 3) - ((hl >> 2) & 1 ? 0 : 4))
-                 + y[l+32] * dall * ((s[1] & 0xF) - 8) * ((int8_t)((ql >> 4) & 3) - ((hl >> 4) & 1 ? 0 : 4))
-                 + y[l+48] * dall * ((s[1] >>  4) - 8) * ((int8_t)((ql >> 6) & 3) - ((hl >> 6) & 1 ? 0 : 4));
-        }
-        tmp += sum;
-    }
-#endif
-
-    // sum up partial sums and write back result
-#pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
-    }
-
-    if (threadIdx.x == 0) {
-        dst[row] = tmp;
-    }
-}
-
-static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
-
-    const int row = blockIdx.x*blockDim.y + threadIdx.y;
-    if (row > nrows) return;
-    const int num_blocks_per_row = ncols / QK_K;
-    const int ib0 = row*num_blocks_per_row;
-
-    const block_q4_K * x = (const block_q4_K *)vx + ib0;
-
-#if QK_K == 256
-    const uint16_t kmask1 = 0x3f3f;
-    const uint16_t kmask2 = 0x0f0f;
-    const uint16_t kmask3 = 0xc0c0;
-
-    const int tid = threadIdx.x/K_QUANTS_PER_ITERATION;  // 0...31 or 0...16
-    const int ix  = threadIdx.x%K_QUANTS_PER_ITERATION;  // 0 or 0,1
-
-    const int step = 8/K_QUANTS_PER_ITERATION;           // 8 or 4
-
-    const int il  = tid/step;                            // 0...3
-    const int ir  = tid - step*il;                       // 0...7 or 0...3
-    const int n   = 2 * K_QUANTS_PER_ITERATION;          // 2 or 4
-
-    const int im = il/2;  // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
-    const int in = il%2;
-
-    const int l0 = n*(2*ir + in);
-    const int q_offset = 32*im + l0;
-    const int y_offset = 64*im + l0;
-
-    uint16_t aux[4];
-    const uint8_t * sc = (const uint8_t *)aux;
-
-#if K_QUANTS_PER_ITERATION == 2
-    uint32_t q32[4];
-    const uint8_t * q4 = (const uint8_t *)q32;
-#else
-    uint16_t q16[4];
-    const uint8_t * q4 = (const uint8_t *)q16;
-#endif
-
-    float tmp = 0; // partial sum for thread in warp
-
-    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
-
-        const float   * y1 = yy + i*QK_K + y_offset;
-        const float   * y2 = y1 + 128;
-
-        const float dall = __low2half(x[i].dm);
-        const float dmin = __high2half(x[i].dm);
-
-        const uint16_t * a = (const uint16_t *)x[i].scales;
-        aux[0] = a[im+0] & kmask1;
-        aux[1] = a[im+2] & kmask1;
-        aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
-        aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
-
-#if K_QUANTS_PER_ITERATION == 2
-        const uint32_t * q1 = (const uint32_t *)(x[i].qs + q_offset);
-        const uint32_t * q2 = q1 + 16;
-
-        q32[0] = q1[0] & 0x0f0f0f0f;
-        q32[1] = q1[0] & 0xf0f0f0f0;
-        q32[2] = q2[0] & 0x0f0f0f0f;
-        q32[3] = q2[0] & 0xf0f0f0f0;
-
-        float4 s = {0.f, 0.f, 0.f, 0.f};
-        float smin = 0;
-        for (int l = 0; l < 4; ++l) {
-            s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+ 4];
-            s.z += y2[l] * q4[l+8]; s.w += y2[l+32] * q4[l+12];
-            smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
-        }
-        tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin;
-#else
-        const uint16_t * q1 = (const uint16_t *)(x[i].qs + q_offset);
-        const uint16_t * q2 = q1 + 32;
-
-        q16[0] = q1[0] & 0x0f0f;
-        q16[1] = q1[0] & 0xf0f0;
-        q16[2] = q2[0] & 0x0f0f;
-        q16[3] = q2[0] & 0xf0f0;
-
-        float4 s = {0.f, 0.f, 0.f, 0.f};
-        float smin = 0;
-        for (int l = 0; l < 2; ++l) {
-            s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+2];
-            s.z += y2[l] * q4[l+4]; s.w += y2[l+32] * q4[l+6];
-            smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
-        }
-        tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin;
-#endif
-
-    }
-#else
-    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...15
-    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);
-
-    const int step = tid * K_QUANTS_PER_ITERATION;
-
-    uint16_t aux16[2];
-    const uint8_t * s = (const uint8_t *)aux16;
-
-    float tmp = 0;
-
-    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
-        const uint8_t * q = x[i].qs + step;
-        const float   * y = yy + i*QK_K + step;
-        const uint16_t * a = (const uint16_t *)x[i].scales;
-        aux16[0] = a[0] & 0x0f0f;
-        aux16[1] = (a[0] >> 4) & 0x0f0f;
-        const float d = (float)x[i].dm[0];
-        const float m = (float)x[i].dm[1];
-        float sum = 0.f;
-        for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
-            sum += y[j+ 0] * (d * s[0] * (q[j+ 0] & 0xF) - m * s[2])
-                 + y[j+16] * (d * s[0] * (q[j+16] & 0xF) - m * s[2])
-                 + y[j+32] * (d * s[1] * (q[j+ 0] >>  4) - m * s[3])
-                 + y[j+48] * (d * s[1] * (q[j+16] >>  4) - m * s[3]);
-        }
-        tmp += sum;
-    }
-
-#endif
-
-    // sum up partial sums and write back result
-#pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
-    }
-
-    if (tid == 0) {
-        dst[row] = tmp;
-    }
-}
-
-static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols) {
-
-    const int row = blockIdx.x;
-    const int num_blocks_per_row = ncols / QK_K;
-    const int ib0 = row*num_blocks_per_row;
-
-    const block_q5_K * x = (const block_q5_K *)vx + ib0;
-
-    float tmp = 0; // partial sum for thread in warp
-
-#if QK_K == 256
-    const uint16_t kmask1 = 0x3f3f;
-    const uint16_t kmask2 = 0x0f0f;
-    const uint16_t kmask3 = 0xc0c0;
-
-    const int tid = threadIdx.x/2;  // 0...15
-    const int ix  = threadIdx.x%2;
-
-    const int il  = tid/4;     // 0...3
-    const int ir  = tid - 4*il;// 0...3
-    const int n   = 2;
-
-    const int im = il/2;  // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
-    const int in = il%2;
-
-    const int l0 = n*(2*ir + in);
-    const int q_offset = 32*im + l0;
-    const int y_offset = 64*im + l0;
-
-    const uint8_t hm1  = 1 << (2*im);
-    const uint8_t hm2  = hm1 << 4;
-
-    uint16_t aux[4];
-    const uint8_t * sc = (const uint8_t *)aux;
-
-    uint16_t q16[8];
-    const uint8_t * q4 = (const uint8_t *)q16;
-
-    for (int i = ix; i < num_blocks_per_row; i += 2) {
-
-        const uint8_t * ql1 = x[i].qs + q_offset;
-        const uint8_t * qh  = x[i].qh + l0;
-        const float   * y1  = yy + i*QK_K + y_offset;
-        const float   * y2  = y1 + 128;
-
-        const float dall = __low2half(x[i].dm);
-        const float dmin = __high2half(x[i].dm);
-
-        const uint16_t * a = (const uint16_t *)x[i].scales;
-        aux[0] = a[im+0] & kmask1;
-        aux[1] = a[im+2] & kmask1;
-        aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
-        aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
-
-        float4 sum = {0.f, 0.f, 0.f, 0.f};
-        float smin = 0;
-        const uint16_t * q1 = (const uint16_t *)ql1;
-        const uint16_t * q2 = q1 + 32;
-        q16[0] = q1[0] & 0x0f0f;
-        q16[1] = q1[8] & 0x0f0f;
-        q16[2] = (q1[0] >> 4) & 0x0f0f;
-        q16[3] = (q1[8] >> 4) & 0x0f0f;
-        q16[4] = q2[0] & 0x0f0f;
-        q16[5] = q2[8] & 0x0f0f;
-        q16[6] = (q2[0] >> 4) & 0x0f0f;
-        q16[7] = (q2[8] >> 4) & 0x0f0f;
-        for (int l = 0; l < n; ++l) {
-            sum.x += y1[l+ 0] * (q4[l +0] + (qh[l+ 0] & (hm1 << 0) ? 16 : 0))
-                   + y1[l+16] * (q4[l +2] + (qh[l+16] & (hm1 << 0) ? 16 : 0));
-            sum.y += y1[l+32] * (q4[l +4] + (qh[l+ 0] & (hm1 << 1) ? 16 : 0))
-                   + y1[l+48] * (q4[l +6] + (qh[l+16] & (hm1 << 1) ? 16 : 0));
-            sum.z += y2[l+ 0] * (q4[l +8] + (qh[l+ 0] & (hm2 << 0) ? 16 : 0))
-                   + y2[l+16] * (q4[l+10] + (qh[l+16] & (hm2 << 0) ? 16 : 0));
-            sum.w += y2[l+32] * (q4[l+12] + (qh[l+ 0] & (hm2 << 1) ? 16 : 0))
-                   + y2[l+48] * (q4[l+14] + (qh[l+16] & (hm2 << 1) ? 16 : 0));
-            smin += (y1[l] + y1[l+16]) * sc[2] + (y1[l+32] + y1[l+48]) * sc[3]
-                  + (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7];
-        }
-        tmp += dall * (sum.x * sc[0] + sum.y * sc[1] + sum.z * sc[4] + sum.w * sc[5]) - dmin * smin;
-    }
-
-#else
-    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...15
-    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);
-    const int step = tid * K_QUANTS_PER_ITERATION;
-    const int im = step/8;
-    const int in = step%8;
-
-    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
-        const uint8_t * q = x[i].qs + step;
-        const int8_t  * s = x[i].scales;
-        const float   * y = yy + i*QK_K + step;
-        const float     d = x[i].d;
-        float sum = 0.f;
-        for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
-            const uint8_t h = x[i].qh[in+j] >> im;
-            sum += y[j+ 0] * d * s[0] * ((q[j+ 0] & 0xF) - ((h >> 0) & 1 ? 0 : 16))
-                 + y[j+16] * d * s[1] * ((q[j+16] & 0xF) - ((h >> 2) & 1 ? 0 : 16))
-                 + y[j+32] * d * s[2] * ((q[j+ 0] >>  4) - ((h >> 4) & 1 ? 0 : 16))
-                 + y[j+48] * d * s[3] * ((q[j+16] >>  4) - ((h >> 6) & 1 ? 0 : 16));
-        }
-        tmp += sum;
-    }
-#endif
-
-    // sum up partial sums and write back result
-#pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
-    }
-
-    if (threadIdx.x == 0) {
-        dst[row] = tmp;
-    }
-}
-
-static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
-
-    static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
-
-    const int row = blockIdx.x*blockDim.y + threadIdx.y;
-    if (row > nrows) return;
-
-    const int num_blocks_per_row = ncols / QK_K;
-    const int ib0 = row*num_blocks_per_row;
-
-    const block_q6_K * x = (const block_q6_K *)vx + ib0;
-
-#if QK_K == 256
-
-    const int tid = threadIdx.x/K_QUANTS_PER_ITERATION;  // 0...31 or 0...16
-    const int ix  = threadIdx.x%K_QUANTS_PER_ITERATION;  // 0 or 0, 1
-
-    const int step = 16/K_QUANTS_PER_ITERATION;          // 16 or 8
-
-    const int im = tid/step;                             // 0 or 1. 0 computes 0..., 1 computes 128...
-    const int in = tid - step*im;                        // 0...15 or 0...7
-
-#if K_QUANTS_PER_ITERATION == 1
-    const int l0 = K_QUANTS_PER_ITERATION*in;            // 0...15
-    const int is = 0;
-#else
-    const int l0 = 4 * in;                               // 0, 4, 8, ..., 28
-    const int is = in / 4;
-#endif
-    const int ql_offset = 64*im + l0;
-    const int qh_offset = 32*im + l0;
-    const int s_offset  =  8*im + is;
-    const int y_offset = 128*im + l0;
-
-    float tmp = 0; // partial sum for thread in warp
-
-    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
-
-        const float   * y  = yy + i * QK_K + y_offset;
-        const uint8_t * ql = x[i].ql + ql_offset;
-        const uint8_t * qh = x[i].qh + qh_offset;
-        const int8_t  * s  = x[i].scales + s_offset;
-
-        const float d = x[i].d;
-
-#if K_QUANTS_PER_ITERATION == 1
-        float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32)
-                  + y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32)
-                  + y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32)
-                  + y[48] * s[3] * d * ((int8_t)((ql[48] & 0xF) | ((qh[16] & 0x0c) << 2)) - 32)
-                  + y[64] * s[4] * d * ((int8_t)((ql[ 0]  >> 4) | ((qh[ 0] & 0x30) >> 0)) - 32)
-                  + y[80] * s[5] * d * ((int8_t)((ql[16]  >> 4) | ((qh[16] & 0x30) >> 0)) - 32)
-                  + y[96] * s[6] * d * ((int8_t)((ql[32]  >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32)
-                  +y[112] * s[7] * d * ((int8_t)((ql[48]  >> 4) | ((qh[16] & 0xc0) >> 2)) - 32);
-        tmp += sum;
-#else
-        float sum = 0;
-        for (int l = 0; l < 4; ++l) {
-            sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32)
-                 + y[l+32] * s[2] * d * ((int8_t)((ql[l+32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32)
-                 + y[l+64] * s[4] * d * ((int8_t)((ql[l+ 0]  >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32)
-                 + y[l+96] * s[6] * d * ((int8_t)((ql[l+32]  >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32);
-        }
-        tmp += sum;
-#endif
-
-    }
-
-#else
-
-    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...7
-    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);  // 0...3
-
-    const int step = tid * K_QUANTS_PER_ITERATION;
-
-    float tmp = 0; // partial sum for thread in warp
-
-    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
-
-        const float   * y  = yy + i * QK_K + step;
-        const uint8_t * ql = x[i].ql + step;
-        const uint8_t * qh = x[i].qh + step;
-        const int8_t  * s  = x[i].scales;
-
-        const float d = x[i+0].d;
-
-        float sum = 0;
-        for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
-            sum += y[j+ 0] * s[0] * d * ((int8_t)((ql[j+ 0] & 0xF) | ((qh[j] & 0x03) << 4)) - 32)
-                 + y[j+16] * s[1] * d * ((int8_t)((ql[j+16] & 0xF) | ((qh[j] & 0x0c) << 2)) - 32)
-                 + y[j+32] * s[2] * d * ((int8_t)((ql[j+ 0] >>  4) | ((qh[j] & 0x30) >> 0)) - 32)
-                 + y[j+48] * s[3] * d * ((int8_t)((ql[j+16] >>  4) | ((qh[j] & 0xc0) >> 2)) - 32);
-        }
-        tmp += sum;
-
-    }
-
-#endif
-
-    // sum up partial sums and write back result
-#pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
-    }
-
-    if (tid == 0) {
-        dst[row] = tmp;
-    }
-}
-
-static __device__ void convert_f16(const void * vx, const int ib, const int iqs, dfloat2 & v){
-    const half * x = (const half *) vx;
-
-    // automatic half -> float type cast if dfloat == float
-    v.x = x[ib + iqs + 0];
-    v.y = x[ib + iqs + 1];
-}
-
-static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int kx, const int kx_padded) {
-    const int ix = blockDim.x*blockIdx.x + threadIdx.x;
-
-    if (ix >= kx_padded) {
-        return;
-    }
-
-    const int iy = blockDim.y*blockIdx.y + threadIdx.y;
-
-    const int i_padded = iy*kx_padded + ix;
-
-    block_q8_1 * y = (block_q8_1 *) vy;
-
-    const int ib = i_padded / QK8_1; // block index
-    const int iqs = i_padded % QK8_1; // quant index
-
-    const float xi = ix < kx ? x[iy*kx + ix] : 0.0f;
-    float amax = fabsf(xi);
-    float sum = xi;
-
-#pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        amax = fmaxf(amax, __shfl_xor_sync(0xffffffff, amax, mask, 32));
-        sum += __shfl_xor_sync(0xffffffff, sum, mask, 32);
-    }
-
-    const float d = amax / 127;
-    const int8_t q = amax == 0.0f ? 0 : roundf(xi / d);
-
-    y[ib].qs[iqs] = q;
-
-    if (iqs > 0) {
-        return;
-    }
-
-    reinterpret_cast<half&>(y[ib].ds.x) = d;
-    reinterpret_cast<half&>(y[ib].ds.y) = sum;
-}
-
-template<int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
-static __global__ void k_get_rows(
-            const void * src0, const int32_t * src1, dst_t * dst,
-            int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/
-            /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/
-            /*size_t s0,*/ size_t s1, size_t s2, size_t s3,
-            /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03,
-            size_t s10, size_t s11, size_t s12/*, size_t s13*/) {
-
-    const int i00 = (blockIdx.x*blockDim.x + threadIdx.x)*2;
-    const int i10 = blockDim.y*blockIdx.y + threadIdx.y;
-    const int i11 = (blockIdx.z*blockDim.z + threadIdx.z)/ne12;
-    const int i12 = (blockIdx.z*blockDim.z + threadIdx.z)%ne12;
-
-    if (i00 >= ne00) {
-        return;
-    }
-
-    const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
-
-    dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
-    const void * src0_row = (const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03;
-
-    const int ib = i00/qk; // block index
-    const int iqs = (i00%qk)/qr; // quant index
-    const int iybs = i00 - i00%qk; // dst block start index
-    const int y_offset = qr == 1 ? 1 : qk/2;
-
-    // dequantize
-    dfloat2 v;
-    dequantize_kernel(src0_row, ib, iqs, v);
-
-    dst_row[iybs + iqs + 0]        = v.x;
-    dst_row[iybs + iqs + y_offset] = v.y;
-}
-
-template<typename src0_t, typename dst_t>
-static __global__ void k_get_rows_float(
-            const src0_t * src0, const int32_t * src1, dst_t * dst,
-            int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/
-            /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/
-            /*size_t s0,*/ size_t s1, size_t s2, size_t s3,
-            /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03,
-            size_t s10, size_t s11, size_t s12/*, size_t s13*/) {
-
-    const int i00 = blockIdx.x*blockDim.x + threadIdx.x;
-    const int i10 = blockDim.y*blockIdx.y + threadIdx.y;
-    const int i11 = (blockIdx.z*blockDim.z + threadIdx.z)/ne12;
-    const int i12 = (blockIdx.z*blockDim.z + threadIdx.z)%ne12;
-
-    if (i00 >= ne00) {
-        return;
-    }
-
-    const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
-
-    dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
-    const src0_t * src0_row = (const src0_t *)((const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03);
-
-    dst_row[i00] = src0_row[i00];
-}
-
-template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
-static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
-    const int i = 2*(blockDim.x*blockIdx.x + threadIdx.x);
-
-    if (i >= k) {
-        return;
-    }
-
-    const int ib = i/qk; // block index
-    const int iqs = (i%qk)/qr; // quant index
-    const int iybs = i - i%qk; // y block start index
-    const int y_offset = qr == 1 ? 1 : qk/2;
-
-    // dequantize
-    dfloat2 v;
-    dequantize_kernel(vx, ib, iqs, v);
-
-    y[iybs + iqs + 0]        = v.x;
-    y[iybs + iqs + y_offset] = v.y;
-}
-
-template <typename src_t, typename dst_t>
-static __global__ void convert_unary(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
-    const int i = blockDim.x*blockIdx.x + threadIdx.x;
-
-    if (i >= k) {
-        return;
-    }
-
-    const src_t * x = (src_t *) vx;
-
-    y[i] = x[i];
-}
-
-template <bool need_check>
-static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, half * __restrict__ y, const int k) {
-#if __CUDA_ARCH__ >= CC_PASCAL
-    constexpr int nint = CUDA_Q8_0_NE_ALIGN/sizeof(int) + WARP_SIZE;
-
-    const int   i0 = CUDA_Q8_0_NE_ALIGN*blockIdx.x;
-    const int * x0 = ((int *) vx) + blockIdx.x * nint;
-    half2 * y2 = (half2 *) (y + i0);
-
-    __shared__ int vals[nint];
-
-#pragma unroll
-    for (int ix0 = 0; ix0 < nint; ix0 += WARP_SIZE) {
-        if (need_check && i0*sizeof(block_q8_0)/QK8_0 + sizeof(int)*(ix0 + threadIdx.x) >= k*sizeof(block_q8_0)/QK8_0) {
-            break;
-        }
-
-        const int ix = ix0 + threadIdx.x;
-        vals[ix] = x0[ix];
-    }
-
-#pragma unroll
-    for (int iy = 0; iy < CUDA_Q8_0_NE_ALIGN; iy += 2*WARP_SIZE) {
-        if (need_check && i0 + iy + 2*threadIdx.x >= k) {
-            return;
-        }
-
-        const half * b0 = ((const half  *) vals) + (sizeof(block_q8_0)/sizeof(half)) * ((iy + 2*threadIdx.x)/QK8_0);
-        const half    d = *b0;
-        const char2  qs = ((const char2 *) (b0 + 1))[threadIdx.x % (QK8_0/2)];
-
-        y2[iy/2 + threadIdx.x] = __hmul2(make_half2(qs.x, qs.y), __half2half2(d));
-    }
-#else
-    (void) vx; (void) y; (void) k;
-    bad_arch();
-#endif // __CUDA_ARCH__ >= CC_PASCAL
-}
-
-// VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called
-// MMVQ = mul_mat_vec_q, MMQ = mul_mat_q
-
-#define VDR_Q4_0_Q8_1_MMVQ 2
-#define VDR_Q4_0_Q8_1_MMQ  4
-
-template <int vdr> static __device__ __forceinline__ float vec_dot_q4_0_q8_1_impl(
-    const int * v, const int * u, const float & d4, const half2 & ds8) {
-
-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
-    int sumi = 0;
-
-#pragma unroll
-    for (int i = 0; i < vdr; ++i) {
-        const int vi0 = (v[i] >> 0) & 0x0F0F0F0F;
-        const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;
-
-        // SIMD dot product of quantized values
-        sumi = __dp4a(vi0, u[2*i+0], sumi);
-        sumi = __dp4a(vi1, u[2*i+1], sumi);
-    }
-
-    const float2 ds8f = __half22float2(ds8);
-
-    // second part effectively subtracts 8 from each quant value
-    return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y);
-#else
-    bad_arch();
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-}
-
-#define VDR_Q4_1_Q8_1_MMVQ 2
-#define VDR_Q4_1_Q8_1_MMQ  4
-
-template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_impl(
-    const int * v, const int * u, const half2 & dm4, const half2 & ds8) {
-
-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
-    int sumi = 0;
-
-#pragma unroll
-    for (int i = 0; i < vdr; ++i) {
-        const int vi0 = (v[i] >> 0) & 0x0F0F0F0F;
-        const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;
-
-        // SIMD dot product of quantized values
-        sumi = __dp4a(vi0, u[2*i+0], sumi);
-        sumi = __dp4a(vi1, u[2*i+1], sumi);
-    }
-
-#ifdef GGML_CUDA_F16
-    const float2 tmp = __half22float2(__hmul2(dm4, ds8));
-    const float d4d8 = tmp.x;
-    const float m4s8 = tmp.y;
-#else
-    const float2 dm4f = __half22float2(dm4);
-    const float2 ds8f = __half22float2(ds8);
-    const float d4d8 = dm4f.x * ds8f.x;
-    const float m4s8 = dm4f.y * ds8f.y;
-#endif // GGML_CUDA_F16
-
-    // scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
-    return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
-#else
-    bad_arch();
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-}
-
-#define VDR_Q5_0_Q8_1_MMVQ 2
-#define VDR_Q5_0_Q8_1_MMQ  4
-
-template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_impl(
-    const int * vl, const int * vh, const int * u, const float & d5, const half2 & ds8) {
-
-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
-    int sumi = 0;
-
-#pragma unroll
-    for (int i = 0; i < vdr; ++i) {
-        int vi0 = (vl[i] >>  0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits
-        vi0    |= (vh[i] <<  4) & 0x00000010; // 0 ->  4
-        vi0    |= (vh[i] << 11) & 0x00001000; // 1 -> 12
-        vi0    |= (vh[i] << 18) & 0x00100000; // 2 -> 20
-        vi0    |= (vh[i] << 25) & 0x10000000; // 3 -> 28
-        sumi = __dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values
-
-        int vi1 = (vl[i] >>  4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits
-        vi1    |= (vh[i] >> 12) & 0x00000010; // 16 ->  4
-        vi1    |= (vh[i] >>  5) & 0x00001000; // 17 -> 12
-        vi1    |= (vh[i] <<  2) & 0x00100000; // 18 -> 20
-        vi1    |= (vh[i] <<  9) & 0x10000000; // 19 -> 28
-        sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values
-    }
-
-    const float2 ds8f = __half22float2(ds8);
-
-    // second part effectively subtracts 16 from each quant value
-    return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y);
-#else
-    bad_arch();
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-}
-
-#define VDR_Q5_1_Q8_1_MMVQ 2
-#define VDR_Q5_1_Q8_1_MMQ  4
-
-template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_impl(
-    const int * vl, const int * vh, const int * u, const half2 & dm5, const half2 & ds8) {
-
-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
-    int sumi = 0;
-
-#pragma unroll
-    for (int i = 0; i < vdr; ++i) {
-        int vi0 = (vl[i] >>  0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits
-        vi0    |= (vh[i] <<  4) & 0x00000010; // 0 ->  4
-        vi0    |= (vh[i] << 11) & 0x00001000; // 1 -> 12
-        vi0    |= (vh[i] << 18) & 0x00100000; // 2 -> 20
-        vi0    |= (vh[i] << 25) & 0x10000000; // 3 -> 28
-        sumi = __dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values
-
-        int vi1 = (vl[i] >>  4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits
-        vi1    |= (vh[i] >> 12) & 0x00000010; // 16 ->  4
-        vi1    |= (vh[i] >>  5) & 0x00001000; // 17 -> 12
-        vi1    |= (vh[i] <<  2) & 0x00100000; // 18 -> 20
-        vi1    |= (vh[i] <<  9) & 0x10000000; // 19 -> 28
-        sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values
-    }
-
-#ifdef GGML_CUDA_F16
-    const float2 tmp = __half22float2(__hmul2(dm5, ds8));
-    const float d5d8 = tmp.x;
-    const float m5s8 = tmp.y;
-#else
-    const float2 dm5f = __half22float2(dm5);
-    const float2 ds8f = __half22float2(ds8);
-    const float d5d8 = dm5f.x * ds8f.x;
-    const float m5s8 = dm5f.y * ds8f.y;
-#endif // GGML_CUDA_F16
-
-    // scale second part of sum by QI5_1 / vdr to compensate for multiple threads adding it
-    return sumi*d5d8 + m5s8 / (QI5_1 / vdr);
-
-#else
-    bad_arch();
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-}
-
-#define VDR_Q8_0_Q8_1_MMVQ 2
-#define VDR_Q8_0_Q8_1_MMQ 8
-
-template <int vdr> static __device__ __forceinline__ float vec_dot_q8_0_q8_1_impl(
-    const int * v, const int * u, const float & d8_0, const float & d8_1) {
-
-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
-    int sumi = 0;
-
-#pragma unroll
-    for (int i = 0; i < vdr; ++i) {
-        // SIMD dot product of quantized values
-        sumi = __dp4a(v[i], u[i], sumi);
-    }
-
-    return d8_0*d8_1 * sumi;
-#else
-    bad_arch();
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-}
-
-template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_impl(
-    const int * v, const int * u, const half2 & dm8, const half2 & ds8) {
-
-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
-    int sumi = 0;
-
-#pragma unroll
-    for (int i = 0; i < vdr; ++i) {
-        // SIMD dot product of quantized values
-        sumi = __dp4a(v[i], u[i], sumi);
-    }
-
-#ifdef GGML_CUDA_F16
-    const float2 tmp = __half22float2(__hmul2(dm8, ds8));
-    const float d8d8 = tmp.x;
-    const float m8s8 = tmp.y;
-#else
-    const float2 dm8f = __half22float2(dm8);
-    const float2 ds8f = __half22float2(ds8);
-    const float d8d8 = dm8f.x * ds8f.x;
-    const float m8s8 = dm8f.y * ds8f.y;
-#endif // GGML_CUDA_F16
-
-    // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
-    return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
-#else
-    bad_arch();
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-}
-
-#define VDR_Q2_K_Q8_1_MMVQ 1
-#define VDR_Q2_K_Q8_1_MMQ  2
-
-// contiguous v/x values
-static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq(
-    const int & v, const int * __restrict__ u, const uint8_t * __restrict__ scales,
-    const half2 & dm2, const float * __restrict__ d8) {
-
-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
-    float sumf_d = 0.0f;
-    float sumf_m = 0.0f;
-
-#pragma unroll
-    for (int i = 0; i < QR2_K; ++i) {
-        const int sc = scales[2*i];
-
-        const int vi = (v >> (2*i)) & 0x03030303;
-
-        sumf_d += d8[i] * (__dp4a(vi, u[i], 0) * (sc & 0xF)); // SIMD dot product
-
-        // fill int with 4x m
-        int m = sc >> 4;
-        m |= m <<  8;
-        m |= m << 16;
-        sumf_m += d8[i] * __dp4a(m, u[i], 0); // multiply constant q2_K part with sum of q8_1 values
-    }
-
-    const float2 dm2f = __half22float2(dm2);
-
-    return dm2f.x*sumf_d - dm2f.y*sumf_m;
-#else
-    bad_arch();
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-}
-
-// contiguous u/y values
-static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq(
-    const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ scales,
-    const half2 & dm2, const float & d8) {
-
-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
-    int sumi_d = 0;
-    int sumi_m = 0;
-
-#pragma unroll
-    for (int i0 = 0; i0 < QI8_1; i0 += QI8_1/2) {
-        int sumi_d_sc = 0;
-
-        const int sc = scales[i0 / (QI8_1/2)];
-
-        // fill int with 4x m
-        int m = sc >> 4;
-        m |= m <<  8;
-        m |= m << 16;
-
-#pragma unroll
-        for (int i = i0; i < i0 + QI8_1/2; ++i) {
-            sumi_d_sc = __dp4a(v[i], u[i], sumi_d_sc); // SIMD dot product
-            sumi_m    = __dp4a(m,    u[i], sumi_m); // multiply sum of q8_1 values with m
-        }
-
-        sumi_d += sumi_d_sc * (sc & 0xF);
-    }
-
-    const float2 dm2f = __half22float2(dm2);
-
-    return d8 * (dm2f.x*sumi_d - dm2f.y*sumi_m);
-#else
-    bad_arch();
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-}
-
-#define VDR_Q3_K_Q8_1_MMVQ 1
-#define VDR_Q3_K_Q8_1_MMQ  2
-
-// contiguous v/x values
-static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq(
-    const int & vl, const int & vh, const int * __restrict__ u, const uint8_t * __restrict__ scales,
-    const int & scale_offset, const float & d3, const float * __restrict__ d8) {
-
-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
-    float sumf = 0.0f;
-
-#pragma unroll
-    for (int i = 0; i < QR3_K; ++i) {
-        const int isc = scale_offset + 2*i;
-
-        const int isc_low = isc % (QK_K/32);
-        const int sc_shift_low = 4 * (isc / (QK_K/32));
-        const int sc_low  = (scales[isc_low] >> sc_shift_low) & 0xF;
-
-        const int isc_high = isc % (QK_K/64);
-        const int sc_shift_high = 2 * (isc / (QK_K/64));
-        const int sc_high = ((scales[(QK_K/32) + isc_high] >> sc_shift_high) & 3) << 4;
-
-        const int sc = (sc_low | sc_high) - 32;
-
-        const int vil = (vl >> (2*i)) & 0x03030303;
-
-        const int vih = ((vh >> i) << 2) & 0x04040404;
-
-        const int vi = __vsubss4(vil, vih);
-
-        sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product
-    }
-
-    return d3 * sumf;
-#else
-    bad_arch();
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-}
-
-// contiguous u/y values
-static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq(
-    const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ scales,
-    const float & d3, const float & d8) {
-
-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
-    int sumi = 0;
-
-#pragma unroll
-    for (int i0 = 0; i0 < QR3_K*VDR_Q3_K_Q8_1_MMQ; i0 += QI8_1/2) {
-        int sumi_sc = 0;
-
-        for (int i = i0; i < i0 + QI8_1/2; ++i) {
-            sumi_sc = __dp4a(v[i], u[i], sumi_sc); // SIMD dot product
-        }
-
-        sumi += sumi_sc * scales[i0 / (QI8_1/2)];
-    }
-
-    return d3*d8 * sumi;
-#else
-    bad_arch();
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-}
-
-#define VDR_Q4_K_Q8_1_MMVQ 2
-#define VDR_Q4_K_Q8_1_MMQ  8
-
-// contiguous v/x values
-static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq(
-    const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
-    const uint8_t * __restrict__ m, const half2 & dm4, const float * __restrict__ d8) {
-
-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
-    float sumf_d = 0.0f;
-    float sumf_m = 0.0f;
-
-#pragma unroll
-    for (int i = 0; i < QR4_K; ++i) {
-        const int v0i = (v[0] >> (4*i)) & 0x0F0F0F0F;
-        const int v1i = (v[1] >> (4*i)) & 0x0F0F0F0F;
-
-        const int dot1 = __dp4a(v1i, u[2*i+1], __dp4a(v0i, u[2*i+0], 0)); // SIMD dot product
-        const int dot2 = __dp4a(0x01010101, u[2*i+1], __dp4a(0x01010101, u[2*i+0], 0)); // sum of u
-
-        sumf_d += d8[i] * (dot1 * sc[i]);
-        sumf_m += d8[i] * (dot2 * m[i]);  // multiply constant part of q4_K with sum of q8_1 values
-    }
-
-    const float2 dm4f = __half22float2(dm4);
-
-    return dm4f.x*sumf_d - dm4f.y*sumf_m;
-
-#else
-    bad_arch();
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-}
-
-// contiguous u/y values
-static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
-    const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
-    const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {
-
-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
-    float sumf_d = 0.0f;
-    float sumf_m = 0.0f;
-
-#pragma unroll
-    for (int i = 0; i < QR4_K*VDR_Q4_K_Q8_1_MMQ/QI8_1; ++i) {
-        int sumi_d = 0;
-
-#pragma unroll
-        for (int j = 0; j < QI8_1; ++j) {
-            sumi_d = __dp4a((v[j] >> (4*i)) & 0x0F0F0F0F, u[i*QI8_1 + j], sumi_d); // SIMD dot product
-        }
-
-        const float2 ds8f = __half22float2(ds8[i]);
-
-        sumf_d += ds8f.x * (sc[i] * sumi_d);
-        sumf_m += ds8f.y *   m[i]; // sum of q8_1 block * q4_K min val
-    }
-
-    const float2 dm4f = __half22float2(dm4);
-
-    return dm4f.x*sumf_d - dm4f.y*sumf_m;
-
-#else
-    bad_arch();
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-}
-
-#define VDR_Q5_K_Q8_1_MMVQ 2
-#define VDR_Q5_K_Q8_1_MMQ  8
-
-// contiguous v/x values
-static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq(
-    const int * __restrict__ vl, const int * __restrict__ vh, const int * __restrict__ u, const uint8_t * __restrict__ sc,
-    const uint8_t * __restrict__ m, const half2 & dm5, const float * __restrict__ d8) {
-
-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
-    float sumf_d = 0.0f;
-    float sumf_m = 0.0f;
-
-#pragma unroll
-    for (int i = 0; i < QR5_K; ++i) {
-        const int vl0i = (vl[0] >> (4*i)) & 0x0F0F0F0F;
-        const int vl1i = (vl[1] >> (4*i)) & 0x0F0F0F0F;
-
-        const int vh0i = ((vh[0] >> i) << 4) & 0x10101010;
-        const int vh1i = ((vh[1] >> i) << 4) & 0x10101010;
-
-        const int v0i = vl0i | vh0i;
-        const int v1i = vl1i | vh1i;
-
-        const int dot1 = __dp4a(v0i, u[2*i+0], __dp4a(v1i, u[2*i+1], 0)); // SIMD dot product
-        const int dot2 = __dp4a(0x01010101, u[2*i+0], __dp4a(0x01010101, u[2*i+1], 0)); // sum of u
-
-        sumf_d += d8[i] * (dot1 * sc[i]);
-        sumf_m += d8[i] * (dot2 * m[i]);
-
-    }
-
-    const float2 dm5f = __half22float2(dm5);
-
-    return dm5f.x*sumf_d - dm5f.y*sumf_m;
-
-#else
-    bad_arch();
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-}
-
-// contiguous u/y values
-static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq(
-    const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
-    const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {
-
-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
-    float sumf_d = 0.0f;
-    float sumf_m = 0.0f;
-
-#pragma unroll
-    for (int i = 0; i < QR5_K*VDR_Q5_K_Q8_1_MMQ/QI8_1; ++i) {
-        int sumi_d = 0;
-
-#pragma unroll
-        for (int j = 0; j < QI8_1; ++j) {
-            sumi_d = __dp4a(v[i*QI8_1 + j], u[i*QI8_1 + j], sumi_d); // SIMD dot product
-        }
-
-        const float2 ds8f = __half22float2(ds8[i]);
-
-        sumf_d += ds8f.x * (sc[i] * sumi_d);
-        sumf_m += ds8f.y *   m[i]; // sum of q8_1 block * q4_K min val
-    }
-
-    const float2 dm4f = __half22float2(dm4);
-
-    return dm4f.x*sumf_d - dm4f.y*sumf_m;
-
-#else
-    bad_arch();
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-}
-
-#define VDR_Q6_K_Q8_1_MMVQ 1
-#define VDR_Q6_K_Q8_1_MMQ  8
-
-// contiguous v/x values
-static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq(
-    const int & vl, const int & vh, const int * __restrict__ u, const int8_t * __restrict__ scales,
-    const float & d, const float * __restrict__ d8) {
-
-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
-    float sumf = 0.0f;
-
-#pragma unroll
-    for (int i = 0; i < QR6_K; ++i) {
-        const int sc = scales[4*i];
-
-        const int vil = (vl >> (4*i)) & 0x0F0F0F0F;
-
-        const int vih = ((vh >> (4*i)) << 4) & 0x30303030;
-
-        const int vi = __vsubss4((vil | vih), 0x20202020); // vi = (vil | vih) - 32
-
-        sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product
-    }
-
-    return d*sumf;
-#else
-    bad_arch();
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-}
-
-// contiguous u/y values
-static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq(
-    const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ sc,
-    const float & d6, const float * __restrict__ d8) {
-
-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
-    float sumf_d = 0.0f;
-
-#pragma unroll
-    for (int i0 = 0; i0 < VDR_Q6_K_Q8_1_MMQ; i0 += 4) {
-        int2 sumi_d = {0, 0}; // 2 q6_K scales per q8_1 scale
-
-#pragma unroll
-        for (int i = i0; i < i0 + 2; ++i) {
-            sumi_d.x = __dp4a(v[2*i+0], u[2*i+0], sumi_d.x); // SIMD dot product
-            sumi_d.x = __dp4a(v[2*i+1], u[2*i+1], sumi_d.x); // SIMD dot product
-
-            sumi_d.y = __dp4a(v[2*i+4], u[2*i+4], sumi_d.y); // SIMD dot product
-            sumi_d.y = __dp4a(v[2*i+5], u[2*i+5], sumi_d.y); // SIMD dot product
-        }
-
-        sumf_d += d8[i0/4] * (sc[i0/2+0]*sumi_d.x + sc[i0/2+1]*sumi_d.y);
-    }
-
-    return d6 * sumf_d;
-
-#else
-    bad_arch();
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-}
-
-static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
-
-    const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
-
-    int v[VDR_Q4_0_Q8_1_MMVQ];
-    int u[2*VDR_Q4_0_Q8_1_MMVQ];
-
-#pragma unroll
-    for (int i = 0; i < VDR_Q4_0_Q8_1_MMVQ; ++i) {
-        v[i]     = get_int_from_uint8(bq4_0->qs, iqs + i);
-        u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
-        u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_0);
-    }
-
-    return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMVQ>(v, u, bq4_0->d, bq8_1->ds);
-}
-
-template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
-    (void)x_qh; (void)x_sc;
-
-    __shared__ int  tile_x_qs[mmq_y * (WARP_SIZE)       + mmq_y];
-    __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI4_0) + mmq_y/QI4_0];
-
-    *x_ql = tile_x_qs;
-    *x_dm = (half2 *) tile_x_d;
-}
-
-template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_0(
-    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
-    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
-    (void)x_qh; (void)x_sc;
-    GGML_CUDA_ASSUME(i_offset >= 0);
-    GGML_CUDA_ASSUME(i_offset <  nwarps);
-    GGML_CUDA_ASSUME(k >= 0);
-    GGML_CUDA_ASSUME(k <  WARP_SIZE);
-
-    const int kbx  = k / QI4_0;
-    const int kqsx = k % QI4_0;
-
-    const block_q4_0 * bx0 = (const block_q4_0 *) vx;
-
-    float * x_dmf = (float *) x_dm;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
-        int i = i0 + i_offset;
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-
-        const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbx;
-
-        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx);
-        // x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbx] = bxi->d;
-    }
-
-    const int blocks_per_tile_x_row = WARP_SIZE / QI4_0;
-    const int kbxd = k % blocks_per_tile_x_row;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_0) {
-        int i = i0 + i_offset * QI4_0 + k / blocks_per_tile_x_row;
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-
-        const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbxd;
-
-        x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbxd] = bxi->d;
-    }
-}
-
-static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat(
-    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
-    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
-    (void)x_qh; (void)x_sc;
-
-    const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
-    const float * x_dmf = (const float *) x_dm;
-
-    int u[2*VDR_Q4_0_Q8_1_MMQ];
-
-#pragma unroll
-    for (int l = 0; l < VDR_Q4_0_Q8_1_MMQ; ++l) {
-        u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l)         % WARP_SIZE];
-        u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_0) % WARP_SIZE];
-    }
-
-    return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMQ>
-        (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dmf[i * (WARP_SIZE/QI4_0) + i/QI4_0 + k/QI4_0],
-         y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
-}
-
-static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
-
-    const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
-
-    int v[VDR_Q4_1_Q8_1_MMVQ];
-    int u[2*VDR_Q4_1_Q8_1_MMVQ];
-
-#pragma unroll
-    for (int i = 0; i < VDR_Q4_1_Q8_1_MMVQ; ++i) {
-        v[i]    = get_int_from_uint8_aligned(bq4_1->qs, iqs + i);
-        u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
-        u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_1);
-    }
-
-    return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMVQ>(v, u, bq4_1->dm, bq8_1->ds);
-}
-
-template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
-    (void)x_qh; (void)x_sc;
-
-    __shared__ int   tile_x_qs[mmq_y * (WARP_SIZE) +     + mmq_y];
-    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_1) + mmq_y/QI4_1];
-
-    *x_ql = tile_x_qs;
-    *x_dm = tile_x_dm;
-}
-
-template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_1(
-    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
-    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
-    (void)x_qh; (void)x_sc;
-
-    GGML_CUDA_ASSUME(i_offset >= 0);
-    GGML_CUDA_ASSUME(i_offset <  nwarps);
-    GGML_CUDA_ASSUME(k >= 0);
-    GGML_CUDA_ASSUME(k <  WARP_SIZE);
-
-    const int kbx  = k / QI4_1;
-    const int kqsx = k % QI4_1;
-
-    const block_q4_1 * bx0 = (const block_q4_1 *) vx;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
-        int i = i0 + i_offset;
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-
-        const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbx;
-
-        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
-    }
-
-    const int blocks_per_tile_x_row = WARP_SIZE / QI4_1;
-    const int kbxd = k % blocks_per_tile_x_row;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_1) {
-        int i = i0 + i_offset * QI4_1 + k / blocks_per_tile_x_row;
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-
-        const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbxd;
-
-        x_dm[i * (WARP_SIZE/QI4_1) + i / QI4_1 + kbxd] = bxi->dm;
-    }
-}
-
-static __device__ __forceinline__ float vec_dot_q4_1_q8_1_mul_mat(
-    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
-    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
-    (void)x_qh; (void)x_sc;
-
-    const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
-
-    int u[2*VDR_Q4_1_Q8_1_MMQ];
-
-#pragma unroll
-    for (int l = 0; l < VDR_Q4_1_Q8_1_MMQ; ++l) {
-        u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l)         % WARP_SIZE];
-        u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_1) % WARP_SIZE];
-    }
-
-    return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMQ>
-        (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dm[i * (WARP_SIZE/QI4_1) + i/QI4_1 + k/QI4_1],
-         y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
-}
-
-static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
-
-    const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;
-
-    int vl[VDR_Q5_0_Q8_1_MMVQ];
-    int vh[VDR_Q5_0_Q8_1_MMVQ];
-    int  u[2*VDR_Q5_0_Q8_1_MMVQ];
-
-#pragma unroll
-    for (int i = 0; i < VDR_Q5_0_Q8_1_MMVQ; ++i) {
-        vl[i]    = get_int_from_uint8(bq5_0->qs, iqs + i);
-        vh[i]    = get_int_from_uint8(bq5_0->qh, 0) >> (4 * (iqs + i));
-        u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
-        u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_0);
-    }
-
-    return vec_dot_q5_0_q8_1_impl<VDR_Q5_0_Q8_1_MMVQ>(vl, vh, u, bq5_0->d, bq8_1->ds);
-}
-
-template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
-    (void)x_qh; (void)x_sc;
-
-    __shared__ int  tile_x_ql[mmq_y * (2*WARP_SIZE)     + mmq_y];
-    __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI5_0) + mmq_y/QI5_0];
-
-    *x_ql = tile_x_ql;
-    *x_dm = (half2 *) tile_x_d;
-}
-
-template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_0(
-    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
-    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
-    (void)x_qh; (void)x_sc;
-
-    GGML_CUDA_ASSUME(i_offset >= 0);
-    GGML_CUDA_ASSUME(i_offset <  nwarps);
-    GGML_CUDA_ASSUME(k >= 0);
-    GGML_CUDA_ASSUME(k <  WARP_SIZE);
-
-    const int kbx  = k / QI5_0;
-    const int kqsx = k % QI5_0;
-
-    const block_q5_0 * bx0 = (const block_q5_0 *) vx;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
-        int i = i0 + i_offset;
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-
-        const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbx;
-
-        const int ql = get_int_from_uint8(bxi->qs, kqsx);
-        const int qh = get_int_from_uint8(bxi->qh, 0) >> (4 * (k % QI5_0));
-
-        int qs0 = (ql >>  0)   & 0x0F0F0F0F;
-        qs0    |= (qh <<  4)   & 0x00000010;  // 0 ->  4
-        qs0    |= (qh << 11)   & 0x00001000;  // 1 -> 12
-        qs0    |= (qh << 18)   & 0x00100000;  // 2 -> 20
-        qs0    |= (qh << 25)   & 0x10000000;  // 3 -> 28
-        qs0     = __vsubss4(qs0, 0x10101010); // subtract 16
-
-        x_ql[i * (2*WARP_SIZE + 1) + 2*k+0] = qs0;
-
-        int qs1 = (ql >>  4)   & 0x0F0F0F0F;
-        qs1    |= (qh >> 12)   & 0x00000010;  // 16 ->  4
-        qs1    |= (qh >>  5)   & 0x00001000;  // 17 -> 12
-        qs1    |= (qh <<  2)   & 0x00100000;  // 18 -> 20
-        qs1    |= (qh <<  9)   & 0x10000000;  // 19 -> 28
-        qs1     = __vsubss4(qs1, 0x10101010); // subtract 16
-
-        x_ql[i * (2*WARP_SIZE + 1) + 2*k+1] = qs1;
-    }
-
-    const int blocks_per_tile_x_row = WARP_SIZE / QI5_0;
-    const int kbxd = k % blocks_per_tile_x_row;
-    float * x_dmf = (float *) x_dm;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_0) {
-        int i = i0 + i_offset * QI5_0 + k / blocks_per_tile_x_row;
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-
-        const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbxd;
-
-        x_dmf[i * (WARP_SIZE/QI5_0) + i / QI5_0 + kbxd] = bxi->d;
-    }
-}
-
-static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat(
-    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
-    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
-    (void)x_qh; (void)x_sc;
-
-    const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
-    const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k/QI5_0;
-    const float * x_dmf = (const float *) x_dm;
-    const float * y_df  = (const float *) y_ds;
-
-    int u[2*VDR_Q5_0_Q8_1_MMQ];
-
-#pragma unroll
-    for (int l = 0; l < VDR_Q5_0_Q8_1_MMQ; ++l) {
-        u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l)         % WARP_SIZE];
-        u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_0) % WARP_SIZE];
-    }
-
-    return vec_dot_q8_0_q8_1_impl<QR5_0*VDR_Q5_0_Q8_1_MMQ>
-        (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dmf[index_bx], y_df[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
-}
-
-static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
-
-    const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;
-
-    int vl[VDR_Q5_1_Q8_1_MMVQ];
-    int vh[VDR_Q5_1_Q8_1_MMVQ];
-    int  u[2*VDR_Q5_1_Q8_1_MMVQ];
-
-#pragma unroll
-    for (int i = 0; i < VDR_Q5_1_Q8_1_MMVQ; ++i) {
-        vl[i]   = get_int_from_uint8_aligned(bq5_1->qs, iqs + i);
-        vh[i]   = get_int_from_uint8_aligned(bq5_1->qh, 0) >> (4 * (iqs + i));
-        u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
-        u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_1);
-    }
-
-    return vec_dot_q5_1_q8_1_impl<VDR_Q5_1_Q8_1_MMVQ>(vl, vh, u, bq5_1->dm, bq8_1->ds);
-}
-
-template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
-    (void)x_qh; (void)x_sc;
-
-    __shared__ int   tile_x_ql[mmq_y * (2*WARP_SIZE)     + mmq_y];
-    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_1) + mmq_y/QI5_1];
-
-    *x_ql = tile_x_ql;
-    *x_dm = tile_x_dm;
-}
-
-template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_1(
-    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
-    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
-    (void)x_qh; (void)x_sc;
-
-    GGML_CUDA_ASSUME(i_offset >= 0);
-    GGML_CUDA_ASSUME(i_offset < nwarps);
-    GGML_CUDA_ASSUME(k >= 0);
-    GGML_CUDA_ASSUME(k <  WARP_SIZE);
-
-    const int kbx  = k / QI5_1;
-    const int kqsx = k % QI5_1;
-
-    const block_q5_1 * bx0 = (const block_q5_1 *) vx;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
-        int i = i0 + i_offset;
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-
-        const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbx;
-
-        const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx);
-        const int qh = get_int_from_uint8_aligned(bxi->qh, 0) >> (4 * (k % QI5_1));
-
-        int qs0 = (ql >>  0) & 0x0F0F0F0F;
-        qs0    |= (qh <<  4) & 0x00000010; // 0 ->  4
-        qs0    |= (qh << 11) & 0x00001000; // 1 -> 12
-        qs0    |= (qh << 18) & 0x00100000; // 2 -> 20
-        qs0    |= (qh << 25) & 0x10000000; // 3 -> 28
-
-        x_ql[i * (2*WARP_SIZE + 1) + 2*k+0] = qs0;
-
-        int qs1 = (ql >>  4) & 0x0F0F0F0F;
-        qs1    |= (qh >> 12) & 0x00000010; // 16 ->  4
-        qs1    |= (qh >>  5) & 0x00001000; // 17 -> 12
-        qs1    |= (qh <<  2) & 0x00100000; // 18 -> 20
-        qs1    |= (qh <<  9) & 0x10000000; // 19 -> 28
-
-        x_ql[i * (2*WARP_SIZE + 1) + 2*k+1] = qs1;
-    }
-
-    const int blocks_per_tile_x_row = WARP_SIZE / QI5_1;
-    const int kbxd = k % blocks_per_tile_x_row;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_1) {
-        int i = i0 + i_offset * QI5_1 + k / blocks_per_tile_x_row;
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-
-        const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbxd;
-
-        x_dm[i * (WARP_SIZE/QI5_1) + i / QI5_1 + kbxd] = bxi->dm;
-    }
-}
-
-static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat(
-    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
-    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
-    (void)x_qh; (void)x_sc;
-
-    const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
-    const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k/QI5_1;
-
-    int u[2*VDR_Q5_1_Q8_1_MMQ];
-
-#pragma unroll
-    for (int l = 0; l < VDR_Q5_1_Q8_1_MMQ; ++l) {
-        u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l)         % WARP_SIZE];
-        u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_1) % WARP_SIZE];
-    }
-
-    return vec_dot_q8_1_q8_1_impl<QR5_1*VDR_Q5_1_Q8_1_MMQ>
-        (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dm[index_bx], y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
-}
-
-static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
-
-    const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
-
-    int v[VDR_Q8_0_Q8_1_MMVQ];
-    int u[VDR_Q8_0_Q8_1_MMVQ];
-
-#pragma unroll
-    for (int i = 0; i < VDR_Q8_0_Q8_1_MMVQ; ++i) {
-        v[i] = get_int_from_int8(bq8_0->qs, iqs + i);
-        u[i] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
-    }
-
-    return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, __low2half(bq8_1->ds));
-}
-
-template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
-    (void)x_qh; (void)x_sc;
-
-    __shared__ int  tile_x_qs[mmq_y * (WARP_SIZE)       + mmq_y];
-    __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI8_0) + mmq_y/QI8_0];
-
-    *x_ql = tile_x_qs;
-    *x_dm = (half2 *) tile_x_d;
-}
-
-template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q8_0(
-    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
-    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
-    (void)x_qh; (void)x_sc;
-
-    GGML_CUDA_ASSUME(i_offset >= 0);
-    GGML_CUDA_ASSUME(i_offset <  nwarps);
-    GGML_CUDA_ASSUME(k >= 0);
-    GGML_CUDA_ASSUME(k <  WARP_SIZE);
-
-    const int kbx  = k / QI8_0;
-    const int kqsx = k % QI8_0;
-    float * x_dmf = (float *) x_dm;
-
-    const block_q8_0 * bx0 = (const block_q8_0 *) vx;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
-        int i = i0 + i_offset;
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-
-        const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbx;
-
-        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_int8(bxi->qs, kqsx);
-    }
-
-    const int blocks_per_tile_x_row = WARP_SIZE / QI8_0;
-    const int kbxd = k % blocks_per_tile_x_row;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI8_0) {
-        int i = i0 + i_offset * QI8_0 + k / blocks_per_tile_x_row;
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-
-        const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbxd;
-
-        x_dmf[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbxd] = bxi->d;
-    }
-}
-
-static __device__ __forceinline__ float vec_dot_q8_0_q8_1_mul_mat(
-    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
-    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
-    (void)x_qh; (void)x_sc;
-
-    const float * x_dmf = (const float *) x_dm;
-    const float * y_df  = (const float *) y_ds;
-
-    return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMQ>
-        (&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[j * WARP_SIZE + k], x_dmf[i * (WARP_SIZE/QI8_0) + i/QI8_0 + k/QI8_0],
-         y_df[j * (WARP_SIZE/QI8_1) + k/QI8_1]);
-}
-
-static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
-
-    const block_q2_K * bq2_K = (const block_q2_K *) vbq;
-
-    const int bq8_offset = QR2_K * (iqs / QI8_1);
-    const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
-
-    const uint8_t * scales = bq2_K->scales + scale_offset;
-
-    const int v = get_int_from_uint8_aligned(bq2_K->qs, iqs);
-    int    u[QR2_K];
-    float d8[QR2_K];
-
-#pragma unroll
-    for (int i = 0; i < QR2_K; ++ i) {
-        u[i]  = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
-        d8[i] = __low2half(bq8_1[bq8_offset + i].ds);
-    }
-
-    return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8);
-}
-
-template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
-    (void)x_qh;
-
-    __shared__ int   tile_x_ql[mmq_y * (WARP_SIZE)       + mmq_y];
-    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI2_K) + mmq_y/QI2_K];
-    __shared__ int   tile_x_sc[mmq_y * (WARP_SIZE/4)     + mmq_y/4];
-
-    *x_ql = tile_x_ql;
-    *x_dm = tile_x_dm;
-    *x_sc = tile_x_sc;
-}
-
-template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q2_K(
-    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
-    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
-    (void)x_qh;
-
-    GGML_CUDA_ASSUME(i_offset >= 0);
-    GGML_CUDA_ASSUME(i_offset <  nwarps);
-    GGML_CUDA_ASSUME(k >= 0);
-    GGML_CUDA_ASSUME(k <  WARP_SIZE);
-
-    const int kbx  = k / QI2_K;
-    const int kqsx = k % QI2_K;
-
-    const block_q2_K * bx0 = (const block_q2_K *) vx;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
-        int i = i0 + i_offset;
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-
-        const block_q2_K * bxi = bx0 + i*blocks_per_row + kbx;
-
-        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
-    }
-
-    const int blocks_per_tile_x_row = WARP_SIZE / QI2_K;
-    const int kbxd = k % blocks_per_tile_x_row;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI2_K) {
-        int i = (i0 + i_offset * QI2_K + k / blocks_per_tile_x_row) % mmq_y;
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-
-        const block_q2_K * bxi = bx0 + i*blocks_per_row + kbxd;
-
-        x_dm[i * (WARP_SIZE/QI2_K) + i / QI2_K + kbxd] = bxi->dm;
-    }
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
-        int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-
-        const block_q2_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI2_K/4);
-
-        x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = get_int_from_uint8_aligned(bxi->scales, k % (QI2_K/4));
-    }
-}
-
-static __device__ __forceinline__ float vec_dot_q2_K_q8_1_mul_mat(
-    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
-    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
-    (void)x_qh;
-
-    const int kbx = k / QI2_K;
-    const int ky  = (k % QI2_K) * QR2_K;
-    const float * y_df = (const float *) y_ds;
-
-    int v[QR2_K*VDR_Q2_K_Q8_1_MMQ];
-
-    const int kqsx = i * (WARP_SIZE + 1) + kbx*QI2_K + (QI2_K/2) * (ky/(2*QI2_K)) + ky % (QI2_K/2);
-    const int shift = 2 * ((ky % (2*QI2_K)) / (QI2_K/2));
-
-#pragma unroll
-    for (int l = 0; l < QR2_K*VDR_Q2_K_Q8_1_MMQ; ++l) {
-        v[l] = (x_ql[kqsx + l] >> shift) & 0x03030303;
-    }
-
-    const uint8_t * scales = ((const uint8_t *) &x_sc[i * (WARP_SIZE/4) + i/4 + kbx*4]) + ky/4;
-
-    const int index_y = j * WARP_SIZE + (QR2_K*k) % WARP_SIZE;
-    return vec_dot_q2_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dm[i * (WARP_SIZE/QI2_K) + i/QI2_K + kbx], y_df[index_y/QI8_1]);
-}
-
-static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
-
-    const block_q3_K * bq3_K = (const block_q3_K *) vbq;
-
-    const int bq8_offset = QR3_K * (iqs / (QI3_K/2));
-    const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
-
-    const float d = bq3_K->d;
-
-    const int vl = get_int_from_uint8(bq3_K->qs, iqs);
-
-    // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
-    const int vh = ~get_int_from_uint8(bq3_K->hmask, iqs % (QI3_K/2)) >> bq8_offset;
-
-    int    u[QR3_K];
-    float d8[QR3_K];
-
-#pragma unroll
-    for (int i = 0; i < QR3_K; ++i) {
-        u[i]  = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
-        d8[i] = __low2half(bq8_1[bq8_offset + i].ds);
-    }
-
-    return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8);
-}
-
-template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q3_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
-
-    __shared__ int   tile_x_ql[mmq_y * (WARP_SIZE)       + mmq_y];
-    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI3_K) + mmq_y/QI3_K];
-    __shared__ int   tile_x_qh[mmq_y * (WARP_SIZE/2)     + mmq_y/2];
-    __shared__ int   tile_x_sc[mmq_y * (WARP_SIZE/4)     + mmq_y/4];
-
-    *x_ql = tile_x_ql;
-    *x_dm = tile_x_dm;
-    *x_qh = tile_x_qh;
-    *x_sc = tile_x_sc;
-}
-
-template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q3_K(
-    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
-    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
-
-    GGML_CUDA_ASSUME(i_offset >= 0);
-    GGML_CUDA_ASSUME(i_offset <  nwarps);
-    GGML_CUDA_ASSUME(k >= 0);
-    GGML_CUDA_ASSUME(k <  WARP_SIZE);
-
-    const int kbx  = k / QI3_K;
-    const int kqsx = k % QI3_K;
-
-    const block_q3_K * bx0 = (const block_q3_K *) vx;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
-        int i = i0 + i_offset;
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-
-        const block_q3_K * bxi = bx0 + i*blocks_per_row + kbx;
-
-        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx);
-    }
-
-    const int blocks_per_tile_x_row = WARP_SIZE / QI3_K;
-    const int kbxd = k % blocks_per_tile_x_row;
-    float * x_dmf = (float *) x_dm;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI3_K) {
-        int i = (i0 + i_offset * QI3_K + k / blocks_per_tile_x_row) % mmq_y;
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-
-        const block_q3_K * bxi = bx0 + i*blocks_per_row + kbxd;
-
-        x_dmf[i * (WARP_SIZE/QI3_K) + i / QI3_K + kbxd] = bxi->d;
-    }
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 2) {
-        int i = i0 + i_offset * 2 + k / (WARP_SIZE/2);
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-
-        const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/2)) / (QI3_K/2);
-
-        // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
-        x_qh[i * (WARP_SIZE/2) + i / 2 + k % (WARP_SIZE/2)] = ~get_int_from_uint8(bxi->hmask, k % (QI3_K/2));
-    }
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
-        int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-
-        const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI3_K/4);
-
-        const int ksc = k % (QI3_K/4);
-
-        const int ksc_low = ksc % (QI3_K/8);
-        const int shift_low = 4 * (ksc / (QI3_K/8));
-        const int sc_low = (get_int_from_uint8(bxi->scales, ksc_low) >> shift_low) & 0x0F0F0F0F;
-
-        const int ksc_high = QI3_K/8;
-        const int shift_high = 2 * ksc;
-        const int sc_high = ((get_int_from_uint8(bxi->scales, ksc_high) >> shift_high) << 4) & 0x30303030;
-
-        const int sc = __vsubss4(sc_low | sc_high, 0x20202020);
-
-        x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = sc;
-    }
-}
-
-static __device__ __forceinline__ float vec_dot_q3_K_q8_1_mul_mat(
-    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
-    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
-
-    const int kbx  = k / QI3_K;
-    const int ky  = (k % QI3_K) * QR3_K;
-    const float * x_dmf = (const float *) x_dm;
-    const float * y_df  = (const float *) y_ds;
-
-    const int8_t * scales = ((const int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4;
-
-    int v[QR3_K*VDR_Q3_K_Q8_1_MMQ];
-
-#pragma unroll
-    for (int l = 0; l < QR3_K*VDR_Q3_K_Q8_1_MMQ; ++l) {
-        const int kqsx = i * (WARP_SIZE + 1) + kbx*QI3_K + (QI3_K/2) * (ky/(2*QI3_K)) + ky % (QI3_K/2);
-        const int shift = 2 * ((ky % 32) / 8);
-        const int vll = (x_ql[kqsx + l] >> shift) & 0x03030303;
-
-        const int vh = x_qh[i * (WARP_SIZE/2) + i/2 + kbx * (QI3_K/2) + (ky+l)%8] >> ((ky+l) / 8);
-        const int vlh = (vh << 2) & 0x04040404;
-
-        v[l] = __vsubss4(vll, vlh);
-    }
-
-    const int index_y = j * WARP_SIZE + (k*QR3_K) % WARP_SIZE;
-    return vec_dot_q3_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dmf[i * (WARP_SIZE/QI3_K) + i/QI3_K + kbx], y_df[index_y/QI8_1]);
-}
-
-static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
-
-#ifndef GGML_QKK_64
-    const block_q4_K * bq4_K = (const block_q4_K *) vbq;
-
-    int    v[2];
-    int    u[2*QR4_K];
-    float d8[QR4_K];
-
-    // iqs is in 0,2..30. bq8_offset = iqs/4 -> bq8_offset = 0, 2, 4, 6
-    const int bq8_offset = QR4_K * ((iqs/2) / (QI8_1/2));
-
-    // iqs = 0....3 -> bq8_offset = 0, want q4_offset = 0, 4, 8, 12
-    // iqs = 4....7 -> bq8_offset = 2, want q4_offset = 32, 36, 40, 44
-    // iqs = 8...11 -> bq8_offset = 4, want q4_offset = 64, 68, 72, 76
-    // iqs = 12..15 -> bq8_offset = 6, want q4_offset = 96, 100, 104, 108
-
-    const int * q4 = (const int *)(bq4_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4));
-    v[0] = q4[0];
-    v[1] = q4[4];
-
-    const uint16_t * scales = (const uint16_t *)bq4_K->scales;
-    uint16_t aux[2];
-    const int j = bq8_offset/2;
-    if (j < 2) {
-        aux[0] = scales[j+0] & 0x3f3f;
-        aux[1] = scales[j+2] & 0x3f3f;
-    } else {
-        aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
-        aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
-    }
-    const uint8_t * sc = (const uint8_t *)aux;
-    const uint8_t * m  = sc + 2;
-
-    for (int i = 0; i < QR4_K; ++i) {
-        const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
-        d8[i] = __low2half(bq8i->ds);
-
-        const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
-        u[2*i+0] = q8[0];
-        u[2*i+1] = q8[4];
-    }
-
-    return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8);
-
-#else
-
-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
-    const block_q4_K * bq4_K = (const block_q4_K *) vbq;
-
-    float sumf_d = 0.0f;
-    float sumf_m = 0.0f;
-
-    uint16_t aux16[2];
-    const uint8_t * s = (const uint8_t *)aux16;
-
-    const uint16_t * a = (const uint16_t *)bq4_K->scales;
-    aux16[0] = a[0] & 0x0f0f;
-    aux16[1] = (a[0] >> 4) & 0x0f0f;
-
-    const float dall = bq4_K->dm[0];
-    const float dmin = bq4_K->dm[1];
-
-    const float d8_1 = __low2float(bq8_1[0].ds);
-    const float d8_2 = __low2float(bq8_1[1].ds);
-
-    const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
-    const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
-    const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
-    const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
-
-    const int * q4 = (const int *)bq4_K->qs + (iqs/2);
-    const int v1 = q4[0];
-    const int v2 = q4[4];
-
-    const int dot1 = __dp4a(ui2, v2 & 0x0f0f0f0f, __dp4a(ui1, v1 & 0x0f0f0f0f, 0));
-    const int dot2 = __dp4a(ui4, (v2 >> 4) & 0x0f0f0f0f, __dp4a(ui3, (v1 >> 4) & 0x0f0f0f0f, 0));
-    const int dot3 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0));
-    const int dot4 = __dp4a(0x01010101, ui4, __dp4a(0x01010101, ui3, 0));
-
-    sumf_d += d8_1 * (dot1 * s[0]) + d8_2 * (dot2 * s[1]);
-    sumf_m += d8_1 * (dot3 * s[2]) + d8_2 * (dot4 * s[3]);
-
-    return dall * sumf_d - dmin * sumf_m;
-
-#else
-    bad_arch();
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-
-#endif
-}
-
-template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
-    (void)x_qh;
-
-    __shared__ int   tile_x_ql[mmq_y * (WARP_SIZE)       + mmq_y];
-    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_K) + mmq_y/QI4_K];
-    __shared__ int   tile_x_sc[mmq_y * (WARP_SIZE/8)     + mmq_y/8];
-
-    *x_ql = tile_x_ql;
-    *x_dm = tile_x_dm;
-    *x_sc = tile_x_sc;
-}
-
-template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_K(
-    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
-    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
-    (void)x_qh;
-
-    GGML_CUDA_ASSUME(i_offset >= 0);
-    GGML_CUDA_ASSUME(i_offset <  nwarps);
-    GGML_CUDA_ASSUME(k >= 0);
-    GGML_CUDA_ASSUME(k <  WARP_SIZE);
-
-    const int kbx  = k / QI4_K; // == 0 if QK_K == 256
-    const int kqsx = k % QI4_K; // == k if QK_K == 256
-
-    const block_q4_K * bx0 = (const block_q4_K *) vx;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
-        int i = i0 + i_offset;
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-
-        const block_q4_K * bxi = bx0 + i*blocks_per_row + kbx;
-
-        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
-    }
-
-    const int blocks_per_tile_x_row = WARP_SIZE / QI4_K; // == 1 if QK_K == 256
-    const int kbxd = k % blocks_per_tile_x_row;          // == 0 if QK_K == 256
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_K) {
-        int i = (i0 + i_offset * QI4_K + k / blocks_per_tile_x_row) % mmq_y;
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-
-        const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;
-
-#if QK_K == 256
-        x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
-#else
-        x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = {bxi->dm[0], bxi->dm[1]};
-#endif
-    }
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
-        int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-
-        const block_q4_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI4_K/8);
-
-        const int * scales = (const int *) bxi->scales;
-
-        const int ksc = k % (WARP_SIZE/8);
-
-        // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8
-        int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits
-        scales8    |= (scales[ksc/2]              >> (2 * (ksc % 2)))       & 0x30303030; // upper 2 bits
-
-        x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8;
-    }
-}
-
-static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat(
-    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
-    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
-    (void)x_qh;
-
-    const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8);
-
-    const int index_y = j * WARP_SIZE + (QR4_K*k) % WARP_SIZE;
-    return vec_dot_q4_K_q8_1_impl_mmq(&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[index_y], sc, sc+8,
-                                      x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K], &y_ds[index_y/QI8_1]);
-}
-
-static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
-
-#ifndef GGML_QKK_64
-    const block_q5_K * bq5_K = (const block_q5_K *) vbq;
-
-    int   vl[2];
-    int   vh[2];
-    int    u[2*QR5_K];
-    float d8[QR5_K];
-
-    const int bq8_offset = QR5_K * ((iqs/2) / (QI8_1/2));
-    const int * ql = (const int *)(bq5_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4));
-    const int * qh = (const int *)(bq5_K->qh + 4 * ((iqs/2)%4));
-
-    vl[0] = ql[0];
-    vl[1] = ql[4];
-
-    vh[0] = qh[0] >> bq8_offset;
-    vh[1] = qh[4] >> bq8_offset;
-
-    const uint16_t * scales = (const uint16_t *)bq5_K->scales;
-    uint16_t aux[2];
-    const int j = bq8_offset/2;
-    if (j < 2) {
-        aux[0] = scales[j+0] & 0x3f3f;
-        aux[1] = scales[j+2] & 0x3f3f;
-    } else {
-        aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
-        aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
-    }
-    const uint8_t * sc = (const uint8_t *)aux;
-    const uint8_t * m  = sc + 2;
-
-#pragma unroll
-    for (int i = 0; i < QR5_K; ++i) {
-        const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
-        d8[i] = __low2float(bq8i->ds);
-
-        const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
-        u[2*i+0] = q8[0];
-        u[2*i+1] = q8[4];
-    }
-
-    return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8);
-
-#else
-
-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
-    const block_q5_K * bq5_K = (const block_q5_K *) vbq;
-
-    const int8_t * s = bq5_K->scales;
-
-    const float d = bq5_K->d;
-
-    const float d8_1 = __low2half(bq8_1[0].ds);
-    const float d8_2 = __low2half(bq8_1[1].ds);
-
-    const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
-    const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
-    const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
-    const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
-
-    const int * ql = (const int *)bq5_K->qs + (iqs/2);
-    const int vl1 = ql[0];
-    const int vl2 = ql[4];
-
-    const int step = 4 * (iqs/2); // 0, 4, 8, 12
-    const int im = step/8; // = 0 for iqs = 0, 2, = 1 for iqs = 4, 6
-    const int in = step%8; // 0, 4, 0, 4
-    const int vh = (*((const int *)(bq5_K->qh + in))) >> im;
-
-    const int v1 = (((vh << 4) & 0x10101010) ^ 0x10101010) | ((vl1 >> 0) & 0x0f0f0f0f);
-    const int v2 = (((vh << 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 0) & 0x0f0f0f0f);
-    const int v3 = (((vh >> 0) & 0x10101010) ^ 0x10101010) | ((vl1 >> 4) & 0x0f0f0f0f);
-    const int v4 = (((vh >> 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 4) & 0x0f0f0f0f);
-
-    const float sumf_d = d8_1 * (__dp4a(ui1, v1, 0) * s[0] + __dp4a(ui2, v2, 0) * s[1])
-                       + d8_2 * (__dp4a(ui3, v3, 0) * s[2] + __dp4a(ui4, v4, 0) * s[3]);
-
-    return d * sumf_d;
-
-#else
-    bad_arch();
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-
-#endif
-}
-
-template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
-    (void)x_qh;
-
-    __shared__ int   tile_x_ql[mmq_y * (2*WARP_SIZE)     + mmq_y];
-    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_K) + mmq_y/QI5_K];
-    __shared__ int   tile_x_sc[mmq_y * (WARP_SIZE/8)     + mmq_y/8];
-
-    *x_ql = tile_x_ql;
-    *x_dm = tile_x_dm;
-    *x_sc = tile_x_sc;
-}
-
-template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_K(
-    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
-    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
-    (void)x_qh;
-
-    GGML_CUDA_ASSUME(i_offset >= 0);
-    GGML_CUDA_ASSUME(i_offset <  nwarps);
-    GGML_CUDA_ASSUME(k >= 0);
-    GGML_CUDA_ASSUME(k <  WARP_SIZE);
-
-    const int kbx  = k / QI5_K; // == 0 if QK_K == 256
-    const int kqsx = k % QI5_K; // == k if QK_K == 256
-
-    const block_q5_K * bx0 = (const block_q5_K *) vx;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
-        int i = i0 + i_offset;
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-
-        const block_q5_K * bxi = bx0 + i*blocks_per_row + kbx;
-        const int ky = QR5_K*kqsx;
-
-        const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx);
-        const int ql0 = (ql >> 0) & 0x0F0F0F0F;
-        const int ql1 = (ql >> 4) & 0x0F0F0F0F;
-
-        const int qh = get_int_from_uint8_aligned(bxi->qh, kqsx % (QI5_K/4));
-        const int qh0 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 0)) << 4) & 0x10101010;
-        const int qh1 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 1)) << 4) & 0x10101010;
-
-        const int kq0 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + 0;
-        const int kq1 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + (QI5_K/4);
-
-        x_ql[i * (2*WARP_SIZE + 1) + kq0] = ql0 | qh0;
-        x_ql[i * (2*WARP_SIZE + 1) + kq1] = ql1 | qh1;
-    }
-
-    const int blocks_per_tile_x_row = WARP_SIZE / QI5_K; // == 1 if QK_K == 256
-    const int kbxd = k % blocks_per_tile_x_row;          // == 0 if QK_K == 256
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_K) {
-        int i = (i0 + i_offset * QI5_K + k / blocks_per_tile_x_row) % mmq_y;
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-
-        const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd;
-
-#if QK_K == 256
-        x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm;
-#endif
-    }
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
-        int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-
-        const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI5_K/8);
-
-        const int * scales = (const int *) bxi->scales;
-
-        const int ksc = k % (WARP_SIZE/8);
-
-        // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8
-        int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits
-        scales8    |= (scales[ksc/2]              >> (2 * (ksc % 2)))       & 0x30303030; // upper 2 bits
-
-        x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8;
-    }
-}
-
-static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat(
-    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
-    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
-    (void)x_qh;
-
-    const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2 * ((k % 16) / 8);
-
-    const int index_x = i * (QR5_K*WARP_SIZE + 1) +  QR5_K*k;
-    const int index_y = j * WARP_SIZE             + (QR5_K*k) % WARP_SIZE;
-    return vec_dot_q5_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, sc+8,
-                                      x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K], &y_ds[index_y/QI8_1]);
-}
-
-static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
-
-    const block_q6_K * bq6_K = (const block_q6_K *) vbq;
-
-    const int bq8_offset = 2 * QR6_K * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/4);
-    const int scale_offset = (QI6_K/4) * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/8);
-    const int vh_shift = 2 * ((iqs % (QI6_K/2)) / (QI6_K/4));
-
-    const int vl = get_int_from_uint8(bq6_K->ql, iqs);
-    const int vh = get_int_from_uint8(bq6_K->qh, (QI6_K/4) * (iqs / (QI6_K/2)) + iqs % (QI6_K/4)) >> vh_shift;
-
-    const int8_t * scales = bq6_K->scales + scale_offset;
-
-    int    u[QR6_K];
-    float d8[QR6_K];
-
-#pragma unroll
-    for (int i = 0; i < QR6_K; ++i) {
-        u[i]  = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1);
-        d8[i] = __low2half(bq8_1[bq8_offset + 2*i].ds);
-    }
-
-    return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8);
-}
-
-template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
-    (void)x_qh;
-
-    __shared__ int   tile_x_ql[mmq_y * (2*WARP_SIZE)     + mmq_y];
-    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI6_K) + mmq_y/QI6_K];
-    __shared__ int   tile_x_sc[mmq_y * (WARP_SIZE/8)     + mmq_y/8];
-
-    *x_ql = tile_x_ql;
-    *x_dm = tile_x_dm;
-    *x_sc = tile_x_sc;
-}
-
-template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q6_K(
-    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
-    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
-    (void)x_qh;
-
-    GGML_CUDA_ASSUME(i_offset >= 0);
-    GGML_CUDA_ASSUME(i_offset <  nwarps);
-    GGML_CUDA_ASSUME(k >= 0);
-    GGML_CUDA_ASSUME(k <  WARP_SIZE);
-
-    const int kbx  = k / QI6_K; // == 0 if QK_K == 256
-    const int kqsx = k % QI6_K; // == k if QK_K == 256
-
-    const block_q6_K * bx0 = (const block_q6_K *) vx;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
-        int i = i0 + i_offset;
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-
-        const block_q6_K * bxi = bx0 + i*blocks_per_row + kbx;
-        const int ky = QR6_K*kqsx;
-
-        const int ql = get_int_from_uint8(bxi->ql, kqsx);
-        const int ql0 = (ql >> 0) & 0x0F0F0F0F;
-        const int ql1 = (ql >> 4) & 0x0F0F0F0F;
-
-        const int qh = get_int_from_uint8(bxi->qh, (QI6_K/4) * (kqsx / (QI6_K/2)) + kqsx % (QI6_K/4));
-        const int qh0 = ((qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4)))) << 4) & 0x30303030;
-        const int qh1 =  (qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4))))       & 0x30303030;
-
-        const int kq0 = ky - ky % QI6_K + k % (QI6_K/2) + 0;
-        const int kq1 = ky - ky % QI6_K + k % (QI6_K/2) + (QI6_K/2);
-
-        x_ql[i * (2*WARP_SIZE + 1) + kq0] = __vsubss4(ql0 | qh0, 0x20202020);
-        x_ql[i * (2*WARP_SIZE + 1) + kq1] = __vsubss4(ql1 | qh1, 0x20202020);
-    }
-
-    const int blocks_per_tile_x_row = WARP_SIZE / QI6_K; // == 1 if QK_K == 256
-    const int kbxd = k % blocks_per_tile_x_row;          // == 0 if QK_K == 256
-    float * x_dmf = (float *) x_dm;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI6_K) {
-        int i = (i0 + i_offset * QI6_K + k / blocks_per_tile_x_row) % mmq_y;
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-
-        const block_q6_K * bxi = bx0 + i*blocks_per_row + kbxd;
-
-        x_dmf[i * (WARP_SIZE/QI6_K) + i / QI6_K + kbxd] = bxi->d;
-    }
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
-        int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-
-        const block_q6_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / 4;
-
-        x_sc[i * (WARP_SIZE/8) + i / 8 + k % (WARP_SIZE/8)] = get_int_from_int8(bxi->scales, k % (QI6_K/8));
-    }
-}
-
-static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat(
-    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
-    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
-    (void)x_qh;
-
-    const float * x_dmf = (const float *) x_dm;
-    const float * y_df  = (const float *) y_ds;
-
-    const int8_t * sc = ((const int8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/8]);
-
-    const int index_x = i * (QR6_K*WARP_SIZE + 1) +  QR6_K*k;
-    const int index_y = j * WARP_SIZE             + (QR6_K*k) % WARP_SIZE;
-    return vec_dot_q6_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, x_dmf[i * (WARP_SIZE/QI6_K) + i/QI6_K], &y_df[index_y/QI8_1]);
-}
-
-static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
-#if QK_K == 256
-    const block_iq2_xxs * bq2 = (const block_iq2_xxs *) vbq;
-
-#if QR2_XXS == 8
-    const int ib32 = iqs;
-    const uint16_t * q2 = bq2->qs + 4*ib32;
-    const uint8_t  * aux8 = (const uint8_t *)q2;
-    const int8_t   * q8 = bq8_1[ib32].qs;
-    uint32_t aux32 = q2[2] | (q2[3] << 16);
-    int sumi = 0;
-    for (int l = 0; l < 4; ++l) {
-        const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]);
-        const uint8_t  signs = ksigns_iq2xs[aux32 & 127];
-        for (int j = 0; j < 8; ++j) {
-            sumi += q8[j] * grid[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
-        }
-        q8 += 8;
-        aux32 >>= 7;
-    }
-    const float d = (float)bq2->d * (0.5f + aux32) * (float)bq8_1[ib32].ds.x * 0.25f;
-    return d * sumi;
-#else
-    // iqs is 0...15
-    const int ib32 = iqs/2;
-    const int il = iqs%2;
-    const uint16_t * q2 = bq2->qs + 4*ib32;
-    const uint8_t  * aux8 = (const uint8_t *)q2;
-    const uint8_t  * grid1 = (const uint8_t *)(iq2xxs_grid + aux8[2*il+0]);
-    const uint8_t  * grid2 = (const uint8_t *)(iq2xxs_grid + aux8[2*il+1]);
-    const uint32_t aux32 = q2[2] | (q2[3] << 16);
-    const float d = (float)bq2->d * (0.5f + (aux32 >> 28)) * (float)bq8_1[ib32].ds.x * 0.25f;
-    const uint8_t signs1 = ksigns_iq2xs[(aux32 >> 14*il) & 127];
-    const uint8_t signs2 = ksigns_iq2xs[(aux32 >> (14*il + 7)) & 127];
-    const int8_t * q8 = bq8_1[ib32].qs + 16*il;
-    int sumi1 = 0, sumi2 = 0;
-    for (int j = 0; j < 8; ++j) {
-        sumi1 += q8[j+0] * grid1[j] * (signs1 & kmask_iq2xs[j] ? -1 : 1);
-        sumi2 += q8[j+8] * grid2[j] * (signs2 & kmask_iq2xs[j] ? -1 : 1);
-    }
-    return d * (sumi1 + sumi2);
-#endif
-#else
-    assert(false);
-    return 0.f;
-#endif
-}
-
-static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
-#if QK_K == 256
-    const block_iq2_xs * bq2 = (const block_iq2_xs *) vbq;
-
-    const int ib32 = iqs;
-    const uint16_t * q2 = bq2->qs + 4*ib32;
-    const int8_t   * q8 = bq8_1[ib32].qs;
-    const uint8_t ls1 = bq2->scales[ib32] & 0xf;
-    const uint8_t ls2 = bq2->scales[ib32] >>  4;
-    int sumi1 = 0;
-    for (int l = 0; l < 2; ++l) {
-        const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
-        const uint8_t  signs = ksigns_iq2xs[q2[l] >> 9];
-        for (int j = 0; j < 8; ++j) {
-            sumi1 += q8[j] * grid[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
-        }
-        q8 += 8;
-    }
-    int sumi2 = 0;
-    for (int l = 2; l < 4; ++l) {
-        const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
-        const uint8_t  signs = ksigns_iq2xs[q2[l] >> 9];
-        for (int j = 0; j < 8; ++j) {
-            sumi2 += q8[j] * grid[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
-        }
-        q8 += 8;
-    }
-    const float d = (float)bq2->d * (float)bq8_1[ib32].ds.x * 0.25f;
-    return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);
-#else
-    assert(false);
-    return 0.f;
-#endif
-}
-
-template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x, int mmq_y, int nwarps,
-              allocate_tiles_cuda_t allocate_tiles, load_tiles_cuda_t load_tiles, int vdr, vec_dot_q_mul_mat_cuda_t vec_dot>
-static __device__ __forceinline__ void mul_mat_q(
-    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
-
-    const block_q_t  * x = (const block_q_t  *) vx;
-    const block_q8_1 * y = (const block_q8_1 *) vy;
-
-    const int blocks_per_row_x = ncols_x / qk;
-    const int blocks_per_col_y = nrows_y / QK8_1;
-    const int blocks_per_warp = WARP_SIZE / qi;
-
-    const int & ncols_dst = ncols_y;
-
-    const int row_dst_0 = blockIdx.x*mmq_y;
-    const int & row_x_0 = row_dst_0;
-
-    const int col_dst_0 = blockIdx.y*mmq_x;
-    const int & col_y_0 = col_dst_0;
-
-    int   * tile_x_ql = nullptr;
-    half2 * tile_x_dm = nullptr;
-    int   * tile_x_qh = nullptr;
-    int   * tile_x_sc = nullptr;
-
-    allocate_tiles(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
-
-    __shared__ int    tile_y_qs[mmq_x * WARP_SIZE];
-    __shared__ half2  tile_y_ds[mmq_x * WARP_SIZE/QI8_1];
-
-    float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {{0.0f}};
-
-    for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) {
-
-        load_tiles(x + row_x_0*blocks_per_row_x + ib0, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc,
-                   threadIdx.y, nrows_x-row_x_0-1, threadIdx.x, blocks_per_row_x);
-
-#pragma unroll
-        for (int ir = 0; ir < qr; ++ir) {
-            const int kqs = ir*WARP_SIZE + threadIdx.x;
-            const int kbxd = kqs / QI8_1;
-
-#pragma unroll
-            for (int i = 0; i < mmq_x; i += nwarps) {
-                const int col_y_eff = min(col_y_0 + threadIdx.y + i, ncols_y-1); // to prevent out-of-bounds memory accesses
-
-                const block_q8_1 * by0 = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + kbxd];
-
-                const int index_y = (threadIdx.y + i) * WARP_SIZE + kqs % WARP_SIZE;
-                tile_y_qs[index_y] = get_int_from_int8_aligned(by0->qs, threadIdx.x % QI8_1);
-            }
-
-#pragma unroll
-            for (int ids0 = 0; ids0 < mmq_x; ids0 += nwarps * QI8_1) {
-                const int ids = (ids0 + threadIdx.y * QI8_1 + threadIdx.x / (WARP_SIZE/QI8_1)) % mmq_x;
-                const int kby = threadIdx.x % (WARP_SIZE/QI8_1);
-                const int col_y_eff = min(col_y_0 + ids, ncols_y-1);
-
-                // if the sum is not needed it's faster to transform the scale to f32 ahead of time
-                const half2 * dsi_src = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + ir*(WARP_SIZE/QI8_1) + kby].ds;
-                half2       * dsi_dst = &tile_y_ds[ids * (WARP_SIZE/QI8_1) + kby];
-                if (need_sum) {
-                    *dsi_dst = *dsi_src;
-                } else {
-                    float * dfi_dst = (float *) dsi_dst;
-                    *dfi_dst = __low2half(*dsi_src);
-                }
-            }
-
-            __syncthreads();
-
-// #pragma unroll // unrolling this loop causes too much register pressure
-            for (int k = ir*WARP_SIZE/qr; k < (ir+1)*WARP_SIZE/qr; k += vdr) {
-#pragma unroll
-                for (int j = 0; j < mmq_x; j += nwarps) {
-#pragma unroll
-                    for (int i = 0; i < mmq_y; i += WARP_SIZE) {
-                        sum[i/WARP_SIZE][j/nwarps] += vec_dot(
-                            tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, tile_y_qs, tile_y_ds,
-                            threadIdx.x + i, threadIdx.y + j, k);
-                    }
-                }
-            }
-
-            __syncthreads();
-        }
-    }
-
-#pragma unroll
-    for (int j = 0; j < mmq_x; j += nwarps) {
-        const int col_dst = col_dst_0 + j + threadIdx.y;
-
-        if (col_dst >= ncols_dst) {
-            return;
-        }
-
-#pragma unroll
-        for (int i = 0; i < mmq_y; i += WARP_SIZE) {
-            const int row_dst = row_dst_0 + threadIdx.x + i;
-
-            if (row_dst >= nrows_dst) {
-                continue;
-            }
-
-            dst[col_dst*nrows_dst + row_dst] = sum[i/WARP_SIZE][j/nwarps];
-        }
-    }
-}
-
-#define  MMQ_X_Q4_0_RDNA2  64
-#define  MMQ_Y_Q4_0_RDNA2  128
-#define NWARPS_Q4_0_RDNA2  8
-#define  MMQ_X_Q4_0_RDNA1  64
-#define  MMQ_Y_Q4_0_RDNA1  64
-#define NWARPS_Q4_0_RDNA1  8
-#if defined(CUDA_USE_TENSOR_CORES)
-#define  MMQ_X_Q4_0_AMPERE 4
-#define  MMQ_Y_Q4_0_AMPERE 32
-#define NWARPS_Q4_0_AMPERE 4
-#else
-#define  MMQ_X_Q4_0_AMPERE 64
-#define  MMQ_Y_Q4_0_AMPERE 128
-#define NWARPS_Q4_0_AMPERE 4
-#endif
-#define  MMQ_X_Q4_0_PASCAL 64
-#define  MMQ_Y_Q4_0_PASCAL 64
-#define NWARPS_Q4_0_PASCAL 8
-
-template <bool need_check> static __global__ void
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    __launch_bounds__(WARP_SIZE*NWARPS_Q4_0_RDNA2, 2)
-#endif // defined(RDNA3) || defined(RDNA2)
-#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-    mul_mat_q4_0(
-    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
-
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    const int mmq_x  =  MMQ_X_Q4_0_RDNA2;
-    const int mmq_y  =  MMQ_Y_Q4_0_RDNA2;
-    const int nwarps = NWARPS_Q4_0_RDNA2;
-#else
-    const int mmq_x  =  MMQ_X_Q4_0_RDNA1;
-    const int mmq_y  =  MMQ_Y_Q4_0_RDNA1;
-    const int nwarps = NWARPS_Q4_0_RDNA1;
-#endif // defined(RDNA3) || defined(RDNA2)
-
-    mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
-        load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-
-#elif __CUDA_ARCH__ >= CC_VOLTA
-    const int mmq_x  =  MMQ_X_Q4_0_AMPERE;
-    const int mmq_y  =  MMQ_Y_Q4_0_AMPERE;
-    const int nwarps = NWARPS_Q4_0_AMPERE;
-
-    mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
-        load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-
-#elif __CUDA_ARCH__ >= MIN_CC_DP4A
-    const int mmq_x  =  MMQ_X_Q4_0_PASCAL;
-    const int mmq_y  =  MMQ_Y_Q4_0_PASCAL;
-    const int nwarps = NWARPS_Q4_0_PASCAL;
-
-    mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
-        load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-#else
-    (void) vec_dot_q4_0_q8_1_mul_mat;
-    bad_arch();
-#endif // __CUDA_ARCH__ >= CC_VOLTA
-}
-
-#define  MMQ_X_Q4_1_RDNA2  64
-#define  MMQ_Y_Q4_1_RDNA2  128
-#define NWARPS_Q4_1_RDNA2  8
-#define  MMQ_X_Q4_1_RDNA1  64
-#define  MMQ_Y_Q4_1_RDNA1  64
-#define NWARPS_Q4_1_RDNA1  8
-#if defined(CUDA_USE_TENSOR_CORES)
-#define  MMQ_X_Q4_1_AMPERE 4
-#define  MMQ_Y_Q4_1_AMPERE 32
-#define NWARPS_Q4_1_AMPERE 4
-#else
-#define  MMQ_X_Q4_1_AMPERE 64
-#define  MMQ_Y_Q4_1_AMPERE 128
-#define NWARPS_Q4_1_AMPERE 4
-#endif
-#define  MMQ_X_Q4_1_PASCAL 64
-#define  MMQ_Y_Q4_1_PASCAL 64
-#define NWARPS_Q4_1_PASCAL 8
-
-template <bool need_check> static __global__ void
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    __launch_bounds__(WARP_SIZE*NWARPS_Q4_1_RDNA2, 2)
-#endif // defined(RDNA3) || defined(RDNA2)
-#elif __CUDA_ARCH__ < CC_VOLTA
-    __launch_bounds__(WARP_SIZE*NWARPS_Q4_1_PASCAL, 2)
-#endif // __CUDA_ARCH__ < CC_VOLTA
-    mul_mat_q4_1(
-    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
-
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    const int mmq_x  =  MMQ_X_Q4_1_RDNA2;
-    const int mmq_y  =  MMQ_Y_Q4_1_RDNA2;
-    const int nwarps = NWARPS_Q4_1_RDNA2;
-#else
-    const int mmq_x  =  MMQ_X_Q4_1_RDNA1;
-    const int mmq_y  =  MMQ_Y_Q4_1_RDNA1;
-    const int nwarps = NWARPS_Q4_1_RDNA1;
-#endif // defined(RDNA3) || defined(RDNA2)
-
-    mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
-        load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-
-#elif __CUDA_ARCH__ >= CC_VOLTA
-    const int mmq_x  =  MMQ_X_Q4_1_AMPERE;
-    const int mmq_y  =  MMQ_Y_Q4_1_AMPERE;
-    const int nwarps = NWARPS_Q4_1_AMPERE;
-
-    mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
-        load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-
-#elif __CUDA_ARCH__ >= MIN_CC_DP4A
-    const int mmq_x  =  MMQ_X_Q4_1_PASCAL;
-    const int mmq_y  =  MMQ_Y_Q4_1_PASCAL;
-    const int nwarps = NWARPS_Q4_1_PASCAL;
-
-    mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
-        load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-#else
-    (void) vec_dot_q4_1_q8_1_mul_mat;
-    bad_arch();
-#endif // __CUDA_ARCH__ >= CC_VOLTA
-}
-
-#define  MMQ_X_Q5_0_RDNA2  64
-#define  MMQ_Y_Q5_0_RDNA2  128
-#define NWARPS_Q5_0_RDNA2  8
-#define  MMQ_X_Q5_0_RDNA1  64
-#define  MMQ_Y_Q5_0_RDNA1  64
-#define NWARPS_Q5_0_RDNA1  8
-#if defined(CUDA_USE_TENSOR_CORES)
-#define  MMQ_X_Q5_0_AMPERE 4
-#define  MMQ_Y_Q5_0_AMPERE 32
-#define NWARPS_Q5_0_AMPERE 4
-#else
-#define  MMQ_X_Q5_0_AMPERE 128
-#define  MMQ_Y_Q5_0_AMPERE 64
-#define NWARPS_Q5_0_AMPERE 4
-#endif
-#define  MMQ_X_Q5_0_PASCAL 64
-#define  MMQ_Y_Q5_0_PASCAL 64
-#define NWARPS_Q5_0_PASCAL 8
-
-template <bool need_check> static __global__ void
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    __launch_bounds__(WARP_SIZE*NWARPS_Q5_0_RDNA2, 2)
-#endif // defined(RDNA3) || defined(RDNA2)
-#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-    mul_mat_q5_0(
-    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
-
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    const int mmq_x  =  MMQ_X_Q5_0_RDNA2;
-    const int mmq_y  =  MMQ_Y_Q5_0_RDNA2;
-    const int nwarps = NWARPS_Q5_0_RDNA2;
-#else
-    const int mmq_x  =  MMQ_X_Q5_0_RDNA1;
-    const int mmq_y  =  MMQ_Y_Q5_0_RDNA1;
-    const int nwarps = NWARPS_Q5_0_RDNA1;
-#endif // defined(RDNA3) || defined(RDNA2)
-
-    mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
-        load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-
-#elif __CUDA_ARCH__ >= CC_VOLTA
-    const int mmq_x  =  MMQ_X_Q5_0_AMPERE;
-    const int mmq_y  =  MMQ_Y_Q5_0_AMPERE;
-    const int nwarps = NWARPS_Q5_0_AMPERE;
-
-    mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
-        load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-
-#elif __CUDA_ARCH__ >= MIN_CC_DP4A
-    const int mmq_x  =  MMQ_X_Q5_0_PASCAL;
-    const int mmq_y  =  MMQ_Y_Q5_0_PASCAL;
-    const int nwarps = NWARPS_Q5_0_PASCAL;
-
-    mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
-        load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-#else
-    (void) vec_dot_q5_0_q8_1_mul_mat;
-    bad_arch();
-#endif // __CUDA_ARCH__ >= CC_VOLTA
-}
-
-#define  MMQ_X_Q5_1_RDNA2  64
-#define  MMQ_Y_Q5_1_RDNA2  128
-#define NWARPS_Q5_1_RDNA2  8
-#define  MMQ_X_Q5_1_RDNA1  64
-#define  MMQ_Y_Q5_1_RDNA1  64
-#define NWARPS_Q5_1_RDNA1  8
-#if defined(CUDA_USE_TENSOR_CORES)
-#define  MMQ_X_Q5_1_AMPERE 4
-#define  MMQ_Y_Q5_1_AMPERE 32
-#define NWARPS_Q5_1_AMPERE 4
-#else
-#define  MMQ_X_Q5_1_AMPERE 128
-#define  MMQ_Y_Q5_1_AMPERE 64
-#define NWARPS_Q5_1_AMPERE 4
-#endif
-#define  MMQ_X_Q5_1_PASCAL 64
-#define  MMQ_Y_Q5_1_PASCAL 64
-#define NWARPS_Q5_1_PASCAL 8
-
-template <bool need_check> static __global__ void
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    __launch_bounds__(WARP_SIZE*NWARPS_Q5_1_RDNA2, 2)
-#endif // defined(RDNA3) || defined(RDNA2)
-#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-mul_mat_q5_1(
-    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
-
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    const int mmq_x  =  MMQ_X_Q5_1_RDNA2;
-    const int mmq_y  =  MMQ_Y_Q5_1_RDNA2;
-    const int nwarps = NWARPS_Q5_1_RDNA2;
-#else
-    const int mmq_x  =  MMQ_X_Q5_1_RDNA1;
-    const int mmq_y  =  MMQ_Y_Q5_1_RDNA1;
-    const int nwarps = NWARPS_Q5_1_RDNA1;
-#endif // defined(RDNA3) || defined(RDNA2)
-
-    mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
-        load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-
-#elif __CUDA_ARCH__ >= CC_VOLTA
-    const int mmq_x  =  MMQ_X_Q5_1_AMPERE;
-    const int mmq_y  =  MMQ_Y_Q5_1_AMPERE;
-    const int nwarps = NWARPS_Q5_1_AMPERE;
-
-    mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
-        load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-
-#elif __CUDA_ARCH__ >= MIN_CC_DP4A
-    const int mmq_x  =  MMQ_X_Q5_1_PASCAL;
-    const int mmq_y  =  MMQ_Y_Q5_1_PASCAL;
-    const int nwarps = NWARPS_Q5_1_PASCAL;
-
-    mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
-        load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-#else
-    (void) vec_dot_q5_1_q8_1_mul_mat;
-    bad_arch();
-#endif // __CUDA_ARCH__ >= CC_VOLTA
-}
-
-#define  MMQ_X_Q8_0_RDNA2  64
-#define  MMQ_Y_Q8_0_RDNA2  128
-#define NWARPS_Q8_0_RDNA2  8
-#define  MMQ_X_Q8_0_RDNA1  64
-#define  MMQ_Y_Q8_0_RDNA1  64
-#define NWARPS_Q8_0_RDNA1  8
-#if defined(CUDA_USE_TENSOR_CORES)
-#define  MMQ_X_Q8_0_AMPERE 4
-#define  MMQ_Y_Q8_0_AMPERE 32
-#define NWARPS_Q8_0_AMPERE 4
-#else
-#define  MMQ_X_Q8_0_AMPERE 128
-#define  MMQ_Y_Q8_0_AMPERE 64
-#define NWARPS_Q8_0_AMPERE 4
-#endif
-#define  MMQ_X_Q8_0_PASCAL 64
-#define  MMQ_Y_Q8_0_PASCAL 64
-#define NWARPS_Q8_0_PASCAL 8
-
-template <bool need_check> static __global__ void
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    __launch_bounds__(WARP_SIZE*NWARPS_Q8_0_RDNA2, 2)
-#endif // defined(RDNA3) || defined(RDNA2)
-#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-    mul_mat_q8_0(
-    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
-
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    const int mmq_x  =  MMQ_X_Q8_0_RDNA2;
-    const int mmq_y  =  MMQ_Y_Q8_0_RDNA2;
-    const int nwarps = NWARPS_Q8_0_RDNA2;
-#else
-    const int mmq_x  =  MMQ_X_Q8_0_RDNA1;
-    const int mmq_y  =  MMQ_Y_Q8_0_RDNA1;
-    const int nwarps = NWARPS_Q8_0_RDNA1;
-#endif // defined(RDNA3) || defined(RDNA2)
-
-    mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
-        load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-
-#elif __CUDA_ARCH__ >= CC_VOLTA
-    const int mmq_x  =  MMQ_X_Q8_0_AMPERE;
-    const int mmq_y  =  MMQ_Y_Q8_0_AMPERE;
-    const int nwarps = NWARPS_Q8_0_AMPERE;
-
-    mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
-        load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-
-#elif __CUDA_ARCH__ >= MIN_CC_DP4A
-    const int mmq_x  =  MMQ_X_Q8_0_PASCAL;
-    const int mmq_y  =  MMQ_Y_Q8_0_PASCAL;
-    const int nwarps = NWARPS_Q8_0_PASCAL;
-
-    mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
-        load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-#else
-    (void) vec_dot_q8_0_q8_1_mul_mat;
-    bad_arch();
-#endif // __CUDA_ARCH__ >= CC_VOLTA
-}
-
-#define  MMQ_X_Q2_K_RDNA2  64
-#define  MMQ_Y_Q2_K_RDNA2  128
-#define NWARPS_Q2_K_RDNA2  8
-#define  MMQ_X_Q2_K_RDNA1  128
-#define  MMQ_Y_Q2_K_RDNA1  32
-#define NWARPS_Q2_K_RDNA1  8
-#if defined(CUDA_USE_TENSOR_CORES)
-#define  MMQ_X_Q2_K_AMPERE 4
-#define  MMQ_Y_Q2_K_AMPERE 32
-#define NWARPS_Q2_K_AMPERE 4
-#else
-#define  MMQ_X_Q2_K_AMPERE 64
-#define  MMQ_Y_Q2_K_AMPERE 128
-#define NWARPS_Q2_K_AMPERE 4
-#endif
-#define  MMQ_X_Q2_K_PASCAL 64
-#define  MMQ_Y_Q2_K_PASCAL 64
-#define NWARPS_Q2_K_PASCAL 8
-
-template <bool need_check> static __global__ void
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    __launch_bounds__(WARP_SIZE*NWARPS_Q2_K_RDNA2, 2)
-#endif // defined(RDNA3) || defined(RDNA2)
-#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-mul_mat_q2_K(
-    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
-
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    const int mmq_x  =  MMQ_X_Q2_K_RDNA2;
-    const int mmq_y  =  MMQ_Y_Q2_K_RDNA2;
-    const int nwarps = NWARPS_Q2_K_RDNA2;
-#else
-    const int mmq_x  =  MMQ_X_Q2_K_RDNA1;
-    const int mmq_y  =  MMQ_Y_Q2_K_RDNA1;
-    const int nwarps = NWARPS_Q2_K_RDNA1;
-#endif // defined(RDNA3) || defined(RDNA2)
-
-    mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
-        load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-
-#elif __CUDA_ARCH__ >= CC_VOLTA
-    const int mmq_x  =  MMQ_X_Q2_K_AMPERE;
-    const int mmq_y  =  MMQ_Y_Q2_K_AMPERE;
-    const int nwarps = NWARPS_Q2_K_AMPERE;
-
-    mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
-        load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-
-#elif __CUDA_ARCH__ >= MIN_CC_DP4A
-    const int mmq_x  =  MMQ_X_Q2_K_PASCAL;
-    const int mmq_y  =  MMQ_Y_Q2_K_PASCAL;
-    const int nwarps = NWARPS_Q2_K_PASCAL;
-
-    mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
-        load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-#else
-    (void) vec_dot_q2_K_q8_1_mul_mat;
-    bad_arch();
-#endif // __CUDA_ARCH__ >= CC_VOLTA
-}
-
-#define  MMQ_X_Q3_K_RDNA2  128
-#define  MMQ_Y_Q3_K_RDNA2  64
-#define NWARPS_Q3_K_RDNA2  8
-#define  MMQ_X_Q3_K_RDNA1  32
-#define  MMQ_Y_Q3_K_RDNA1  128
-#define NWARPS_Q3_K_RDNA1  8
-#if defined(CUDA_USE_TENSOR_CORES)
-#define  MMQ_X_Q3_K_AMPERE 4
-#define  MMQ_Y_Q3_K_AMPERE 32
-#define NWARPS_Q3_K_AMPERE 4
-#else
-#define  MMQ_X_Q3_K_AMPERE 128
-#define  MMQ_Y_Q3_K_AMPERE 128
-#define NWARPS_Q3_K_AMPERE 4
-#endif
-#define  MMQ_X_Q3_K_PASCAL 64
-#define  MMQ_Y_Q3_K_PASCAL 64
-#define NWARPS_Q3_K_PASCAL 8
-
-template <bool need_check> static __global__ void
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    __launch_bounds__(WARP_SIZE*NWARPS_Q3_K_RDNA2, 2)
-#endif // defined(RDNA3) || defined(RDNA2)
-#elif __CUDA_ARCH__ < CC_VOLTA
-    __launch_bounds__(WARP_SIZE*NWARPS_Q3_K_PASCAL, 2)
-#endif // __CUDA_ARCH__ < CC_VOLTA
-    mul_mat_q3_K(
-    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
-
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    const int mmq_x  =  MMQ_X_Q3_K_RDNA2;
-    const int mmq_y  =  MMQ_Y_Q3_K_RDNA2;
-    const int nwarps = NWARPS_Q3_K_RDNA2;
-#else
-    const int mmq_x  =  MMQ_X_Q3_K_RDNA1;
-    const int mmq_y  =  MMQ_Y_Q3_K_RDNA1;
-    const int nwarps = NWARPS_Q3_K_RDNA1;
-#endif // defined(RDNA3) || defined(RDNA2)
-
-    mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
-        load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-
-#elif __CUDA_ARCH__ >= CC_VOLTA
-    const int mmq_x  =  MMQ_X_Q3_K_AMPERE;
-    const int mmq_y  =  MMQ_Y_Q3_K_AMPERE;
-    const int nwarps = NWARPS_Q3_K_AMPERE;
-
-    mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
-        load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-
-#elif __CUDA_ARCH__ >= MIN_CC_DP4A
-    const int mmq_x  =  MMQ_X_Q3_K_PASCAL;
-    const int mmq_y  =  MMQ_Y_Q3_K_PASCAL;
-    const int nwarps = NWARPS_Q3_K_PASCAL;
-
-    mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
-        load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-#else
-    (void) vec_dot_q3_K_q8_1_mul_mat;
-    bad_arch();
-#endif // __CUDA_ARCH__ >= CC_VOLTA
-}
-
-#define  MMQ_X_Q4_K_RDNA2  64
-#define  MMQ_Y_Q4_K_RDNA2  128
-#define NWARPS_Q4_K_RDNA2  8
-#define  MMQ_X_Q4_K_RDNA1  32
-#define  MMQ_Y_Q4_K_RDNA1  64
-#define NWARPS_Q4_K_RDNA1  8
-#if defined(CUDA_USE_TENSOR_CORES)
-#define  MMQ_X_Q4_K_AMPERE 4
-#define  MMQ_Y_Q4_K_AMPERE 32
-#define NWARPS_Q4_K_AMPERE 4
-#else
-#define  MMQ_X_Q4_K_AMPERE 64
-#define  MMQ_Y_Q4_K_AMPERE 128
-#define NWARPS_Q4_K_AMPERE 4
-#endif
-#define  MMQ_X_Q4_K_PASCAL 64
-#define  MMQ_Y_Q4_K_PASCAL 64
-#define NWARPS_Q4_K_PASCAL 8
-
-template <bool need_check> static __global__ void
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    __launch_bounds__(WARP_SIZE*NWARPS_Q4_K_RDNA2, 2)
-#endif // defined(RDNA3) || defined(RDNA2)
-#elif __CUDA_ARCH__ < CC_VOLTA
-    __launch_bounds__(WARP_SIZE*NWARPS_Q4_K_PASCAL, 2)
-#endif // __CUDA_ARCH__ < CC_VOLTA
-    mul_mat_q4_K(
-    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
-
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    const int mmq_x  =  MMQ_X_Q4_K_RDNA2;
-    const int mmq_y  =  MMQ_Y_Q4_K_RDNA2;
-    const int nwarps = NWARPS_Q4_K_RDNA2;
-#else
-    const int mmq_x  =  MMQ_X_Q4_K_RDNA1;
-    const int mmq_y  =  MMQ_Y_Q4_K_RDNA1;
-    const int nwarps = NWARPS_Q4_K_RDNA1;
-#endif // defined(RDNA3) || defined(RDNA2)
-
-    mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
-        load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-
-#elif __CUDA_ARCH__ >= CC_VOLTA
-    const int mmq_x  =  MMQ_X_Q4_K_AMPERE;
-    const int mmq_y  =  MMQ_Y_Q4_K_AMPERE;
-    const int nwarps = NWARPS_Q4_K_AMPERE;
-
-    mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
-        load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-
-#elif __CUDA_ARCH__ >= MIN_CC_DP4A
-    const int mmq_x  =  MMQ_X_Q4_K_PASCAL;
-    const int mmq_y  =  MMQ_Y_Q4_K_PASCAL;
-    const int nwarps = NWARPS_Q4_K_PASCAL;
-
-    mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
-        load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-#else
-    (void) vec_dot_q4_K_q8_1_mul_mat;
-    bad_arch();
-#endif // __CUDA_ARCH__ >= CC_VOLTA
-}
-
-#define  MMQ_X_Q5_K_RDNA2  64
-#define  MMQ_Y_Q5_K_RDNA2  128
-#define NWARPS_Q5_K_RDNA2  8
-#define  MMQ_X_Q5_K_RDNA1  32
-#define  MMQ_Y_Q5_K_RDNA1  64
-#define NWARPS_Q5_K_RDNA1  8
-#if defined(CUDA_USE_TENSOR_CORES)
-#define  MMQ_X_Q5_K_AMPERE 4
-#define  MMQ_Y_Q5_K_AMPERE 32
-#define NWARPS_Q5_K_AMPERE 4
-#else
-#define  MMQ_X_Q5_K_AMPERE 64
-#define  MMQ_Y_Q5_K_AMPERE 128
-#define NWARPS_Q5_K_AMPERE 4
-#endif
-#define  MMQ_X_Q5_K_PASCAL 64
-#define  MMQ_Y_Q5_K_PASCAL 64
-#define NWARPS_Q5_K_PASCAL 8
-
-template <bool need_check> static __global__ void
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    __launch_bounds__(WARP_SIZE*NWARPS_Q5_K_RDNA2, 2)
-#endif // defined(RDNA3) || defined(RDNA2)
-#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-mul_mat_q5_K(
-    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
-
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    const int mmq_x  =  MMQ_X_Q5_K_RDNA2;
-    const int mmq_y  =  MMQ_Y_Q5_K_RDNA2;
-    const int nwarps = NWARPS_Q5_K_RDNA2;
-#else
-    const int mmq_x  =  MMQ_X_Q5_K_RDNA1;
-    const int mmq_y  =  MMQ_Y_Q5_K_RDNA1;
-    const int nwarps = NWARPS_Q5_K_RDNA1;
-#endif // defined(RDNA3) || defined(RDNA2)
-
-    mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
-        load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-
-#elif __CUDA_ARCH__ >= CC_VOLTA
-    const int mmq_x  =  MMQ_X_Q5_K_AMPERE;
-    const int mmq_y  =  MMQ_Y_Q5_K_AMPERE;
-    const int nwarps = NWARPS_Q5_K_AMPERE;
-
-    mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
-        load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-
-#elif __CUDA_ARCH__ >= MIN_CC_DP4A
-    const int mmq_x  =  MMQ_X_Q5_K_PASCAL;
-    const int mmq_y  =  MMQ_Y_Q5_K_PASCAL;
-    const int nwarps = NWARPS_Q5_K_PASCAL;
-
-    mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
-        load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-#else
-    (void) vec_dot_q5_K_q8_1_mul_mat;
-    bad_arch();
-#endif // __CUDA_ARCH__ >= CC_VOLTA
-}
-
-#define  MMQ_X_Q6_K_RDNA2  64
-#define  MMQ_Y_Q6_K_RDNA2  128
-#define NWARPS_Q6_K_RDNA2  8
-#define  MMQ_X_Q6_K_RDNA1  32
-#define  MMQ_Y_Q6_K_RDNA1  64
-#define NWARPS_Q6_K_RDNA1  8
-#if defined(CUDA_USE_TENSOR_CORES)
-#define  MMQ_X_Q6_K_AMPERE 4
-#define  MMQ_Y_Q6_K_AMPERE 32
-#define NWARPS_Q6_K_AMPERE 4
-#else
-#define  MMQ_X_Q6_K_AMPERE 64
-#define  MMQ_Y_Q6_K_AMPERE 64
-#define NWARPS_Q6_K_AMPERE 4
-#endif
-#define  MMQ_X_Q6_K_PASCAL 64
-#define  MMQ_Y_Q6_K_PASCAL 64
-#define NWARPS_Q6_K_PASCAL 8
-
-template <bool need_check> static __global__ void
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    __launch_bounds__(WARP_SIZE*NWARPS_Q6_K_RDNA2, 2)
-#endif // defined(RDNA3) || defined(RDNA2)
-#elif __CUDA_ARCH__ < CC_VOLTA
-    __launch_bounds__(WARP_SIZE*NWARPS_Q6_K_PASCAL, 2)
-#endif // __CUDA_ARCH__ < CC_VOLTA
-    mul_mat_q6_K(
-    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
-
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    const int mmq_x  =  MMQ_X_Q6_K_RDNA2;
-    const int mmq_y  =  MMQ_Y_Q6_K_RDNA2;
-    const int nwarps = NWARPS_Q6_K_RDNA2;
-#else
-    const int mmq_x  =  MMQ_X_Q6_K_RDNA1;
-    const int mmq_y  =  MMQ_Y_Q6_K_RDNA1;
-    const int nwarps = NWARPS_Q6_K_RDNA1;
-#endif // defined(RDNA3) || defined(RDNA2)
-
-    mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
-        load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-
-#elif __CUDA_ARCH__ >= CC_VOLTA
-    const int mmq_x  =  MMQ_X_Q6_K_AMPERE;
-    const int mmq_y  =  MMQ_Y_Q6_K_AMPERE;
-    const int nwarps = NWARPS_Q6_K_AMPERE;
-
-    mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
-        load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-
-#elif __CUDA_ARCH__ >= MIN_CC_DP4A
-    const int mmq_x  =  MMQ_X_Q6_K_PASCAL;
-    const int mmq_y  =  MMQ_Y_Q6_K_PASCAL;
-    const int nwarps = NWARPS_Q6_K_PASCAL;
-
-    mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
-        load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-#else
-    (void) vec_dot_q6_K_q8_1_mul_mat;
-    bad_arch();
-#endif // __CUDA_ARCH__ >= CC_VOLTA
-}
-
-template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
-static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows) {
-    const int row = blockIdx.x*blockDim.y + threadIdx.y;
-
-    if (row >= nrows) {
-        return;
-    }
-
-    const int blocks_per_row = ncols / qk;
-    const int blocks_per_warp = vdr * WARP_SIZE / qi;
-
-// partial sum for each thread
-    float tmp = 0.0f;
-
-    const block_q_t  * x = (const block_q_t  *) vx;
-    const block_q8_1 * y = (const block_q8_1 *) vy;
-
-    for (int i = 0; i < blocks_per_row; i += blocks_per_warp) {
-        const int ibx = row*blocks_per_row + i + threadIdx.x / (qi/vdr); // x block index
-
-        const int iby = (i + threadIdx.x / (qi/vdr)) * (qk/QK8_1); // y block index that aligns with ibx
-
-        const int iqs  = vdr * (threadIdx.x % (qi/vdr)); // x block quant index when casting the quants to int
-
-        tmp += vec_dot_q_cuda(&x[ibx], &y[iby], iqs);
-    }
-
-    // sum up partial sums and write back result
-#pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
-    }
-
-    if (threadIdx.x == 0) {
-        dst[row] = tmp;
-    }
-}
-
-template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
-static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows) {
-    // qk = quantized weights per x block
-    // qr = number of quantized weights per data value in x block
-    const int row = blockIdx.x*blockDim.y + threadIdx.y;
-
-    if (row >= nrows) {
-        return;
-    }
-
-    const int tid = threadIdx.x;
-
-    const int iter_stride = 2*GGML_CUDA_DMMV_X;
-    const int vals_per_iter = iter_stride / WARP_SIZE; // num quantized vals per thread and i iter
-    const int y_offset = qr == 1 ? 1 : qk/2;
-
-// partial sum for each thread
-#ifdef GGML_CUDA_F16
-    half2 tmp = {0.0f, 0.0f}; // two sums for f16 to take advantage of half2 intrinsics
-#else
-    float tmp = 0.0f;
-#endif // GGML_CUDA_F16
-
-    for (int i = 0; i < ncols; i += iter_stride) {
-        const int col = i + vals_per_iter*tid;
-        const int ib = (row*ncols + col)/qk; // x block index
-        const int iqs = (col%qk)/qr; // x quant index
-        const int iybs = col - col%qk; // y block start index
-
-// processing >2 values per i iter is faster for fast GPUs
-#pragma unroll
-        for (int j = 0; j < vals_per_iter; j += 2) {
-            // process 2 vals per j iter
-
-            // dequantize
-            // for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val
-            dfloat2 v;
-            dequantize_kernel(vx, ib, iqs + j/qr, v);
-
-            // matrix multiplication
-            // for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2
-#ifdef GGML_CUDA_F16
-            tmp += __hmul2(v, {
-                y[iybs + iqs + j/qr + 0],
-                y[iybs + iqs + j/qr + y_offset]
-            });
-#else
-            tmp += v.x * y[iybs + iqs + j/qr + 0];
-            tmp += v.y * y[iybs + iqs + j/qr + y_offset];
-#endif // GGML_CUDA_F16
-        }
-    }
-
-    // sum up partial sums and write back result
-#pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
-    }
-
-    if (tid == 0) {
-#ifdef GGML_CUDA_F16
-        dst[row] = tmp.x + tmp.y;
-#else
-        dst[row] = tmp;
-#endif // GGML_CUDA_F16
-    }
-}
-
-static __global__ void mul_mat_p021_f16_f32(
-    const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int nchannels_x, const int nchannels_y) {
-
-    const half * x = (const half *) vx;
-
-    const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
-    const int channel = blockDim.z*blockIdx.z + threadIdx.z;
-    const int channel_x = channel / (nchannels_y / nchannels_x);
-
-    const int nrows_y = ncols_x;
-    const int nrows_dst = nrows_x;
-    const int row_dst = row_x;
-
-    float tmp = 0.0f;
-
-    for (int col_x0 = 0; col_x0 < ncols_x; col_x0 += blockDim.x) {
-        const int col_x = col_x0 + threadIdx.x;
-
-        if (col_x >= ncols_x) {
-            break;
-        }
-
-        // x is transposed and permuted
-        const int ix = row_x*nchannels_x*ncols_x + channel_x*ncols_x + col_x;
-        const float xi = __half2float(x[ix]);
-
-        const int row_y = col_x;
-
-        // y is not transposed but permuted
-        const int iy = channel*nrows_y + row_y;
-
-        tmp += xi * y[iy];
-    }
-
-    // dst is not transposed and not permuted
-    const int idst = channel*nrows_dst + row_dst;
-
-    // sum up partial sums and write back result
-#pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
-    }
-
-    if (threadIdx.x == 0) {
-        dst[idst] = tmp;
-    }
-}
-
-static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
-    const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x,
-    const int row_stride_x, const int channel_stride_x, const int channel_x_divisor) {
-
-    const half * x = (const half *) vx;
-
-    const int row_x     = blockDim.y*blockIdx.y + threadIdx.y;
-    const int channel   = blockDim.z*blockIdx.z + threadIdx.z;
-    const int channel_x = channel / channel_x_divisor;
-
-    const int nrows_y   = ncols_x;
-    const int nrows_dst = nrows_x;
-    const int row_dst   = row_x;
-
-    const int idst = channel*nrows_dst + row_dst;
-
-    float tmp = 0.0f;
-
-    for (int col_x0 = 0; col_x0 < ncols_x; col_x0 += blockDim.x) {
-        const int col_x = col_x0 + threadIdx.x;
-
-        if (col_x >= ncols_x) {
-            break;
-        }
-
-        const int row_y = col_x;
-
-        const int ix = channel_x*channel_stride_x + row_x*row_stride_x + col_x;
-        const int iy = channel*nrows_y + row_y;
-
-        const float xi = __half2float(x[ix]);
-
-        tmp += xi * y[iy];
-    }
-
-    // sum up partial sums and write back result
-#pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
-    }
-
-    if (threadIdx.x == 0) {
-        dst[idst] = tmp;
-    }
-}
-
-static __device__ void cpy_1_f32_f32(const char * cxi, char * cdsti) {
-    const float * xi = (const float *) cxi;
-    float * dsti = (float *) cdsti;
-
-    *dsti = *xi;
-}
-
-static __device__ void cpy_1_f32_f16(const char * cxi, char * cdsti) {
-    const float * xi = (const float *) cxi;
-    half * dsti = (half *) cdsti;
-
-    *dsti = __float2half(*xi);
-}
-
-static __device__ void cpy_1_f16_f16(const char * cxi, char * cdsti) {
-    const half * xi = (const half *) cxi;
-    half * dsti = (half *) cdsti;
-
-    *dsti = *xi;
-}
-
-template <cpy_kernel_t cpy_1>
-static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
-                                   const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, const int nb03,
-                                   const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13) {
-    const int i = blockDim.x*blockIdx.x + threadIdx.x;
-
-    if (i >= ne) {
-        return;
-    }
-
-    // determine indices i02/i12, i01/i11, i00/i10 as a function of index i of flattened tensor
-    // then combine those indices with the corresponding byte offsets to get the total offsets
-    const int i03 = i / (ne00*ne01*ne02);
-    const int i02 = (i - i03*ne00*ne01*ne02) / (ne00*ne01);
-    const int i01 = (i - i03*ne00*ne01*ne02 - i02*ne00*ne01) / ne00;
-    const int i00 = i - i03*ne00*ne01*ne02- i02*ne00*ne01 - i01*ne00;
-    const int x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03;
-
-    const int i13 = i / (ne10*ne11*ne12);
-    const int i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
-    const int i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
-    const int i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
-    const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13*nb13;
-
-    cpy_1(cx + x_offset, cdst + dst_offset);
-}
-
-static __device__ void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) {
-    const float * xi = (const float *) cxi;
-    block_q8_0 * dsti = (block_q8_0 *) cdsti;
-
-    float amax = 0.0f; // absolute max
-
-    for (int j = 0; j < QK8_0; j++) {
-        const float v = xi[j];
-        amax = fmaxf(amax, fabsf(v));
-    }
-
-    const float d = amax / ((1 << 7) - 1);
-    const float id = d ? 1.0f/d : 0.0f;
-
-    dsti->d = d;
-
-    for (int j = 0; j < QK8_0; ++j) {
-        const float x0 = xi[j]*id;
-
-        dsti->qs[j] = roundf(x0);
-    }
-}
-
-static __device__ void cpy_blck_f32_q4_0(const char * cxi, char * cdsti) {
-    const float * xi = (const float *) cxi;
-    block_q4_0 * dsti = (block_q4_0 *) cdsti;
-
-    float amax = 0.0f;
-    float vmax = 0.0f;
-
-    for (int j = 0; j < QK4_0; ++j) {
-        const float v = xi[j];
-        if (amax < fabsf(v)) {
-            amax = fabsf(v);
-            vmax = v;
-        }
-    }
-
-    const float d  = vmax / -8;
-    const float id = d ? 1.0f/d : 0.0f;
-
-    dsti->d = d;
-
-    for (int j = 0; j < QK4_0/2; ++j) {
-        const float x0 = xi[0       + j]*id;
-        const float x1 = xi[QK4_0/2 + j]*id;
-
-        const uint8_t xi0 = min(15, (int8_t)(x0 + 8.5f));
-        const uint8_t xi1 = min(15, (int8_t)(x1 + 8.5f));
-
-        dsti->qs[j]  = xi0;
-        dsti->qs[j] |= xi1 << 4;
-    }
-}
-
-static __device__ void cpy_blck_f32_q4_1(const char * cxi, char * cdsti) {
-    const float * xi = (const float *) cxi;
-    block_q4_1 * dsti = (block_q4_1 *) cdsti;
-
-    float vmin = FLT_MAX;
-    float vmax = -FLT_MAX;
-
-    for (int j = 0; j < QK4_1; ++j) {
-        const float v = xi[j];
-
-        if (v < vmin) vmin = v;
-        if (v > vmax) vmax = v;
-    }
-
-    const float d  = (vmax - vmin) / ((1 << 4) - 1);
-    const float id = d ? 1.0f/d : 0.0f;
-
-    dsti->dm.x = d;
-    dsti->dm.y = vmin;
-
-    for (int j = 0; j < QK4_1/2; ++j) {
-        const float x0 = (xi[0       + j] - vmin)*id;
-        const float x1 = (xi[QK4_1/2 + j] - vmin)*id;
-
-        const uint8_t xi0 = min(15, (int8_t)(x0 + 0.5f));
-        const uint8_t xi1 = min(15, (int8_t)(x1 + 0.5f));
-
-        dsti->qs[j]  = xi0;
-        dsti->qs[j] |= xi1 << 4;
-    }
-}
-
-template <cpy_kernel_t cpy_blck, int qk>
-static __global__ void cpy_f32_q(const char * cx, char * cdst, const int ne,
-                                 const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, const int nb03,
-                                 const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13) {
-    const int i = (blockDim.x*blockIdx.x + threadIdx.x)*qk;
-
-    if (i >= ne) {
-        return;
-    }
-
-    const int i03 = i / (ne00*ne01*ne02);
-    const int i02 = (i - i03*ne00*ne01*ne02) / (ne00*ne01);
-    const int i01 = (i - i03*ne00*ne01*ne02 - i02*ne00*ne01) / ne00;
-    const int i00 = i - i03*ne00*ne01*ne02- i02*ne00*ne01 - i01*ne00;
-    const int x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03;
-
-    const int i13 = i / (ne10*ne11*ne12);
-    const int i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
-    const int i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
-    const int i10 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10)/qk;
-    const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13*nb13;
-
-    cpy_blck(cx + x_offset, cdst + dst_offset);
-}
-
-static __device__ float rope_yarn_ramp(const float low, const float high, const int i0) {
-    const float y = (i0 / 2 - low) / max(0.001f, high - low);
-    return 1.0f - min(1.0f, max(0.0f, y));
-}
-
-struct rope_corr_dims {
-    float v[4];
-};
-
-// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
-// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
-static __device__ void rope_yarn(
-    float theta_extrap, float freq_scale, rope_corr_dims corr_dims, int64_t i0, float ext_factor, float mscale,
-    float * cos_theta, float * sin_theta
-) {
-    // Get n-d rotational scaling corrected for extrapolation
-    float theta_interp = freq_scale * theta_extrap;
-    float theta = theta_interp;
-    if (ext_factor != 0.0f) {
-        float ramp_mix = rope_yarn_ramp(corr_dims.v[0], corr_dims.v[1], i0) * ext_factor;
-        theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
-
-        // Get n-d magnitude scaling corrected for interpolation
-        mscale *= 1.0f + 0.1f * logf(1.0f / freq_scale);
-    }
-    *cos_theta = cosf(theta) * mscale;
-    *sin_theta = sinf(theta) * mscale;
-}
-
-// rope == RoPE == rotary positional embedding
-template<typename T, bool has_pos>
-static __global__ void rope(
-    const T * x, T * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
-    float ext_factor, float attn_factor, rope_corr_dims corr_dims
-) {
-    const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
-
-    if (col >= ncols) {
-        return;
-    }
-
-    const int row = blockDim.x*blockIdx.x + threadIdx.x;
-    const int i = row*ncols + col;
-    const int i2 = row/p_delta_rows;
-
-    const int p = has_pos ? pos[i2] : 0;
-    const float theta_base = p*powf(freq_base, -float(col)/ncols);
-
-    float cos_theta, sin_theta;
-    rope_yarn(theta_base, freq_scale, corr_dims, col, ext_factor, attn_factor, &cos_theta, &sin_theta);
-
-    const float x0 = x[i + 0];
-    const float x1 = x[i + 1];
-
-    dst[i + 0] = x0*cos_theta - x1*sin_theta;
-    dst[i + 1] = x0*sin_theta + x1*cos_theta;
-}
-
-template<typename T, bool has_pos>
-static __global__ void rope_neox(
-    const T * x, T * dst, int ncols, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
-    float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, float inv_ndims
-) {
-    const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
-
-    if (col >= ncols) {
-        return;
-    }
-
-    const int row = blockDim.x*blockIdx.x + threadIdx.x;
-    const int ib = col / n_dims;
-    const int ic = col % n_dims;
-
-    if (ib > 0) {
-        const int i = row*ncols + ib*n_dims + ic;
-
-        dst[i + 0] = x[i + 0];
-        dst[i + 1] = x[i + 1];
-
-        return;
-    }
-
-    const int i  = row*ncols + ib*n_dims + ic/2;
-    const int i2 = row/p_delta_rows;
-
-    float cur_rot = inv_ndims * ic - ib;
-
-    const int p = has_pos ? pos[i2] : 0;
-    const float theta_base = p*freq_scale*powf(theta_scale, col/2.0f);
-
-    float cos_theta, sin_theta;
-    rope_yarn(theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
-
-    const float x0 = x[i + 0];
-    const float x1 = x[i + n_dims/2];
-
-    dst[i + 0]        = x0*cos_theta - x1*sin_theta;
-    dst[i + n_dims/2] = x0*sin_theta + x1*cos_theta;
-}
-
-static __global__ void rope_glm_f32(
-    const float * x, float * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
-    int n_ctx
-) {
-    const int col = blockDim.x*blockIdx.x + threadIdx.x;
-    const int half_n_dims = ncols/4;
-
-    if (col >= half_n_dims) {
-        return;
-    }
-
-    const int row = blockDim.y*blockIdx.y + threadIdx.y;
-    const int i = row*ncols + col;
-    const int i2 = row/p_delta_rows;
-
-    const float col_theta_scale = powf(freq_base, -2.0f*col/ncols);
-     // FIXME: this is likely wrong
-    const int p = pos != nullptr ? pos[i2] : 0;
-
-    const float theta = min(p, n_ctx - 2)*freq_scale*col_theta_scale;
-    const float sin_theta = sinf(theta);
-    const float cos_theta = cosf(theta);
-
-    const float x0 = x[i + 0];
-    const float x1 = x[i + half_n_dims];
-
-    dst[i + 0]           = x0*cos_theta - x1*sin_theta;
-    dst[i + half_n_dims] = x0*sin_theta + x1*cos_theta;
-
-    const float block_theta = ((float)max(p - n_ctx - 2, 0))*col_theta_scale;
-    const float sin_block_theta = sinf(block_theta);
-    const float cos_block_theta = cosf(block_theta);
-
-    const float x2 = x[i + half_n_dims * 2];
-    const float x3 = x[i + half_n_dims * 3];
-
-    dst[i + half_n_dims * 2] = x2*cos_block_theta - x3*sin_block_theta;
-    dst[i + half_n_dims * 3] = x2*sin_block_theta + x3*cos_block_theta;
-}
-
-static __global__ void alibi_f32(const float * x, float * dst, const int ncols, const int k_rows,
-                                 const int n_heads_log2_floor, const float m0, const float m1) {
-    const int col = blockDim.x*blockIdx.x + threadIdx.x;
-
-    if (col >= ncols) {
-        return;
-    }
-
-    const int row = blockDim.y*blockIdx.y + threadIdx.y;
-    const int i = row*ncols + col;
-
-    const int k = row/k_rows;
-
-    float m_k;
-    if (k < n_heads_log2_floor) {
-        m_k = powf(m0, k + 1);
-    } else {
-        m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
-    }
-
-    dst[i] = col * m_k + x[i];
-}
-
-static __global__ void k_sum_rows_f32(const float * x, float * dst, const int ncols) {
-    const int row = blockIdx.y;
-    const int col = threadIdx.x;
-
-    float sum = 0.0f;
-    for (int i = col; i < ncols; i += blockDim.x) {
-        sum += x[row * ncols + i];
-    }
-
-    sum = warp_reduce_sum(sum);
-
-    if (col == 0) {
-        dst[row] = sum;
-    }
-}
-
-template<typename T>
-static inline __device__ void swap(T & a, T & b) {
-    T tmp = a;
-    a = b;
-    b = tmp;
-}
-
-template<ggml_sort_order order>
-static __global__ void k_argsort_f32_i32(const float * x, int * dst, const int ncols) {
-    // bitonic sort
-    int col = threadIdx.x;
-    int row = blockIdx.y;
-
-    if (col >= ncols) return;
-
-    const float * x_row = x + row * ncols;
-    int * dst_row = dst + row * ncols;
-
-    // initialize indices
-    if (col < ncols) {
-        dst_row[col] = col;
-    }
-    __syncthreads();
-
-    for (int k = 2; k <= ncols; k *= 2) {
-        for (int j = k / 2; j > 0; j /= 2) {
-            int ixj = col ^ j;
-            if (ixj > col) {
-                if ((col & k) == 0) {
-                    if (order == GGML_SORT_ASC ? x_row[dst_row[col]] > x_row[dst_row[ixj]] : x_row[dst_row[col]] < x_row[dst_row[ixj]]) {
-                        swap(dst_row[col], dst_row[ixj]);
-                    }
-                } else {
-                    if (order == GGML_SORT_ASC ? x_row[dst_row[col]] < x_row[dst_row[ixj]] : x_row[dst_row[col]] > x_row[dst_row[ixj]]) {
-                        swap(dst_row[col], dst_row[ixj]);
-                    }
-                }
-            }
-            __syncthreads();
-        }
-    }
-}
-
-static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) {
-    const int col = blockDim.y*blockIdx.y + threadIdx.y;
-    const int row = blockDim.x*blockIdx.x + threadIdx.x;
-
-    if (col >= ncols) {
-        return;
-    }
-
-    const int i = row*ncols + col;
-    //dst[i] = col > (n_past + row % rows_per_channel) ? -INFINITY : x[i];
-    //dst[i] = x[i] - (col > n_past + row % rows_per_channel) * INT_MAX; // equivalent within rounding error but slightly faster on GPU
-    dst[i] = x[i] - (col > n_past + row % rows_per_channel) * FLT_MAX;
-}
-
-template <bool vals_smem, int ncols_template, int block_size_template, bool need_check>
-static __global__ void soft_max_f16(const float * x, const float * y, float * dst, const int ncols_par, const int nrows_y, const float scale) {
-#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
-    const int ncols_data = ncols_template == 0 ? ncols_par : ncols_template;
-    const int ncols_smem = GGML_PAD(ncols_data, 2*WARP_SIZE)/2;
-
-    const int tid  = threadIdx.x;
-    const int rowx = blockIdx.x;
-    const int rowy = rowx % nrows_y; // broadcast the mask (y) in the row dimension
-
-    const int block_size = block_size_template == 0 ? blockDim.x : block_size_template;
-
-    const int warp_id = threadIdx.x / WARP_SIZE;
-    const int lane_id = threadIdx.x % WARP_SIZE;
-
-    extern __shared__ half data_soft_max_f16[];
-    half  * buf_iw = data_soft_max_f16 + 0; // shared memory buffer for inter-warp communication
-    // (shared memory) buffer to cache values between iterations:
-    half2 * vals = vals_smem ? (half2 *) (buf_iw + WARP_SIZE) : (half2 *) (dst + rowx*ncols_data);
-    // if the buffer is larger than max. shared memory per block, use dst as temp. buffer instead
-    // in that case col_smem == col_data must be enforced to avoid race conditions
-
-    half2 max_val = make_half2(-INFINITY, -INFINITY);
-
-#pragma unroll
-    for (int col0 = 0; col0 < ncols_smem; col0 += block_size) {
-        const int col_data = 2*col0 + 2*WARP_SIZE*warp_id + lane_id;
-        const int col_smem = vals_smem ? col0 + tid : col_data;
-
-        const int ix = rowx*ncols_data + col_data;
-        const int iy = rowy*ncols_data + col_data;
-
-        half2 val;
-        if (need_check && col_data + 0 >= ncols_data) {
-            val.x = -INFINITY;
-        } else {
-            val.x = x[ix + 0]*scale + (y ? y[iy + 0] : 0.0f);
-        }
-        if (need_check && col_data + WARP_SIZE >= ncols_data) {
-            val.y = -INFINITY;
-        } else {
-            val.y = x[ix + WARP_SIZE]*scale + (y ? y[iy + WARP_SIZE] : 0.0f);
-        }
-        if (!need_check || col_smem < (vals_smem ? ncols_smem : ncols_data)) {
-            vals[col_smem] = val;
-        }
-        max_val = __hmax2(max_val, val);
-    }
-
-    // find the max value in the block
-    max_val = warp_reduce_max(max_val);
-    if (block_size > WARP_SIZE) {
-        if (warp_id == 0) {
-            buf_iw[lane_id] = -INFINITY;
-        }
-        __syncthreads();
-
-        if (lane_id == 0) {
-            buf_iw[warp_id] = __hmax(max_val.x, max_val.y);
-        }
-        __syncthreads();
-
-        max_val = __half2half2(buf_iw[lane_id]);
-        max_val = warp_reduce_max(max_val);
-    } else {
-        max_val = __half2half2(__hmax(max_val.x, max_val.y));
-    }
-
-    half2 tmp = make_half2(0.0f, 0.0f); // partial sums
-
-#pragma unroll
-    for (int col0 = 0; col0 < ncols_smem; col0 += block_size) {
-        const int col_smem = vals_smem ? col0 + tid : 2*col0 + 2*warp_id*WARP_SIZE + lane_id;
-
-        if (ncols_template == 0 && col_smem >= (vals_smem ? ncols_smem : ncols_data)) {
-            break;
-        }
-
-        const half2 val = h2exp(vals[col_smem] - max_val);
-
-        tmp += val;
-        vals[col_smem] = val;
-    }
-
-    // find the sum of exps in the block
-    tmp = warp_reduce_sum(tmp);
-    if (block_size > WARP_SIZE) {
-        if (warp_id == 0) {
-            buf_iw[lane_id] = 0.0f;
-        }
-        __syncthreads();
-
-        if (lane_id == 0) {
-            buf_iw[warp_id] = tmp.x + tmp.y;
-        }
-        __syncthreads();
-
-        tmp = __half2half2(buf_iw[lane_id]);
-        tmp = warp_reduce_sum(tmp);
-    } else {
-        tmp = __half2half2(tmp.x + tmp.y);
-    }
-
-    const half2 inv_sum = make_half2(1.0f, 1.0f) / tmp;
-
-#pragma unroll
-    for (int col0 = 0; col0 < ncols_smem; col0 += block_size) {
-        const int col_data = 2*col0 + 2*WARP_SIZE*warp_id + lane_id;
-        const int col_smem = vals_smem ? col0 + tid : col_data;
-
-        const int idst = rowx*ncols_data + col_data;
-        const half2 result = vals[col_smem] * inv_sum;
-
-        if (need_check && col_data + 0 >= ncols_data) {
-            return;
-        }
-        dst[idst] = result.x;
-
-        if (need_check && col_data + WARP_SIZE >= ncols_data) {
-            return;
-        }
-
-        dst[idst + WARP_SIZE] = result.y;
-    }
-#else
-    (void) x; (void) y; (void) dst; (void) ncols_par; (void) nrows_y; (void) scale;
-    bad_arch();
-#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
-}
-
-template <bool vals_smem, int ncols_template, int block_size_template>
-static __global__ void soft_max_f32(const float * x, const float * y, float * dst, const int ncols_par, const int nrows_y, const float scale) {
-    const int ncols = ncols_template == 0 ? ncols_par : ncols_template;
-
-    const int tid  = threadIdx.x;
-    const int rowx = blockIdx.x;
-    const int rowy = rowx % nrows_y; // broadcast the mask (y) in the row dimension
-
-    const int block_size = block_size_template == 0 ? blockDim.x : block_size_template;
-
-    const int warp_id = threadIdx.x / WARP_SIZE;
-    const int lane_id = threadIdx.x % WARP_SIZE;
-
-    extern __shared__ float data_soft_max_f32[];
-    float * buf_iw = data_soft_max_f32; // shared memory buffer for inter-warp communication
-    // shared memory buffer to cache values between iterations:
-    float * vals = vals_smem ? buf_iw + WARP_SIZE : dst + rowx*ncols;
-
-    float max_val = -INFINITY;
-
-#pragma unroll
-    for (int col0 = 0; col0 < ncols; col0 += block_size) {
-        const int col = col0 + tid;
-
-        if (ncols_template == 0 && col >= ncols) {
-            break;
-        }
-
-        const int ix = rowx*ncols + col;
-        const int iy = rowy*ncols + col;
-
-        const float val = x[ix]*scale + (y ? y[iy] : 0.0f);
-        vals[col] = val;
-        max_val = max(max_val, val);
-    }
-
-    // find the max value in the block
-    max_val = warp_reduce_max(max_val);
-    if (block_size > WARP_SIZE) {
-        if (warp_id == 0) {
-            buf_iw[lane_id] = -INFINITY;
-        }
-        __syncthreads();
-
-        if (lane_id == 0) {
-            buf_iw[warp_id] = max_val;
-        }
-        __syncthreads();
-
-        max_val = buf_iw[lane_id];
-        max_val = warp_reduce_max(max_val);
-    }
-
-    float tmp = 0.0f; // partial sum
-
-#pragma unroll
-    for (int col0 = 0; col0 < ncols; col0 += block_size) {
-        const int col = col0 + tid;
-
-        if (ncols_template == 0 && col >= ncols) {
-            break;
-        }
-
-        const float val = expf(vals[col] - max_val);
-        tmp += val;
-        vals[col] = val;
-    }
-
-    // find the sum of exps in the block
-    tmp = warp_reduce_sum(tmp);
-    if (block_size > WARP_SIZE) {
-        if (warp_id == 0) {
-            buf_iw[lane_id] = 0.0f;
-        }
-        __syncthreads();
-
-        if (lane_id == 0) {
-            buf_iw[warp_id] = tmp;
-        }
-        __syncthreads();
-
-        tmp = buf_iw[lane_id];
-        tmp = warp_reduce_sum(tmp);
-    }
-
-    const float inv_sum = 1.0f / tmp;
-
-#pragma unroll
-    for (int col0 = 0; col0 < ncols; col0 += block_size) {
-        const int col = col0 + tid;
-
-        if (ncols_template == 0 && col >= ncols) {
-            return;
-        }
-
-        const int idst = rowx*ncols + col;
-        dst[idst] = vals[col] * inv_sum;
-    }
-}
-
-static __global__ void scale_f32(const float * x, float * dst, const float scale, const int k) {
-    const int i = blockDim.x*blockIdx.x + threadIdx.x;
-
-    if (i >= k) {
-        return;
-    }
-
-    dst[i] = scale * x[i];
-}
-
-static __global__ void clamp_f32(const float * x, float * dst, const float min, const float max, const int k) {
-    const int i = blockDim.x*blockIdx.x + threadIdx.x;
-
-    if (i >= k) {
-        return;
-    }
-
-    dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]);
-}
-
-static  __global__ void im2col_f32_f16(
-        const float * x, half * dst,
-        int offset_delta, int64_t N, int64_t IW, int64_t IH, int64_t OW, int64_t OH, int64_t KW, int64_t KH, int64_t pelements, int64_t CHW,
-        int s0, int s1, int p0, int p1, int d0, int d1) {
-    const int64_t i = threadIdx.x + blockIdx.x * blockDim.x;
-    if (i >= pelements) {
-        return;
-    }
-    // blockIdx.x: KH * KW * OW * N
-    // blockIdx.y: ioh
-    // blockIdx.z: iic
-    // CHW: IC * KH * KW
-    const int64_t ioh = blockIdx.y;
-    const int64_t iic = blockIdx.z;
-
-    const int64_t ikh = i / (KW * OW * N);
-    const int64_t ikw = (i - ikh*(KW * OW * N)) / (OW*N);
-    const int64_t iow = (i - ikh*(KW * OW * N) - ikw*(OW*N)) / N;
-    const int64_t in = i % N;
-
-    const int64_t iiw = iow * s0 + ikw * d0 - p0;
-    const int64_t iih = ioh * s1 + ikh * d1 - p1;
-
-    const int64_t offset_dst =
-        (in*OW*OH + ioh * OW + iow) * CHW +
-        (iic * (KW * KH) + ikh * KW + ikw);
-
-    if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
-        dst[offset_dst] = __float2half(0.0f);
-    } else {
-        const int64_t offset_src = in*CHW + iic * IH * IW + iih * IW + iiw;
-        dst[offset_dst] = __float2half(x[offset_src]);
-    }
-}
-
-template<int qk, int qr, dequantize_kernel_t dq>
-static void get_rows_cuda(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-                            const void * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) {
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
-    const int block_num_x = (ne00 + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE);
-    const dim3 block_nums(block_num_x, ne10, ne11*ne12);
-
-    // strides in elements
-    //const size_t s0 = nb0 / ggml_element_size(dst);
-    const size_t s1 = nb1 / ggml_element_size(dst);
-    const size_t s2 = nb2 / ggml_element_size(dst);
-    const size_t s3 = nb3 / ggml_element_size(dst);
-
-    const size_t s10 = nb10 / ggml_element_size(src1);
-    const size_t s11 = nb11 / ggml_element_size(src1);
-    const size_t s12 = nb12 / ggml_element_size(src1);
-    //const size_t s13 = nb13 / ggml_element_size(src1);
-
-    GGML_ASSERT(ne00 % 2 == 0);
-
-    k_get_rows<qk, qr, dq><<<block_nums, block_dims, 0, stream>>>(
-            src0_dd, src1_dd, dst_dd,
-            ne00, /*ne01, ne02, ne03,*/
-            /*ne10, ne11,*/ ne12, /*ne13,*/
-            /* s0,*/ s1, s2, s3,
-            /* nb00,*/ nb01, nb02, nb03,
-            s10, s11, s12/*, s13*/);
-
-    (void) dst;
-}
-
-template<typename src0_t>
-static void get_rows_cuda_float(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-                                const src0_t * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) {
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
-    const int block_num_x = (ne00 + CUDA_GET_ROWS_BLOCK_SIZE - 1) / CUDA_GET_ROWS_BLOCK_SIZE;
-    const dim3 block_nums(block_num_x, ne10, ne11*ne12);
-
-    // strides in elements
-    //const size_t s0 = nb0 / ggml_element_size(dst);
-    const size_t s1 = nb1 / ggml_element_size(dst);
-    const size_t s2 = nb2 / ggml_element_size(dst);
-    const size_t s3 = nb3 / ggml_element_size(dst);
-
-    const size_t s10 = nb10 / ggml_element_size(src1);
-    const size_t s11 = nb11 / ggml_element_size(src1);
-    const size_t s12 = nb12 / ggml_element_size(src1);
-    //const size_t s13 = nb13 / ggml_element_size(src1);
-
-    k_get_rows_float<<<block_nums, block_dims, 0, stream>>>(
-            src0_dd, src1_dd, dst_dd,
-            ne00, /*ne01, ne02, ne03,*/
-            /*ne10, ne11,*/ ne12, /*ne13,*/
-            /* s0,*/ s1, s2, s3,
-            /* nb00,*/ nb01, nb02, nb03,
-            s10, s11, s12/*, s13*/);
-
-    (void) dst;
-}
-
-template<float (*bin_op)(const float, const float)>
-struct bin_bcast_cuda {
-    template<typename src0_t, typename src1_t, typename dst_t>
-    void operator()(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst,
-            const src0_t * src0_dd, const src1_t * src1_dd, dst_t * dst_dd,
-            cudaStream_t stream) {
-
-        GGML_TENSOR_BINARY_OP_LOCALS
-
-        int nr0 = ne10/ne0;
-        int nr1 = ne11/ne1;
-        int nr2 = ne12/ne2;
-        int nr3 = ne13/ne3;
-
-        int nr[4] = { nr0, nr1, nr2, nr3 };
-
-        // collapse dimensions until first broadcast dimension
-        int64_t cne0[] = {ne0, ne1, ne2, ne3};
-        int64_t cne1[] = {ne10, ne11, ne12, ne13};
-        size_t cnb0[] = {nb0, nb1, nb2, nb3};
-        size_t cnb1[] = {nb10, nb11, nb12, nb13};
-        auto collapse = [](int64_t cne[]) {
-            cne[0] *= cne[1];
-            cne[1] = cne[2];
-            cne[2] = cne[3];
-            cne[3] = 1;
-        };
-
-        auto collapse_nb = [](size_t cnb[], const int64_t cne[]) {
-            cnb[1] *= cne[1];
-            cnb[2] *= cne[2];
-            cnb[3] *= cne[3];
-        };
-
-        for (int i = 0; i < 4; i++) {
-            if (nr[i] != 1) {
-                break;
-            }
-            if (i > 0) {
-                collapse_nb(cnb0, cne0);
-                collapse_nb(cnb1, cne1);
-                collapse(cne0);
-                collapse(cne1);
-            }
-        }
-        {
-            int64_t ne0 = cne0[0];
-            int64_t ne1 = cne0[1];
-            int64_t ne2 = cne0[2];
-            int64_t ne3 = cne0[3];
-
-            int64_t ne10 = cne1[0];
-            int64_t ne11 = cne1[1];
-            int64_t ne12 = cne1[2];
-            int64_t ne13 = cne1[3];
-
-            size_t nb0 = cnb0[0];
-            size_t nb1 = cnb0[1];
-            size_t nb2 = cnb0[2];
-            size_t nb3 = cnb0[3];
-
-            size_t nb10 = cnb1[0];
-            size_t nb11 = cnb1[1];
-            size_t nb12 = cnb1[2];
-            size_t nb13 = cnb1[3];
-
-            size_t s0 = nb0 / sizeof(dst_t);
-            size_t s1 = nb1 / sizeof(dst_t);
-            size_t s2 = nb2 / sizeof(dst_t);
-            size_t s3 = nb3 / sizeof(dst_t);
-
-            size_t s10 = nb10 / sizeof(src1_t);
-            size_t s11 = nb11 / sizeof(src1_t);
-            size_t s12 = nb12 / sizeof(src1_t);
-            size_t s13 = nb13 / sizeof(src1_t);
-
-            GGML_ASSERT(s0 == 1);
-            GGML_ASSERT(s10 == 1);
-
-            const int block_size = 128;
-
-            int64_t hne0 = std::max(ne0/2LL, 1LL);
-
-            dim3 block_dims;
-            block_dims.x = std::min<unsigned int>(hne0, block_size);
-            block_dims.y = std::min<unsigned int>(ne1, block_size / block_dims.x);
-            block_dims.z = std::min(std::min<unsigned int>(ne2*ne3, block_size / block_dims.x / block_dims.y), 64U);
-
-            dim3 block_nums(
-                (hne0 + block_dims.x - 1) / block_dims.x,
-                (ne1 + block_dims.y - 1) / block_dims.y,
-                (ne2*ne3 + block_dims.z - 1) / block_dims.z
-            );
-
-            if (block_nums.z > 65535) {
-                // this is the maximum number of blocks in z direction, fallback to 1D grid kernel
-                int block_num = (ne0*ne1*ne2*ne3 + block_size - 1) / block_size;
-                k_bin_bcast_unravel<bin_op><<<block_num, block_size, 0, stream>>>(
-                    src0_dd, src1_dd, dst_dd,
-                    ne0, ne1, ne2, ne3,
-                    ne10, ne11, ne12, ne13,
-                    /* s0, */ s1, s2, s3,
-                    /* s10, */ s11, s12, s13);
-            } else {
-                k_bin_bcast<bin_op><<<block_nums, block_dims, 0, stream>>>(
-                    src0_dd, src1_dd, dst_dd,
-                    ne0, ne1, ne2, ne3,
-                    ne10, ne11, ne12, ne13,
-                    /* s0, */ s1, s2, s3,
-                    /* s10, */ s11, s12, s13);
-            }
-        }
-    }
-};
-
-static void acc_f32_cuda(const float * x, const float * y, float * dst, const int n_elements,
-    const int ne10, const int ne11, const int ne12,
-    const int nb1, const int nb2, const int offset, cudaStream_t stream) {
-    int num_blocks = (n_elements + CUDA_ACC_BLOCK_SIZE - 1) / CUDA_ACC_BLOCK_SIZE;
-    acc_f32<<<num_blocks, CUDA_ACC_BLOCK_SIZE, 0, stream>>>(x, y, dst, n_elements, ne10, ne11, ne12, nb1, nb2, offset);
-}
-
-static void gelu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
-    const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
-    gelu_f32<<<num_blocks, CUDA_GELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
-}
-
-static void silu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
-    const int num_blocks = (k + CUDA_SILU_BLOCK_SIZE - 1) / CUDA_SILU_BLOCK_SIZE;
-    silu_f32<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
-}
-
-static void gelu_quick_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
-    const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
-    gelu_quick_f32<<<num_blocks, CUDA_GELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
-}
-
-static void tanh_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
-    const int num_blocks = (k + CUDA_TANH_BLOCK_SIZE - 1) / CUDA_TANH_BLOCK_SIZE;
-    tanh_f32<<<num_blocks, CUDA_TANH_BLOCK_SIZE, 0, stream>>>(x, dst, k);
-}
-
-static void relu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
-    const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE;
-    relu_f32<<<num_blocks, CUDA_RELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
-}
-
-static void leaky_relu_f32_cuda(const float * x, float * dst, const int k, const float negative_slope, cudaStream_t stream) {
-    const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE;
-    leaky_relu_f32<<<num_blocks, CUDA_RELU_BLOCK_SIZE, 0, stream>>>(x, dst, k, negative_slope);
-}
-
-static void sqr_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
-    const int num_blocks = (k + CUDA_SQR_BLOCK_SIZE - 1) / CUDA_SQR_BLOCK_SIZE;
-    sqr_f32<<<num_blocks, CUDA_SQR_BLOCK_SIZE, 0, stream>>>(x, dst, k);
-}
-
-static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
-    GGML_ASSERT(ncols % WARP_SIZE == 0);
-    if (ncols < 1024) {
-        const dim3 block_dims(WARP_SIZE, 1, 1);
-        norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
-    } else {
-        const dim3 block_dims(1024, 1, 1);
-        norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
-    }
-}
-
-static void group_norm_f32_cuda(const float * x, float * dst, const int num_groups, const int group_size, const int ne_elements, cudaStream_t stream) {
-    static const float eps = 1e-6f;
-    if (group_size < 1024) {
-        const dim3 block_dims(WARP_SIZE, 1, 1);
-        group_norm_f32<WARP_SIZE><<<num_groups, block_dims, 0, stream>>>(x, dst, group_size, ne_elements, eps);
-    } else {
-        const dim3 block_dims(1024, 1, 1);
-        group_norm_f32<1024><<<num_groups, block_dims, 0, stream>>>(x, dst, group_size, ne_elements, eps);
-    }
-}
-
-static void concat_f32_cuda(const float * x, const float * y, float * dst, const int ne0, int ne1, int ne2, int ne02, cudaStream_t stream) {
-    int num_blocks = (ne0 + CUDA_CONCAT_BLOCK_SIZE - 1) / CUDA_CONCAT_BLOCK_SIZE;
-    dim3 gridDim(num_blocks, ne1, ne2);
-    concat_f32<<<gridDim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne0, ne02);
-}
-
-static void upscale_f32_cuda(const float * x, float * dst, const int ne00, const int ne01, const int ne02, const int ne03,
-                             const int scale_factor, cudaStream_t stream) {
-    int ne0 = (ne00 * scale_factor);
-    int num_blocks = (ne0 + CUDA_UPSCALE_BLOCK_SIZE - 1) / CUDA_UPSCALE_BLOCK_SIZE;
-    dim3 gridDim(num_blocks, (ne01 * scale_factor), ne02*ne03);
-    upscale_f32<<<gridDim, CUDA_UPSCALE_BLOCK_SIZE, 0, stream>>>(x, dst, ne00, ne00 * ne01, scale_factor);
-}
-
-static void pad_f32_cuda(const float * x, float * dst,
-    const int ne00, const int ne01, const int ne02, const int ne03,
-    const int ne0, const int ne1, const int ne2, const int ne3, cudaStream_t stream) {
-    int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE;
-    dim3 gridDim(num_blocks, ne1, ne2*ne3);
-    pad_f32<<<gridDim, CUDA_PAD_BLOCK_SIZE, 0, stream>>>(x, dst, ne0, ne00, ne01, ne02, ne03);
-}
-
-static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
-    GGML_ASSERT(ncols % WARP_SIZE == 0);
-    if (ncols < 1024) {
-        const dim3 block_dims(WARP_SIZE, 1, 1);
-        rms_norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
-    } else {
-        const dim3 block_dims(1024, 1, 1);
-        rms_norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
-    }
-}
-
-static void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, const int ky, const int kx_padded, cudaStream_t stream) {
-    const int block_num_x = (kx_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
-    const dim3 num_blocks(block_num_x, ky, 1);
-    const dim3 block_size(CUDA_DEQUANTIZE_BLOCK_SIZE, 1, 1);
-    quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, kx, kx_padded);
-}
-
-template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
-static void dequantize_block_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) {
-    const int num_blocks = (k + 2*CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / (2*CUDA_DEQUANTIZE_BLOCK_SIZE);
-    dequantize_block<qk, qr, dequantize_kernel><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
-}
-
-static void dequantize_block_q8_0_f16_cuda(const void * __restrict__ vx, half * __restrict__ y, const int k, cudaStream_t stream) {
-    const int num_blocks = (k + CUDA_Q8_0_NE_ALIGN - 1) / CUDA_Q8_0_NE_ALIGN;
-    if (k % CUDA_Q8_0_NE_ALIGN == 0) {
-        const bool need_check = false;
-        dequantize_block_q8_0_f16<need_check><<<num_blocks, WARP_SIZE, 0, stream>>>(vx, y, k);
-    } else {
-        const bool need_check = true;
-        dequantize_block_q8_0_f16<need_check><<<num_blocks, WARP_SIZE, 0, stream>>>(vx, y, k);
-    }
-}
-
-template<typename dst_t>
-static void dequantize_row_q2_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
-    const int nb = k / QK_K;
-#if QK_K == 256
-    dequantize_block_q2_K<<<nb, 64, 0, stream>>>(vx, y);
-#else
-    dequantize_block_q2_K<<<nb, 32, 0, stream>>>(vx, y);
-#endif
-}
-
-template<typename dst_t>
-static void dequantize_row_q3_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
-    const int nb = k / QK_K;
-#if QK_K == 256
-    dequantize_block_q3_K<<<nb, 64, 0, stream>>>(vx, y);
-#else
-    dequantize_block_q3_K<<<nb, 32, 0, stream>>>(vx, y);
-#endif
-}
-
-template<typename dst_t>
-static void dequantize_row_q4_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
-    const int nb = k / QK_K;
-    dequantize_block_q4_K<<<nb, 32, 0, stream>>>(vx, y);
-}
-
-template<typename dst_t>
-static void dequantize_row_q5_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
-    const int nb = k / QK_K;
-#if QK_K == 256
-    dequantize_block_q5_K<<<nb, 64, 0, stream>>>(vx, y);
-#else
-    dequantize_block_q5_K<<<nb, 32, 0, stream>>>(vx, y);
-#endif
-}
-
-template<typename dst_t>
-static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
-    const int nb = k / QK_K;
-#if QK_K == 256
-    dequantize_block_q6_K<<<nb, 64, 0, stream>>>(vx, y);
-#else
-    dequantize_block_q6_K<<<nb, 32, 0, stream>>>(vx, y);
-#endif
-}
-
-template<typename dst_t>
-static void dequantize_row_iq2_xxs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
-    const int nb = k / QK_K;
-    dequantize_block_iq2_xxs<<<nb, 32, 0, stream>>>(vx, y);
-}
-
-template<typename dst_t>
-static void dequantize_row_iq2_xs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
-    const int nb = k / QK_K;
-    dequantize_block_iq2_xs<<<nb, 32, 0, stream>>>(vx, y);
-}
-
-template <typename src_t, typename dst_t>
-static void convert_unary_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) {
-    const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
-    convert_unary<src_t><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
-}
-
-static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
-    int id;
-    switch (type) {
-        case GGML_TYPE_Q4_0:
-            return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
-        case GGML_TYPE_Q4_1:
-            return dequantize_block_cuda<QK4_1, QR4_1, dequantize_q4_1>;
-        case GGML_TYPE_Q5_0:
-            return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
-        case GGML_TYPE_Q5_1:
-            return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
-        case GGML_TYPE_Q8_0:
-            CUDA_CHECK(cudaGetDevice(&id));
-            if (g_device_caps[id].cc >= CC_PASCAL) {
-                return dequantize_block_q8_0_f16_cuda;
-            }
-            return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
-        case GGML_TYPE_Q2_K:
-            return dequantize_row_q2_K_cuda;
-        case GGML_TYPE_Q3_K:
-            return dequantize_row_q3_K_cuda;
-        case GGML_TYPE_Q4_K:
-            return dequantize_row_q4_K_cuda;
-        case GGML_TYPE_Q5_K:
-            return dequantize_row_q5_K_cuda;
-        case GGML_TYPE_Q6_K:
-            return dequantize_row_q6_K_cuda;
-        case GGML_TYPE_IQ2_XXS:
-            return dequantize_row_iq2_xxs_cuda;
-        case GGML_TYPE_IQ2_XS:
-            return dequantize_row_iq2_xs_cuda;
-        case GGML_TYPE_F32:
-            return convert_unary_cuda<float>;
-        default:
-            return nullptr;
-    }
-}
-
-static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
-    switch (type) {
-        case GGML_TYPE_Q4_0:
-            return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
-        case GGML_TYPE_Q4_1:
-            return dequantize_block_cuda<QK4_1, QR4_1, dequantize_q4_1>;
-        case GGML_TYPE_Q5_0:
-            return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
-        case GGML_TYPE_Q5_1:
-            return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
-        case GGML_TYPE_Q8_0:
-            return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
-        case GGML_TYPE_Q2_K:
-            return dequantize_row_q2_K_cuda;
-        case GGML_TYPE_Q3_K:
-            return dequantize_row_q3_K_cuda;
-        case GGML_TYPE_Q4_K:
-            return dequantize_row_q4_K_cuda;
-        case GGML_TYPE_Q5_K:
-            return dequantize_row_q5_K_cuda;
-        case GGML_TYPE_Q6_K:
-            return dequantize_row_q6_K_cuda;
-        case GGML_TYPE_IQ2_XXS:
-            return dequantize_row_iq2_xxs_cuda;
-        case GGML_TYPE_IQ2_XS:
-            return dequantize_row_iq2_xs_cuda;
-        case GGML_TYPE_F16:
-            return convert_unary_cuda<half>;
-        default:
-            return nullptr;
-    }
-}
-
-static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    // the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead
-    const dim3 block_nums(block_num_y, 1, 1);
-    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>
-        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
-}
-
-static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const dim3 block_nums(block_num_y, 1, 1);
-    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>
-        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
-}
-
-static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const dim3 block_nums(block_num_y, 1, 1);
-    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>
-        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
-}
-
-static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const dim3 block_nums(block_num_y, 1, 1);
-    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>
-        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
-}
-
-static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const dim3 block_nums(block_num_y, 1, 1);
-    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>
-        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
-}
-
-static void dequantize_mul_mat_vec_q2_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % QK_K == 0);
-    const int ny = 2; // very slightly faster than 1 even when K_QUANTS_PER_ITERATION = 2
-    const int block_num_y = (nrows + ny - 1) / ny;
-    const dim3 block_nums(block_num_y, 1, 1);
-    const dim3 block_dims(32, ny, 1);
-    dequantize_mul_mat_vec_q2_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
-}
-
-static void dequantize_mul_mat_vec_q3_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % QK_K == 0);
-    const int ny = 2 / K_QUANTS_PER_ITERATION;
-    const int block_num_y = (nrows + ny - 1) / ny;
-    const dim3 block_nums(block_num_y, 1, 1);
-    const dim3 block_dims(32, ny, 1);
-    dequantize_mul_mat_vec_q3_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
-}
-
-static void dequantize_mul_mat_vec_q4_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % QK_K == 0);
-    const int ny = 2 / K_QUANTS_PER_ITERATION;
-    const int block_num_y = (nrows + ny - 1) / ny;
-    const dim3 block_nums(block_num_y, 1, 1);
-    const dim3 block_dims(32, ny, 1);
-    dequantize_mul_mat_vec_q4_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
-}
-
-static void dequantize_mul_mat_vec_q5_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % QK_K == 0);
-    const dim3 block_dims(32, 1, 1);
-    dequantize_mul_mat_vec_q5_k<<<nrows, block_dims, 0, stream>>>(vx, y, dst, ncols);
-}
-
-static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % QK_K == 0);
-    const int ny = 2 / K_QUANTS_PER_ITERATION;
-    const int block_num_y = (nrows + ny - 1) / ny;
-    const dim3 block_nums(block_num_y, 1, 1);
-    const dim3 block_dims(32, ny, 1);
-    dequantize_mul_mat_vec_q6_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
-}
-
-static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const dim3 block_nums(block_num_y, 1, 1);
-    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    dequantize_mul_mat_vec<1, 1, convert_f16>
-        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
-}
-
-static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % QK4_0 == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const dim3 block_nums(block_num_y, 1, 1);
-    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>
-        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
-}
-
-static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % QK4_1 == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const dim3 block_nums(block_num_y, 1, 1);
-    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>
-        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
-}
-
-static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % QK5_0 == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const dim3 block_nums(block_num_y, 1, 1);
-    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>
-        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
-}
-
-static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % QK5_1 == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const dim3 block_nums(block_num_y, 1, 1);
-    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>
-        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
-}
-
-static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % QK8_0 == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const dim3 block_nums(block_num_y, 1, 1);
-    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>
-        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
-}
-
-static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % QK_K == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const dim3 block_nums(block_num_y, 1, 1);
-    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    mul_mat_vec_q<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>
-        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
-}
-
-static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % QK_K == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const dim3 block_nums(block_num_y, 1, 1);
-    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    mul_mat_vec_q<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>
-        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
-}
-
-static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % QK_K == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const dim3 block_nums(block_num_y, 1, 1);
-    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    mul_mat_vec_q<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>
-        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
-}
-
-static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % QK_K == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const dim3 block_nums(block_num_y, 1, 1);
-    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    mul_mat_vec_q<QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>
-        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
-}
-
-static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % QK_K == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const dim3 block_nums(block_num_y, 1, 1);
-    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    mul_mat_vec_q<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>
-        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
-}
-
-static void mul_mat_vec_iq2_xxs_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % QK_K == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const dim3 block_nums(block_num_y, 1, 1);
-    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    mul_mat_vec_q<QK_K, QI2_XXS, block_iq2_xxs, 1, vec_dot_iq2_xxs_q8_1>
-        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
-}
-
-static void mul_mat_vec_iq2_xs_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % QK_K == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const dim3 block_nums(block_num_y, 1, 1);
-    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    mul_mat_vec_q<QK_K, QI2_XS, block_iq2_xs, 1, vec_dot_iq2_xs_q8_1>
-        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
-}
-
-static void ggml_mul_mat_q4_0_q8_1_cuda(
-    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
-    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
-
-    int id;
-    CUDA_CHECK(cudaGetDevice(&id));
-    const int compute_capability = g_device_caps[id].cc;
-
-    int mmq_x, mmq_y, nwarps;
-    if (compute_capability >= CC_RDNA2) {
-        mmq_x  =  MMQ_X_Q4_0_RDNA2;
-        mmq_y  =  MMQ_Y_Q4_0_RDNA2;
-        nwarps = NWARPS_Q4_0_RDNA2;
-    } else if (compute_capability >= CC_OFFSET_AMD) {
-        mmq_x  =  MMQ_X_Q4_0_RDNA1;
-        mmq_y  =  MMQ_Y_Q4_0_RDNA1;
-        nwarps = NWARPS_Q4_0_RDNA1;
-    } else if (compute_capability >= CC_VOLTA) {
-        mmq_x  =  MMQ_X_Q4_0_AMPERE;
-        mmq_y  =  MMQ_Y_Q4_0_AMPERE;
-        nwarps = NWARPS_Q4_0_AMPERE;
-    } else if (compute_capability >= MIN_CC_DP4A) {
-        mmq_x  =  MMQ_X_Q4_0_PASCAL;
-        mmq_y  =  MMQ_Y_Q4_0_PASCAL;
-        nwarps = NWARPS_Q4_0_PASCAL;
-    } else {
-        GGML_ASSERT(false);
-    }
-
-    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
-    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
-    const dim3 block_nums(block_num_x, block_num_y, 1);
-    const dim3 block_dims(WARP_SIZE, nwarps, 1);
-
-    if (nrows_x % mmq_y == 0) {
-        const bool need_check = false;
-        mul_mat_q4_0<need_check><<<block_nums, block_dims, 0, stream>>>
-            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-    } else {
-        const bool need_check = true;
-        mul_mat_q4_0<need_check><<<block_nums, block_dims, 0, stream>>>
-            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-    }
-}
-
-static void ggml_mul_mat_q4_1_q8_1_cuda(
-    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
-    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
-
-    int id;
-    CUDA_CHECK(cudaGetDevice(&id));
-    const int compute_capability = g_device_caps[id].cc;
-
-    int mmq_x, mmq_y, nwarps;
-    if (compute_capability >= CC_RDNA2) {
-        mmq_x  =  MMQ_X_Q4_1_RDNA2;
-        mmq_y  =  MMQ_Y_Q4_1_RDNA2;
-        nwarps = NWARPS_Q4_1_RDNA2;
-    } else if (compute_capability >= CC_OFFSET_AMD) {
-        mmq_x  =  MMQ_X_Q4_1_RDNA1;
-        mmq_y  =  MMQ_Y_Q4_1_RDNA1;
-        nwarps = NWARPS_Q4_1_RDNA1;
-    } else if (compute_capability >= CC_VOLTA) {
-        mmq_x  =  MMQ_X_Q4_1_AMPERE;
-        mmq_y  =  MMQ_Y_Q4_1_AMPERE;
-        nwarps = NWARPS_Q4_1_AMPERE;
-    } else if (compute_capability >= MIN_CC_DP4A) {
-        mmq_x  =  MMQ_X_Q4_1_PASCAL;
-        mmq_y  =  MMQ_Y_Q4_1_PASCAL;
-        nwarps = NWARPS_Q4_1_PASCAL;
-    } else {
-        GGML_ASSERT(false);
-    }
-
-    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
-    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
-    const dim3 block_nums(block_num_x, block_num_y, 1);
-    const dim3 block_dims(WARP_SIZE, nwarps, 1);
-
-    if (nrows_x % mmq_y == 0) {
-        const bool need_check = false;
-        mul_mat_q4_1<need_check><<<block_nums, block_dims, 0, stream>>>
-            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-    } else {
-        const bool need_check = true;
-        mul_mat_q4_1<need_check><<<block_nums, block_dims, 0, stream>>>
-            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-    }
-}
-
-static void ggml_mul_mat_q5_0_q8_1_cuda(
-    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
-    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
-
-    int id;
-    CUDA_CHECK(cudaGetDevice(&id));
-    const int compute_capability = g_device_caps[id].cc;
-
-    int mmq_x, mmq_y, nwarps;
-    if (compute_capability >= CC_RDNA2) {
-        mmq_x  =  MMQ_X_Q5_0_RDNA2;
-        mmq_y  =  MMQ_Y_Q5_0_RDNA2;
-        nwarps = NWARPS_Q5_0_RDNA2;
-    } else if (compute_capability >= CC_OFFSET_AMD) {
-        mmq_x  =  MMQ_X_Q5_0_RDNA1;
-        mmq_y  =  MMQ_Y_Q5_0_RDNA1;
-        nwarps = NWARPS_Q5_0_RDNA1;
-    } else if (compute_capability >= CC_VOLTA) {
-        mmq_x  =  MMQ_X_Q5_0_AMPERE;
-        mmq_y  =  MMQ_Y_Q5_0_AMPERE;
-        nwarps = NWARPS_Q5_0_AMPERE;
-    } else if (compute_capability >= MIN_CC_DP4A) {
-        mmq_x  =  MMQ_X_Q5_0_PASCAL;
-        mmq_y  =  MMQ_Y_Q5_0_PASCAL;
-        nwarps = NWARPS_Q5_0_PASCAL;
-    } else {
-        GGML_ASSERT(false);
-    }
-
-    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
-    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
-    const dim3 block_nums(block_num_x, block_num_y, 1);
-    const dim3 block_dims(WARP_SIZE, nwarps, 1);
-
-    if (nrows_x % mmq_y == 0) {
-        const bool need_check = false;
-        mul_mat_q5_0<need_check><<<block_nums, block_dims, 0, stream>>>
-            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-    } else {
-        const bool need_check = true;
-        mul_mat_q5_0<need_check><<<block_nums, block_dims, 0, stream>>>
-            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-    }
-}
-
-static void ggml_mul_mat_q5_1_q8_1_cuda(
-    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
-    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
-
-    int id;
-    CUDA_CHECK(cudaGetDevice(&id));
-    const int compute_capability = g_device_caps[id].cc;
-
-    int mmq_x, mmq_y, nwarps;
-    if (compute_capability >= CC_RDNA2) {
-        mmq_x  =  MMQ_X_Q5_1_RDNA2;
-        mmq_y  =  MMQ_Y_Q5_1_RDNA2;
-        nwarps = NWARPS_Q5_1_RDNA2;
-    } else if (compute_capability >= CC_OFFSET_AMD) {
-        mmq_x  =  MMQ_X_Q5_1_RDNA1;
-        mmq_y  =  MMQ_Y_Q5_1_RDNA1;
-        nwarps = NWARPS_Q5_1_RDNA1;
-    } else if (compute_capability >= CC_VOLTA) {
-        mmq_x  =  MMQ_X_Q5_1_AMPERE;
-        mmq_y  =  MMQ_Y_Q5_1_AMPERE;
-        nwarps = NWARPS_Q5_1_AMPERE;
-    } else if (compute_capability >= MIN_CC_DP4A) {
-        mmq_x  =  MMQ_X_Q5_1_PASCAL;
-        mmq_y  =  MMQ_Y_Q5_1_PASCAL;
-        nwarps = NWARPS_Q5_1_PASCAL;
-    } else {
-        GGML_ASSERT(false);
-    }
-
-    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
-    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
-    const dim3 block_nums(block_num_x, block_num_y, 1);
-    const dim3 block_dims(WARP_SIZE, nwarps, 1);
-
-    if (nrows_x % mmq_y == 0) {
-        const bool need_check = false;
-        mul_mat_q5_1<need_check><<<block_nums, block_dims, 0, stream>>>
-            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-    } else {
-        const bool need_check = true;
-        mul_mat_q5_1<need_check><<<block_nums, block_dims, 0, stream>>>
-            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-    }
-}
-
-static void ggml_mul_mat_q8_0_q8_1_cuda(
-    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
-    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
-
-    int id;
-    CUDA_CHECK(cudaGetDevice(&id));
-    const int compute_capability = g_device_caps[id].cc;
-
-    int mmq_x, mmq_y, nwarps;
-    if (compute_capability >= CC_RDNA2) {
-        mmq_x  =  MMQ_X_Q8_0_RDNA2;
-        mmq_y  =  MMQ_Y_Q8_0_RDNA2;
-        nwarps = NWARPS_Q8_0_RDNA2;
-    } else if (compute_capability >= CC_OFFSET_AMD) {
-        mmq_x  =  MMQ_X_Q8_0_RDNA1;
-        mmq_y  =  MMQ_Y_Q8_0_RDNA1;
-        nwarps = NWARPS_Q8_0_RDNA1;
-    } else if (compute_capability >= CC_VOLTA) {
-        mmq_x  =  MMQ_X_Q8_0_AMPERE;
-        mmq_y  =  MMQ_Y_Q8_0_AMPERE;
-        nwarps = NWARPS_Q8_0_AMPERE;
-    } else if (compute_capability >= MIN_CC_DP4A) {
-        mmq_x  =  MMQ_X_Q8_0_PASCAL;
-        mmq_y  =  MMQ_Y_Q8_0_PASCAL;
-        nwarps = NWARPS_Q8_0_PASCAL;
-    } else {
-        GGML_ASSERT(false);
-    }
-
-    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
-    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
-    const dim3 block_nums(block_num_x, block_num_y, 1);
-    const dim3 block_dims(WARP_SIZE, nwarps, 1);
-
-    if (nrows_x % mmq_y == 0) {
-        const bool need_check = false;
-        mul_mat_q8_0<need_check><<<block_nums, block_dims, 0, stream>>>
-            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-    } else {
-        const bool need_check = true;
-        mul_mat_q8_0<need_check><<<block_nums, block_dims, 0, stream>>>
-            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-    }
-}
-
-static void ggml_mul_mat_q2_K_q8_1_cuda(
-    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
-    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
-
-    int id;
-    CUDA_CHECK(cudaGetDevice(&id));
-    const int compute_capability = g_device_caps[id].cc;
-
-    int mmq_x, mmq_y, nwarps;
-    if (compute_capability >= CC_RDNA2) {
-        mmq_x  =  MMQ_X_Q2_K_RDNA2;
-        mmq_y  =  MMQ_Y_Q2_K_RDNA2;
-        nwarps = NWARPS_Q2_K_RDNA2;
-    } else if (compute_capability >= CC_OFFSET_AMD) {
-        mmq_x  =  MMQ_X_Q2_K_RDNA1;
-        mmq_y  =  MMQ_Y_Q2_K_RDNA1;
-        nwarps = NWARPS_Q2_K_RDNA1;
-    } else if (compute_capability >= CC_VOLTA) {
-        mmq_x  =  MMQ_X_Q2_K_AMPERE;
-        mmq_y  =  MMQ_Y_Q2_K_AMPERE;
-        nwarps = NWARPS_Q2_K_AMPERE;
-    } else if (compute_capability >= MIN_CC_DP4A) {
-        mmq_x  =  MMQ_X_Q2_K_PASCAL;
-        mmq_y  =  MMQ_Y_Q2_K_PASCAL;
-        nwarps = NWARPS_Q2_K_PASCAL;
-    } else {
-        GGML_ASSERT(false);
-    }
-
-    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
-    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
-    const dim3 block_nums(block_num_x, block_num_y, 1);
-    const dim3 block_dims(WARP_SIZE, nwarps, 1);
-
-    if (nrows_x % mmq_y == 0) {
-        const bool need_check = false;
-        mul_mat_q2_K<need_check><<<block_nums, block_dims, 0, stream>>>
-            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-    } else {
-        const bool need_check = true;
-        mul_mat_q2_K<need_check><<<block_nums, block_dims, 0, stream>>>
-            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-    }
-}
-
-static void ggml_mul_mat_q3_K_q8_1_cuda(
-    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
-    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
-
-#if QK_K == 256
-
-    int id;
-    CUDA_CHECK(cudaGetDevice(&id));
-    const int compute_capability = g_device_caps[id].cc;
-
-    int mmq_x, mmq_y, nwarps;
-    if (compute_capability >= CC_RDNA2) {
-        mmq_x  =  MMQ_X_Q3_K_RDNA2;
-        mmq_y  =  MMQ_Y_Q3_K_RDNA2;
-        nwarps = NWARPS_Q3_K_RDNA2;
-    } else if (compute_capability >= CC_OFFSET_AMD) {
-        mmq_x  =  MMQ_X_Q3_K_RDNA1;
-        mmq_y  =  MMQ_Y_Q3_K_RDNA1;
-        nwarps = NWARPS_Q3_K_RDNA1;
-    } else if (compute_capability >= CC_VOLTA) {
-        mmq_x  =  MMQ_X_Q3_K_AMPERE;
-        mmq_y  =  MMQ_Y_Q3_K_AMPERE;
-        nwarps = NWARPS_Q3_K_AMPERE;
-    } else if (compute_capability >= MIN_CC_DP4A) {
-        mmq_x  =  MMQ_X_Q3_K_PASCAL;
-        mmq_y  =  MMQ_Y_Q3_K_PASCAL;
-        nwarps = NWARPS_Q3_K_PASCAL;
-    } else {
-        GGML_ASSERT(false);
-    }
-
-    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
-    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
-    const dim3 block_nums(block_num_x, block_num_y, 1);
-    const dim3 block_dims(WARP_SIZE, nwarps, 1);
-
-    if (nrows_x % mmq_y == 0) {
-        const bool need_check = false;
-        mul_mat_q3_K<need_check><<<block_nums, block_dims, 0, stream>>>
-            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-    } else {
-        const bool need_check = true;
-        mul_mat_q3_K<need_check><<<block_nums, block_dims, 0, stream>>>
-            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-    }
-#endif
-}
-
-static void ggml_mul_mat_q4_K_q8_1_cuda(
-    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
-    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
-
-    int id;
-    CUDA_CHECK(cudaGetDevice(&id));
-    const int compute_capability = g_device_caps[id].cc;
-
-    int mmq_x, mmq_y, nwarps;
-    if (compute_capability >= CC_RDNA2) {
-        mmq_x  =  MMQ_X_Q4_K_RDNA2;
-        mmq_y  =  MMQ_Y_Q4_K_RDNA2;
-        nwarps = NWARPS_Q4_K_RDNA2;
-    } else if (compute_capability >= CC_OFFSET_AMD) {
-        mmq_x  =  MMQ_X_Q4_K_RDNA1;
-        mmq_y  =  MMQ_Y_Q4_K_RDNA1;
-        nwarps = NWARPS_Q4_K_RDNA1;
-    } else if (compute_capability >= CC_VOLTA) {
-        mmq_x  =  MMQ_X_Q4_K_AMPERE;
-        mmq_y  =  MMQ_Y_Q4_K_AMPERE;
-        nwarps = NWARPS_Q4_K_AMPERE;
-    } else if (compute_capability >= MIN_CC_DP4A) {
-        mmq_x  =  MMQ_X_Q4_K_PASCAL;
-        mmq_y  =  MMQ_Y_Q4_K_PASCAL;
-        nwarps = NWARPS_Q4_K_PASCAL;
-    } else {
-        GGML_ASSERT(false);
-    }
-
-    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
-    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
-    const dim3 block_nums(block_num_x, block_num_y, 1);
-    const dim3 block_dims(WARP_SIZE, nwarps, 1);
-
-    if (nrows_x % mmq_y == 0) {
-        const bool need_check = false;
-        mul_mat_q4_K<need_check><<<block_nums, block_dims, 0, stream>>>
-            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-    } else {
-        const bool need_check = true;
-        mul_mat_q4_K<need_check><<<block_nums, block_dims, 0, stream>>>
-            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-    }
-}
-
-static void ggml_mul_mat_q5_K_q8_1_cuda(
-    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
-    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
-
-    int id;
-    CUDA_CHECK(cudaGetDevice(&id));
-    const int compute_capability = g_device_caps[id].cc;
-
-    int mmq_x, mmq_y, nwarps;
-    if (compute_capability >= CC_RDNA2) {
-        mmq_x  =  MMQ_X_Q5_K_RDNA2;
-        mmq_y  =  MMQ_Y_Q5_K_RDNA2;
-        nwarps = NWARPS_Q5_K_RDNA2;
-    } else if (compute_capability >= CC_OFFSET_AMD) {
-        mmq_x  =  MMQ_X_Q5_K_RDNA1;
-        mmq_y  =  MMQ_Y_Q5_K_RDNA1;
-        nwarps = NWARPS_Q5_K_RDNA1;
-    } else if (compute_capability >= CC_VOLTA) {
-        mmq_x  =  MMQ_X_Q5_K_AMPERE;
-        mmq_y  =  MMQ_Y_Q5_K_AMPERE;
-        nwarps = NWARPS_Q5_K_AMPERE;
-    } else if (compute_capability >= MIN_CC_DP4A) {
-        mmq_x  =  MMQ_X_Q5_K_PASCAL;
-        mmq_y  =  MMQ_Y_Q5_K_PASCAL;
-        nwarps = NWARPS_Q5_K_PASCAL;
-    } else {
-        GGML_ASSERT(false);
-    }
-
-    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
-    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
-    const dim3 block_nums(block_num_x, block_num_y, 1);
-    const dim3 block_dims(WARP_SIZE, nwarps, 1);
-
-    if (nrows_x % mmq_y == 0) {
-        const bool need_check = false;
-        mul_mat_q5_K<need_check><<<block_nums, block_dims, 0, stream>>>
-            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-    } else {
-        const bool need_check = true;
-        mul_mat_q5_K<need_check><<<block_nums, block_dims, 0, stream>>>
-            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-    }
-}
-
-static void ggml_mul_mat_q6_K_q8_1_cuda(
-    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
-    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
-
-    int id;
-    CUDA_CHECK(cudaGetDevice(&id));
-    const int compute_capability = g_device_caps[id].cc;
-
-    int mmq_x, mmq_y, nwarps;
-    if (compute_capability >= CC_RDNA2) {
-        mmq_x  =  MMQ_X_Q6_K_RDNA2;
-        mmq_y  =  MMQ_Y_Q6_K_RDNA2;
-        nwarps = NWARPS_Q6_K_RDNA2;
-    } else if (compute_capability >= CC_OFFSET_AMD) {
-        mmq_x  =  MMQ_X_Q6_K_RDNA1;
-        mmq_y  =  MMQ_Y_Q6_K_RDNA1;
-        nwarps = NWARPS_Q6_K_RDNA1;
-    } else if (compute_capability >= CC_VOLTA) {
-        mmq_x  =  MMQ_X_Q6_K_AMPERE;
-        mmq_y  =  MMQ_Y_Q6_K_AMPERE;
-        nwarps = NWARPS_Q6_K_AMPERE;
-    } else if (compute_capability >= MIN_CC_DP4A) {
-        mmq_x  =  MMQ_X_Q6_K_PASCAL;
-        mmq_y  =  MMQ_Y_Q6_K_PASCAL;
-        nwarps = NWARPS_Q6_K_PASCAL;
-    } else {
-        GGML_ASSERT(false);
-    }
-
-    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
-    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
-    const dim3 block_nums(block_num_x, block_num_y, 1);
-    const dim3 block_dims(WARP_SIZE, nwarps, 1);
-
-    if (nrows_x % mmq_y == 0) {
-        const bool need_check = false;
-        mul_mat_q6_K<need_check><<<block_nums, block_dims, 0, stream>>>
-            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-    } else {
-        const bool need_check = true;
-        mul_mat_q6_K<need_check><<<block_nums, block_dims, 0, stream>>>
-            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-    }
-}
-
-static void ggml_mul_mat_p021_f16_f32_cuda(
-    const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x,
-    const int nchannels_x, const int nchannels_y, cudaStream_t stream) {
-
-    const dim3 block_nums(1, nrows_x, nchannels_y);
-    const dim3 block_dims(WARP_SIZE, 1, 1);
-    mul_mat_p021_f16_f32<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols_x, nrows_x, nchannels_x, nchannels_y);
-}
-
-static void ggml_mul_mat_vec_nc_f16_f32_cuda(
-    const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int row_stride_x,
-    const int nchannels_x, const int nchannels_y, const int channel_stride_x, cudaStream_t stream) {
-
-    const dim3 block_nums(1, nrows_x, nchannels_y);
-    const dim3 block_dims(WARP_SIZE, 1, 1);
-    mul_mat_vec_nc_f16_f32<<<block_nums, block_dims, 0, stream>>>
-        (vx, y, dst, ncols_x, nrows_x, row_stride_x, channel_stride_x, nchannels_y/nchannels_x);
-}
-
-static void ggml_cpy_f32_f32_cuda(
-    const char * cx, char * cdst, const int ne,
-    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, const int nb03,
-    const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
-
-    const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
-    cpy_f32_f16<cpy_1_f32_f32><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
-        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
-}
-
-static void ggml_cpy_f32_f16_cuda(
-    const char * cx, char * cdst, const int ne,
-    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, const int nb03,
-    const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
-
-    const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
-    cpy_f32_f16<cpy_1_f32_f16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
-        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
-}
-
-static void ggml_cpy_f32_q8_0_cuda(
-    const char * cx, char * cdst, const int ne,
-    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, const int nb03,
-    const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
-
-    GGML_ASSERT(ne % QK8_0 == 0);
-    const int num_blocks = ne / QK8_0;
-    cpy_f32_q<cpy_blck_f32_q8_0, QK8_0><<<num_blocks, 1, 0, stream>>>
-        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
-}
-
-static void ggml_cpy_f32_q4_0_cuda(
-    const char * cx, char * cdst, const int ne,
-    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, const int nb03,
-    const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
-
-    GGML_ASSERT(ne % QK4_0 == 0);
-    const int num_blocks = ne / QK4_0;
-    cpy_f32_q<cpy_blck_f32_q4_0, QK4_0><<<num_blocks, 1, 0, stream>>>
-        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
-}
-
-static void ggml_cpy_f32_q4_1_cuda(
-    const char * cx, char * cdst, const int ne,
-    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, const int nb03,
-    const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
-
-    GGML_ASSERT(ne % QK4_1 == 0);
-    const int num_blocks = ne / QK4_1;
-    cpy_f32_q<cpy_blck_f32_q4_1, QK4_1><<<num_blocks, 1, 0, stream>>>
-        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
-}
-
-static void ggml_cpy_f16_f16_cuda(
-    const char * cx, char * cdst, const int ne,
-    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, const int nb03,
-    const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
-
-    const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
-    cpy_f32_f16<cpy_1_f16_f16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
-        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
-}
-
-static void scale_f32_cuda(const float * x, float * dst, const float scale, const int k, cudaStream_t stream) {
-    const int num_blocks = (k + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE;
-    scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
-}
-
-static void clamp_f32_cuda(const float * x, float * dst, const float min, const float max, const int k, cudaStream_t stream) {
-    const int num_blocks = (k + CUDA_CLAMP_BLOCK_SIZE - 1) / CUDA_CLAMP_BLOCK_SIZE;
-    clamp_f32<<<num_blocks, CUDA_CLAMP_BLOCK_SIZE, 0, stream>>>(x, dst, min, max, k);
-}
-
-template<typename T>
-static void rope_cuda(
-    const T * x, T * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
-    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream
-) {
-    GGML_ASSERT(ncols % 2 == 0);
-    const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
-    const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
-    const dim3 block_nums(nrows, num_blocks_x, 1);
-    if (pos == nullptr) {
-        rope<T, false><<<block_nums, block_dims, 0, stream>>>(
-            x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
-        );
-    } else {
-        rope<T, true><<<block_nums, block_dims, 0, stream>>>(
-            x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
-        );
-    }
-}
-
-template<typename T>
-static void rope_neox_cuda(
-    const T * x, T * dst, int ncols, int n_dims, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
-    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream
-) {
-    GGML_ASSERT(ncols % 2 == 0);
-    const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
-    const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
-    const dim3 block_nums(nrows, num_blocks_x, 1);
-
-    const float theta_scale = powf(freq_base, -2.0f/n_dims);
-    const float inv_ndims = -1.0f / n_dims;
-
-    if (pos == nullptr) {
-        rope_neox<T, false><<<block_nums, block_dims, 0, stream>>>(
-            x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
-            theta_scale, inv_ndims
-        );
-    } else {
-        rope_neox<T, true><<<block_nums, block_dims, 0, stream>>>(
-            x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
-            theta_scale, inv_ndims
-        );
-    }
-}
-
-static void rope_glm_f32_cuda(
-    const float * x, float * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
-    float freq_base, int n_ctx, cudaStream_t stream
-) {
-    GGML_ASSERT(ncols % 4 == 0);
-    const dim3 block_dims(CUDA_ROPE_BLOCK_SIZE/4, 1, 1);
-    const int num_blocks_x = (ncols + CUDA_ROPE_BLOCK_SIZE - 1) / CUDA_ROPE_BLOCK_SIZE;
-    const dim3 block_nums(num_blocks_x, nrows, 1);
-    rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, n_ctx);
-}
-
-static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const int nrows,
-                           const int k_rows, const int n_heads_log2_floor, const float m0,
-                           const float m1, cudaStream_t stream) {
-    const dim3 block_dims(CUDA_ALIBI_BLOCK_SIZE, 1, 1);
-    const int num_blocks_x = (ncols + CUDA_ALIBI_BLOCK_SIZE - 1) / (CUDA_ALIBI_BLOCK_SIZE);
-    const dim3 block_nums(num_blocks_x, nrows, 1);
-    alibi_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, k_rows, n_heads_log2_floor, m0, m1);
-}
-
-static void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    const dim3 block_dims(WARP_SIZE, 1, 1);
-    const dim3 block_nums(1, nrows, 1);
-    k_sum_rows_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
-}
-
-static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, const int nrows, ggml_sort_order order, cudaStream_t stream) {
-    // bitonic sort requires ncols to be power of 2
-    GGML_ASSERT((ncols & (ncols - 1)) == 0);
-
-    const dim3 block_dims(ncols, 1, 1);
-    const dim3 block_nums(1, nrows, 1);
-    if (order == GGML_SORT_ASC) {
-        k_argsort_f32_i32<GGML_SORT_ASC><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
-    } else if (order == GGML_SORT_DESC) {
-        k_argsort_f32_i32<GGML_SORT_DESC><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
-    } else {
-        GGML_ASSERT(false);
-    }
-}
-
-static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, const int rows_per_channel, const int n_past, cudaStream_t stream) {
-    const dim3 block_dims(1, CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1);
-    const int block_num_x = (ncols_x + CUDA_DIAG_MASK_INF_BLOCK_SIZE - 1) / CUDA_DIAG_MASK_INF_BLOCK_SIZE;
-    const dim3 block_nums(nrows_x, block_num_x, 1);
-    diag_mask_inf_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x, rows_per_channel, n_past);
-}
-
-static void soft_max_f16_cuda(const float * x, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nrows_y, const float scale, cudaStream_t stream) {
-    int nth = WARP_SIZE;
-    while (nth < ncols_x/2 && nth < CUDA_SOFT_MAX_BLOCK_SIZE) nth *= 2;
-    const dim3 block_dims(nth,     1, 1);
-    const dim3 block_nums(nrows_x, 1, 1);
-    const size_t shmem = (GGML_PAD(ncols_x, 2*WARP_SIZE) + WARP_SIZE)*sizeof(half);
-    static_assert(CUDA_SOFT_MAX_BLOCK_SIZE == 1024, "These values need to be adjusted.");
-    if (shmem <= g_device_caps[g_main_device].smpb) {
-        switch (ncols_x) {
-            case 32:
-                soft_max_f16<true, 32, 32, true><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
-                break;
-            case 64:
-                soft_max_f16<true, 64, 32, false><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
-                break;
-            case 128:
-                soft_max_f16<true, 128, 64, false><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
-                break;
-            case 256:
-                soft_max_f16<true, 256, 128, false><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
-                break;
-            case 512:
-                soft_max_f16<true, 512, 256, false><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
-                break;
-            case 1024:
-                soft_max_f16<true, 1024, 512, false><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
-                break;
-            case 2048:
-                soft_max_f16<true, 2048, 1024, false><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
-                break;
-            case 4096:
-                soft_max_f16<true, 4096, 1024, false><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
-                break;
-            default:
-                soft_max_f16<true, 0, 0, true><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
-                break;
-        }
-    } else {
-        const size_t shmem_low = WARP_SIZE*sizeof(half);
-        soft_max_f16<false, 0, 0, true><<<block_nums, block_dims, shmem_low, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
-    }
-}
-
-static void soft_max_f32_cuda(const float * x, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nrows_y, const float scale, cudaStream_t stream) {
-    int nth = WARP_SIZE;
-    while (nth < ncols_x && nth < CUDA_SOFT_MAX_BLOCK_SIZE) nth *= 2;
-    const dim3 block_dims(nth,     1, 1);
-    const dim3 block_nums(nrows_x, 1, 1);
-    const size_t shmem = (GGML_PAD(ncols_x, WARP_SIZE) + WARP_SIZE)*sizeof(float);
-    static_assert(CUDA_SOFT_MAX_BLOCK_SIZE == 1024, "These values need to be adjusted.");
-    if (shmem < g_device_caps[g_main_device].smpb) {
-        switch (ncols_x) {
-            case 32:
-                soft_max_f32<true, 32, 32><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
-                break;
-            case 64:
-                soft_max_f32<true, 64, 64><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
-                break;
-            case 128:
-                soft_max_f32<true, 128, 128><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
-                break;
-            case 256:
-                soft_max_f32<true, 256, 256><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
-                break;
-            case 512:
-                soft_max_f32<true, 512, 512><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
-                break;
-            case 1024:
-                soft_max_f32<true, 1024, 1024><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
-                break;
-            case 2048:
-                soft_max_f32<true, 2048, 1024><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
-                break;
-            case 4096:
-                soft_max_f32<true, 4096, 1024><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
-                break;
-            default:
-                soft_max_f32<true, 0, 0><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
-                break;
-        }
-    } else {
-        const size_t shmem_low = WARP_SIZE*sizeof(float);
-        soft_max_f32<false, 0, 0><<<block_nums, block_dims, shmem_low, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
-    }
-}
-
-static void im2col_f32_f16_cuda(const float* x, half* dst, int64_t N,
-    int64_t IW, int64_t IH, int64_t OW, int64_t OH, int64_t KW, int64_t KH, int64_t IC,
-    int offset_delta,
-    int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) {
-    const int64_t parallel_elements = N * OW * KW * KH;
-    const int64_t num_blocks = (parallel_elements + CUDA_IM2COL_BLOCK_SIZE - 1) / CUDA_IM2COL_BLOCK_SIZE;
-    dim3 block_nums(num_blocks, OH, IC);
-    im2col_f32_f16<<<block_nums, CUDA_IM2COL_BLOCK_SIZE, 0, stream>>>(x, dst, offset_delta, N, IW, IH, OW, OH, KW, KH, parallel_elements, (IC * KH * KW), s0, s1, p0, p1, d0, d1);
-}
-
-// buffer pool for cuda
-#define MAX_CUDA_BUFFERS 256
-
-struct scoped_spin_lock {
-    std::atomic_flag& lock;
-    scoped_spin_lock(std::atomic_flag& lock) : lock(lock) {
-        while (lock.test_and_set(std::memory_order_acquire)) {
-            ; // spin
-        }
-    }
-    ~scoped_spin_lock() {
-        lock.clear(std::memory_order_release);
-    }
-    scoped_spin_lock(const scoped_spin_lock&) = delete;
-    scoped_spin_lock& operator=(const scoped_spin_lock&) = delete;
-};
-
-static std::atomic_flag g_cuda_pool_lock = ATOMIC_FLAG_INIT;
-
-// #define DEBUG_CUDA_MALLOC
-struct ggml_cuda_buffer {
-    void * ptr = nullptr;
-    size_t size = 0;
-};
-
-static ggml_cuda_buffer g_cuda_buffer_pool[GGML_CUDA_MAX_DEVICES][MAX_CUDA_BUFFERS];
-static size_t g_cuda_pool_size[GGML_CUDA_MAX_DEVICES] = {0};
-
-static void * ggml_cuda_pool_malloc_leg(int device, size_t size, size_t * actual_size) {
-    scoped_spin_lock lock(g_cuda_pool_lock);
-#ifdef DEBUG_CUDA_MALLOC
-    int nnz = 0;
-    size_t max_size = 0;
-#endif
-    size_t best_diff = 1ull << 36;
-    int ibest = -1;
-    for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
-        ggml_cuda_buffer& b = g_cuda_buffer_pool[device][i];
-        if (b.ptr != nullptr) {
-#ifdef DEBUG_CUDA_MALLOC
-            ++nnz;
-            if (b.size > max_size) max_size = b.size;
-#endif
-            if (b.size >= size) {
-                size_t diff = b.size - size;
-                if (diff < best_diff) {
-                    best_diff = diff;
-                    ibest = i;
-                    if (!best_diff) {
-                        void * ptr = b.ptr;
-                        *actual_size = b.size;
-                        b.ptr = nullptr;
-                        b.size = 0;
-                        return ptr;
-                    }
-                }
-            }
-        }
-    }
-    if (ibest >= 0) {
-        ggml_cuda_buffer& b = g_cuda_buffer_pool[device][ibest];
-        void * ptr = b.ptr;
-        *actual_size = b.size;
-        b.ptr = nullptr;
-        b.size = 0;
-        return ptr;
-    }
-    void * ptr;
-    size_t look_ahead_size = (size_t) (1.05 * size);
-    look_ahead_size = 256 * ((look_ahead_size + 255)/256);
-    ggml_cuda_set_device(device);
-    CUDA_CHECK(cudaMalloc((void **) &ptr, look_ahead_size));
-    *actual_size = look_ahead_size;
-    g_cuda_pool_size[device] += look_ahead_size;
-#ifdef DEBUG_CUDA_MALLOC
-    fprintf(stderr, "%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, id, nnz,
-            (uint32_t)(max_size/1024/1024), (uint32_t)(g_cuda_pool_size[id]/1024/1024), (uint32_t)(size/1024/1024));
-#endif
-    return ptr;
-}
-
-static void ggml_cuda_pool_free_leg(int device, void * ptr, size_t size) {
-    scoped_spin_lock lock(g_cuda_pool_lock);
-
-    for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
-        ggml_cuda_buffer& b = g_cuda_buffer_pool[device][i];
-        if (b.ptr == nullptr) {
-            b.ptr = ptr;
-            b.size = size;
-            return;
-        }
-    }
-    fprintf(stderr, "WARNING: cuda buffer pool full, increase MAX_CUDA_BUFFERS\n");
-    ggml_cuda_set_device(device);
-    CUDA_CHECK(cudaFree(ptr));
-    g_cuda_pool_size[device] -= size;
-}
-
-#if !defined(GGML_USE_HIPBLAS)
-// pool with virtual memory
-static CUdeviceptr g_cuda_pool_addr[GGML_CUDA_MAX_DEVICES] = {0};
-static size_t g_cuda_pool_used[GGML_CUDA_MAX_DEVICES] = {0};
-static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB
-
-static void * ggml_cuda_pool_malloc_vmm(int device, size_t size, size_t * actual_size) {
-    scoped_spin_lock lock(g_cuda_pool_lock);
-
-    // round up the allocation size to the alignment to ensure that all allocations are aligned for all data types
-    const size_t alignment = 128;
-    size = alignment * ((size + alignment - 1) / alignment);
-
-    size_t avail = g_cuda_pool_size[device] - g_cuda_pool_used[device];
-
-    if (size > avail) {
-        // round up to the next multiple of the granularity
-        size_t reserve_size = size - avail;
-        const size_t granularity = g_device_caps[device].vmm_granularity;
-        reserve_size = granularity * ((reserve_size + granularity - 1) / granularity);
-
-        GGML_ASSERT(g_cuda_pool_size[device] + reserve_size <= CUDA_POOL_VMM_MAX_SIZE);
-
-        // allocate more physical memory
-        CUmemAllocationProp prop = {};
-        prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
-        prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
-        prop.location.id = device;
-        CUmemGenericAllocationHandle handle;
-        CU_CHECK(cuMemCreate(&handle, reserve_size, &prop, 0));
-
-        // reserve virtual address space (if not already reserved)
-        if (g_cuda_pool_addr[device] == 0) {
-            CU_CHECK(cuMemAddressReserve(&g_cuda_pool_addr[device], CUDA_POOL_VMM_MAX_SIZE, 0, 0, 0));
-        }
-
-        // map at the end of the pool
-        CU_CHECK(cuMemMap(g_cuda_pool_addr[device] + g_cuda_pool_size[device], reserve_size, 0, handle, 0));
-
-        // the memory allocation handle is no longer needed after mapping
-        CU_CHECK(cuMemRelease(handle));
-
-        // set access
-        CUmemAccessDesc access = {};
-        access.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
-        access.location.id = device;
-        access.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
-        CU_CHECK(cuMemSetAccess(g_cuda_pool_addr[device] + g_cuda_pool_size[device], reserve_size, &access, 1));
-
-        // add to the pool
-        g_cuda_pool_size[device] += reserve_size;
-
-        //printf("cuda pool[%d]: size increased to %llu MB (reserved %llu MB)\n",
-        //       id, (unsigned long long) (g_cuda_pool_size[id]/1024/1024),
-        //       (unsigned long long) (reserve_size/1024/1024));
-    }
-
-    GGML_ASSERT(g_cuda_pool_addr[device] != 0);
-
-    void * ptr = (void *) (g_cuda_pool_addr[device] + g_cuda_pool_used[device]);
-    *actual_size = size;
-    g_cuda_pool_used[device] += size;
-
-#ifdef DEBUG_CUDA_MALLOC
-    printf("cuda pool[%d]: allocated %llu bytes at %llx [%s]\n", id, (unsigned long long) size, ptr);
-#endif
-
-    return ptr;
-}
-
-static void ggml_cuda_pool_free_vmm(int device, void * ptr, size_t size) {
-    scoped_spin_lock lock(g_cuda_pool_lock);
-
-#ifdef DEBUG_CUDA_MALLOC
-    printf("cuda pool[%d]: freed %llu bytes at %llx\n", id, (unsigned long long) size, ptr);
-#endif
-
-    g_cuda_pool_used[device] -= size;
-
-    // all deallocations must be in reverse order of the allocations
-    GGML_ASSERT(ptr == (void *) (g_cuda_pool_addr[device] + g_cuda_pool_used[device]));
-}
-
-static void * ggml_cuda_pool_malloc(int device, size_t size, size_t * actual_size) {
-    if (g_device_caps[device].vmm) {
-        return ggml_cuda_pool_malloc_vmm(device, size, actual_size);
-    } else {
-        return ggml_cuda_pool_malloc_leg(device, size, actual_size);
-    }
-}
-
-static void ggml_cuda_pool_free(int device, void * ptr, size_t size) {
-    if (g_device_caps[device].vmm) {
-        ggml_cuda_pool_free_vmm(device, ptr, size);
-    } else {
-        ggml_cuda_pool_free_leg(device, ptr, size);
-    }
-}
-#else
-#define ggml_cuda_pool_malloc ggml_cuda_pool_malloc_leg
-#define ggml_cuda_pool_free ggml_cuda_pool_free_leg
-#endif // !defined(GGML_USE_HIPBLAS)
-
-template<typename T>
-struct cuda_pool_alloc {
-    int device = -1;
-    T * ptr = nullptr;
-    size_t actual_size = 0;
-
-    // size is in number of elements
-    T * alloc(size_t size) {
-        GGML_ASSERT(ptr == nullptr);
-        CUDA_CHECK(cudaGetDevice(&device));
-        ptr = (T *) ggml_cuda_pool_malloc(device, size * sizeof(T), &this->actual_size);
-        return ptr;
-    }
-
-    cuda_pool_alloc(size_t size) {
-        alloc(size);
-    }
-
-    ~cuda_pool_alloc() {
-        if (ptr != nullptr) {
-            ggml_cuda_pool_free(device, ptr, actual_size);
-        }
-    }
-
-    T * get() {
-        return ptr;
-    }
-
-    cuda_pool_alloc() = default;
-    cuda_pool_alloc(const cuda_pool_alloc &) = delete;
-    cuda_pool_alloc(cuda_pool_alloc &&) = delete;
-    cuda_pool_alloc& operator=(const cuda_pool_alloc &) = delete;
-    cuda_pool_alloc& operator=(cuda_pool_alloc &&) = delete;
-};
-
-static bool g_cublas_loaded = false;
-
-bool ggml_cublas_loaded(void) {
-    return g_cublas_loaded;
-}
-
-void ggml_init_cublas() {
-    static bool initialized = false;
-
-    if (!initialized) {
-
-#ifdef __HIP_PLATFORM_AMD__
-        // Workaround for a rocBLAS bug when using multiple graphics cards:
-        // https://github.com/ROCmSoftwarePlatform/rocBLAS/issues/1346
-        rocblas_initialize();
-        CUDA_CHECK(cudaDeviceSynchronize());
-#endif
-
-        if (cudaGetDeviceCount(&g_device_count) != cudaSuccess) {
-            initialized = true;
-            g_cublas_loaded = false;
-            return;
-        }
-
-        GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
-        int64_t total_vram = 0;
-#if defined(GGML_CUDA_FORCE_MMQ)
-        fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ:   yes\n", __func__);
-#else
-        fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ:   no\n", __func__);
-#endif
-#if defined(CUDA_USE_TENSOR_CORES)
-        fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: yes\n", __func__);
-#else
-        fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: no\n", __func__);
-#endif
-        fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count);
-        for (int id = 0; id < g_device_count; ++id) {
-            int device_vmm = 0;
-
-#if !defined(GGML_USE_HIPBLAS)
-            CUdevice device;
-            CU_CHECK(cuDeviceGet(&device, id));
-            CU_CHECK(cuDeviceGetAttribute(&device_vmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device));
-
-            if (device_vmm) {
-                CUmemAllocationProp alloc_prop = {};
-                alloc_prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
-                alloc_prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
-                alloc_prop.location.id = id;
-                CU_CHECK(cuMemGetAllocationGranularity(&g_device_caps[id].vmm_granularity, &alloc_prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
-            }
-#endif // !defined(GGML_USE_HIPBLAS)
-            g_device_caps[id].vmm = !!device_vmm;
-
-            cudaDeviceProp prop;
-            CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
-            fprintf(stderr, "  Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
-
-            g_default_tensor_split[id] = total_vram;
-            total_vram += prop.totalGlobalMem;
-
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-            g_device_caps[id].cc = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD;
-#else
-            g_device_caps[id].cc = 100*prop.major + 10*prop.minor;
-#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-            g_device_caps[id].smpb = prop.sharedMemPerBlock;
-        }
-        for (int id = 0; id < g_device_count; ++id) {
-            g_default_tensor_split[id] /= total_vram;
-        }
-
-        for (int id = 0; id < g_device_count; ++id) {
-            ggml_cuda_set_device(id);
-
-            // create cuda streams
-            for (int is = 0; is < MAX_STREAMS; ++is) {
-                CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams[id][is], cudaStreamNonBlocking));
-            }
-
-            // create cublas handle
-            CUBLAS_CHECK(cublasCreate(&g_cublas_handles[id]));
-            CUBLAS_CHECK(cublasSetMathMode(g_cublas_handles[id], CUBLAS_TF32_TENSOR_OP_MATH));
-        }
-
-        // configure logging to stdout
-        // CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
-
-        initialized = true;
-        g_cublas_loaded = true;
-    }
-}
-
-void * ggml_cuda_host_malloc(size_t size) {
-    if (getenv("GGML_CUDA_NO_PINNED") != nullptr) {
-        return nullptr;
-    }
-
-    void * ptr = nullptr;
-    cudaError_t err = cudaMallocHost((void **) &ptr, size);
-    if (err != cudaSuccess) {
-        // clear the error
-        cudaGetLastError();
-        fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n",
-            size/1024.0/1024.0, cudaGetErrorString(err));
-        return nullptr;
-    }
-
-    return ptr;
-}
-
-void ggml_cuda_host_free(void * ptr) {
-    CUDA_CHECK(cudaFreeHost(ptr));
-}
-
-static cudaError_t ggml_cuda_cpy_tensor_2d(
-    void * dst, const struct ggml_tensor * src, int64_t i3, int64_t i2, int64_t i1_low, int64_t i1_high, cudaStream_t stream) {
-
-    cudaMemcpyKind kind;
-    char * src_ptr;
-    if (src->backend == GGML_BACKEND_CPU) {
-        kind = cudaMemcpyHostToDevice;
-        src_ptr = (char *) src->data;
-    } else if (src->backend == GGML_BACKEND_GPU || src->backend == GGML_BACKEND_GPU_SPLIT) {
-        GGML_ASSERT(src->backend != GGML_BACKEND_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1]));
-        kind = cudaMemcpyDeviceToDevice;
-        ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
-        int id;
-        CUDA_CHECK(cudaGetDevice(&id));
-        src_ptr = (char *) extra->data_device[id];
-    } else {
-        GGML_ASSERT(false);
-    }
-    char * dst_ptr = (char *) dst;
-
-    const int64_t ne0 = src->ne[0];
-    const int64_t nb0 = src->nb[0];
-    const int64_t nb1 = src->nb[1];
-    const int64_t nb2 = src->nb[2];
-    const int64_t nb3 = src->nb[3];
-    const enum ggml_type type = src->type;
-    const int64_t ts = ggml_type_size(type);
-    const int64_t bs = ggml_blck_size(type);
-    int64_t i1_diff = i1_high - i1_low;
-
-    const char * x = src_ptr + i1_low*nb1 + i2*nb2 + i3*nb3;
-    if (nb0 == ts && nb1 == ts*ne0/bs) {
-        return cudaMemcpyAsync(dst_ptr, x, i1_diff*nb1, kind, stream);
-    } else if (nb0 == ts) {
-        return cudaMemcpy2DAsync(dst_ptr, ts*ne0/bs, x, nb1, ts*ne0/bs, i1_diff, kind, stream);
-    } else {
-        for (int64_t i1 = 0; i1 < i1_diff; i1++) {
-            const void * rx = (const void *) ((const char *) x + i1*nb1);
-            void * rd = (void *) (dst_ptr + i1*ts*ne0/bs);
-            // pretend the row is a matrix with cols=1
-            cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, kind, stream);
-            if (r != cudaSuccess) return r;
-        }
-        return cudaSuccess;
-    }
-}
-
-static void ggml_cuda_op_get_rows(
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-    const float * src0_d, const float * src1_d, float * dst_d, cudaStream_t stream) {
-
-    GGML_ASSERT(src1->type == GGML_TYPE_I32);
-    GGML_ASSERT(dst->type == GGML_TYPE_F32);
-
-    GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
-    GGML_ASSERT(src1->nb[0] == ggml_type_size(src1->type));
-    GGML_ASSERT(dst->nb[0] == ggml_type_size(dst->type));
-
-    const int32_t * src1_i32 = (const int32_t *) src1_d;
-
-    switch (src0->type) {
-        case GGML_TYPE_F16:
-            get_rows_cuda_float(src0, src1, dst, (const half *)src0_d, src1_i32, dst_d, stream);
-            break;
-        case GGML_TYPE_F32:
-            get_rows_cuda_float(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
-            break;
-        case GGML_TYPE_Q4_0:
-            get_rows_cuda<QK4_0, QR4_0, dequantize_q4_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
-            break;
-        case GGML_TYPE_Q4_1:
-            get_rows_cuda<QK4_1, QR4_1, dequantize_q4_1>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
-            break;
-        case GGML_TYPE_Q5_0:
-            get_rows_cuda<QK5_0, QR5_0, dequantize_q5_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
-            break;
-        case GGML_TYPE_Q5_1:
-            get_rows_cuda<QK5_1, QR5_1, dequantize_q5_1>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
-            break;
-        case GGML_TYPE_Q8_0:
-            get_rows_cuda<QK8_0, QR8_0, dequantize_q8_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
-            break;
-        default:
-            // TODO: k-quants
-            fprintf(stderr, "%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type));
-            GGML_ASSERT(false);
-            break;
-    }
-}
-
-template<class op>
-static void ggml_cuda_op_bin_bcast(
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-
-    if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
-        op()(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
-    } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
-        op()(src0, src1, dst, (const half *) src0_dd, src1_dd, (half *) dst_dd, main_stream);
-    } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
-        op()(src0, src1, dst, (const half *) src0_dd, src1_dd, dst_dd, main_stream);
-    } else {
-        fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__,
-            ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type));
-        GGML_ASSERT(false);
-    }
-}
-
-static void ggml_cuda_op_repeat(
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-    const float * src0_d, const float * src1_d, float * dst_d, cudaStream_t main_stream) {
-
-    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_repeat>>(dst, src0, dst, nullptr, src0_d, dst_d, main_stream);
-
-    (void) src1;
-    (void) src1_d;
-}
-
-static void ggml_cuda_op_add(
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-
-    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_add>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
-}
-
-static void ggml_cuda_op_acc(
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-    GGML_ASSERT(dst->ne[3] == 1); // just 3D tensors supported
-
-    int nb1 = dst->op_params[0] / 4; // 4 bytes of float32
-    int nb2 = dst->op_params[1] / 4; // 4 bytes of float32
-    // int nb3 = dst->op_params[2] / 4; // 4 bytes of float32 - unused
-    int offset = dst->op_params[3] / 4; // offset in bytes
-
-    acc_f32_cuda(src0_dd, src1_dd, dst_dd, ggml_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2], nb1, nb2, offset, main_stream);
-
-    (void) dst;
-}
-
-static void ggml_cuda_op_mul(
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-
-    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_mul>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
-}
-
-static void ggml_cuda_op_div(
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-
-    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_div>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
-}
-
-static void ggml_cuda_op_gelu(
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    gelu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-static void ggml_cuda_op_silu(
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    silu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-static void ggml_cuda_op_gelu_quick(
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    gelu_quick_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-static void ggml_cuda_op_tanh(
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    tanh_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-static void ggml_cuda_op_relu(
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    relu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-static void ggml_cuda_op_leaky_relu(
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    float negative_slope;
-    memcpy(&negative_slope, dst->op_params, sizeof(float));
-
-    leaky_relu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), negative_slope, main_stream);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-static void ggml_cuda_op_sqr(
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    sqr_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-static void ggml_cuda_op_norm(
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t nrows = ggml_nrows(src0);
-
-    float eps;
-    memcpy(&eps, dst->op_params, sizeof(float));
-
-    norm_f32_cuda(src0_dd, dst_dd, ne00, nrows, eps, main_stream);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-static void ggml_cuda_op_group_norm(
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    int num_groups = dst->op_params[0];
-    int group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups);
-    group_norm_f32_cuda(src0_dd, dst_dd, num_groups * src0->ne[3], group_size, ggml_nelements(src0), main_stream);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-static void ggml_cuda_op_concat(
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-    GGML_ASSERT(dst->type == GGML_TYPE_F32);
-
-    for (int i3 = 0; i3 < dst->ne[3]; i3++) {
-        concat_f32_cuda(src0_dd + i3 * (src0->nb[3] / 4), src1_dd + i3 * (src1->nb[3] / 4), dst_dd + i3 * (dst->nb[3] / 4), dst->ne[0], dst->ne[1], dst->ne[2], src0->ne[2], main_stream);
-    }
-
-    (void) src1;
-    (void) dst;
-}
-
-static void ggml_cuda_op_upscale(
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT(dst->type == GGML_TYPE_F32);
-
-    const int scale_factor = dst->op_params[0];
-
-    upscale_f32_cuda(src0_dd, dst_dd, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], scale_factor, main_stream);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-static void ggml_cuda_op_pad(
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT(dst->type == GGML_TYPE_F32);
-
-    pad_f32_cuda(src0_dd, dst_dd,
-        src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
-        dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], main_stream);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-static void ggml_cuda_op_rms_norm(
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t nrows = ggml_nrows(src0);
-
-    float eps;
-    memcpy(&eps, dst->op_params, sizeof(float));
-
-    rms_norm_f32_cuda(src0_dd, dst_dd, ne00, nrows, eps, main_stream);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-static void ggml_cuda_op_mul_mat_q(
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
-    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
-    const int64_t src1_padded_row_size, cudaStream_t stream) {
-
-    const int64_t ne00 = src0->ne[0];
-
-    const int64_t ne10 = src1->ne[0];
-    GGML_ASSERT(ne10 % QK8_1 == 0);
-
-    const int64_t ne0 = dst->ne[0];
-
-    const int64_t row_diff = row_high - row_low;
-
-    int id;
-    CUDA_CHECK(cudaGetDevice(&id));
-
-    // the main device has a larger memory buffer to hold the results from all GPUs
-    // nrows_dst == nrows of the matrix that the dequantize_mul_mat kernel writes into
-    const int64_t nrows_dst = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff;
-
-    switch (src0->type) {
-        case GGML_TYPE_Q4_0:
-            ggml_mul_mat_q4_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
-            break;
-        case GGML_TYPE_Q4_1:
-            ggml_mul_mat_q4_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
-            break;
-        case GGML_TYPE_Q5_0:
-            ggml_mul_mat_q5_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
-            break;
-        case GGML_TYPE_Q5_1:
-            ggml_mul_mat_q5_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
-            break;
-        case GGML_TYPE_Q8_0:
-            ggml_mul_mat_q8_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
-            break;
-        case GGML_TYPE_Q2_K:
-            ggml_mul_mat_q2_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
-            break;
-        case GGML_TYPE_Q3_K:
-            ggml_mul_mat_q3_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
-            break;
-        case GGML_TYPE_Q4_K:
-            ggml_mul_mat_q4_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
-            break;
-        case GGML_TYPE_Q5_K:
-            ggml_mul_mat_q5_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
-            break;
-        case GGML_TYPE_Q6_K:
-            ggml_mul_mat_q6_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
-            break;
-        default:
-            GGML_ASSERT(false);
-            break;
-    }
-
-    (void) src1;
-    (void) dst;
-    (void) src1_ddf_i;
-}
-
-static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_CUDA_MAX_DEVICES> & tensor_split) {
-    int64_t min_compute_capability = INT_MAX;
-    int64_t max_compute_capability = INT_MIN;
-    for (int id = 0; id < g_device_count; ++id) {
-        if (tensor_split[id] < (id + 1 < g_device_count ? tensor_split[id + 1] : 1.0f)) {
-            if (min_compute_capability > g_device_caps[id].cc) {
-                min_compute_capability = g_device_caps[id].cc;
-            }
-            if (max_compute_capability < g_device_caps[id].cc) {
-                max_compute_capability = g_device_caps[id].cc;
-            }
-        }
-    }
-
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-    switch(type) {
-        case GGML_TYPE_Q4_0:
-        case GGML_TYPE_Q4_1:
-        case GGML_TYPE_Q5_0:
-        case GGML_TYPE_Q5_1:
-        case GGML_TYPE_Q8_0:
-            return max_compute_capability >= CC_RDNA2 ? 128 : 64;
-        case GGML_TYPE_F16:
-        case GGML_TYPE_F32:
-            return 1;
-        case GGML_TYPE_Q2_K:
-            return max_compute_capability >= CC_RDNA2 ? 128 : 32;
-        case GGML_TYPE_Q3_K:
-            return min_compute_capability < CC_RDNA2 ? 128 : 64;
-        case GGML_TYPE_Q4_K:
-        case GGML_TYPE_Q5_K:
-        case GGML_TYPE_Q6_K:
-        case GGML_TYPE_IQ2_XXS:
-        case GGML_TYPE_IQ2_XS:
-            return max_compute_capability >= CC_RDNA2 ? 128 : 64;
-        default:
-            GGML_ASSERT(false);
-    }
-#else
-    switch(type) {
-        case GGML_TYPE_Q4_0:
-        case GGML_TYPE_Q4_1:
-            return max_compute_capability >= CC_VOLTA ? 128 : 64;
-        case GGML_TYPE_Q5_0:
-        case GGML_TYPE_Q5_1:
-        case GGML_TYPE_Q8_0:
-            return 64;
-        case GGML_TYPE_F16:
-        case GGML_TYPE_F32:
-            return 1;
-        case GGML_TYPE_Q2_K:
-        case GGML_TYPE_Q3_K:
-        case GGML_TYPE_Q4_K:
-        case GGML_TYPE_Q5_K:
-        case GGML_TYPE_IQ2_XXS:
-        case GGML_TYPE_IQ2_XS:
-            return max_compute_capability >= CC_VOLTA ? 128 : 64;
-        case GGML_TYPE_Q6_K:
-            return 64;
-        default:
-            GGML_ASSERT(false);
-    }
-#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-}
-
-static void get_row_split(int64_t * row_low, int64_t * row_high, const ggml_tensor * tensor, const std::array<float, GGML_CUDA_MAX_DEVICES> & tensor_split, int id) {
-    const int64_t nrows = ggml_nrows(tensor);
-    const int64_t rounding = get_row_rounding(tensor->type, tensor_split);
-
-    *row_low = id == 0 ? 0 : nrows*tensor_split[id];
-    *row_low -= *row_low % rounding;
-
-    if (id == g_device_count - 1) {
-        *row_high = nrows;
-    } else {
-        *row_high = nrows*tensor_split[id + 1];
-        *row_high -= *row_high % rounding;
-    }
-}
-
-static void ggml_cuda_op_mul_mat_vec_q(
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
-    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
-    const int64_t src1_padded_row_size, cudaStream_t stream) {
-
-    GGML_ASSERT(ggml_nrows(src1) == 1);
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t row_diff = row_high - row_low;
-
-    switch (src0->type) {
-        case GGML_TYPE_Q4_0:
-            mul_mat_vec_q4_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q4_1:
-            mul_mat_vec_q4_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q5_0:
-            mul_mat_vec_q5_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q5_1:
-            mul_mat_vec_q5_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q8_0:
-            mul_mat_vec_q8_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q2_K:
-            mul_mat_vec_q2_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q3_K:
-            mul_mat_vec_q3_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q4_K:
-            mul_mat_vec_q4_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q5_K:
-            mul_mat_vec_q5_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q6_K:
-            mul_mat_vec_q6_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_IQ2_XXS:
-            mul_mat_vec_iq2_xxs_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_IQ2_XS:
-            mul_mat_vec_iq2_xs_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
-            break;
-        default:
-            GGML_ASSERT(false);
-            break;
-    }
-
-    (void) src1;
-    (void) dst;
-    (void) src1_ddf_i;
-    (void) src1_ncols;
-    (void) src1_padded_row_size;
-}
-
-static void ggml_cuda_op_dequantize_mul_mat_vec(
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
-    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
-    const int64_t src1_padded_row_size, cudaStream_t stream) {
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t row_diff = row_high - row_low;
-
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-
-    // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
-#ifdef GGML_CUDA_F16
-    cuda_pool_alloc<half> src1_dfloat_a;
-    half * src1_dfloat = nullptr; // dfloat == half
-
-    bool src1_convert_f16 =
-        src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
-        src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
-        src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
-
-    if (src1_convert_f16) {
-        src1_dfloat = src1_dfloat_a.alloc(ne00);
-        ggml_cpy_f32_f16_cuda((const char *) src1_ddf_i, (char *) src1_dfloat, ne00,
-                                ne00, 1, sizeof(float), 0, 0,
-                                ne00, 1, sizeof(half),  0, 0, stream);
-    }
-#else
-    const dfloat * src1_dfloat = (const dfloat *) src1_ddf_i; // dfloat == float, no conversion
-#endif // GGML_CUDA_F16
-
-    switch (src0->type) {
-        case GGML_TYPE_Q4_0:
-            dequantize_mul_mat_vec_q4_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q4_1:
-            dequantize_mul_mat_vec_q4_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q5_0:
-            dequantize_mul_mat_vec_q5_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q5_1:
-            dequantize_mul_mat_vec_q5_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q8_0:
-            dequantize_mul_mat_vec_q8_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q2_K:
-            dequantize_mul_mat_vec_q2_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q3_K:
-            dequantize_mul_mat_vec_q3_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q4_K:
-            dequantize_mul_mat_vec_q4_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q5_K:
-            dequantize_mul_mat_vec_q5_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q6_K:
-            dequantize_mul_mat_vec_q6_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_F16:
-            convert_mul_mat_vec_f16_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
-            break;
-        default:
-            GGML_ASSERT(false);
-            break;
-    }
-
-    (void) src1;
-    (void) dst;
-    (void) src1_ddq_i;
-    (void) src1_ncols;
-    (void) src1_padded_row_size;
-}
-
-static void ggml_cuda_op_mul_mat_cublas(
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
-    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
-    const int64_t src1_padded_row_size, cudaStream_t stream) {
-
-    GGML_ASSERT(src0_dd_i  != nullptr);
-    GGML_ASSERT(src1_ddf_i != nullptr);
-    GGML_ASSERT(dst_dd_i   != nullptr);
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne10 = src1->ne[0];
-
-    const int64_t ne0 = dst->ne[0];
-
-    const int64_t row_diff = row_high - row_low;
-
-    int id;
-    CUDA_CHECK(cudaGetDevice(&id));
-
-    // the main device has a larger memory buffer to hold the results from all GPUs
-    // ldc == nrows of the matrix that cuBLAS writes into
-    int ldc = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff;
-
-    const int compute_capability = g_device_caps[id].cc;
-
-    if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
-        //printf("this branch\n");
-        // convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
-        cuda_pool_alloc<half> src0_as_f16;
-        if (src0->type != GGML_TYPE_F16) {
-            const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src0->type);
-            GGML_ASSERT(to_fp16_cuda != nullptr);
-            size_t ne = row_diff*ne00;
-            src0_as_f16.alloc(ne);
-            to_fp16_cuda(src0_dd_i, src0_as_f16.get(), ne, stream);
-        }
-        const half * src0_ptr = src0->type == GGML_TYPE_F16 ? (const half *) src0_dd_i : src0_as_f16.get();
-
-        cuda_pool_alloc<half> src1_as_f16;
-        if (src1->type != GGML_TYPE_F16) {
-            const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
-            GGML_ASSERT(to_fp16_cuda != nullptr);
-            size_t ne = src1_ncols*ne10;
-            src1_as_f16.alloc(ne);
-            to_fp16_cuda(src1_ddf_i, src1_as_f16.get(), ne, stream);
-        }
-        const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddf_i : src1_as_f16.get();
-        cuda_pool_alloc<half> dst_f16(row_diff*src1_ncols);
-
-        const half alpha_f16 = 1.0f;
-        const half beta_f16 = 0.0f;
-
-        CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], stream));
-        CUBLAS_CHECK(
-            cublasGemmEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
-                    row_diff, src1_ncols, ne10,
-                    &alpha_f16, src0_ptr,       CUDA_R_16F, ne00,
-                                src1_ptr,       CUDA_R_16F, ne10,
-                    &beta_f16,   dst_f16.get(), CUDA_R_16F, ldc,
-                    CUBLAS_COMPUTE_16F,
-                    CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-
-        const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
-        to_fp32_cuda(dst_f16.get(), dst_dd_i, row_diff*src1_ncols, stream);
-    } else {
-        cuda_pool_alloc<float> src0_ddq_as_f32;
-        cuda_pool_alloc<float> src1_ddq_as_f32;
-
-        if (src0->type != GGML_TYPE_F32) {
-            const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
-            GGML_ASSERT(to_fp32_cuda != nullptr);
-            src0_ddq_as_f32.alloc(row_diff*ne00);
-            to_fp32_cuda(src0_dd_i, src0_ddq_as_f32.get(), row_diff*ne00, stream);
-        }
-        if (src1->type != GGML_TYPE_F32) {
-            const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src1->type);
-            GGML_ASSERT(to_fp32_cuda != nullptr);
-            src1_ddq_as_f32.alloc(src1_ncols*ne10);
-            to_fp32_cuda(src1_ddf_i, src1_ddq_as_f32.get(), src1_ncols*ne10, stream);
-        }
-
-        const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32.get();
-        const float * src1_ddf1_i = src1->type == GGML_TYPE_F32 ? (const float *) src1_ddf_i : src1_ddq_as_f32.get();
-
-        const float alpha = 1.0f;
-        const float beta = 0.0f;
-
-        CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], stream));
-        CUBLAS_CHECK(
-            cublasSgemm(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
-                    row_diff, src1_ncols, ne10,
-                    &alpha, src0_ddf_i,  ne00,
-                            src1_ddf1_i, ne10,
-                    &beta,  dst_dd_i,    ldc));
-    }
-
-    (void) dst;
-    (void) src1_ddq_i;
-    (void) src1_padded_row_size;
-}
-
-static void ggml_cuda_op_rope(
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32 ||  dst->type == GGML_TYPE_F16);
-    GGML_ASSERT(src0->type == dst->type);
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne2 = dst->ne[2];
-    const int64_t nrows = ggml_nrows(src0);
-
-    //const int n_past      = ((int32_t *) dst->op_params)[0];
-    const int n_dims      = ((int32_t *) dst->op_params)[1];
-    const int mode        = ((int32_t *) dst->op_params)[2];
-    const int n_ctx       = ((int32_t *) dst->op_params)[3];
-    const int n_orig_ctx  = ((int32_t *) dst->op_params)[4];
-
-    // RoPE alteration for extended context
-    float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
-    memcpy(&freq_base,   (int32_t *) dst->op_params +  5, sizeof(float));
-    memcpy(&freq_scale,  (int32_t *) dst->op_params +  6, sizeof(float));
-    memcpy(&ext_factor,  (int32_t *) dst->op_params +  7, sizeof(float));
-    memcpy(&attn_factor, (int32_t *) dst->op_params +  8, sizeof(float));
-    memcpy(&beta_fast,   (int32_t *) dst->op_params +  9, sizeof(float));
-    memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));
-
-    const int32_t * pos = nullptr;
-    if ((mode & 1) == 0) {
-        GGML_ASSERT(src1->type == GGML_TYPE_I32);
-        GGML_ASSERT(src1->ne[0] == ne2);
-        pos = (const int32_t *) src1_dd;
-    }
-
-    const bool is_neox = mode & 2;
-    const bool is_glm  = mode & 4;
-
-    rope_corr_dims corr_dims;
-    ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims.v);
-
-    // compute
-    if (is_glm) {
-        GGML_ASSERT(false);
-        rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, n_ctx, main_stream);
-    } else if (is_neox) {
-        if (src0->type == GGML_TYPE_F32) {
-            rope_neox_cuda(
-                (const float *)src0_dd, (float *)dst_dd, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
-                attn_factor, corr_dims, main_stream
-            );
-        } else if (src0->type == GGML_TYPE_F16) {
-            rope_neox_cuda(
-                (const half *)src0_dd, (half *)dst_dd, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
-                attn_factor, corr_dims, main_stream
-            );
-        } else {
-            GGML_ASSERT(false);
-        }
-    } else {
-        if (src0->type == GGML_TYPE_F32) {
-            rope_cuda(
-                (const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
-                attn_factor, corr_dims, main_stream
-            );
-        } else if (src0->type == GGML_TYPE_F16) {
-            rope_cuda(
-                (const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
-                attn_factor, corr_dims, main_stream
-            );
-        } else {
-            GGML_ASSERT(false);
-        }
-    }
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-static void ggml_cuda_op_alibi(
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-    const int64_t nrows = ggml_nrows(src0);
-
-    //const int n_past = ((int32_t *) dst->op_params)[0];
-    const int n_head = ((int32_t *) dst->op_params)[1];
-    float max_bias;
-    memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
-
-    //GGML_ASSERT(ne01 + n_past == ne00);
-    GGML_ASSERT(n_head == ne02);
-
-    const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
-
-    const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
-    const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
-
-    alibi_f32_cuda(src0_dd, dst_dd, ne00, nrows, ne01, n_heads_log2_floor, m0, m1, main_stream);
-
-    (void) src1;
-    (void) src1_dd;
-}
-
-static void ggml_cuda_op_im2col(
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F16);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F16);
-
-    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
-    const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
-    const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
-    const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
-    const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
-    const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
-
-    const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1;
-
-    const int64_t N  = src1->ne[is_2D ? 3 : 2];
-    const int64_t IC = src1->ne[is_2D ? 2 : 1];
-    const int64_t IH = is_2D ? src1->ne[1] : 1;
-    const int64_t IW =         src1->ne[0];
-
-    const int64_t KH = is_2D ? src0->ne[1] : 1;
-    const int64_t KW =         src0->ne[0];
-
-    const int64_t OH = is_2D ? dst->ne[2] : 1;
-    const int64_t OW =         dst->ne[1];
-
-    const size_t delta_offset = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
-
-    im2col_f32_f16_cuda(src1_dd, (half*) dst_dd, N, IW, IH, OW, OH, KW, KH, IC, delta_offset, s0, s1, p0, p1, d0, d1, main_stream);
-
-    (void) src0;
-    (void) src0_dd;
-}
-
-static void ggml_cuda_op_sum_rows(
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    const int64_t ncols = src0->ne[0];
-    const int64_t nrows = ggml_nrows(src0);
-
-    sum_rows_f32_cuda(src0_dd, dst_dd, ncols, nrows, main_stream);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-static void ggml_cuda_op_argsort(
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_I32);
-
-    const int64_t ncols = src0->ne[0];
-    const int64_t nrows = ggml_nrows(src0);
-
-    enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0];
-
-    argsort_f32_i32_cuda(src0_dd, (int *)dst_dd, ncols, nrows, order, main_stream);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-static void ggml_cuda_op_diag_mask_inf(
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int nrows0 = ggml_nrows(src0);
-
-    const int n_past = ((int32_t *) dst->op_params)[0];
-
-    diag_mask_inf_f32_cuda(src0_dd, dst_dd, ne00, nrows0, ne01, n_past, main_stream);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-static void ggml_cuda_op_soft_max(
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F32); // src1 contains mask and it is optional
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t nrows_x = ggml_nrows(src0);
-    const int64_t nrows_y = src1 ? ggml_nrows(src1) : 1;
-
-    float scale = 1.0f;
-    memcpy(&scale, dst->op_params, sizeof(float));
-
-#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION >= CUDART_HMAX
-#ifdef GGML_CUDA_F16
-    const bool use_f16_soft_max = true;
-#else
-    const bool use_f16_soft_max = false;
-#endif // GGML_CUDA_F16
-#else
-    const bool use_f16_soft_max = false;
-#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && CUDART_VERSION >= CUDART_HMAX
-
-    if (use_f16_soft_max) {
-        soft_max_f16_cuda(src0_dd, src1 ? src1_dd : nullptr, dst_dd, ne00, nrows_x, nrows_y, scale, main_stream);
-    } else {
-        soft_max_f32_cuda(src0_dd, src1 ? src1_dd : nullptr, dst_dd, ne00, nrows_x, nrows_y, scale, main_stream);
-    }
-
-    (void) dst;
-}
-
-static void ggml_cuda_op_scale(
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    float scale;
-    memcpy(&scale, dst->op_params, sizeof(float));
-
-    scale_f32_cuda(src0_dd, dst_dd, scale, ggml_nelements(src0), main_stream);
-    CUDA_CHECK(cudaGetLastError());
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-static void ggml_cuda_op_clamp(
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    float min;
-    float max;
-    memcpy(&min, dst->op_params, sizeof(float));
-    memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
-
-    clamp_f32_cuda(src0_dd, dst_dd, min, max, ggml_nelements(src0), main_stream);
-    CUDA_CHECK(cudaGetLastError());
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const ggml_cuda_op_flatten_t op) {
-    const int64_t nrows0 = ggml_nrows(src0);
-
-    const bool use_src1 = src1 != nullptr;
-    const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1;
-
-    GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
-    GGML_ASSERT(              dst->backend != GGML_BACKEND_GPU_SPLIT);
-
-    ggml_tensor_extra_gpu * src0_extra =            (ggml_tensor_extra_gpu *) src0->extra;
-    ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
-    ggml_tensor_extra_gpu * dst_extra  =            (ggml_tensor_extra_gpu *)  dst->extra;
-
-    const bool src0_on_device =             src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
-    const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU;
-    const bool  dst_on_device =              dst->backend == GGML_BACKEND_GPU;
-
-    // dd = data device
-    float * src0_ddf = nullptr;
-    float * src1_ddf = nullptr;
-    float *  dst_ddf = nullptr;
-
-    cuda_pool_alloc<float> src0_f;
-    cuda_pool_alloc<float> src1_f;
-    cuda_pool_alloc<float>  dst_f;
-
-    ggml_cuda_set_device(g_main_device);
-    cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
-
-    if (src0_on_device) {
-        src0_ddf = (float *) src0_extra->data_device[g_main_device];
-    } else {
-        src0_ddf = src0_f.alloc(ggml_nelements(src0));
-        CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf, src0, 0, 0, 0, nrows0, main_stream));
-    }
-
-    if (use_src1) {
-        if (src1_on_device) {
-            src1_ddf = (float *) src1_extra->data_device[g_main_device];
-        } else {
-            src1_ddf = src1_f.alloc(ggml_nelements(src1));
-            CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf, src1, 0, 0, 0, nrows1, main_stream));
-        }
-    }
-    if (dst_on_device) {
-        dst_ddf = (float *) dst_extra->data_device[g_main_device];
-    } else {
-        dst_ddf = dst_f.alloc(ggml_nelements(dst));
-    }
-
-    // do the computation
-    op(src0, src1, dst, src0_ddf, src1_ddf, dst_ddf, main_stream);
-    CUDA_CHECK(cudaGetLastError());
-
-    // copy dst to host if necessary
-    if (!dst_on_device) {
-        CUDA_CHECK(cudaMemcpyAsync(dst->data, dst_ddf, ggml_nbytes(dst), cudaMemcpyDeviceToHost, main_stream));
-    }
-
-    if (dst->backend == GGML_BACKEND_CPU) {
-        CUDA_CHECK(cudaDeviceSynchronize());
-    }
-}
-
-static void ggml_cuda_set_peer_access(const int n_tokens) {
-    static bool peer_access_enabled = false;
-
-    const bool enable_peer_access = n_tokens <= GGML_CUDA_PEER_MAX_BATCH_SIZE;
-
-    if (peer_access_enabled == enable_peer_access) {
-        return;
-    }
-
-#ifdef NDEBUG
-    for (int id = 0; id < g_device_count; ++id) {
-        ggml_cuda_set_device(id);
-        CUDA_CHECK(cudaDeviceSynchronize());
-    }
-
-    for (int id = 0; id < g_device_count; ++id) {
-        ggml_cuda_set_device(id);
-
-        for (int id_other = 0; id_other < g_device_count; ++id_other) {
-            if (id == id_other) {
-                continue;
-            }
-            if (id != g_main_device && id_other != g_main_device) {
-                continue;
-            }
-
-            int can_access_peer;
-            CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access_peer, id, id_other));
-            if (can_access_peer) {
-                if (enable_peer_access) {
-                    CUDA_CHECK(cudaDeviceEnablePeerAccess(id_other, 0));
-                } else {
-                    CUDA_CHECK(cudaDeviceDisablePeerAccess(id_other));
-                }
-            }
-        }
-    }
-#endif // NDEBUG
-
-    peer_access_enabled = enable_peer_access;
-}
-
-// FIXME: move this somewhere else
-struct ggml_backend_cuda_split_buffer_type_context {
-    std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split;
-};
-
-static void ggml_cuda_op_mul_mat(
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_cuda_op_mul_mat_t op,
-    const bool convert_src1_to_q8_1) {
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-    const int64_t ne03 = src0->ne[3];
-
-    const int64_t ne10 = src1->ne[0];
-    const int64_t ne11 = src1->ne[1];
-    const int64_t ne12 = src1->ne[2];
-    const int64_t ne13 = src1->ne[3];
-    const int64_t nrows1 = ggml_nrows(src1);
-
-    GGML_ASSERT(ne03 == ne13);
-
-    const int64_t ne0 = dst->ne[0];
-    const int64_t ne1 = dst->ne[1];
-
-    const int nb2 = dst->nb[2];
-    const int nb3 = dst->nb[3];
-
-    GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT);
-    GGML_ASSERT(src1->backend != GGML_BACKEND_GPU_SPLIT);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32 || (src1->ne[2] == 1 && src1->ne[3] == 1));
-
-    GGML_ASSERT(ne12 >= ne02 && ne12 % ne02 == 0);
-
-    const int64_t i02_divisor = ne12 / ne02;
-
-    const size_t src0_ts = ggml_type_size(src0->type);
-    const size_t src0_bs = ggml_blck_size(src0->type);
-    const size_t q8_1_ts = sizeof(block_q8_1);
-    const size_t q8_1_bs = QK8_1;
-
-    ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
-    ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
-    ggml_tensor_extra_gpu *  dst_extra = (ggml_tensor_extra_gpu *)  dst->extra;
-
-    const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
-    const bool src0_is_contiguous = ggml_is_contiguous(src0);
-    const bool src1_is_contiguous = ggml_is_contiguous(src1);
-
-    const int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING);
-
-    const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
-    GGML_ASSERT(!(split && ne02 > 1));
-    GGML_ASSERT(!(split && ne03 > 1));
-    GGML_ASSERT(!(split && ne02 < ne12));
-
-    std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split;
-    if (split) {
-        // TODO: check that src0->buffer->buft is a split buffer type, replace GGML_BACKEND_GPU_SPLIT check
-        // GGML_ASSERT(src0->buffer != nullptr && src0->buffer->buft == ...);
-        ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
-        tensor_split = buft_ctx->tensor_split;
-    }
-
-    struct dev_data {
-        cuda_pool_alloc<char>  src0_dd_alloc;
-        cuda_pool_alloc<float> src1_ddf_alloc;
-        cuda_pool_alloc<char>  src1_ddq_alloc;
-        cuda_pool_alloc<float>   dst_dd_alloc;
-
-        char  *  src0_dd = nullptr;
-        float * src1_ddf = nullptr; // float
-        char  * src1_ddq = nullptr; // q8_1
-        float *   dst_dd = nullptr;
-
-        int64_t  row_low;
-        int64_t row_high;
-    };
-
-    dev_data dev[GGML_CUDA_MAX_DEVICES];
-
-    int used_devices = 0;
-
-    for (int id = 0; id < g_device_count; ++id) {
-        // by default, use all rows
-        dev[id].row_low  = 0;
-        dev[id].row_high = ne01;
-
-        // for multi GPU, get the row boundaries from tensor split
-        // and round to mul_mat_q tile sizes
-        if (split) {
-            const int64_t rounding = get_row_rounding(src0->type, tensor_split);
-
-            if (id != 0) {
-                dev[id].row_low  = ne01*tensor_split[id];
-                if (dev[id].row_low < ne01) {
-                    dev[id].row_low -= dev[id].row_low % rounding;
-                }
-            }
-
-            if (id != g_device_count - 1) {
-                dev[id].row_high  = ne01*tensor_split[id + 1];
-                if (dev[id].row_high < ne01) {
-                    dev[id].row_high -= dev[id].row_high % rounding;
-                }
-            }
-        }
-    }
-
-    for (int id = 0; id < g_device_count; ++id) {
-        if ((!split && id != g_main_device) || dev[id].row_low == dev[id].row_high) {
-            continue;
-        }
-
-        used_devices++;
-
-        const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device;
-        const bool  dst_on_device =  dst->backend == GGML_BACKEND_GPU && id == g_main_device;
-
-        ggml_cuda_set_device(id);
-        cudaStream_t stream = g_cudaStreams[id][0];
-
-        if (src0_on_device && src0_is_contiguous) {
-            dev[id].src0_dd = (char *) src0_extra->data_device[id];
-        } else {
-            dev[id].src0_dd = dev[id].src0_dd_alloc.alloc(ggml_nbytes(src0));
-        }
-
-        if (src1_on_device && src1_is_contiguous) {
-            dev[id].src1_ddf = (float *) src1_extra->data_device[id];
-        } else {
-            dev[id].src1_ddf = dev[id].src1_ddf_alloc.alloc(ggml_nelements(src1));
-        }
-
-        if (convert_src1_to_q8_1) {
-            dev[id].src1_ddq = dev[id].src1_ddq_alloc.alloc(nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs);
-
-            if (src1_on_device && src1_is_contiguous) {
-                quantize_row_q8_1_cuda(dev[id].src1_ddf, dev[id].src1_ddq, ne10, nrows1, src1_padded_col_size, stream);
-                CUDA_CHECK(cudaGetLastError());
-            }
-        }
-
-        if (dst_on_device) {
-            dev[id].dst_dd = (float *) dst_extra->data_device[id];
-        } else {
-            const size_t size_dst_ddf = split ? (dev[id].row_high - dev[id].row_low)*ne1 : ggml_nelements(dst);
-            dev[id].dst_dd = dev[id].dst_dd_alloc.alloc(size_dst_ddf);
-        }
-    }
-
-    // if multiple devices are used they need to wait for the main device
-    // here an event is recorded that signals that the main device has finished calculating the input data
-    if (split && used_devices > 1) {
-        ggml_cuda_set_device(g_main_device);
-        CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device][0], g_cudaStreams[g_main_device][0]));
-    }
-
-    const int64_t src1_col_stride = split && used_devices > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11;
-    for (int64_t src1_col_0 = 0; src1_col_0 < ne11; src1_col_0 += src1_col_stride) {
-        const int64_t is = split ? (src1_col_0/src1_col_stride) % MAX_STREAMS : 0;
-        const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride;
-
-        for (int id = 0; id < g_device_count; ++id) {
-            if ((!split && id != g_main_device) || dev[id].row_low == dev[id].row_high) {
-                continue;
-            }
-
-            const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device;
-            const bool  dst_on_device =  dst->backend == GGML_BACKEND_GPU && id == g_main_device;
-            const int64_t row_diff = dev[id].row_high - dev[id].row_low;
-
-            ggml_cuda_set_device(id);
-            cudaStream_t stream = g_cudaStreams[id][is];
-
-            // wait for main GPU data if necessary
-            if (split && (id != g_main_device || is != 0)) {
-                CUDA_CHECK(cudaStreamWaitEvent(stream, src0_extra->events[g_main_device][0], 0));
-            }
-
-            for (int64_t i0 = 0; i0 < ne13*ne12; ++i0) {
-                const int64_t i03 = i0 / ne12;
-                const int64_t i02 = i0 % ne12;
-
-                const size_t src1_ddq_i_offset = (i0*ne11 + src1_col_0) * src1_padded_col_size*q8_1_ts/q8_1_bs;
-
-                // for split tensors the data begins at i0 == i0_offset_low
-                char  *  src0_dd_i =  dev[id].src0_dd + (i0/i02_divisor) * (ne01*ne00*src0_ts)/src0_bs;
-                float * src1_ddf_i = dev[id].src1_ddf + (i0*ne11 + src1_col_0) * ne10;
-                char  * src1_ddq_i = dev[id].src1_ddq +  src1_ddq_i_offset;
-                float *   dst_dd_i =   dev[id].dst_dd + (i0*ne1  + src1_col_0) * (dst_on_device ? ne0 : row_diff);
-
-                // the main device memory buffer can be on VRAM scratch, with space for all partial results
-                // in that case an offset on dst_ddf_i is needed
-                if (dst->backend == GGML_BACKEND_GPU && id == g_main_device) {
-                    dst_dd_i += dev[id].row_low; // offset is 0 if no tensor split
-                }
-
-                // copy src0, src1 to device if necessary
-                if (src1->backend == GGML_BACKEND_GPU && src1_is_contiguous) {
-                    if (id != g_main_device) {
-                        if (convert_src1_to_q8_1) {
-                            char * src1_ddq_i_source = dev[g_main_device].src1_ddq + src1_ddq_i_offset;
-                            CUDA_CHECK(cudaMemcpyPeerAsync(src1_ddq_i, id, src1_ddq_i_source, g_main_device,
-                                                            src1_ncols*src1_padded_col_size*q8_1_ts/q8_1_bs, stream));
-                        } else {
-                            float * src1_ddf_i_source = (float *) src1_extra->data_device[g_main_device];
-                            src1_ddf_i_source += (i0*ne11 + src1_col_0) * ne10;
-                            CUDA_CHECK(cudaMemcpyPeerAsync(src1_ddf_i, id, src1_ddf_i_source, g_main_device,
-                                                            src1_ncols*ne10*sizeof(float), stream));
-                        }
-                    }
-                } else if (src1->backend == GGML_BACKEND_CPU || (src1_on_device && !src1_is_contiguous)) {
-                    CUDA_CHECK(ggml_cuda_cpy_tensor_2d(
-                                src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
-                } else {
-                    GGML_ASSERT(false);
-                }
-
-                if (convert_src1_to_q8_1 && (src1->backend == GGML_BACKEND_CPU || !src1_is_contiguous)) {
-                    quantize_row_q8_1_cuda(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
-                    CUDA_CHECK(cudaGetLastError());
-                }
-
-                if (src1_col_0 == 0 && (!src0_on_device || !src0_is_contiguous) && i02 % i02_divisor == 0) {
-                    CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_dd_i, src0, i03, i02/i02_divisor, dev[id].row_low, dev[id].row_high, stream));
-                }
-
-                // do the computation
-                op(src0, src1, dst, src0_dd_i, src1_ddf_i, src1_ddq_i, dst_dd_i,
-                    dev[id].row_low, dev[id].row_high, src1_ncols, src1_padded_col_size, stream);
-                CUDA_CHECK(cudaGetLastError());
-
-                // copy dst to host or other device if necessary
-                if (!dst_on_device) {
-                    void * dst_off_device;
-                    cudaMemcpyKind kind;
-                    if (dst->backend == GGML_BACKEND_CPU) {
-                        dst_off_device = dst->data;
-                        kind = cudaMemcpyDeviceToHost;
-                    } else if (dst->backend == GGML_BACKEND_GPU) {
-                        dst_off_device = dst_extra->data_device[g_main_device];
-                        kind = cudaMemcpyDeviceToDevice;
-                    } else {
-                        GGML_ASSERT(false);
-                    }
-                    if (split) {
-                        // src0 = weight matrix is saved as a transposed matrix for better memory layout.
-                        // dst is NOT transposed.
-                        // The outputs of matrix matrix multiplications can therefore NOT simply be concatenated for >1 GPU.
-                        // Instead they need to be copied to the correct slice in ne0 = dst row index.
-                        // If dst is a vector with ne0 == 1 then you don't have to do this but it still produces correct results.
-                        float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
-                        GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
-                        dhf_dst_i += src1_col_0*ne0 + dev[id].row_low;
-#if !defined(GGML_USE_HIPBLAS)
-                        if (kind == cudaMemcpyDeviceToDevice) {
-                            // cudaMemcpy2DAsync may fail with copies between vmm pools of different devices
-                            cudaMemcpy3DPeerParms p = {};
-                            p.dstDevice = g_main_device;
-                            p.dstPtr = make_cudaPitchedPtr(dhf_dst_i, ne0*sizeof(float), row_diff, src1_ncols);
-                            p.srcDevice = id;
-                            p.srcPtr = make_cudaPitchedPtr(dst_dd_i, row_diff*sizeof(float), row_diff, src1_ncols);
-                            p.extent = make_cudaExtent(row_diff*sizeof(float), src1_ncols, 1);
-                            CUDA_CHECK(cudaMemcpy3DPeerAsync(&p, stream));
-                        } else
-#endif
-                        {
-                            CUDA_CHECK(cudaMemcpy2DAsync(dhf_dst_i, ne0*sizeof(float),
-                                                            dst_dd_i, row_diff*sizeof(float),
-                                                            row_diff*sizeof(float), src1_ncols,
-                                                            kind, stream));
-                        }
-                    } else {
-                        float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
-                        GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
-                        dhf_dst_i += src1_col_0*ne0;
-                        CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_dd_i, src1_ncols*ne0*sizeof(float), kind, stream));
-                    }
-                }
-
-                // add event for the main device to wait on until other device is done
-                if (split && (id != g_main_device || is != 0)) {
-                    CUDA_CHECK(cudaEventRecord(src0_extra->events[id][is], stream));
-                }
-            }
-        }
-    }
-
-    // main device waits for all other devices to be finished
-    if (split && g_device_count > 1) {
-        int64_t is_max = (ne11 + MUL_MAT_SRC1_COL_STRIDE - 1) / MUL_MAT_SRC1_COL_STRIDE;
-        is_max = is_max <= MAX_STREAMS ? is_max : MAX_STREAMS;
-
-        ggml_cuda_set_device(g_main_device);
-        for (int id = 0; id < g_device_count; ++id) {
-            if (dev[id].row_low == dev[id].row_high) {
-                continue;
-            }
-            for (int64_t is = 0; is < is_max; ++is) {
-                CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[g_main_device][0], src0_extra->events[id][is], 0));
-            }
-        }
-    }
-
-    if (dst->backend == GGML_BACKEND_CPU) {
-        ggml_cuda_set_device(g_main_device);
-        CUDA_CHECK(cudaDeviceSynchronize());
-    }
-}
-
-static void ggml_cuda_repeat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_repeat);
-}
-
-static void ggml_cuda_get_rows(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_get_rows);
-}
-
-static void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_add);
-}
-
-static void ggml_cuda_acc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_acc);
-}
-
-static void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_mul);
-}
-
-static void ggml_cuda_div(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_div);
-}
-
-static void ggml_cuda_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_gelu);
-}
-
-static void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_silu);
-}
-
-static void ggml_cuda_gelu_quick(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_gelu_quick);
-}
-
-static void ggml_cuda_tanh(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_tanh);
-}
-
-static void ggml_cuda_relu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_relu);
-}
-
-static void ggml_cuda_leaky_relu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_leaky_relu);
-}
-
-static void ggml_cuda_sqr(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_sqr);
-}
-
-static void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_norm);
-}
-
-static void ggml_cuda_group_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_group_norm);
-}
-
-static void ggml_cuda_concat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_concat);
-}
-
-static void ggml_cuda_upscale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_upscale);
-}
-
-static void ggml_cuda_pad(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_pad);
-}
-
-static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm);
-}
-
-bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
-    if (!g_cublas_loaded) return false;
-
-    const int64_t ne10 = src1->ne[0];
-
-    const int64_t ne0 = dst->ne[0];
-    const int64_t ne1 = dst->ne[1];
-
-    // TODO: find the optimal values for these
-    return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
-            src1->type == GGML_TYPE_F32 &&
-             dst->type == GGML_TYPE_F32 &&
-            (ne0 >= 32 && ne1 >= 32 && ne10 >= 32);
-}
-
-static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
-    GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
-    GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
-    GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation
-    GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // 0213 permutation
-    GGML_ASSERT(src0->type == GGML_TYPE_F16);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-
-    const int64_t ne12 = src1->ne[2];
-
-    ggml_cuda_set_device(g_main_device);
-    cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
-
-    ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
-    void * src0_ddq = src0_extra->data_device[g_main_device];
-
-    ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
-    float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
-
-    ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
-    float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
-
-    ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
-}
-
-static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
-    GGML_ASSERT(!ggml_is_transposed(src0));
-    GGML_ASSERT(!ggml_is_transposed(src1));
-    GGML_ASSERT(!ggml_is_permuted(src0));
-    GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
-    GGML_ASSERT(src0->type == GGML_TYPE_F16);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-
-    const int64_t nb01 = src0->nb[1];
-    const int64_t nb02 = src0->nb[2];
-
-    const int64_t ne12 = src1->ne[2];
-
-    ggml_cuda_set_device(g_main_device);
-    cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
-
-    ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
-    void * src0_ddq = src0_extra->data_device[g_main_device];
-
-    ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
-    float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
-
-    ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
-    float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
-
-    const int64_t row_stride_x = nb01 / sizeof(half);
-    const int64_t channel_stride_x = nb02 / sizeof(half);
-
-    ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
-}
-
-static __global__ void k_compute_batched_ptrs(
-        const half * src0_as_f16, const half * src1_as_f16, char * dst,
-        const void ** ptrs_src, void ** ptrs_dst,
-        int64_t ne12, int64_t ne13,
-        int64_t ne23,
-        size_t  nb02, size_t  nb03,
-        size_t  nb12, size_t  nb13,
-        size_t  nbd2, size_t  nbd3,
-        int64_t r2,   int64_t r3) {
-    int64_t i13 = blockIdx.x * blockDim.x + threadIdx.x;
-    int64_t i12 = blockIdx.y * blockDim.y + threadIdx.y;
-
-    if (i13 >= ne13 || i12 >= ne12) {
-        return;
-    }
-
-    int64_t i03 = i13 / r3;
-    int64_t i02 = i12 / r2;
-
-    ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03;
-    ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12 + i13*nb13;
-    ptrs_dst[0*ne23 + i12 + i13*ne12] = (      char *)         dst + i12*nbd2 + i13*nbd3;
-}
-
-static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    GGML_ASSERT(!ggml_is_transposed(src0));
-    GGML_ASSERT(!ggml_is_transposed(src1));
-
-    GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
-    GGML_ASSERT(src0->type == GGML_TYPE_F16);
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    const int64_t ne_dst = ggml_nelements(dst);
-
-    ggml_cuda_set_device(g_main_device);
-    cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
-
-    CUBLAS_CHECK(cublasSetStream(g_cublas_handles[g_main_device], main_stream));
-
-    ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
-    void * src0_ddq = src0_extra->data_device[g_main_device];
-    half * src0_f16 = (half *) src0_ddq;
-
-    ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
-    float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
-
-    ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
-    float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
-
-    // convert src1 to fp16
-    cuda_pool_alloc<half> src1_f16_alloc;
-    if (src1->type != GGML_TYPE_F16) {
-        const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
-        const int64_t ne_src1 = ggml_nelements(src1);
-        src1_f16_alloc.alloc(ne_src1);
-        GGML_ASSERT(to_fp16_cuda != nullptr);
-        to_fp16_cuda(src1_ddf, src1_f16_alloc.get(), ne_src1, main_stream);
-    }
-    half * src1_f16 = src1->type == GGML_TYPE_F16 ? (half *) src1_ddf : src1_f16_alloc.get();
-
-    cuda_pool_alloc<half> dst_f16;
-    char * dst_t;
-
-    cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F;
-    cudaDataType_t      cu_data_type    = CUDA_R_16F;
-
-    // dst strides
-    size_t nbd2 = dst->nb[2];
-    size_t nbd3 = dst->nb[3];
-
-    const half  alpha_f16 = 1.0f;
-    const half  beta_f16  = 0.0f;
-
-    const float alpha_f32 = 1.0f;
-    const float beta_f32  = 0.0f;
-
-    const void * alpha = &alpha_f16;
-    const void * beta  = &beta_f16;
-
-    if (dst->op_params[0] == GGML_PREC_DEFAULT) {
-        dst_t = (char *) dst_f16.alloc(ne_dst);
-
-        nbd2 /= sizeof(float) / sizeof(half);
-        nbd3 /= sizeof(float) / sizeof(half);
-    } else {
-        dst_t = (char *) dst_ddf;
-
-        cu_compute_type = CUBLAS_COMPUTE_32F;
-        cu_data_type    = CUDA_R_32F;
-
-        alpha = &alpha_f32;
-        beta  = &beta_f32;
-    }
-
-    GGML_ASSERT(ne12 % ne02 == 0);
-    GGML_ASSERT(ne13 % ne03 == 0);
-
-    // broadcast factors
-    const int64_t r2 = ne12/ne02;
-    const int64_t r3 = ne13/ne03;
-
-#if 0
-    // use cublasGemmEx
-    {
-        for (int i13 = 0; i13 < ne13; ++i13) {
-            for (int i12 = 0; i12 < ne12; ++i12) {
-                int i03 = i13 / r3;
-                int i02 = i12 / r2;
-
-                CUBLAS_CHECK(
-                        cublasGemmEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
-                            ne01, ne11, ne10,
-                            alpha, (const char *) src0_as_f16 + i02*src0->nb[2]   + i03*src0->nb[3]  , CUDA_R_16F,   nb01/sizeof(half),
-                                   (const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, CUDA_R_16F,   nb11/sizeof(float),
-                            beta,  (      char *)       dst_t + i12*nbd2          + i13*nbd3,          cu_data_type, ne01,
-                            cu_compute_type,
-                            CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-            }
-        }
-    }
-#else
-    if (r2 == 1 && r3 == 1 && src0->nb[2]*src0->ne[2] == src0->nb[3] && src1->nb[2]*src1->ne[2] == src1->nb[3]) {
-        // there is no broadcast and src0, src1 are contiguous across dims 2, 3
-        // use cublasGemmStridedBatchedEx
-        CUBLAS_CHECK(
-        cublasGemmStridedBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
-                ne01, ne11, ne10,
-                alpha, (const char *) src0_f16, CUDA_R_16F,   nb01/nb00, nb02/nb00,  // strideA
-                       (const char *) src1_f16, CUDA_R_16F,   nb11/nb10, nb12/nb10,  // strideB
-                beta,  (      char *)    dst_t, cu_data_type, ne01,       nb2/nb0,   // strideC
-                ne12*ne13,
-                cu_compute_type,
-                CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-    } else {
-        // use cublasGemmBatchedEx
-        const int ne23 = ne12*ne13;
-
-        cuda_pool_alloc<const void *> ptrs_src(2*ne23);
-        cuda_pool_alloc<      void *> ptrs_dst(1*ne23);
-
-        dim3 block_dims(ne13, ne12);
-        k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>(
-                src0_f16, src1_f16, dst_t,
-                ptrs_src.get(), ptrs_dst.get(),
-                ne12, ne13,
-                ne23,
-                nb02, nb03,
-                src1->type == GGML_TYPE_F16 ? nb12 : nb12/2,
-                src1->type == GGML_TYPE_F16 ? nb13 : nb13/2,
-                nbd2, nbd3,
-                r2, r3);
-        CUDA_CHECK(cudaGetLastError());
-
-        CUBLAS_CHECK(
-        cublasGemmBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
-                ne01, ne11, ne10,
-                alpha, (const void **) (ptrs_src.get() + 0*ne23), CUDA_R_16F,   nb01/nb00,
-                       (const void **) (ptrs_src.get() + 1*ne23), CUDA_R_16F,   nb11/nb10,
-                beta,  (      void **) (ptrs_dst.get() + 0*ne23), cu_data_type, ne01,
-                ne23,
-                cu_compute_type,
-                CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-    }
-#endif
-
-    if (dst->op_params[0] == GGML_PREC_DEFAULT) {
-        const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
-        to_fp32_cuda(dst_f16.get(), dst_ddf, ne_dst, main_stream);
-    }
-}
-
-static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    const bool all_on_device =
-        (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
-        (src1->backend == GGML_BACKEND_GPU) &&
-        ( dst->backend == GGML_BACKEND_GPU);
-
-    const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
-
-    int64_t min_compute_capability = INT_MAX;
-
-    if (split) {
-        ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
-        auto & tensor_split = buft_ctx->tensor_split;
-        for (int id = 0; id < g_device_count; ++id) {
-            if (min_compute_capability > g_device_caps[id].cc && tensor_split[id] < (id + 1 < g_device_count ? tensor_split[id + 1] : 1.0f)) {
-                min_compute_capability = g_device_caps[id].cc;
-            }
-        }
-    } else {
-        min_compute_capability = g_device_caps[g_main_device].cc;
-    }
-
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-
-    const bool fp16_performance_good = min_compute_capability >= CC_RDNA1;
-    bool               use_mul_mat_q = ggml_is_quantized(src0->type);
-#ifdef CUDA_USE_TENSOR_CORES
-    use_mul_mat_q = use_mul_mat_q && min_compute_capability < CC_RDNA3;
-#endif // CUDA_USE_TENSOR_CORES
-
-#else
-
-    const bool fp16_performance_good = min_compute_capability >= CC_VOLTA;
-    bool               use_mul_mat_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type);
-#ifdef CUDA_USE_TENSOR_CORES
-    // when tensor cores are available, use them for large batch size
-    // ref: https://github.com/ggerganov/llama.cpp/pull/3776
-    use_mul_mat_q = use_mul_mat_q && !(fp16_performance_good && src1->ne[1] > MMQ_MAX_BATCH_SIZE);
-#endif // CUDA_USE_TENSOR_CORES
-
-#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-
-    use_mul_mat_q = use_mul_mat_q && ggml_cuda_supports_mmq(src0->type);
-
-    // debug helpers
-    //printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
-    //printf("      %8d %8d %8d %8d\n", src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
-    //printf("src1: %8d %8d %8d %8d\n", src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3]);
-    //printf("      %8d %8d %8d %8d\n", src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3]);
-    //printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
-    //printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
-
-    if (!split && all_on_device && !fp16_performance_good && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
-        // KQ single-batch
-        ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
-    } else if (!split && all_on_device && !fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
-        // KQV single-batch
-        ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
-    } else if (!split && all_on_device && fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
-        // KQ + KQV multi-batch
-        ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst);
-    } else if (src0->type == GGML_TYPE_F32) {
-        ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
-    } else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
-        if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src1->type == GGML_TYPE_F32) {
-#ifdef GGML_CUDA_FORCE_DMMV
-            const bool use_mul_mat_vec_q = false;
-#else
-            const bool use_mul_mat_vec_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type) && ggml_nrows(src1) == 1;
-#endif // GGML_CUDA_FORCE_DMMV
-
-            if (use_mul_mat_vec_q) {
-                // NOTE: this kernel does not support ggml_nrows(src1) > 1
-                ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, true);
-            } else {
-                ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
-            }
-        } else {
-            if (use_mul_mat_q) {
-                ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_q, true);
-            } else {
-                ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
-            }
-        }
-    } else {
-        GGML_ASSERT(false);
-    }
-}
-
-#if 0
-template<typename ... Srcs>
-static __global__ void k_compute_batched_ptrs_id(
-        const void ** ptrs_src, void ** ptrs_dst,
-        int ne12, int ne13,
-        int ne23,
-        int nb02, int nb03,
-        int nb12, int nb13,
-        int nb2, int nb3,
-        int r2, int r3,
-        ggml_type src0_type, half * src0_as_f16, int64_t src0_ne,
-        const half * src1_f16, half * dst_f16,
-        const int32_t * ids, const int id,
-        Srcs... src0s) {
-
-    int i = ids[id];
-
-    half * src0_f16;
-    const void * srcs_ar[] = { (const half *) src0s... };
-    if (src0_type == GGML_TYPE_F16) {
-        src0_f16 = (half *) srcs_ar[i];
-    } else {
-        src0_f16 = src0_as_f16;
-        if (threadIdx.x == 0 && threadIdx.y == 0) {
-            const to_fp16_cuda_t to_fp16 = ggml_get_to_fp16_cuda(src0_type);
-            to_fp16(srcs_ar[i], src0_f16, src0_ne, cudaStreamFireAndForget);
-        }
-    }
-
-    int i13 = blockIdx.x * blockDim.x + threadIdx.x;
-    int i12 = blockIdx.y * blockDim.y + threadIdx.y;
-
-    if (i13 >= ne13 || i12 >= ne12) {
-        return;
-    }
-
-    int i03 = i13 / r3;
-    int i02 = i12 / r2;
-
-    ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_f16 + i02*nb02   + i03*nb03;
-    ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_f16 + i12*nb12/2 + i13*nb13/2;
-    ptrs_dst[0*ne23 + i12 + i13*ne12] = (      char *)  dst_f16 + i12* nb2/2 + i13* nb3/2;
-}
-
-static void ggml_cuda_mul_mat_id_cublas(ggml_tensor * dst) {
-    const struct ggml_tensor * ids = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-    const struct ggml_tensor * src00 = dst->src[2];
-
-    const int id = dst->op_params[0];
-
-    GGML_ASSERT(!ggml_is_transposed(src00));
-    GGML_ASSERT(!ggml_is_transposed(src1));
-
-    GGML_ASSERT(src00->backend != GGML_BACKEND_GPU_SPLIT);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-
-    const int64_t ne00 = src00->ne[0]; GGML_UNUSED(ne00);
-    const int64_t ne01 = src00->ne[1];
-    const int64_t ne02 = src00->ne[2];
-    const int64_t ne03 = src00->ne[3];
-
-    //const int64_t nb01 = src00->nb[1];
-    const int64_t nb02 = src00->nb[2]; GGML_UNUSED(nb02);
-    const int64_t nb03 = src00->nb[3]; GGML_UNUSED(nb03);
-
-    const int64_t ne10 = src1->ne[0];
-    const int64_t ne11 = src1->ne[1];
-    const int64_t ne12 = src1->ne[2];
-    const int64_t ne13 = src1->ne[3];
-
-    //const int64_t nb11 = src1->nb[1];
-    const int64_t nb12 = src1->nb[2]; GGML_UNUSED(nb12);
-    const int64_t nb13 = src1->nb[3]; GGML_UNUSED(nb13);
-
-    const int64_t ne1 = ggml_nelements(src1);
-    const int64_t ne  = ggml_nelements(dst);
-
-    ggml_cuda_set_device(g_main_device);
-    cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
-
-    CUBLAS_CHECK(cublasSetStream(g_cublas_handles[g_main_device], main_stream));
-
-    //ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
-    //void * src0_ddq = src0_extra->data_device[g_main_device];
-    //half * src0_as_f16 = (half *) src0_ddq;
-
-    ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
-    float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
-
-    ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
-    float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
-
-    // convert src1 to fp16
-    const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
-    GGML_ASSERT(to_fp16_cuda != nullptr);
-
-    size_t src1_as = 0;
-    half * src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne1 * sizeof(half), &src1_as);
-    to_fp16_cuda(src1_ddf, src1_as_f16, ne1, main_stream);
-
-    size_t dst_as = 0;
-    half * dst_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &dst_as);
-
-    GGML_ASSERT(ne12 % ne02 == 0);
-    GGML_ASSERT(ne13 % ne03 == 0);
-
-    // broadcast factors
-    const int64_t r2 = ne12/ne02;
-    const int64_t r3 = ne13/ne03;
-
-    const half alpha_f16 = 1.0f;
-    const half beta_f16  = 0.0f;
-
-    // use cublasGemmBatchedEx
-    const int ne23 = ne12*ne13;
-
-    const void ** ptrs_src = nullptr;
-          void ** ptrs_dst = nullptr;
-
-    size_t ptrs_src_s = 0;
-    size_t ptrs_dst_s = 0;
-
-    ptrs_src = (const void **) ggml_cuda_pool_malloc(2*ne23*sizeof(void *), &ptrs_src_s);
-    ptrs_dst = (      void **) ggml_cuda_pool_malloc(1*ne23*sizeof(void *), &ptrs_dst_s);
-
-    int64_t src0_ne = ggml_nelements(src00);
-    half * src0_as_f16 = nullptr;
-    size_t src0_as = 0;
-    if (src00->type != GGML_TYPE_F16) {
-        src0_as_f16 = (half *) ggml_cuda_pool_malloc(src0_ne * sizeof(half), &src0_as);
-    }
-
-    static_assert(GGML_MAX_SRC == 6, "GGML_MAX_SRC == 6");
-    dim3 block_dims(ne13, ne12);
-    k_compute_batched_ptrs_id<<<1, block_dims, 0, main_stream>>>(
-            ptrs_src, ptrs_dst,
-            ne12, ne13,
-            ne23,
-            ne00*ne01*sizeof(half), ne00*ne01*ne02*sizeof(half),
-            nb12, nb13,
-            dst->nb[2], dst->nb[3],
-            r2, r3,
-            src00->type, src0_as_f16, src0_ne,
-            src1_as_f16, dst_f16,
-            (const int *)((ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device], id,
-            dst->src[2] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[2]->extra)->data_device[g_main_device] : nullptr,
-            dst->src[3] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[3]->extra)->data_device[g_main_device] : nullptr,
-            dst->src[4] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[4]->extra)->data_device[g_main_device] : nullptr,
-            dst->src[5] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[5]->extra)->data_device[g_main_device] : nullptr
-    );
-    CUDA_CHECK(cudaGetLastError());
-
-    CUBLAS_CHECK(
-    cublasGemmBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
-            ne01, ne11, ne10,
-            &alpha_f16, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, ne00,
-                        (const void **) (ptrs_src + 1*ne23), CUDA_R_16F, ne10,
-            &beta_f16,  (      void **) (ptrs_dst + 0*ne23), CUDA_R_16F, ne01,
-            ne23,
-            CUBLAS_COMPUTE_16F,
-            CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-
-    if (src0_as != 0) {
-        ggml_cuda_pool_free(src0_as_f16, src0_as);
-    }
-    if (ptrs_src_s != 0) {
-        ggml_cuda_pool_free(ptrs_src, ptrs_src_s);
-    }
-    if (ptrs_dst_s != 0) {
-        ggml_cuda_pool_free(ptrs_dst, ptrs_dst_s);
-    }
-
-    const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
-    to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream);
-
-    ggml_cuda_pool_free(src1_as_f16, src1_as);
-    ggml_cuda_pool_free(dst_f16, dst_as);
-}
-#endif
-
-static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-#if 0
-    ggml_cuda_mul_mat_id_cublas(dst);
-    // TODO: mmq/mmv support
-#endif
-
-    const int64_t nb11 = src1->nb[1];
-    const int64_t nb1  =  dst->nb[1];
-
-    const struct ggml_tensor * ids = src0;
-    const int32_t id = ((int32_t *) dst->op_params)[0];
-    const int32_t n_as = ((int32_t *) dst->op_params)[1];
-
-    std::vector<char> ids_host(ggml_nbytes(ids));
-
-    cudaStream_t stream = g_cudaStreams[g_main_device][0];
-
-    if (ids->backend == GGML_BACKEND_GPU) {
-        const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device];
-        CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream));
-        CUDA_CHECK(cudaStreamSynchronize(stream));
-    } else {
-        memcpy(ids_host.data(), ids->data, ggml_nbytes(ids));
-    }
-
-    const ggml_tensor_extra_gpu * src1_extra = (const ggml_tensor_extra_gpu *) src1->extra;
-    const ggml_tensor_extra_gpu * dst_extra = (const ggml_tensor_extra_gpu *) dst->extra;
-
-    ggml_tensor_extra_gpu src1_row_extra;
-    ggml_tensor_extra_gpu dst_row_extra;
-
-    ggml_tensor src1_row = *src1;
-    ggml_tensor dst_row = *dst;
-
-    src1_row.backend = GGML_BACKEND_GPU;
-    dst_row.backend  = GGML_BACKEND_GPU;
-
-    src1_row.extra = &src1_row_extra;
-    dst_row.extra = &dst_row_extra;
-
-    char * src1_original = src1->backend == GGML_BACKEND_CPU ?
-        (char *) src1->data : (char *) src1_extra->data_device[g_main_device];
-    char * dst_original  =  dst->backend == GGML_BACKEND_CPU ?
-        (char *)  dst->data : (char *)  dst_extra->data_device[g_main_device];
-
-    if (src1->ne[1] == 1) {
-        GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
-        GGML_ASSERT(dst->backend  == GGML_BACKEND_GPU);
-
-        for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
-            //int32_t row_id;
-            //CUDA_CHECK(cudaMemcpyAsync(&row_id, ids_dev + i01*ids->nb[1] + id*ids->nb[0], sizeof(int32_t), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
-            //CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
-
-            const int32_t row_id = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
-
-            GGML_ASSERT(row_id >= 0 && row_id < n_as);
-
-            const struct ggml_tensor * src0_row = dst->src[row_id + 2];
-
-            src1_row_extra.data_device[g_main_device] = src1_original + i01*src1->nb[1];
-            src1_row.data = (char *) src1->data + i01*src1->nb[1]; // TODO why is this set?
-
-            dst_row_extra.data_device[g_main_device] = dst_original + i01*dst->nb[1];
-            dst_row.data = (char *) dst->data + i01*dst->nb[1]; // TODO why is this set?
-
-            ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row);
-        }
-    } else {
-        cuda_pool_alloc<char> src1_contiguous(sizeof(float)*ggml_nelements(src1));
-        cuda_pool_alloc<char>  dst_contiguous(sizeof(float)*ggml_nelements(dst));
-
-        src1_row_extra.data_device[g_main_device] = src1_contiguous.get();
-        dst_row_extra.data_device[g_main_device]  =  dst_contiguous.get();
-
-        const cudaMemcpyKind src1_kind = src1->backend == GGML_BACKEND_CPU ?
-            cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice;
-        const cudaMemcpyKind dst_kind  =  dst->backend == GGML_BACKEND_CPU ?
-            cudaMemcpyDeviceToHost : cudaMemcpyDeviceToDevice;
-
-        for (int32_t row_id = 0; row_id < n_as; ++row_id) {
-            const struct ggml_tensor * src0_row = dst->src[row_id + 2];
-
-            int64_t num_src1_rows = 0;
-            for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
-                const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
-
-                if (row_id_i != row_id) {
-                    continue;
-                }
-
-                GGML_ASSERT(row_id >= 0 && row_id < n_as);
-
-                CUDA_CHECK(cudaMemcpyAsync(src1_contiguous.get() + num_src1_rows*nb11, src1_original + i01*nb11,
-                                        nb11, src1_kind, stream));
-                num_src1_rows++;
-            }
-
-            if (num_src1_rows == 0) {
-                continue;
-            }
-
-            src1_row.ne[1] = num_src1_rows;
-            dst_row.ne[1] = num_src1_rows;
-
-            src1_row.nb[1] = nb11;
-            src1_row.nb[2] = num_src1_rows*nb11;
-            src1_row.nb[3] = num_src1_rows*nb11;
-
-            dst_row.nb[1] = nb1;
-            dst_row.nb[2] = num_src1_rows*nb1;
-            dst_row.nb[3] = num_src1_rows*nb1;
-
-            ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row);
-
-            num_src1_rows = 0;
-            for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
-                const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
-
-                if (row_id_i != row_id) {
-                    continue;
-                }
-
-                GGML_ASSERT(row_id >= 0 && row_id < n_as);
-
-                CUDA_CHECK(cudaMemcpyAsync(dst_original + i01*nb1, dst_contiguous.get() + num_src1_rows*nb1,
-                                        nb1, dst_kind, stream));
-                num_src1_rows++;
-            }
-        }
-    }
-
-    if (dst->backend == GGML_BACKEND_CPU) {
-        CUDA_CHECK(cudaStreamSynchronize(stream));
-    }
-}
-
-static void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_scale);
-}
-
-static void ggml_cuda_clamp(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_clamp);
-}
-
-static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    const int64_t ne = ggml_nelements(src0);
-    GGML_ASSERT(ne == ggml_nelements(src1));
-
-    GGML_ASSERT(src0->backend == GGML_BACKEND_GPU);
-    GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
-
-    GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
-    GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-
-    const int64_t nb00 = src0->nb[0];
-    const int64_t nb01 = src0->nb[1];
-    const int64_t nb02 = src0->nb[2];
-    const int64_t nb03 = src0->nb[3];
-
-    const int64_t ne10 = src1->ne[0];
-    const int64_t ne11 = src1->ne[1];
-    const int64_t ne12 = src1->ne[2];
-
-    const int64_t nb10 = src1->nb[0];
-    const int64_t nb11 = src1->nb[1];
-    const int64_t nb12 = src1->nb[2];
-    const int64_t nb13 = src1->nb[3];
-
-    ggml_cuda_set_device(g_main_device);
-    cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
-
-    const ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
-    const ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
-
-    char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
-    char * src1_ddc = (char *) src1_extra->data_device[g_main_device];
-
-    if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
-        ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
-    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
-        ggml_cpy_f32_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
-    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
-        ggml_cpy_f32_q8_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
-    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) {
-        ggml_cpy_f32_q4_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
-    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) {
-        ggml_cpy_f32_q4_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
-    } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
-        ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
-    } else {
-        fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
-                ggml_type_name(src0->type), ggml_type_name(src1->type));
-        GGML_ASSERT(false);
-    }
-
-    (void) dst;
-}
-
-static void ggml_cuda_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    // TODO: why do we pass dst as src1 here?
-    ggml_cuda_cpy(src0, dst, nullptr);
-    (void) src1;
-}
-
-static void ggml_cuda_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_diag_mask_inf);
-}
-
-static void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_soft_max);
-}
-
-static void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    GGML_ASSERT(ggml_is_contiguous(src0)); // TODO: this restriction is temporary until non-cont support is implemented
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rope);
-}
-
-static void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_alibi);
-}
-
-static void ggml_cuda_im2col(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_im2col);
-}
-
-static void ggml_cuda_sum_rows(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    GGML_ASSERT(ggml_is_contiguous(src0));
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_sum_rows);
-}
-
-static void ggml_cuda_argsort(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    GGML_ASSERT(ggml_is_contiguous(src0));
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_argsort);
-}
-
-static void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    (void) src0;
-    (void) src1;
-    (void) dst;
-}
-
-static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
-    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
-
-    return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]);
-}
-
-static void ggml_cuda_set_main_device(const int main_device) {
-    if (main_device >= g_device_count) {
-        fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n",
-                main_device, g_device_count, g_main_device);
-        return;
-    }
-
-    if (g_main_device != main_device && g_device_count > 1) {
-        g_main_device = main_device;
-        //cudaDeviceProp prop;
-        //CUDA_CHECK(cudaGetDeviceProperties(&prop, g_main_device));
-        //fprintf(stderr, "%s: using device %d (%s) as main device\n", __func__, g_main_device, prop.name);
-    }
-}
-
-bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
-    if (!g_cublas_loaded) return false;
-
-    ggml_cuda_func_t func;
-    const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
-        || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
-        || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
-
-    if (!any_on_device && tensor->op != GGML_OP_MUL_MAT && tensor->op != GGML_OP_MUL_MAT_ID) {
-        return false;
-    }
-
-    if (tensor->op == GGML_OP_MUL_MAT) {
-        if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) {
-#ifndef NDEBUG
-            fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]);
-#endif
-            return false;
-        }
-    }
-
-    switch (tensor->op) {
-        case GGML_OP_REPEAT:
-            func = ggml_cuda_repeat;
-            break;
-        case GGML_OP_GET_ROWS:
-            func = ggml_cuda_get_rows;
-            break;
-        case GGML_OP_DUP:
-            func = ggml_cuda_dup;
-            break;
-        case GGML_OP_ADD:
-            func = ggml_cuda_add;
-            break;
-        case GGML_OP_ACC:
-            func = ggml_cuda_acc;
-            break;
-        case GGML_OP_MUL:
-            func = ggml_cuda_mul;
-            break;
-        case GGML_OP_DIV:
-            func = ggml_cuda_div;
-            break;
-        case GGML_OP_UNARY:
-            switch (ggml_get_unary_op(tensor)) {
-                case GGML_UNARY_OP_GELU:
-                    func = ggml_cuda_gelu;
-                    break;
-                case GGML_UNARY_OP_SILU:
-                    func = ggml_cuda_silu;
-                    break;
-                case GGML_UNARY_OP_GELU_QUICK:
-                    func = ggml_cuda_gelu_quick;
-                    break;
-                case GGML_UNARY_OP_TANH:
-                    func = ggml_cuda_tanh;
-                    break;
-                case GGML_UNARY_OP_RELU:
-                    func = ggml_cuda_relu;
-                    break;
-                default:
-                    return false;
-            }
-            break;
-        case GGML_OP_NORM:
-            func = ggml_cuda_norm;
-            break;
-        case GGML_OP_GROUP_NORM:
-            func = ggml_cuda_group_norm;
-            break;
-        case GGML_OP_CONCAT:
-            func = ggml_cuda_concat;
-            break;
-        case GGML_OP_UPSCALE:
-            func = ggml_cuda_upscale;
-            break;
-        case GGML_OP_PAD:
-            func = ggml_cuda_pad;
-            break;
-        case GGML_OP_LEAKY_RELU:
-            func = ggml_cuda_leaky_relu;
-            break;
-        case GGML_OP_RMS_NORM:
-            func = ggml_cuda_rms_norm;
-            break;
-        case GGML_OP_MUL_MAT:
-            if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src[0], tensor->src[1], tensor)) {
-                return false;
-            }
-            func = ggml_cuda_mul_mat;
-            break;
-        case GGML_OP_MUL_MAT_ID:
-            if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src[2], tensor->src[1], tensor)) {
-                return false;
-            }
-            func = ggml_cuda_mul_mat_id;
-            break;
-        case GGML_OP_SCALE:
-            func = ggml_cuda_scale;
-            break;
-        case GGML_OP_SQR:
-            func = ggml_cuda_sqr;
-            break;
-        case GGML_OP_CLAMP:
-            func = ggml_cuda_clamp;
-            break;
-        case GGML_OP_CPY:
-            func = ggml_cuda_cpy;
-            break;
-        case GGML_OP_CONT:
-            func = ggml_cuda_dup;
-            break;
-        case GGML_OP_NONE:
-        case GGML_OP_RESHAPE:
-        case GGML_OP_VIEW:
-        case GGML_OP_PERMUTE:
-        case GGML_OP_TRANSPOSE:
-            func = ggml_cuda_nop;
-            break;
-        case GGML_OP_DIAG_MASK_INF:
-            func = ggml_cuda_diag_mask_inf;
-            break;
-        case GGML_OP_SOFT_MAX:
-            func = ggml_cuda_soft_max;
-            break;
-        case GGML_OP_ROPE:
-            func = ggml_cuda_rope;
-            break;
-        case GGML_OP_ALIBI:
-            func = ggml_cuda_alibi;
-            break;
-        case GGML_OP_IM2COL:
-            func = ggml_cuda_im2col;
-            break;
-        case GGML_OP_SUM_ROWS:
-            func = ggml_cuda_sum_rows;
-            break;
-        case GGML_OP_ARGSORT:
-            func = ggml_cuda_argsort;
-            break;
-        default:
-            return false;
-    }
-
-    if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT) {
-        ggml_cuda_set_peer_access(tensor->src[1]->ne[1]);
-    }
-
-    if (params->ith != 0) {
-        return true;
-    }
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return true;
-    }
-    func(tensor->src[0], tensor->src[1], tensor);
-    return true;
-}
-
-int ggml_cuda_get_device_count() {
-    int device_count;
-    if (cudaGetDeviceCount(&device_count) != cudaSuccess) {
-        return 0;
-    }
-    return device_count;
-}
-
-void ggml_cuda_get_device_description(int device, char * description, size_t description_size) {
-    cudaDeviceProp prop;
-    CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
-    snprintf(description, description_size, "%s", prop.name);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-// backend interface
-
-#define UNUSED GGML_UNUSED
-
-struct ggml_backend_cuda_context {
-    int device;
-    std::string name;
-};
-
-// cuda buffer
-
-struct ggml_backend_cuda_buffer_context {
-    int device;
-    void * dev_ptr = nullptr;
-    ggml_tensor_extra_gpu * temp_tensor_extras = nullptr;
-    size_t temp_tensor_extra_index = 0;
-    std::string name;
-
-    ggml_backend_cuda_buffer_context(int device, void * dev_ptr) :
-        device(device), dev_ptr(dev_ptr),
-        name(GGML_CUDA_NAME + std::to_string(device)) {
-    }
-
-    ~ggml_backend_cuda_buffer_context() {
-        delete[] temp_tensor_extras;
-    }
-
-    ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
-        // TODO: remove GGML_CUDA_MAX_NODES, allocate dynamically and reuse in backend_buffer_reset
-        if (temp_tensor_extras == nullptr) {
-            temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_CUDA_MAX_NODES];
-        }
-
-        size_t alloc_index = temp_tensor_extra_index;
-        temp_tensor_extra_index = (temp_tensor_extra_index + 1) % GGML_CUDA_MAX_NODES;
-        ggml_tensor_extra_gpu * extra = &temp_tensor_extras[alloc_index];
-        memset(extra, 0, sizeof(*extra));
-
-        return extra;
-    }
-};
-
-static const char * ggml_backend_cuda_buffer_get_name(ggml_backend_buffer_t buffer) {
-    ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
-    return ctx->name.c_str();
-}
-
-static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
-    return buffer->iface.get_name == ggml_backend_cuda_buffer_get_name;
-}
-
-static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
-    ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
-    CUDA_CHECK(cudaFree(ctx->dev_ptr));
-    delete ctx;
-}
-
-static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
-    ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
-    return ctx->dev_ptr;
-}
-
-static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
-    ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
-
-    if (tensor->view_src != NULL && tensor->view_offs == 0) {
-        assert(tensor->view_src->buffer->buft == buffer->buft);
-        tensor->backend = tensor->view_src->backend;
-        tensor->extra = tensor->view_src->extra;
-        return;
-    }
-
-    ggml_tensor_extra_gpu * extra = ctx->ggml_cuda_alloc_temp_tensor_extra();
-
-    extra->data_device[ctx->device] = tensor->data;
-
-    tensor->backend = GGML_BACKEND_GPU;
-    tensor->extra = extra;
-
-    if (ggml_is_quantized(tensor->type)) {
-        // initialize padding to 0 to avoid possible NaN values
-        int64_t row_low = 0;
-        int64_t row_high = ggml_nrows(tensor);
-        int64_t nrows_split = row_high - row_low;
-
-        size_t original_size = ggml_nbytes_split(tensor, nrows_split);
-        size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
-
-        if (padded_size > original_size && tensor->view_src == nullptr) {
-            CUDA_CHECK(cudaMemsetAsync((char *)tensor->data + original_size, 0, padded_size - original_size, g_cudaStreams[ctx->device][0]));
-        }
-    }
-}
-
-static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
-    GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
-
-    ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
-
-    ggml_cuda_set_device(ctx->device);
-    CUDA_CHECK(cudaDeviceSynchronize());
-    CUDA_CHECK(cudaMemcpy((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice));
-    CUDA_CHECK(cudaDeviceSynchronize());
-}
-
-static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
-    GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
-
-    ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
-
-    ggml_cuda_set_device(ctx->device);
-    CUDA_CHECK(cudaDeviceSynchronize());
-    CUDA_CHECK(cudaMemcpy(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost));
-    CUDA_CHECK(cudaDeviceSynchronize());
-}
-
-static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
-    if (ggml_backend_buffer_is_cuda(src->buffer)) {
-        ggml_backend_cuda_buffer_context * src_ctx = (ggml_backend_cuda_buffer_context *)src->buffer->context;
-        ggml_backend_cuda_buffer_context * dst_ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
-
-        ggml_cuda_set_device(src_ctx->device);
-        CUDA_CHECK(cudaDeviceSynchronize());
-        ggml_cuda_set_device(dst_ctx->device);
-        CUDA_CHECK(cudaDeviceSynchronize());
-        CUDA_CHECK(cudaMemcpy((char *)dst->data, (const char *)src->data, ggml_nbytes(src), cudaMemcpyDeviceToDevice));
-        CUDA_CHECK(cudaDeviceSynchronize());
-
-        return true;
-    }
-    return false;
-}
-
-static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
-    ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
-
-    ggml_cuda_set_device(ctx->device);
-    CUDA_CHECK(cudaDeviceSynchronize());
-    CUDA_CHECK(cudaMemset(ctx->dev_ptr, value, buffer->size));
-    CUDA_CHECK(cudaDeviceSynchronize());
-}
-
-static ggml_backend_buffer_i ggml_backend_cuda_buffer_interface = {
-    /* .get_name        = */ ggml_backend_cuda_buffer_get_name,
-    /* .free_buffer     = */ ggml_backend_cuda_buffer_free_buffer,
-    /* .get_base        = */ ggml_backend_cuda_buffer_get_base,
-    /* .init_tensor     = */ ggml_backend_cuda_buffer_init_tensor,
-    /* .set_tensor      = */ ggml_backend_cuda_buffer_set_tensor,
-    /* .get_tensor      = */ ggml_backend_cuda_buffer_get_tensor,
-    /* .cpy_tensor      = */ ggml_backend_cuda_buffer_cpy_tensor,
-    /* .clear           = */ ggml_backend_cuda_buffer_clear,
-    /* .reset           = */ NULL,
-};
-
-// cuda buffer type
-
-struct ggml_backend_cuda_buffer_type_context {
-    int device;
-    std::string name;
-};
-
-static const char * ggml_backend_cuda_buffer_type_name(ggml_backend_buffer_type_t buft) {
-    ggml_backend_cuda_buffer_type_context * ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
-
-    return ctx->name.c_str();
-}
-
-static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
-    ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
-
-    ggml_cuda_set_device(buft_ctx->device);
-
-    size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0
-
-    void * dev_ptr;
-    cudaError_t err = cudaMalloc(&dev_ptr, size);
-    if (err != cudaSuccess) {
-        fprintf(stderr, "%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size/1024.0/1024.0, buft_ctx->device, cudaGetErrorString(err));
-        return nullptr;
-    }
-
-    ggml_backend_cuda_buffer_context * ctx = new ggml_backend_cuda_buffer_context(buft_ctx->device, dev_ptr);
-
-    return ggml_backend_buffer_init(buft, ggml_backend_cuda_buffer_interface, ctx, size);
-}
-
-static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
-    return 128;
-
-    UNUSED(buft);
-}
-
-static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
-    int64_t row_low = 0;
-    int64_t row_high = ggml_nrows(tensor);
-    int64_t nrows_split = row_high - row_low;
-
-    size_t size = ggml_nbytes_split(tensor, nrows_split);
-
-    int64_t ne0 = tensor->ne[0];
-
-    if (ggml_is_quantized(tensor->type)) {
-        if (ne0 % MATRIX_ROW_PADDING != 0) {
-            size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
-        }
-    }
-
-    return size;
-
-    UNUSED(buft);
-}
-
-static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
-    if (!ggml_backend_is_cuda(backend)) {
-        return false;
-    }
-
-    ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
-    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
-
-    return buft_ctx->device == cuda_ctx->device;
-}
-
-static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
-    /* .get_name         = */ ggml_backend_cuda_buffer_type_name,
-    /* .alloc_buffer     = */ ggml_backend_cuda_buffer_type_alloc_buffer,
-    /* .get_alignment    = */ ggml_backend_cuda_buffer_type_get_alignment,
-    /* .get_alloc_size   = */ ggml_backend_cuda_buffer_type_get_alloc_size,
-    /* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend,
-    /* .is_host          = */ NULL,
-};
-
-ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
-    // FIXME: this is not thread safe
-    if (device >= ggml_backend_cuda_get_device_count()) {
-        return nullptr;
-    }
-
-    static ggml_backend_buffer_type ggml_backend_cuda_buffer_types[GGML_CUDA_MAX_DEVICES];
-
-    static bool ggml_backend_cuda_buffer_type_initialized = false;
-
-    if (!ggml_backend_cuda_buffer_type_initialized) {
-        for (int i = 0; i < GGML_CUDA_MAX_DEVICES; i++) {
-            ggml_backend_cuda_buffer_types[i] = {
-                /* .iface    = */ ggml_backend_cuda_buffer_type_interface,
-                /* .context  = */ new ggml_backend_cuda_buffer_type_context{i, GGML_CUDA_NAME + std::to_string(i)},
-            };
-        }
-        ggml_backend_cuda_buffer_type_initialized = true;
-    }
-
-    return &ggml_backend_cuda_buffer_types[device];
-}
-
-// cuda split buffer
-
-struct ggml_backend_cuda_split_buffer_context {
-    ~ggml_backend_cuda_split_buffer_context() {
-        for (ggml_tensor_extra_gpu * extra : tensor_extras) {
-            for (int id = 0; id < g_device_count; ++id) {
-                for (int64_t is = 0; is < MAX_STREAMS; ++is) {
-                    if (extra->events[id][is] != nullptr) {
-                        CUDA_CHECK(cudaEventDestroy(extra->events[id][is]));
-                    }
-                }
-                if (extra->data_device[id] != nullptr) {
-                    CUDA_CHECK(cudaFree(extra->data_device[id]));
-                }
-            }
-            delete extra;
-        }
-    }
-
-    std::vector<ggml_tensor_extra_gpu *> tensor_extras;
-};
-
-static const char * ggml_backend_cuda_split_buffer_get_name(ggml_backend_buffer_t buffer) {
-    return GGML_CUDA_NAME "_Split";
-
-    UNUSED(buffer);
-}
-
-// unused at the moment
-//static bool ggml_backend_buffer_is_cuda_split(ggml_backend_buffer_t buffer) {
-//    return buffer->iface.get_name == ggml_backend_cuda_split_buffer_get_name;
-//}
-
-static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
-    ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
-    delete ctx;
-}
-
-static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) {
-    // the pointers are stored in the tensor extras, this is just a dummy address and never dereferenced
-    return (void *)0x1000;
-
-    UNUSED(buffer);
-}
-
-static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
-    GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
-
-    ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
-    ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
-
-    const int64_t ne0 = tensor->ne[0];
-
-    ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{};
-
-    ctx->tensor_extras.push_back(extra);
-
-    for (int id = 0; id < g_device_count; ++id) {
-        int64_t row_low, row_high;
-        get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
-
-        int64_t nrows_split = row_high - row_low;
-        if (nrows_split == 0) {
-            continue;
-        }
-
-        size_t size = ggml_nbytes_split(tensor, nrows_split);
-        const size_t original_size = size;
-
-        // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
-        if (ne0 % MATRIX_ROW_PADDING != 0) {
-            size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
-        }
-
-        // FIXME: do not crash if cudaMalloc fails
-        // currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first
-        ggml_cuda_set_device(id);
-        char * buf;
-        CUDA_CHECK(cudaMalloc(&buf, size));
-
-        // set padding to 0 to avoid possible NaN values
-        if (size > original_size) {
-            CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
-        }
-
-        extra->data_device[id] = buf;
-
-        for (int64_t is = 0; is < MAX_STREAMS; ++is) {
-            CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id][is], cudaEventDisableTiming));
-        }
-    }
-    tensor->backend = GGML_BACKEND_GPU_SPLIT;
-    tensor->extra = extra;
-}
-
-static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
-    // split tensors must always be set in their entirety at once
-    GGML_ASSERT(offset == 0);
-    GGML_ASSERT(size == ggml_nbytes(tensor));
-
-    ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
-
-    const int64_t ne0 = tensor->ne[0];
-    const size_t nb1 = tensor->nb[1];
-    ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra;
-
-    for (int id = 0; id < g_device_count; ++id) {
-        int64_t row_low, row_high;
-        get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
-
-        int64_t nrows_split = row_high - row_low;
-        if (nrows_split == 0) {
-            continue;
-        }
-
-        const size_t offset_split = row_low*nb1;
-        size_t size = ggml_nbytes_split(tensor, nrows_split);
-        const size_t original_size = size;
-
-        // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
-        if (ne0 % MATRIX_ROW_PADDING != 0) {
-            size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
-        }
-
-        const char * buf_host = (const char *)data + offset_split;
-        CUDA_CHECK(cudaMemcpy(extra->data_device[id], buf_host, original_size, cudaMemcpyHostToDevice));
-    }
-}
-
-static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
-    // split tensors must always be set in their entirety at once
-    GGML_ASSERT(offset == 0);
-    GGML_ASSERT(size == ggml_nbytes(tensor));
-
-    ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
-
-    const int64_t ne0 = tensor->ne[0];
-    const size_t nb1 = tensor->nb[1];
-    ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra;
-
-    for (int id = 0; id < g_device_count; ++id) {
-        int64_t row_low, row_high;
-        get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
-
-        int64_t nrows_split = row_high - row_low;
-        if (nrows_split == 0) {
-            continue;
-        }
-
-        const size_t offset_split = row_low*nb1;
-        size_t size = ggml_nbytes_split(tensor, nrows_split);
-        const size_t original_size = size;
-
-        // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
-        if (ne0 % MATRIX_ROW_PADDING != 0) {
-            size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
-        }
-
-        char * buf_host = (char *)data + offset_split;
-        CUDA_CHECK(cudaMemcpy(buf_host, extra->data_device[id], original_size, cudaMemcpyDeviceToHost));
-    }
-}
-
-static void ggml_backend_cuda_split_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
-    UNUSED(buffer);
-    UNUSED(value);
-}
-
-static struct ggml_backend_buffer_i ggml_backend_cuda_split_buffer_interface = {
-    /* .get_name        = */ ggml_backend_cuda_split_buffer_get_name,
-    /* .free_buffer     = */ ggml_backend_cuda_split_buffer_free_buffer,
-    /* .get_base        = */ ggml_backend_cuda_split_buffer_get_base,
-    /* .init_tensor     = */ ggml_backend_cuda_split_buffer_init_tensor,
-    /* .set_tensor      = */ ggml_backend_cuda_split_buffer_set_tensor,
-    /* .get_tensor      = */ ggml_backend_cuda_split_buffer_get_tensor,
-    /* .cpy_tensor      = */ NULL,
-    /* .clear           = */ ggml_backend_cuda_split_buffer_clear,
-    /* .reset           = */ NULL,
-};
-
-// cuda split buffer type
-
-static const char * ggml_backend_cuda_split_buffer_type_name(ggml_backend_buffer_type_t buft) {
-    return GGML_CUDA_NAME "_Split";
-
-    UNUSED(buft);
-}
-
-static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
-    // since we don't know the exact split after rounding, we cannot allocate the device buffers at this point
-    // instead, we allocate them for each tensor separately in init_tensor
-    // however, the size still represents the maximum cumulative size of all the device buffers after the tensors are allocated,
-    // as returned by get_alloc_size. this limit is enforced during tensor allocation by ggml-alloc, so it must be correct.
-    ggml_backend_cuda_split_buffer_context * ctx = new ggml_backend_cuda_split_buffer_context();
-
-    return ggml_backend_buffer_init(buft, ggml_backend_cuda_split_buffer_interface, ctx, size);
-}
-
-static size_t ggml_backend_cuda_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
-    return 128;
-
-    UNUSED(buft);
-}
-
-static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
-    ggml_backend_cuda_split_buffer_type_context * ctx = (ggml_backend_cuda_split_buffer_type_context *)buft->context;
-
-    size_t total_size = 0;
-
-    const int64_t ne0 = tensor->ne[0];
-
-    for (int id = 0; id < g_device_count; ++id) {
-        int64_t row_low, row_high;
-        get_row_split(&row_low, &row_high, tensor, ctx->tensor_split, id);
-
-        int64_t nrows_split = row_high - row_low;
-        if (nrows_split == 0) {
-            continue;
-        }
-
-        total_size += ggml_nbytes_split(tensor, nrows_split);
-
-        // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
-        if (ne0 % MATRIX_ROW_PADDING != 0) {
-            total_size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
-        }
-    }
-
-    return total_size;
-}
-
-static bool ggml_backend_cuda_split_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
-    return ggml_backend_is_cuda(backend);
-
-    UNUSED(buft);
-}
-
-static bool ggml_backend_cuda_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
-    return false;
-
-    UNUSED(buft);
-}
-
-static ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface = {
-    /* .get_name         = */ ggml_backend_cuda_split_buffer_type_name,
-    /* .alloc_buffer     = */ ggml_backend_cuda_split_buffer_type_alloc_buffer,
-    /* .get_alignment    = */ ggml_backend_cuda_split_buffer_type_get_alignment,
-    /* .get_alloc_size   = */ ggml_backend_cuda_split_buffer_type_get_alloc_size,
-    /* .supports_backend = */ ggml_backend_cuda_split_buffer_type_supports_backend,
-    /* .is_host          = */ ggml_backend_cuda_split_buffer_type_is_host,
-};
-
-ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split) {
-    // FIXME: this is not thread safe
-    static std::map<std::array<float, GGML_CUDA_MAX_DEVICES>, struct ggml_backend_buffer_type> buft_map;
-
-    std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split_arr = {};
-
-    bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + GGML_CUDA_MAX_DEVICES, [](float x) { return x == 0.0f; });
-    if (all_zero) {
-        tensor_split_arr = g_default_tensor_split;
-    } else {
-        float split_sum = 0.0f;
-        for (int i = 0; i < g_device_count; ++i) {
-            tensor_split_arr[i] = split_sum;
-            split_sum += tensor_split[i];
-        }
-        for (int i = 0; i < g_device_count; ++i) {
-            tensor_split_arr[i] /= split_sum;
-        }
-    }
-
-    auto it = buft_map.find(tensor_split_arr);
-    if (it != buft_map.end()) {
-        return &it->second;
-    }
-
-    struct ggml_backend_buffer_type buft {
-        /* .iface   = */ ggml_backend_cuda_split_buffer_type_interface,
-        /* .context = */ new ggml_backend_cuda_split_buffer_type_context{tensor_split_arr},
-    };
-
-    auto result = buft_map.emplace(tensor_split_arr, buft);
-    return &result.first->second;
-}
-
-// host buffer type
-
-static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
-    return GGML_CUDA_NAME "_Host";
-
-    UNUSED(buft);
-}
-
-static const char * ggml_backend_cuda_host_buffer_name(ggml_backend_buffer_t buffer) {
-    return GGML_CUDA_NAME "_Host";
-
-    UNUSED(buffer);
-}
-
-static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
-    ggml_cuda_host_free(buffer->context);
-}
-
-static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
-    void * ptr = ggml_cuda_host_malloc(size);
-
-    if (ptr == nullptr) {
-        // fallback to cpu buffer
-        return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
-    }
-
-    ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
-    buffer->buft = buft;
-    buffer->iface.get_name = ggml_backend_cuda_host_buffer_name;
-    buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer;
-
-    return buffer;
-}
-
-ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
-    static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_type_host = {
-        /* .iface    = */ {
-            /* .get_name         = */ ggml_backend_cuda_host_buffer_type_name,
-            /* .alloc_buffer     = */ ggml_backend_cuda_host_buffer_type_alloc_buffer,
-            /* .get_alignment    = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
-            /* .get_alloc_size   = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
-            /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
-            /* .is_host          = */ ggml_backend_cpu_buffer_type()->iface.is_host,
-        },
-        /* .context  = */ nullptr,
-    };
-
-    return &ggml_backend_cuda_buffer_type_host;
-}
-
-// backend
-
-static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
-    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
-
-    return cuda_ctx->name.c_str();
-}
-
-static void ggml_backend_cuda_free(ggml_backend_t backend) {
-    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
-
-    delete cuda_ctx;
-    delete backend;
-}
-
-static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer_type(ggml_backend_t backend) {
-    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
-
-    return ggml_backend_cuda_buffer_type(cuda_ctx->device);
-}
-
-static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
-    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
-
-    GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
-    GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
-
-    CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[cuda_ctx->device][0]));
-}
-
-static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
-    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
-
-    GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
-    GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
-
-    CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[cuda_ctx->device][0]));
-}
-
-static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) {
-    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
-
-    if (dst->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && ggml_backend_buffer_is_cuda(src->buffer)) {
-        CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, g_cudaStreams[cuda_ctx->device][0]));
-        return true;
-    }
-
-    return false;
-}
-
-static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
-    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
-
-    CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[cuda_ctx->device][0]));
-
-    UNUSED(backend);
-}
-
-static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
-    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
-
-    ggml_cuda_set_main_device(cuda_ctx->device);
-
-    ggml_compute_params params = {};
-    params.type = GGML_TASK_COMPUTE;
-    params.ith = 0;
-    for (int i = 0; i < cgraph->n_nodes; i++) {
-        ggml_tensor * node = cgraph->nodes[i];
-
-        if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
-            continue;
-        }
-
-#ifndef NDEBUG
-        assert(node->backend == GGML_BACKEND_GPU || node->backend == GGML_BACKEND_GPU_SPLIT);
-        assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
-        assert(node->extra != nullptr);
-
-        for (int j = 0; j < GGML_MAX_SRC; j++) {
-            if (node->src[j] != nullptr) {
-                assert(node->src[j]->backend == GGML_BACKEND_GPU || node->src[j]->backend == GGML_BACKEND_GPU_SPLIT);
-                assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
-                assert(node->src[j]->extra != nullptr);
-            }
-        }
-#endif
-
-        bool ok = ggml_cuda_compute_forward(&params, node);
-        if (!ok) {
-            fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
-        }
-        GGML_ASSERT(ok);
-    }
-
-    return true;
-}
-
-static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
-    switch (op->op) {
-        case GGML_OP_UNARY:
-            switch (ggml_get_unary_op(op)) {
-                case GGML_UNARY_OP_GELU:
-                case GGML_UNARY_OP_SILU:
-                case GGML_UNARY_OP_RELU:
-                case GGML_UNARY_OP_GELU_QUICK:
-                case GGML_UNARY_OP_TANH:
-                    return true;
-                default:
-                    return false;
-            }
-            break;
-        case GGML_OP_MUL_MAT:
-        case GGML_OP_MUL_MAT_ID:
-            {
-                struct ggml_tensor * a;
-                struct ggml_tensor * b;
-                if (op->op == GGML_OP_MUL_MAT) {
-                    a = op->src[0];
-                    b = op->src[1];
-                } else {
-                    a = op->src[2];
-                    b = op->src[1];
-                }
-                if (a->ne[3] != b->ne[3]) {
-                    return false;
-                }
-                return true;
-            } break;
-        case GGML_OP_GET_ROWS:
-            {
-                switch (op->src[0]->type) {
-                    case GGML_TYPE_F16:
-                    case GGML_TYPE_F32:
-                    case GGML_TYPE_Q4_0:
-                    case GGML_TYPE_Q4_1:
-                    case GGML_TYPE_Q5_0:
-                    case GGML_TYPE_Q5_1:
-                    case GGML_TYPE_Q8_0:
-                        return true;
-                    default:
-                        return false;
-                }
-            } break;
-        case GGML_OP_CPY:
-            {
-                ggml_type src0_type = op->src[0]->type;
-                ggml_type src1_type = op->src[1]->type;
-                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
-                    return true;
-                }
-                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F16) {
-                    return true;
-                }
-                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q8_0) {
-                    return true;
-                }
-                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q4_0) {
-                    return true;
-                }
-                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q4_1) {
-                    return true;
-                }
-                if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) {
-                    return true;
-                }
-                return false;
-            } break;
-        case GGML_OP_DUP:
-        case GGML_OP_REPEAT:
-        case GGML_OP_CONCAT:
-            {
-                ggml_type src0_type = op->src[0]->type;
-                return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
-            } break;
-        case GGML_OP_NONE:
-        case GGML_OP_RESHAPE:
-        case GGML_OP_VIEW:
-        case GGML_OP_PERMUTE:
-        case GGML_OP_TRANSPOSE:
-        case GGML_OP_NORM:
-        case GGML_OP_ADD:
-        case GGML_OP_MUL:
-        case GGML_OP_DIV:
-        case GGML_OP_RMS_NORM:
-        case GGML_OP_SCALE:
-        case GGML_OP_SQR:
-        case GGML_OP_CLAMP:
-        case GGML_OP_CONT:
-        case GGML_OP_DIAG_MASK_INF:
-        case GGML_OP_SOFT_MAX:
-        case GGML_OP_ROPE:
-        case GGML_OP_ALIBI:
-        case GGML_OP_IM2COL:
-        case GGML_OP_SUM_ROWS:
-        case GGML_OP_ARGSORT:
-        case GGML_OP_ACC:
-        case GGML_OP_GROUP_NORM:
-        case GGML_OP_UPSCALE:
-        case GGML_OP_PAD:
-        case GGML_OP_LEAKY_RELU:
-            return true;
-        default:
-            return false;
-    }
-
-    UNUSED(backend);
-}
-
-static ggml_backend_i ggml_backend_cuda_interface = {
-    /* .get_name                = */ ggml_backend_cuda_name,
-    /* .free                    = */ ggml_backend_cuda_free,
-    /* .get_default_buffer_type = */ ggml_backend_cuda_get_default_buffer_type,
-    /* .set_tensor_async        = */ ggml_backend_cuda_set_tensor_async,
-    /* .get_tensor_async        = */ ggml_backend_cuda_get_tensor_async,
-    /* .cpy_tensor_async        = */ ggml_backend_cuda_cpy_tensor_async,
-    /* .synchronize             = */ ggml_backend_cuda_synchronize,
-    /* .graph_plan_create       = */ NULL,
-    /* .graph_plan_free         = */ NULL,
-    /* .graph_plan_compute      = */ NULL,
-    /* .graph_compute           = */ ggml_backend_cuda_graph_compute,
-    /* .supports_op             = */ ggml_backend_cuda_supports_op,
-};
-
-ggml_backend_t ggml_backend_cuda_init(int device) {
-    ggml_init_cublas(); // TODO: remove from ggml.c
-
-    if (device < 0 || device >= ggml_cuda_get_device_count()) {
-        fprintf(stderr, "%s: error: invalid device %d\n", __func__, device);
-        return nullptr;
-    }
-
-    // not strictly necessary, but it may reduce the overhead of the first graph_compute
-    ggml_cuda_set_main_device(device);
-
-    ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context {
-        /* .device = */ device,
-        /* .name   = */ GGML_CUDA_NAME + std::to_string(device),
-    };
-
-    ggml_backend_t cuda_backend = new ggml_backend {
-        /* .interface = */ ggml_backend_cuda_interface,
-        /* .context   = */ ctx
-    };
-
-    return cuda_backend;
-}
-
-bool ggml_backend_is_cuda(ggml_backend_t backend) {
-    return backend && backend->iface.get_name == ggml_backend_cuda_name;
-}
-
-int ggml_backend_cuda_get_device_count() {
-    return ggml_cuda_get_device_count();
-}
-
-void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size) {
-    ggml_cuda_get_device_description(device, description, description_size);
-}
-
-void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total) {
-    ggml_cuda_set_device(device);
-
-    CUDA_CHECK(cudaMemGetInfo(free, total));
-}
-
-// backend registry
-static ggml_backend_t ggml_backend_reg_cuda_init(const char * params, void * user_data) {
-    ggml_backend_t cuda_backend = ggml_backend_cuda_init((int) (intptr_t) user_data);
-    return cuda_backend;
-
-    UNUSED(params);
-}
-
-extern "C" int ggml_backend_cuda_reg_devices();
-
-int ggml_backend_cuda_reg_devices() {
-    int device_count = ggml_cuda_get_device_count();
-    //int device_count = 1; // DEBUG: some tools require delaying CUDA initialization
-    for (int i = 0; i < device_count; i++) {
-        char name[128];
-        snprintf(name, sizeof(name), "%s%d", GGML_CUDA_NAME, i);
-        ggml_backend_register(name, ggml_backend_reg_cuda_init, ggml_backend_cuda_buffer_type(i), (void *) (intptr_t) i);
-    }
-    return device_count;
-}
diff --git a/ggml/src/ggml-cuda.h b/ggml/src/ggml-cuda.h
deleted file mode 100644
index d19cbf3..0000000
--- a/ggml/src/ggml-cuda.h
+++ /dev/null
@@ -1,52 +0,0 @@
-#pragma once
-
-#include "ggml.h"
-#include "ggml-backend.h"
-
-#ifdef GGML_USE_HIPBLAS
-#define GGML_CUDA_NAME "ROCm"
-#define GGML_CUBLAS_NAME "hipBLAS"
-#else
-#define GGML_CUDA_NAME "CUDA"
-#define GGML_CUBLAS_NAME "cuBLAS"
-#endif
-
-#ifdef  __cplusplus
-extern "C" {
-#endif
-
-#define GGML_CUDA_MAX_DEVICES       16
-
-// Always success. To check if CUDA is actually loaded, use `ggml_cublas_loaded`.
-GGML_API void   ggml_init_cublas(void);
-
-// Returns `true` if there are available CUDA devices and cublas loads successfully; otherwise, it returns `false`.
-GGML_API bool   ggml_cublas_loaded(void);
-
-GGML_API void * ggml_cuda_host_malloc(size_t size);
-GGML_API void   ggml_cuda_host_free(void * ptr);
-
-GGML_API bool   ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
-GGML_API bool   ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
-
-GGML_API int    ggml_cuda_get_device_count(void);
-GGML_API void   ggml_cuda_get_device_description(int device, char * description, size_t description_size);
-
-// backend API
-GGML_API ggml_backend_t ggml_backend_cuda_init(int device);
-
-GGML_API bool ggml_backend_is_cuda(ggml_backend_t backend);
-
-GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
-// split tensor buffer that splits matrices by rows across multiple devices
-GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split);
-// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
-GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
-
-GGML_API int  ggml_backend_cuda_get_device_count(void);
-GGML_API void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
-GGML_API void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
-
-#ifdef  __cplusplus
-}
-#endif
diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h
deleted file mode 100644
index 2c58075..0000000
--- a/ggml/src/ggml-impl.h
+++ /dev/null
@@ -1,246 +0,0 @@
-#pragma once
-
-#include "ggml.h"
-
-// GGML internal header
-
-#include <assert.h>
-#include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/
-#include <stddef.h>
-#include <stdbool.h>
-#include <string.h> // memcpy
-#include <math.h>   // fabsf
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// static_assert should be a #define, but if it's not,
-// fall back to the _Static_assert C11 keyword.
-// if C99 - static_assert is noop
-// ref: https://stackoverflow.com/a/53923785/4039976
-#ifndef static_assert
-#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
-#define static_assert(cond, msg) _Static_assert(cond, msg)
-#else
-#define static_assert(cond, msg) struct global_scope_noop_trick
-#endif
-#endif
-
-// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
-#if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))
-#ifndef __FMA__
-#define __FMA__
-#endif
-#ifndef __F16C__
-#define __F16C__
-#endif
-#ifndef __SSE3__
-#define __SSE3__
-#endif
-#endif
-
-// 16-bit float
-// on Arm, we use __fp16
-// on x86, we use uint16_t
-#if defined(__ARM_NEON) && !defined(_MSC_VER)
-
-// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
-//
-//   $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
-//
-#include <arm_neon.h>
-
-#define GGML_COMPUTE_FP16_TO_FP32(x) ((float) (x))
-#define GGML_COMPUTE_FP32_TO_FP16(x) (x)
-
-#define GGML_FP16_TO_FP32(x) ((float) (x))
-#define GGML_FP32_TO_FP16(x) (x)
-
-#else
-
-#ifdef __wasm_simd128__
-#include <wasm_simd128.h>
-#else
-#ifdef __POWER9_VECTOR__
-#include <altivec.h>
-#undef bool
-#define bool _Bool
-#else
-#if defined(_MSC_VER) || defined(__MINGW32__)
-#include <intrin.h>
-#else
-#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__)
-#if !defined(__riscv)
-#include <immintrin.h>
-#endif
-#endif
-#endif
-#endif
-#endif
-
-#ifdef __riscv_v_intrinsic
-#include <riscv_vector.h>
-#endif
-
-#ifdef __F16C__
-
-#ifdef _MSC_VER
-#define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
-#define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
-#else
-#define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
-#define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
-#endif
-
-#elif defined(__POWER9_VECTOR__)
-
-#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
-#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
-/* the inline asm below is about 12% faster than the lookup method */
-#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
-#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
-
-static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
-    register float f;
-    register double d;
-    __asm__(
-        "mtfprd %0,%2\n"
-        "xscvhpdp %0,%0\n"
-        "frsp %1,%0\n" :
-        /* temp */ "=d"(d),
-        /* out */  "=f"(f):
-        /* in */   "r"(h));
-    return f;
-}
-
-static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
-    register double d;
-    register ggml_fp16_t r;
-    __asm__( /* xscvdphp can work on double or single precision */
-        "xscvdphp %0,%2\n"
-        "mffprd %1,%0\n" :
-        /* temp */ "=d"(d),
-        /* out */  "=r"(r):
-        /* in */   "f"(f));
-    return r;
-}
-
-#else
-
-// FP16 <-> FP32
-// ref: https://github.com/Maratyszcza/FP16
-
-static inline float fp32_from_bits(uint32_t w) {
-    union {
-        uint32_t as_bits;
-        float as_value;
-    } fp32;
-    fp32.as_bits = w;
-    return fp32.as_value;
-}
-
-static inline uint32_t fp32_to_bits(float f) {
-    union {
-        float as_value;
-        uint32_t as_bits;
-    } fp32;
-    fp32.as_value = f;
-    return fp32.as_bits;
-}
-
-static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
-    const uint32_t w = (uint32_t) h << 16;
-    const uint32_t sign = w & UINT32_C(0x80000000);
-    const uint32_t two_w = w + w;
-
-    const uint32_t exp_offset = UINT32_C(0xE0) << 23;
-#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
-    const float exp_scale = 0x1.0p-112f;
-#else
-    const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
-#endif
-    const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
-
-    const uint32_t magic_mask = UINT32_C(126) << 23;
-    const float magic_bias = 0.5f;
-    const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
-
-    const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
-    const uint32_t result = sign |
-        (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
-    return fp32_from_bits(result);
-}
-
-static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
-#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
-    const float scale_to_inf = 0x1.0p+112f;
-    const float scale_to_zero = 0x1.0p-110f;
-#else
-    const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
-    const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
-#endif
-    float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
-
-    const uint32_t w = fp32_to_bits(f);
-    const uint32_t shl1_w = w + w;
-    const uint32_t sign = w & UINT32_C(0x80000000);
-    uint32_t bias = shl1_w & UINT32_C(0xFF000000);
-    if (bias < UINT32_C(0x71000000)) {
-        bias = UINT32_C(0x71000000);
-    }
-
-    base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
-    const uint32_t bits = fp32_to_bits(base);
-    const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
-    const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
-    const uint32_t nonsign = exp_bits + mantissa_bits;
-    return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
-}
-
-#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
-#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
-
-#endif // __F16C__
-
-#endif // __ARM_NEON
-
-// precomputed f32 table for f16 (256 KB)
-// defined in ggml.c, initialized in ggml_init()
-extern float ggml_table_f32_f16[1 << 16];
-
-// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
-// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
-// This is also true for POWER9.
-#if !defined(GGML_FP16_TO_FP32) || !defined(GGML_FP32_TO_FP16)
-
-inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
-    uint16_t s;
-    memcpy(&s, &f, sizeof(uint16_t));
-    return ggml_table_f32_f16[s];
-}
-
-#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
-#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
-
-#endif
-
-#define GGML_HASHTABLE_FULL ((size_t)-1)
-#define GGML_HASHTABLE_ALREADY_EXISTS ((size_t)-2)
-
-struct ggml_hash_set ggml_hash_set_new(size_t size);
-
-bool   ggml_hash_contains      (const struct ggml_hash_set hash_set, struct ggml_tensor * key);
-
-// returns GGML_HASHTABLE_FULL if table is full, otherwise the current index of the key or where it should be inserted
-size_t ggml_hash_find          (const struct ggml_hash_set hash_set, struct ggml_tensor * key);
-
-// returns GGML_HASHTABLE_ALREADY_EXISTS if key already exists, index otherwise, asserts if table is full
-size_t ggml_hash_insert        (      struct ggml_hash_set hash_set, struct ggml_tensor * key);
-
-// return index, asserts if table is full
-size_t ggml_hash_find_or_insert(      struct ggml_hash_set hash_set, struct ggml_tensor * key);
-
-#ifdef __cplusplus
-}
-#endif
diff --git a/ggml/src/ggml-metal.h b/ggml/src/ggml-metal.h
deleted file mode 100644
index cd5e299..0000000
--- a/ggml/src/ggml-metal.h
+++ /dev/null
@@ -1,64 +0,0 @@
-// An interface allowing to compute ggml_cgraph with Metal
-//
-// This is a fully functional interface that extends ggml with GPU support for Apple devices.
-// A similar interface can be created for other GPU backends (e.g. Vulkan, CUDA, OpenCL, etc.)
-//
-// How it works?
-//
-// As long as your program can create and evaluate a ggml_cgraph on the CPU, you can use this
-// interface to evaluate the same graph on the GPU. Instead of using ggml_graph_compute(), you
-// use ggml_metal_graph_compute() (or ggml_vulkan_graph_compute(), etc.)
-//
-// You only need to make sure that all memory buffers that you used during the graph creation
-// are mapped to the device memory with the ggml_metal_add_buffer() function. This mapping is
-// used during the graph evaluation to determine the arguments of the compute kernels.
-//
-// Synchronization between device and host memory (for example for input and output tensors)
-// is done with the ggml_metal_set_tensor() and ggml_metal_get_tensor() functions.
-//
-
-#pragma once
-
-#include "ggml.h"
-#include "ggml-backend.h"
-
-#include <stddef.h>
-#include <stdbool.h>
-
-// max memory buffers that can be mapped to the device
-#define GGML_METAL_MAX_BUFFERS 64
-#define GGML_METAL_MAX_COMMAND_BUFFERS 32
-
-struct ggml_tensor;
-struct ggml_cgraph;
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-//
-// backend API
-// user-code should use only these functions
-//
-
-GGML_API void ggml_backend_metal_log_set_callback(ggml_log_callback log_callback, void * user_data);
-
-GGML_API ggml_backend_t ggml_backend_metal_init(void);
-
-GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
-
-GGML_API ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size);
-
-GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb);
-
-GGML_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
-
-// helper to check if the device supports a specific family
-// ideally, the user code should be doing these checks
-// ref: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
-GGML_API bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family);
-
-#ifdef __cplusplus
-}
-#endif
-
diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m
deleted file mode 100644
index cae52c9..0000000
--- a/ggml/src/ggml-metal.m
+++ /dev/null
@@ -1,2640 +0,0 @@
-#import "ggml-metal.h"
-
-#import "ggml-backend-impl.h"
-#import "ggml.h"
-
-#import <Foundation/Foundation.h>
-
-#import <Metal/Metal.h>
-
-#undef MIN
-#undef MAX
-#define MIN(a, b) ((a) < (b) ? (a) : (b))
-#define MAX(a, b) ((a) > (b) ? (a) : (b))
-
-#ifdef GGML_METAL_NDEBUG
-#define GGML_METAL_LOG_INFO(...)
-#define GGML_METAL_LOG_WARN(...)
-#define GGML_METAL_LOG_ERROR(...)
-#else
-#define GGML_METAL_LOG_INFO(...)  ggml_metal_log(GGML_LOG_LEVEL_INFO, __VA_ARGS__)
-#define GGML_METAL_LOG_WARN(...)  ggml_metal_log(GGML_LOG_LEVEL_WARN, __VA_ARGS__)
-#define GGML_METAL_LOG_ERROR(...) ggml_metal_log(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
-#endif
-
-#define UNUSED(x) (void)(x)
-
-#define GGML_METAL_MAX_KERNELS 256
-
-struct ggml_metal_buffer {
-    const char * name;
-
-    void   * data;
-    size_t   size;
-
-    id<MTLBuffer> metal;
-};
-
-struct ggml_metal_kernel {
-    id<MTLFunction>             function;
-    id<MTLComputePipelineState> pipeline;
-};
-
-enum ggml_metal_kernel_type {
-    GGML_METAL_KERNEL_TYPE_ADD,
-    GGML_METAL_KERNEL_TYPE_ADD_ROW,
-    GGML_METAL_KERNEL_TYPE_MUL,
-    GGML_METAL_KERNEL_TYPE_MUL_ROW,
-    GGML_METAL_KERNEL_TYPE_DIV,
-    GGML_METAL_KERNEL_TYPE_DIV_ROW,
-    GGML_METAL_KERNEL_TYPE_SCALE,
-    GGML_METAL_KERNEL_TYPE_SCALE_4,
-    GGML_METAL_KERNEL_TYPE_TANH,
-    GGML_METAL_KERNEL_TYPE_RELU,
-    GGML_METAL_KERNEL_TYPE_GELU,
-    GGML_METAL_KERNEL_TYPE_GELU_QUICK,
-    GGML_METAL_KERNEL_TYPE_SILU,
-    GGML_METAL_KERNEL_TYPE_SOFT_MAX,
-    GGML_METAL_KERNEL_TYPE_SOFT_MAX_4,
-    GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF,
-    GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF_8,
-    GGML_METAL_KERNEL_TYPE_GET_ROWS_F32,
-    GGML_METAL_KERNEL_TYPE_GET_ROWS_F16,
-    GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_0,
-    GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_1,
-    GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_0,
-    GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_1,
-    GGML_METAL_KERNEL_TYPE_GET_ROWS_Q8_0,
-    GGML_METAL_KERNEL_TYPE_GET_ROWS_Q2_K,
-    GGML_METAL_KERNEL_TYPE_GET_ROWS_Q3_K,
-    GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_K,
-    GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_K,
-    GGML_METAL_KERNEL_TYPE_GET_ROWS_Q6_K,
-    GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XXS,
-    GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS,
-    GGML_METAL_KERNEL_TYPE_GET_ROWS_I32,
-    GGML_METAL_KERNEL_TYPE_RMS_NORM,
-    GGML_METAL_KERNEL_TYPE_GROUP_NORM,
-    GGML_METAL_KERNEL_TYPE_NORM,
-    GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F16,
-    GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_1ROW,
-    GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_L4,
-    GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_0_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_1_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_0_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_1_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MV_Q8_0_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MV_Q2_K_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MV_Q3_K_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_K_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_K_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MV_Q6_K_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XXS_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32,
-  //GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F16,
-    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32,
-  //GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32_1ROW,
-  //GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32_L4,
-    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_0_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_1_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_0_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_1_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q8_0_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q2_K_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q3_K_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_K_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_K_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q6_K_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XXS_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_0_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_1_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_0_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_1_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_Q8_0_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_Q2_K_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_Q3_K_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_K_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_K_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_Q6_K_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_1_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_0_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_1_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q8_0_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q2_K_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q3_K_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_K_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_K_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q6_K_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32,
-    GGML_METAL_KERNEL_TYPE_ROPE_F32,
-    GGML_METAL_KERNEL_TYPE_ROPE_F16,
-    GGML_METAL_KERNEL_TYPE_ALIBI_F32,
-    GGML_METAL_KERNEL_TYPE_IM2COL_F16,
-    GGML_METAL_KERNEL_TYPE_UPSCALE_F32,
-    GGML_METAL_KERNEL_TYPE_PAD_F32,
-    GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,
-    GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC,
-    GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32,
-    GGML_METAL_KERNEL_TYPE_CPY_F32_F16,
-    GGML_METAL_KERNEL_TYPE_CPY_F32_F32,
-    GGML_METAL_KERNEL_TYPE_CPY_F32_Q8_0,
-    GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_0,
-    GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_1,
-  //GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_0,
-  //GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_1,
-    GGML_METAL_KERNEL_TYPE_CPY_F16_F16,
-    GGML_METAL_KERNEL_TYPE_CPY_F16_F32,
-    GGML_METAL_KERNEL_TYPE_CONCAT,
-    GGML_METAL_KERNEL_TYPE_SQR,
-    GGML_METAL_KERNEL_TYPE_SUM_ROWS,
-
-    GGML_METAL_KERNEL_TYPE_COUNT
-};
-
-struct ggml_metal_context {
-    int n_cb;
-
-    id<MTLDevice>       device;
-    id<MTLCommandQueue> queue;
-    id<MTLLibrary>      library;
-
-    id<MTLCommandBuffer>         command_buffers [GGML_METAL_MAX_COMMAND_BUFFERS];
-    id<MTLComputeCommandEncoder> command_encoders[GGML_METAL_MAX_COMMAND_BUFFERS];
-
-    dispatch_queue_t d_queue;
-
-    int n_buffers;
-    struct ggml_metal_buffer buffers[GGML_METAL_MAX_BUFFERS];
-
-    struct ggml_metal_kernel kernels[GGML_METAL_MAX_KERNELS];
-
-    bool support_simdgroup_reduction;
-    bool support_simdgroup_mm;
-};
-
-// MSL code
-// TODO: move the contents here when ready
-//       for now it is easier to work in a separate file
-//static NSString * const msl_library_source = @"see metal.metal";
-
-// Here to assist with NSBundle Path Hack
-@interface GGMLMetalClass : NSObject
-@end
-@implementation GGMLMetalClass
-@end
-
-static void ggml_metal_default_log_callback(enum ggml_log_level level, const char * msg, void * user_data) {
-    fprintf(stderr, "%s", msg);
-
-    UNUSED(level);
-    UNUSED(user_data);
-}
-
-ggml_log_callback ggml_metal_log_callback = ggml_metal_default_log_callback;
-void * ggml_metal_log_user_data = NULL;
-
-GGML_ATTRIBUTE_FORMAT(2, 3)
-static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){
-    if (ggml_metal_log_callback != NULL) {
-        va_list args;
-        va_start(args, format);
-        char buffer[128];
-        int len = vsnprintf(buffer, 128, format, args);
-        if (len < 128) {
-            ggml_metal_log_callback(level, buffer, ggml_metal_log_user_data);
-        } else {
-            char* buffer2 = malloc(len+1);
-            va_end(args);
-            va_start(args, format);
-            vsnprintf(buffer2, len+1, format, args);
-            buffer2[len] = 0;
-            ggml_metal_log_callback(level, buffer2, ggml_metal_log_user_data);
-            free(buffer2);
-        }
-        va_end(args);
-    }
-}
-
-static void * ggml_metal_host_malloc(size_t n) {
-    void * data = NULL;
-    const int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n);
-    if (result != 0) {
-        GGML_METAL_LOG_ERROR("%s: error: posix_memalign failed\n", __func__);
-        return NULL;
-    }
-
-    return data;
-}
-
-static struct ggml_metal_context * ggml_metal_init(int n_cb) {
-    GGML_METAL_LOG_INFO("%s: allocating\n", __func__);
-
-    id<MTLDevice> device;
-    NSString * s;
-
-#if TARGET_OS_OSX
-    // Show all the Metal device instances in the system
-    NSArray * devices = MTLCopyAllDevices();
-    for (device in devices) {
-        s = [device name];
-        GGML_METAL_LOG_INFO("%s: found device: %s\n", __func__, [s UTF8String]);
-    }
-#endif
-
-    // Pick and show default Metal device
-    device = MTLCreateSystemDefaultDevice();
-    s = [device name];
-    GGML_METAL_LOG_INFO("%s: picking default device: %s\n", __func__, [s UTF8String]);
-
-    // Configure context
-    struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
-    ctx->device = device;
-    ctx->n_cb   = MIN(n_cb, GGML_METAL_MAX_BUFFERS);
-    ctx->queue  = [ctx->device newCommandQueue];
-    ctx->n_buffers = 0;
-
-    ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
-
-    // load library
-    {
-        NSBundle * bundle = nil;
-#ifdef SWIFT_PACKAGE
-        bundle = SWIFTPM_MODULE_BUNDLE;
-#else
-        bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
-#endif
-        NSError * error = nil;
-        NSString * libPath = [bundle pathForResource:@"default" ofType:@"metallib"];
-        if (libPath != nil) {
-            // pre-compiled library found
-            NSURL * libURL = [NSURL fileURLWithPath:libPath];
-            GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [libPath UTF8String]);
-            ctx->library = [ctx->device newLibraryWithURL:libURL error:&error];
-        } else {
-            GGML_METAL_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__);
-
-            NSString * sourcePath;
-            NSString * ggmlMetalPathResources = [[NSProcessInfo processInfo].environment objectForKey:@"GGML_METAL_PATH_RESOURCES"];
-
-            GGML_METAL_LOG_INFO("%s: GGML_METAL_PATH_RESOURCES = %s\n", __func__, ggmlMetalPathResources ? [ggmlMetalPathResources UTF8String] : "nil");
-
-            if (ggmlMetalPathResources) {
-                sourcePath = [ggmlMetalPathResources stringByAppendingPathComponent:@"ggml-metal.metal"];
-            } else {
-                sourcePath = [bundle pathForResource:@"ggml-metal" ofType:@"metal"];
-            }
-            if (sourcePath == nil) {
-                GGML_METAL_LOG_WARN("%s: error: could not use bundle path to find ggml-metal.metal, falling back to trying cwd\n", __func__);
-                sourcePath = @"ggml-metal.metal";
-            }
-            GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [sourcePath UTF8String]);
-            NSString * src = [NSString stringWithContentsOfFile:sourcePath encoding:NSUTF8StringEncoding error:&error];
-            if (error) {
-                GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
-                return NULL;
-            }
-
-            // dictionary of preprocessor macros
-            NSMutableDictionary * prep = [NSMutableDictionary dictionary];
-
-#ifdef GGML_QKK_64
-            prep[@"QK_K"] = @(64);
-#endif
-
-            MTLCompileOptions* options = [MTLCompileOptions new];
-            options.preprocessorMacros = prep;
-
-            //[options setFastMathEnabled:false];
-
-            ctx->library = [ctx->device newLibraryWithSource:src options:options error:&error];
-
-            [options release];
-            [prep release];
-        }
-
-        if (error) {
-            GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
-            return NULL;
-        }
-    }
-
-#if TARGET_OS_OSX
-    // print MTL GPU family:
-    GGML_METAL_LOG_INFO("%s: GPU name:   %s\n", __func__, [[ctx->device name] UTF8String]);
-
-    const NSInteger MTLGPUFamilyMetal3 = 5001;
-
-    // determine max supported GPU family
-    // https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf
-    // https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
-    {
-        for (int i = MTLGPUFamilyApple1 + 20; i >= MTLGPUFamilyApple1; --i) {
-            if ([ctx->device supportsFamily:i]) {
-                GGML_METAL_LOG_INFO("%s: GPU family: MTLGPUFamilyApple%d  (%d)\n", __func__, i - (int) MTLGPUFamilyApple1 + 1, i);
-                break;
-            }
-        }
-
-        for (int i = MTLGPUFamilyCommon1 + 5; i >= MTLGPUFamilyCommon1; --i) {
-            if ([ctx->device supportsFamily:i]) {
-                GGML_METAL_LOG_INFO("%s: GPU family: MTLGPUFamilyCommon%d (%d)\n", __func__, i - (int) MTLGPUFamilyCommon1 + 1, i);
-                break;
-            }
-        }
-
-        for (int i = MTLGPUFamilyMetal3 + 5; i >= MTLGPUFamilyMetal3; --i) {
-            if ([ctx->device supportsFamily:i]) {
-                GGML_METAL_LOG_INFO("%s: GPU family: MTLGPUFamilyMetal%d  (%d)\n", __func__, i - (int) MTLGPUFamilyMetal3 + 3, i);
-                break;
-            }
-        }
-    }
-
-    ctx->support_simdgroup_reduction  = [ctx->device supportsFamily:MTLGPUFamilyApple7];
-    ctx->support_simdgroup_reduction |= [ctx->device supportsFamily:MTLGPUFamilyMetal3];
-
-    ctx->support_simdgroup_mm = [ctx->device supportsFamily:MTLGPUFamilyApple7];
-
-    GGML_METAL_LOG_INFO("%s: simdgroup reduction support   = %s\n",       __func__, ctx->support_simdgroup_reduction ? "true" : "false");
-    GGML_METAL_LOG_INFO("%s: simdgroup matrix mul. support = %s\n",       __func__, ctx->support_simdgroup_mm ? "true" : "false");
-    GGML_METAL_LOG_INFO("%s: hasUnifiedMemory              = %s\n",       __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
-    GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize  = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1e6);
-    if (ctx->device.maxTransferRate != 0) {
-        GGML_METAL_LOG_INFO("%s: maxTransferRate               = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1e6);
-    } else {
-        GGML_METAL_LOG_INFO("%s: maxTransferRate               = built-in GPU\n", __func__);
-    }
-#endif
-
-    // load kernels
-    {
-        NSError * error = nil;
-
-        for (int i = 0; i < GGML_METAL_MAX_KERNELS; ++i) {
-            ctx->kernels[i].function = nil;
-            ctx->kernels[i].pipeline = nil;
-        }
-
-        /*
-            GGML_METAL_LOG_INFO("%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) kernel->pipeline, \
-                    (int) kernel->pipeline.maxTotalThreadsPerThreadgroup, \
-                    (int) kernel->pipeline.threadExecutionWidth); \
-        */
-#define GGML_METAL_ADD_KERNEL(e, name, supported) \
-        if (supported) { \
-            struct ggml_metal_kernel * kernel = &ctx->kernels[e]; \
-            kernel->function = [ctx->library newFunctionWithName:@"kernel_"#name]; \
-            kernel->pipeline = [ctx->device newComputePipelineStateWithFunction:kernel->function error:&error]; \
-            if (error) { \
-                GGML_METAL_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
-                return NULL; \
-            } \
-        } else { \
-            GGML_METAL_LOG_WARN("%s: skipping %-32s (not supported)\n", __func__, "kernel_"#name); \
-        }
-
-        // simd_sum and simd_max requires MTLGPUFamilyApple7
-
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD,                       add,                    true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD_ROW,                   add_row,                true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL,                       mul,                    true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_ROW,                   mul_row,                true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIV,                       div,                    true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIV_ROW,                   div_row,                true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SCALE,                     scale,                  true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SCALE_4,                   scale_4,                true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TANH,                      tanh,                   true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RELU,                      relu,                   true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU,                      gelu,                   true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU_QUICK,                gelu_quick,             true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SILU,                      silu,                   true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX,                  soft_max,               ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX_4,                soft_max_4,             ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF,             diag_mask_inf,          true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF_8,           diag_mask_inf_8,        true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_F32,              get_rows_f32,           true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_F16,              get_rows_f16,           true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_0,             get_rows_q4_0,          true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_1,             get_rows_q4_1,          true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_0,             get_rows_q5_0,          true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_1,             get_rows_q5_1,          true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q8_0,             get_rows_q8_0,          true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q2_K,             get_rows_q2_K,          true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q3_K,             get_rows_q3_K,          true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_K,             get_rows_q4_K,          true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_K,             get_rows_q5_K,          true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q6_K,             get_rows_q6_K,          true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XXS,          get_rows_iq2_xxs,       true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS,           get_rows_iq2_xs,        true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_I32,              get_rows_i32,           true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RMS_NORM,                  rms_norm,               ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GROUP_NORM,                group_norm,             ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_NORM,                      norm,                   true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32,            mul_mv_f32_f32,         ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F16,            mul_mv_f16_f16,         ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32,            mul_mv_f16_f32,         ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_1ROW,       mul_mv_f16_f32_1row,    ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_L4,         mul_mv_f16_f32_l4,      ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_0_F32,           mul_mv_q4_0_f32,        ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_1_F32,           mul_mv_q4_1_f32,        ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_0_F32,           mul_mv_q5_0_f32,        ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_1_F32,           mul_mv_q5_1_f32,        ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q8_0_F32,           mul_mv_q8_0_f32,        ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q2_K_F32,           mul_mv_q2_K_f32,        ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q3_K_F32,           mul_mv_q3_K_f32,        ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_K_F32,           mul_mv_q4_K_f32,        ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_K_F32,           mul_mv_q5_K_f32,        ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q6_K_F32,           mul_mv_q6_K_f32,        ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XXS_F32,        mul_mv_iq2_xxs_f32,     ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32,         mul_mv_iq2_xs_f32,      ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32,         mul_mv_id_f32_f32,      ctx->support_simdgroup_reduction);
-      //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F16,         mul_mv_id_f16_f16,      ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32,         mul_mv_id_f16_f32,      ctx->support_simdgroup_reduction);
-      //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32_1ROW,    mul_mv_id_f16_f32_1row, ctx->support_simdgroup_reduction);
-      //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32_L4,      mul_mv_id_f16_f32_l4,   ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_0_F32,        mul_mv_id_q4_0_f32,     ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_1_F32,        mul_mv_id_q4_1_f32,     ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_0_F32,        mul_mv_id_q5_0_f32,     ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_1_F32,        mul_mv_id_q5_1_f32,     ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q8_0_F32,        mul_mv_id_q8_0_f32,     ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q2_K_F32,        mul_mv_id_q2_K_f32,     ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q3_K_F32,        mul_mv_id_q3_K_f32,     ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_K_F32,        mul_mv_id_q4_K_f32,     ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_K_F32,        mul_mv_id_q5_K_f32,     ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q6_K_F32,        mul_mv_id_q6_K_f32,     ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XXS_F32,     mul_mv_id_iq2_xxs_f32,  ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32,      mul_mv_id_iq2_xs_f32,   ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32,            mul_mm_f32_f32,         ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32,            mul_mm_f16_f32,         ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_0_F32,           mul_mm_q4_0_f32,        ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_1_F32,           mul_mm_q4_1_f32,        ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_0_F32,           mul_mm_q5_0_f32,        ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_1_F32,           mul_mm_q5_1_f32,        ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q8_0_F32,           mul_mm_q8_0_f32,        ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q2_K_F32,           mul_mm_q2_K_f32,        ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q3_K_F32,           mul_mm_q3_K_f32,        ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_K_F32,           mul_mm_q4_K_f32,        ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_K_F32,           mul_mm_q5_K_f32,        ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q6_K_F32,           mul_mm_q6_K_f32,        ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32,        mul_mm_iq2_xxs_f32,     ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32,         mul_mm_iq2_xs_f32,      ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32,         mul_mm_id_f32_f32,      ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32,         mul_mm_id_f16_f32,      ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32,        mul_mm_id_q4_0_f32,     ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_1_F32,        mul_mm_id_q4_1_f32,     ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_0_F32,        mul_mm_id_q5_0_f32,     ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_1_F32,        mul_mm_id_q5_1_f32,     ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q8_0_F32,        mul_mm_id_q8_0_f32,     ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q2_K_F32,        mul_mm_id_q2_K_f32,     ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q3_K_F32,        mul_mm_id_q3_K_f32,     ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_K_F32,        mul_mm_id_q4_K_f32,     ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_K_F32,        mul_mm_id_q5_K_f32,     ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q6_K_F32,        mul_mm_id_q6_K_f32,     ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32,     mul_mm_id_iq2_xxs_f32,  ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32,      mul_mm_id_iq2_xs_f32,   ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_F32,                  rope_f32,               true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_F16,                  rope_f16,               true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ALIBI_F32,                 alibi_f32,              true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_F16,                im2col_f16,             true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32,               upscale_f32,            true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32,                   pad_f32,                true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,       argsort_f32_i32_asc,    true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC,      argsort_f32_i32_desc,   true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32,            leaky_relu_f32,         true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F16,               cpy_f32_f16,            true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F32,               cpy_f32_f32,            true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q8_0,              cpy_f32_q8_0,           true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_0,              cpy_f32_q4_0,           true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_1,              cpy_f32_q4_1,           true);
-      //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_0,              cpy_f32_q5_0,           true);
-      //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_1,              cpy_f32_q5_1,           true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F16_F16,               cpy_f16_f16,            true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F16_F32,               cpy_f16_f32,            true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CONCAT,                    concat,                 true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SQR,                       sqr,                    true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUM_ROWS,                  sum_rows,               true);
-    }
-
-    return ctx;
-}
-
-static void ggml_metal_free(struct ggml_metal_context * ctx) {
-    GGML_METAL_LOG_INFO("%s: deallocating\n", __func__);
-
-    for (int i = 0; i < ctx->n_buffers; ++i) {
-        [ctx->buffers[i].metal release];
-    }
-
-    for (int i = 0; i < GGML_METAL_MAX_KERNELS; ++i) {
-        if (ctx->kernels[i].pipeline) {
-            [ctx->kernels[i].pipeline release];
-        }
-
-        if (ctx->kernels[i].function) {
-            [ctx->kernels[i].function release];
-        }
-    }
-
-    [ctx->library release];
-    [ctx->queue release];
-    [ctx->device release];
-
-    dispatch_release(ctx->d_queue);
-
-    free(ctx);
-}
-
-// temporarily defined here for compatibility between ggml-backend and the old API
-
-struct ggml_backend_metal_buffer {
-    void   * data;
-    size_t   size;
-
-    id<MTLBuffer> metal;
-};
-
-struct ggml_backend_metal_buffer_context {
-    void * all_data;
-    size_t all_size;
-    bool owned;
-
-    // multiple buffers are used only to avoid the maximum buffer size limitation when using mmap
-    int n_buffers;
-    struct ggml_backend_metal_buffer buffers[GGML_METAL_MAX_BUFFERS];
-};
-
-// finds the Metal buffer that contains the tensor data on the GPU device
-// the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the
-// Metal buffer based on the host memory pointer
-//
-static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, struct ggml_tensor * t, size_t * offs) {
-    //GGML_METAL_LOG_INFO("%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach);
-
-    const int64_t tsize = ggml_nbytes(t);
-
-    ggml_backend_buffer_t buffer = t->view_src ? t->view_src->buffer : t->buffer;
-
-    // compatibility with ggml-backend
-    if (buffer && buffer->buft == ggml_backend_metal_buffer_type()) {
-        struct ggml_backend_metal_buffer_context * buf_ctx = (struct ggml_backend_metal_buffer_context *) buffer->context;
-
-        // find the view that contains the tensor fully
-        for (int i = 0; i < buf_ctx->n_buffers; ++i) {
-            const int64_t ioffs = (int64_t) t->data - (int64_t) buf_ctx->buffers[i].data;
-
-            //GGML_METAL_LOG_INFO("ioffs = %10ld, tsize = %10ld, sum = %10ld, buf_ctx->buffers[%d].size = %10ld\n", ioffs, tsize, ioffs + tsize, i, buf_ctx->buffers[i].size);
-            if (ioffs >= 0 && ioffs + tsize <= (int64_t) buf_ctx->buffers[i].size) {
-                *offs = (size_t) ioffs;
-
-                //GGML_METAL_LOG_INFO("%s: tensor '%16s', offs = %8ld\n", __func__, t->name, *offs);
-
-                return buf_ctx->buffers[i].metal;
-            }
-        }
-
-        GGML_METAL_LOG_ERROR("%s: error: tensor '%s' buffer is nil\n", __func__, t->name);
-
-        return nil;
-    }
-
-    // find the view that contains the tensor fully
-    for (int i = 0; i < ctx->n_buffers; ++i) {
-        const int64_t ioffs = (int64_t) t->data - (int64_t) ctx->buffers[i].data;
-
-        //GGML_METAL_LOG_INFO("ioffs = %10ld, tsize = %10ld, sum = %10ld, ctx->buffers[%d].size = %10ld, name = %s\n", ioffs, tsize, ioffs + tsize, i, ctx->buffers[i].size, ctx->buffers[i].name);
-        if (ioffs >= 0 && ioffs + tsize <= (int64_t) ctx->buffers[i].size) {
-            *offs = (size_t) ioffs;
-
-            //GGML_METAL_LOG_INFO("%s: '%s' tensor '%16s', offs = %8ld\n", __func__, ctx->buffers[i].name, t->name, *offs);
-
-            return ctx->buffers[i].metal;
-        }
-    }
-
-    GGML_METAL_LOG_ERROR("%s: error: buffer is nil\n", __func__);
-
-    return nil;
-}
-
-static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const struct ggml_tensor * op) {
-    switch (op->op) {
-        case GGML_OP_UNARY:
-            switch (ggml_get_unary_op(op)) {
-                case GGML_UNARY_OP_TANH:
-                case GGML_UNARY_OP_RELU:
-                case GGML_UNARY_OP_GELU:
-                case GGML_UNARY_OP_GELU_QUICK:
-                case GGML_UNARY_OP_SILU:
-                    return true;
-                default:
-                    return false;
-            }
-        case GGML_OP_NONE:
-        case GGML_OP_RESHAPE:
-        case GGML_OP_VIEW:
-        case GGML_OP_TRANSPOSE:
-        case GGML_OP_PERMUTE:
-        case GGML_OP_CONCAT:
-        case GGML_OP_ADD:
-        case GGML_OP_ACC:
-        case GGML_OP_MUL:
-        case GGML_OP_DIV:
-        case GGML_OP_SCALE:
-        case GGML_OP_SQR:
-        case GGML_OP_SUM_ROWS:
-            return true;
-        case GGML_OP_SOFT_MAX:
-        case GGML_OP_RMS_NORM:
-        case GGML_OP_GROUP_NORM:
-            return ctx->support_simdgroup_reduction;
-        case GGML_OP_NORM:
-        case GGML_OP_ALIBI:
-        case GGML_OP_ROPE:
-        case GGML_OP_IM2COL:
-        case GGML_OP_UPSCALE:
-        case GGML_OP_PAD:
-        case GGML_OP_ARGSORT:
-        case GGML_OP_LEAKY_RELU:
-            return true;
-        case GGML_OP_MUL_MAT:
-        case GGML_OP_MUL_MAT_ID:
-            return ctx->support_simdgroup_reduction;
-        case GGML_OP_CPY:
-        case GGML_OP_DUP:
-        case GGML_OP_CONT:
-            {
-                switch (op->src[0]->type) {
-                    case GGML_TYPE_F32:
-                        switch (op->type) {
-                           case GGML_TYPE_F16:
-                           case GGML_TYPE_F32:
-                           case GGML_TYPE_Q8_0:
-                           case GGML_TYPE_Q4_0:
-                           case GGML_TYPE_Q4_1:
-                                return true;
-                           default:
-                                return false;
-                        }
-                    case GGML_TYPE_F16:
-                        switch (op->type) {
-                           case GGML_TYPE_F16:
-                           case GGML_TYPE_F32:
-                                return true;
-                           default:
-                                return false;
-                        }
-                    default:
-                        return false;
-                };
-            }
-        case GGML_OP_DIAG_MASK_INF:
-        case GGML_OP_GET_ROWS:
-            {
-                return op->ne[3] == 1;
-            }
-        default:
-            return false;
-    }
-}
-
-static bool ggml_metal_graph_compute(
-        struct ggml_metal_context * ctx,
-               struct ggml_cgraph * gf) {
-    @autoreleasepool {
-
-    MTLComputePassDescriptor * edesc = MTLComputePassDescriptor.computePassDescriptor;
-
-    const int n_nodes  = gf->n_nodes;
-    edesc.dispatchType = MTLDispatchTypeSerial;
-
-    // create multiple command buffers and enqueue them
-    // then, we encode the graph into the command buffers in parallel
-
-    const int n_cb = ctx->n_cb;
-
-    for (int i = 0; i < n_cb; ++i) {
-        ctx->command_buffers[i] = [ctx->queue commandBuffer];
-
-        // enqueue the command buffers in order to specify their execution order
-        [ctx->command_buffers[i] enqueue];
-
-        ctx->command_encoders[i] = [ctx->command_buffers[i] computeCommandEncoderWithDescriptor: edesc];
-    }
-
-    for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) {
-        const int n_nodes_per_cb = (n_nodes + n_cb - 1) / n_cb;
-
-        dispatch_async(ctx->d_queue, ^{
-            size_t offs_src0 = 0;
-            size_t offs_src1 = 0;
-            size_t offs_dst  = 0;
-
-            id<MTLCommandBuffer> command_buffer  = ctx->command_buffers[cb_idx];
-            id<MTLComputeCommandEncoder> encoder = ctx->command_encoders[cb_idx];
-
-            const int node_start =                                      (cb_idx + 0) * n_nodes_per_cb;
-            const int node_end   = MIN((cb_idx == n_cb - 1) ? n_nodes : (cb_idx + 1) * n_nodes_per_cb, n_nodes);
-
-            for (int ind = node_start; ind < node_end; ++ind) {
-                const int i = ind;
-
-                if (i == -1) {
-                    [encoder memoryBarrierWithScope:MTLBarrierScopeBuffers];
-                    continue;
-                }
-
-                //GGML_METAL_LOG_INFO("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
-
-                struct ggml_tensor * src0 = gf->nodes[i]->src[0];
-                struct ggml_tensor * src1 = gf->nodes[i]->src[1];
-                struct ggml_tensor * dst  = gf->nodes[i];
-
-                switch (dst->op) {
-                    case GGML_OP_NONE:
-                    case GGML_OP_RESHAPE:
-                    case GGML_OP_VIEW:
-                    case GGML_OP_TRANSPOSE:
-                    case GGML_OP_PERMUTE:
-                        {
-                            // noop -> next node
-                        } continue;
-                    default:
-                        {
-                        } break;
-                }
-
-                if (!ggml_metal_supports_op(ctx, dst)) {
-                    GGML_METAL_LOG_ERROR("%s: error: unsupported op '%s'\n", __func__, ggml_op_desc(dst));
-                    GGML_ASSERT(!"unsupported op");
-                }
-
-#ifndef GGML_METAL_NDEBUG
-                [encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(dst) encoding:NSUTF8StringEncoding]];
-#endif
-
-                const int64_t  ne00 = src0 ? src0->ne[0] : 0;
-                const int64_t  ne01 = src0 ? src0->ne[1] : 0;
-                const int64_t  ne02 = src0 ? src0->ne[2] : 0;
-                const int64_t  ne03 = src0 ? src0->ne[3] : 0;
-
-                const uint64_t nb00 = src0 ? src0->nb[0] : 0;
-                const uint64_t nb01 = src0 ? src0->nb[1] : 0;
-                const uint64_t nb02 = src0 ? src0->nb[2] : 0;
-                const uint64_t nb03 = src0 ? src0->nb[3] : 0;
-
-                const int64_t  ne10 = src1 ? src1->ne[0] : 0;
-                const int64_t  ne11 = src1 ? src1->ne[1] : 0;
-                const int64_t  ne12 = src1 ? src1->ne[2] : 0;
-                const int64_t  ne13 = src1 ? src1->ne[3] : 0; UNUSED(ne13);
-
-                const uint64_t nb10 = src1 ? src1->nb[0] : 0;
-                const uint64_t nb11 = src1 ? src1->nb[1] : 0;
-                const uint64_t nb12 = src1 ? src1->nb[2] : 0;
-                const uint64_t nb13 = src1 ? src1->nb[3] : 0; UNUSED(nb13);
-
-                const int64_t  ne0  = dst ? dst->ne[0] : 0;
-                const int64_t  ne1  = dst ? dst->ne[1] : 0;
-                const int64_t  ne2  = dst ? dst->ne[2] : 0;
-                const int64_t  ne3  = dst ? dst->ne[3] : 0;
-
-                const uint64_t nb0  = dst ? dst->nb[0] : 0;
-                const uint64_t nb1  = dst ? dst->nb[1] : 0;
-                const uint64_t nb2  = dst ? dst->nb[2] : 0;
-                const uint64_t nb3  = dst ? dst->nb[3] : 0;
-
-                const enum ggml_type src0t = src0 ? src0->type : GGML_TYPE_COUNT;
-                const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
-                const enum ggml_type dstt  = dst  ? dst->type  : GGML_TYPE_COUNT;
-
-                id<MTLBuffer> id_src0 = src0 ? ggml_metal_get_buffer(ctx, src0, &offs_src0) : nil;
-                id<MTLBuffer> id_src1 = src1 ? ggml_metal_get_buffer(ctx, src1, &offs_src1) : nil;
-                id<MTLBuffer> id_dst  = dst  ? ggml_metal_get_buffer(ctx, dst,  &offs_dst)  : nil;
-
-                //GGML_METAL_LOG_INFO("%s: op - %s\n", __func__, ggml_op_name(dst->op));
-                //if (src0) {
-                //    GGML_METAL_LOG_INFO("%s: src0 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src0t), ne00, ne01, ne02,
-                //            ggml_is_contiguous(src0), src0->name);
-                //}
-                //if (src1) {
-                //    GGML_METAL_LOG_INFO("%s: src1 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src1t), ne10, ne11, ne12,
-                //            ggml_is_contiguous(src1), src1->name);
-                //}
-                //if (dst) {
-                //    GGML_METAL_LOG_INFO("%s: dst  - %4s [%5lld, %5lld, %5lld], 1, %s\n",  __func__, ggml_type_name(dstt),  ne0,  ne1,  ne2,
-                //            dst->name);
-                //}
-
-                switch (dst->op) {
-                    case GGML_OP_CONCAT:
-                        {
-                            const int64_t nb = ne00;
-
-                            id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CONCAT].pipeline;
-
-                            [encoder setComputePipelineState:pipeline];
-                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                            [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
-                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
-                            [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
-                            [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
-                            [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5];
-                            [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:6];
-                            [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:7];
-                            [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:8];
-                            [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:9];
-                            [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:10];
-                            [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:11];
-                            [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:12];
-                            [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:13];
-                            [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:14];
-                            [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:15];
-                            [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:16];
-                            [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:17];
-                            [encoder setBytes:&nb13 length:sizeof(nb13) atIndex:18];
-                            [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:19];
-                            [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:20];
-                            [encoder setBytes:&ne2  length:sizeof(ne2)  atIndex:21];
-                            [encoder setBytes:&ne3  length:sizeof(ne3)  atIndex:22];
-                            [encoder setBytes:&nb0  length:sizeof(nb0)  atIndex:23];
-                            [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:24];
-                            [encoder setBytes:&nb2  length:sizeof(nb2)  atIndex:25];
-                            [encoder setBytes:&nb3  length:sizeof(nb3)  atIndex:26];
-                            [encoder setBytes:&nb   length:sizeof(nb)   atIndex:27];
-
-                            const int nth = MIN(1024, ne0);
-
-                            [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
-                        } break;
-                    case GGML_OP_ADD:
-                    case GGML_OP_MUL:
-                    case GGML_OP_DIV:
-                        {
-                            const size_t offs = 0;
-
-                            bool bcast_row = false;
-
-                            int64_t nb = ne00;
-
-                            id<MTLComputePipelineState> pipeline = nil;
-
-                            if (ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0) {
-                                GGML_ASSERT(ggml_is_contiguous(src0));
-
-                                // src1 is a row
-                                GGML_ASSERT(ne11 == 1);
-
-                                nb = ne00 / 4;
-                                switch (dst->op) {
-                                    case GGML_OP_ADD: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_ROW].pipeline; break;
-                                    case GGML_OP_MUL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_ROW].pipeline; break;
-                                    case GGML_OP_DIV: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIV_ROW].pipeline; break;
-                                    default: GGML_ASSERT(false);
-                                }
-
-                                bcast_row = true;
-                            } else {
-                                switch (dst->op) {
-                                    case GGML_OP_ADD: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD].pipeline; break;
-                                    case GGML_OP_MUL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL].pipeline; break;
-                                    case GGML_OP_DIV: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIV].pipeline; break;
-                                    default: GGML_ASSERT(false);
-                                }
-                            }
-
-                            [encoder setComputePipelineState:pipeline];
-                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                            [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
-                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
-                            [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
-                            [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
-                            [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5];
-                            [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:6];
-                            [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:7];
-                            [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:8];
-                            [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:9];
-                            [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:10];
-                            [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:11];
-                            [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:12];
-                            [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:13];
-                            [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:14];
-                            [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:15];
-                            [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:16];
-                            [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:17];
-                            [encoder setBytes:&nb13 length:sizeof(nb13) atIndex:18];
-                            [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:19];
-                            [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:20];
-                            [encoder setBytes:&ne2  length:sizeof(ne2)  atIndex:21];
-                            [encoder setBytes:&ne3  length:sizeof(ne3)  atIndex:22];
-                            [encoder setBytes:&nb0  length:sizeof(nb0)  atIndex:23];
-                            [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:24];
-                            [encoder setBytes:&nb2  length:sizeof(nb2)  atIndex:25];
-                            [encoder setBytes:&nb3  length:sizeof(nb3)  atIndex:26];
-                            [encoder setBytes:&offs length:sizeof(offs) atIndex:27];
-                            [encoder setBytes:&nb   length:sizeof(nb)   atIndex:28];
-
-                            if (bcast_row) {
-                                const int64_t n = ggml_nelements(dst)/4;
-
-                                [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                            } else {
-                                const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne0);
-
-                                [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
-                            }
-                        } break;
-                    case GGML_OP_ACC:
-                        {
-                            GGML_ASSERT(src0t == GGML_TYPE_F32);
-                            GGML_ASSERT(src1t == GGML_TYPE_F32);
-                            GGML_ASSERT(dstt  == GGML_TYPE_F32);
-
-                            GGML_ASSERT(ggml_is_contiguous(src0));
-                            GGML_ASSERT(ggml_is_contiguous(src1));
-
-                            const size_t pnb1 = ((int32_t *) dst->op_params)[0];
-                            const size_t pnb2 = ((int32_t *) dst->op_params)[1];
-                            const size_t pnb3 = ((int32_t *) dst->op_params)[2];
-                            const size_t offs = ((int32_t *) dst->op_params)[3];
-
-                            const bool inplace = (bool) ((int32_t *) dst->op_params)[4];
-
-                            if (!inplace) {
-                                // run a separete kernel to cpy src->dst
-                                // not sure how to avoid this
-                                // TODO: make a simpler cpy_bytes kernel
-
-                                const id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_F32].pipeline;
-
-                                [encoder setComputePipelineState:pipeline];
-                                [encoder setBuffer:id_src0 offset:offs_src0        atIndex:0];
-                                [encoder setBuffer:id_dst  offset:offs_dst         atIndex:1];
-                                [encoder setBytes:&ne00    length:sizeof( int64_t) atIndex:2];
-                                [encoder setBytes:&ne01    length:sizeof( int64_t) atIndex:3];
-                                [encoder setBytes:&ne02    length:sizeof( int64_t) atIndex:4];
-                                [encoder setBytes:&ne03    length:sizeof( int64_t) atIndex:5];
-                                [encoder setBytes:&nb00    length:sizeof(uint64_t) atIndex:6];
-                                [encoder setBytes:&nb01    length:sizeof(uint64_t) atIndex:7];
-                                [encoder setBytes:&nb02    length:sizeof(uint64_t) atIndex:8];
-                                [encoder setBytes:&nb03    length:sizeof(uint64_t) atIndex:9];
-                                [encoder setBytes:&ne0     length:sizeof( int64_t) atIndex:10];
-                                [encoder setBytes:&ne1     length:sizeof( int64_t) atIndex:11];
-                                [encoder setBytes:&ne2     length:sizeof( int64_t) atIndex:12];
-                                [encoder setBytes:&ne3     length:sizeof( int64_t) atIndex:13];
-                                [encoder setBytes:&nb0     length:sizeof(uint64_t) atIndex:14];
-                                [encoder setBytes:&nb1     length:sizeof(uint64_t) atIndex:15];
-                                [encoder setBytes:&nb2     length:sizeof(uint64_t) atIndex:16];
-                                [encoder setBytes:&nb3     length:sizeof(uint64_t) atIndex:17];
-
-                                const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne00);
-
-                                [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
-                            }
-
-                            const id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD].pipeline;
-
-                            [encoder setComputePipelineState:pipeline];
-                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                            [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
-                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
-                            [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
-                            [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
-                            [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5];
-                            [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:6];
-                            [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:7];
-                            [encoder setBytes:&pnb1 length:sizeof(pnb1) atIndex:8];
-                            [encoder setBytes:&pnb2 length:sizeof(pnb2) atIndex:9];
-                            [encoder setBytes:&pnb3 length:sizeof(pnb3) atIndex:10];
-                            [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:11];
-                            [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:12];
-                            [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:13];
-                            [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:14];
-                            [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:15];
-                            [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:16];
-                            [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:17];
-                            [encoder setBytes:&nb13 length:sizeof(nb13) atIndex:18];
-                            [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:19];
-                            [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:20];
-                            [encoder setBytes:&ne2  length:sizeof(ne2)  atIndex:21];
-                            [encoder setBytes:&ne3  length:sizeof(ne3)  atIndex:22];
-                            [encoder setBytes:&nb0  length:sizeof(nb0)  atIndex:23];
-                            [encoder setBytes:&pnb1 length:sizeof(pnb1) atIndex:24];
-                            [encoder setBytes:&pnb2 length:sizeof(pnb2) atIndex:25];
-                            [encoder setBytes:&pnb3 length:sizeof(pnb3) atIndex:26];
-                            [encoder setBytes:&offs length:sizeof(offs) atIndex:27];
-
-                            const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne00);
-
-                            [encoder dispatchThreadgroups:MTLSizeMake(ne11, ne12, ne13) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
-                        } break;
-                    case GGML_OP_SCALE:
-                        {
-                            GGML_ASSERT(ggml_is_contiguous(src0));
-
-                            const float scale = *(const float *) dst->op_params;
-
-                            int64_t n = ggml_nelements(dst);
-
-                            id<MTLComputePipelineState> pipeline = nil;
-
-                            if (n % 4 == 0) {
-                                n /= 4;
-                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SCALE_4].pipeline;
-                            } else {
-                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SCALE].pipeline;
-                            }
-
-                            [encoder setComputePipelineState:pipeline];
-                            [encoder setBuffer:id_src0   offset:offs_src0 atIndex:0];
-                            [encoder setBuffer:id_dst    offset:offs_dst  atIndex:1];
-                            [encoder setBytes:&scale length:sizeof(scale) atIndex:2];
-
-                            [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                        } break;
-                    case GGML_OP_UNARY:
-                        switch (ggml_get_unary_op(gf->nodes[i])) {
-                            case GGML_UNARY_OP_TANH:
-                                {
-                                    id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_TANH].pipeline;
-
-                                    [encoder setComputePipelineState:pipeline];
-                                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-
-                                    const int64_t n = ggml_nelements(dst);
-
-                                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                                } break;
-                            case GGML_UNARY_OP_RELU:
-                                {
-                                    id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_RELU].pipeline;
-
-                                    [encoder setComputePipelineState:pipeline];
-                                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-
-                                    const int64_t n = ggml_nelements(dst);
-
-                                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                                } break;
-                            case GGML_UNARY_OP_GELU:
-                                {
-                                    id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GELU].pipeline;
-
-                                    [encoder setComputePipelineState:pipeline];
-                                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-
-                                    const int64_t n = ggml_nelements(dst);
-                                    GGML_ASSERT(n % 4 == 0);
-
-                                    [encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                                } break;
-                            case GGML_UNARY_OP_GELU_QUICK:
-                                {
-                                    id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GELU_QUICK].pipeline;
-
-                                    [encoder setComputePipelineState:pipeline];
-                                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-
-                                    const int64_t n = ggml_nelements(dst);
-                                    GGML_ASSERT(n % 4 == 0);
-
-                                    [encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                                } break;
-                            case GGML_UNARY_OP_SILU:
-                                {
-                                    id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SILU].pipeline;
-
-                                    [encoder setComputePipelineState:pipeline];
-                                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-
-                                    const int64_t n = ggml_nelements(dst);
-                                    GGML_ASSERT(n % 4 == 0);
-
-                                    [encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                                } break;
-                            default:
-                                {
-                                    GGML_METAL_LOG_WARN("%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
-                                    GGML_ASSERT(false);
-                                }
-                        } break;
-                    case GGML_OP_SQR:
-                        {
-                            GGML_ASSERT(ggml_is_contiguous(src0));
-
-                            id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SQR].pipeline;
-
-                            [encoder setComputePipelineState:pipeline];
-                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                            [encoder setBuffer:id_dst  offset:offs_dst atIndex:1];
-
-                            const int64_t n = ggml_nelements(dst);
-
-                            [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                        } break;
-                    case GGML_OP_SUM_ROWS:
-                        {
-                            GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
-
-                            id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SUM_ROWS].pipeline;
-
-                            [encoder setComputePipelineState:pipeline];
-                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-                            [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
-                            [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
-                            [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
-                            [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5];
-                            [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
-                            [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
-                            [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
-                            [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9];
-                            [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:10];
-                            [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:11];
-                            [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:12];
-                            [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:13];
-                            [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:14];
-                            [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:15];
-                            [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:16];
-                            [encoder setBytes:&nb13 length:sizeof(nb13) atIndex:17];
-                            [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:18];
-                            [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:19];
-                            [encoder setBytes:&ne2  length:sizeof(ne2)  atIndex:20];
-                            [encoder setBytes:&ne3  length:sizeof(ne3)  atIndex:21];
-                            [encoder setBytes:&nb0  length:sizeof(nb0)  atIndex:22];
-                            [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:23];
-                            [encoder setBytes:&nb2  length:sizeof(nb2)  atIndex:24];
-                            [encoder setBytes:&nb3  length:sizeof(nb3)  atIndex:25];
-
-                            [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                        } break;
-                    case GGML_OP_SOFT_MAX:
-                        {
-                            int nth = 32; // SIMD width
-
-                            id<MTLComputePipelineState> pipeline = nil;
-
-                            if (ne00%4 == 0) {
-                                while (nth < ne00/4 && nth < 256) {
-                                    nth *= 2;
-                                }
-                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX_4].pipeline;
-                            } else {
-                                while (nth < ne00 && nth < 1024) {
-                                    nth *= 2;
-                                }
-                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX].pipeline;
-                            }
-
-                            const float scale = ((float *) dst->op_params)[0];
-
-                            [encoder setComputePipelineState:pipeline];
-                            [encoder setBuffer:id_src0 offset:offs_src0   atIndex:0];
-                            if (id_src1) {
-                                [encoder setBuffer:id_src1 offset:offs_src1   atIndex:1];
-                            } else {
-                                [encoder setBuffer:id_src0 offset:offs_src0   atIndex:1];
-                            }
-                            [encoder setBuffer:id_dst  offset:offs_dst    atIndex:2];
-                            [encoder setBytes:&ne00  length:sizeof(ne00)  atIndex:3];
-                            [encoder setBytes:&ne01  length:sizeof(ne01)  atIndex:4];
-                            [encoder setBytes:&ne02  length:sizeof(ne02)  atIndex:5];
-                            [encoder setBytes:&scale length:sizeof(scale) atIndex:6];
-                            [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
-
-                            [encoder dispatchThreadgroups:MTLSizeMake(ne01*ne02*ne03, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
-                        } break;
-                    case GGML_OP_DIAG_MASK_INF:
-                        {
-                            const int n_past = ((int32_t *)(dst->op_params))[0];
-
-                            id<MTLComputePipelineState> pipeline = nil;
-
-                            if (ne00%8 == 0) {
-                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF_8].pipeline;
-                            } else {
-                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF].pipeline;
-                            }
-
-                            [encoder setComputePipelineState:pipeline];
-                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-                            [encoder setBytes:&ne00   length:sizeof(ne00) atIndex:2];
-                            [encoder setBytes:&ne01   length:sizeof(ne01) atIndex:3];
-                            [encoder setBytes:&n_past length:sizeof(int)  atIndex:4];
-
-                            if (ne00%8 == 0) {
-                                [encoder dispatchThreadgroups:MTLSizeMake(ne00*ne01*ne02/8, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                            }
-                            else {
-                                [encoder dispatchThreadgroups:MTLSizeMake(ne00, ne01, ne02) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                            }
-                        } break;
-                    case GGML_OP_MUL_MAT:
-                        {
-                            GGML_ASSERT(ne00 == ne10);
-
-                            // TODO: assert that dim2 and dim3 are contiguous
-                            GGML_ASSERT(ne12 % ne02 == 0);
-                            GGML_ASSERT(ne13 % ne03 == 0);
-
-                            const uint r2 = ne12/ne02;
-                            const uint r3 = ne13/ne03;
-
-                            // find the break-even point where the matrix-matrix kernel becomes more efficient compared
-                            // to the matrix-vector kernel
-                            int ne11_mm_min = 1;
-
-#if 0
-                            // the numbers below are measured on M2 Ultra for 7B and 13B models
-                            // these numbers do not translate to other devices or model sizes
-                            // TODO: need to find a better approach
-                            if ([ctx->device.name isEqualToString:@"Apple M2 Ultra"]) {
-                                switch (src0t) {
-                                    case GGML_TYPE_F16:  ne11_mm_min = 2;  break;
-                                    case GGML_TYPE_Q8_0: ne11_mm_min = 7;  break;
-                                    case GGML_TYPE_Q2_K: ne11_mm_min = 15; break;
-                                    case GGML_TYPE_Q3_K: ne11_mm_min = 7;  break;
-                                    case GGML_TYPE_Q4_0:
-                                    case GGML_TYPE_Q4_1: ne11_mm_min = 15; break;
-                                    case GGML_TYPE_Q4_K: ne11_mm_min = 11; break;
-                                    case GGML_TYPE_Q5_0:                          // not tested yet
-                                    case GGML_TYPE_Q5_1: ne11_mm_min = 13; break; // not tested yet
-                                    case GGML_TYPE_Q5_K: ne11_mm_min = 7;  break;
-                                    case GGML_TYPE_Q6_K: ne11_mm_min = 7;  break;
-                                    default:             ne11_mm_min = 1;  break;
-                                }
-                            }
-#endif
-
-                            // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
-                            // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
-                            if ([ctx->device supportsFamily:MTLGPUFamilyApple7] &&
-                                !ggml_is_transposed(src0) &&
-                                !ggml_is_transposed(src1) &&
-                                src1t == GGML_TYPE_F32 &&
-                                ne00 % 32 == 0 && ne00 >= 64 &&
-                                (ne11 > ne11_mm_min || (ggml_is_quantized(src0t) && ne12 > 1))) {
-                                //printf("matrix: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12);
-
-                                id<MTLComputePipelineState> pipeline = nil;
-
-                                switch (src0->type) {
-                                    case GGML_TYPE_F32:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32    ].pipeline; break;
-                                    case GGML_TYPE_F16:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32    ].pipeline; break;
-                                    case GGML_TYPE_Q4_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_0_F32   ].pipeline; break;
-                                    case GGML_TYPE_Q4_1:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_1_F32   ].pipeline; break;
-                                    case GGML_TYPE_Q5_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_0_F32   ].pipeline; break;
-                                    case GGML_TYPE_Q5_1:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_1_F32   ].pipeline; break;
-                                    case GGML_TYPE_Q8_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q8_0_F32   ].pipeline; break;
-                                    case GGML_TYPE_Q2_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q2_K_F32   ].pipeline; break;
-                                    case GGML_TYPE_Q3_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q3_K_F32   ].pipeline; break;
-                                    case GGML_TYPE_Q4_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_K_F32   ].pipeline; break;
-                                    case GGML_TYPE_Q5_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_K_F32   ].pipeline; break;
-                                    case GGML_TYPE_Q6_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q6_K_F32   ].pipeline; break;
-                                    case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32].pipeline; break;
-                                    case GGML_TYPE_IQ2_XS:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32 ].pipeline; break;
-                                    default: GGML_ASSERT(false && "MUL MAT-MAT not implemented");
-                                }
-
-                                [encoder setComputePipelineState:pipeline];
-                                [encoder setBuffer:id_src0 offset:offs_src0    atIndex:0];
-                                [encoder setBuffer:id_src1 offset:offs_src1    atIndex:1];
-                                [encoder setBuffer:id_dst  offset:offs_dst     atIndex:2];
-                                [encoder setBytes:&ne00    length:sizeof(ne00) atIndex:3];
-                                [encoder setBytes:&ne02    length:sizeof(ne02) atIndex:4];
-                                [encoder setBytes:&nb01    length:sizeof(nb01) atIndex:5];
-                                [encoder setBytes:&nb02    length:sizeof(nb02) atIndex:6];
-                                [encoder setBytes:&ne12    length:sizeof(ne12) atIndex:7];
-                                [encoder setBytes:&nb10    length:sizeof(nb10) atIndex:8];
-                                [encoder setBytes:&nb11    length:sizeof(nb11) atIndex:9];
-                                [encoder setBytes:&nb12    length:sizeof(nb12) atIndex:10];
-                                [encoder setBytes:&ne0     length:sizeof(ne0)  atIndex:11];
-                                [encoder setBytes:&ne1     length:sizeof(ne1)  atIndex:12];
-                                [encoder setBytes:&r2      length:sizeof(r2)   atIndex:13];
-                                [encoder setBytes:&r3      length:sizeof(r3)   atIndex:14];
-                                [encoder setThreadgroupMemoryLength:8192 atIndex:0];
-                                [encoder dispatchThreadgroups:MTLSizeMake( (ne11 + 31)/32, (ne01 + 63)/64, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
-                            } else {
-                                int nth0 = 32;
-                                int nth1 = 1;
-                                int nrows = 1;
-                                //printf("vector: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12);
-
-                                id<MTLComputePipelineState> pipeline = nil;
-
-                                // use custom matrix x vector kernel
-                                switch (src0t) {
-                                    case GGML_TYPE_F32:
-                                        {
-                                            GGML_ASSERT(src1t == GGML_TYPE_F32);
-                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32].pipeline;
-                                            nrows = 4;
-                                        } break;
-                                    case GGML_TYPE_F16:
-                                        {
-                                            nth0 = 32;
-                                            nth1 = 1;
-                                            if (src1t == GGML_TYPE_F32) {
-                                                if (ne11 * ne12 < 4) {
-                                                    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_1ROW].pipeline;
-                                                } else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) {
-                                                    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_L4].pipeline;
-                                                    nrows = ne11;
-                                                } else {
-                                                    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32].pipeline;
-                                                    nrows = 4;
-                                                }
-                                            } else {
-                                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F16].pipeline;
-                                                nrows = 4;
-                                            }
-                                        } break;
-                                    case GGML_TYPE_Q4_0:
-                                        {
-                                            nth0 = 8;
-                                            nth1 = 8;
-                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_0_F32].pipeline;
-                                        } break;
-                                    case GGML_TYPE_Q4_1:
-                                        {
-                                            nth0 = 8;
-                                            nth1 = 8;
-                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_1_F32].pipeline;
-                                        } break;
-                                    case GGML_TYPE_Q5_0:
-                                        {
-                                            nth0 = 8;
-                                            nth1 = 8;
-                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_0_F32].pipeline;
-                                        } break;
-                                    case GGML_TYPE_Q5_1:
-                                        {
-                                            nth0 = 8;
-                                            nth1 = 8;
-                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_1_F32].pipeline;
-                                        } break;
-                                    case GGML_TYPE_Q8_0:
-                                        {
-                                            nth0 = 8;
-                                            nth1 = 8;
-                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q8_0_F32].pipeline;
-                                        } break;
-                                    case GGML_TYPE_Q2_K:
-                                        {
-                                            nth0 = 2;
-                                            nth1 = 32;
-                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q2_K_F32].pipeline;
-                                        } break;
-                                    case GGML_TYPE_Q3_K:
-                                        {
-                                            nth0 = 2;
-                                            nth1 = 32;
-                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q3_K_F32].pipeline;
-                                        } break;
-                                    case GGML_TYPE_Q4_K:
-                                        {
-                                            nth0 = 4; //1;
-                                            nth1 = 8; //32;
-                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_K_F32].pipeline;
-                                        } break;
-                                    case GGML_TYPE_Q5_K:
-                                        {
-                                            nth0 = 2;
-                                            nth1 = 32;
-                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_K_F32].pipeline;
-                                        } break;
-                                    case GGML_TYPE_Q6_K:
-                                        {
-                                            nth0 = 2;
-                                            nth1 = 32;
-                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q6_K_F32].pipeline;
-                                        } break;
-                                    case GGML_TYPE_IQ2_XXS:
-                                        {
-                                            nth0 = 4;
-                                            nth1 = 16;
-                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XXS_F32].pipeline;
-                                        } break;
-                                    case GGML_TYPE_IQ2_XS:
-                                        {
-                                            nth0 = 4;
-                                            nth1 = 16;
-                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32].pipeline;
-                                        } break;
-                                    default:
-                                        {
-                                            GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src0t);
-                                            GGML_ASSERT(false && "not implemented");
-                                        }
-                                };
-
-                                if (ggml_is_quantized(src0t)) {
-                                    GGML_ASSERT(ne00 >= nth0*nth1);
-                                }
-
-                                [encoder setComputePipelineState:pipeline];
-                                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                                [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
-                                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
-                                [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
-                                [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
-                                [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5];
-                                [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
-                                [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
-                                [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
-                                [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:9];
-                                [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:10];
-                                [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:11];
-                                [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:12];
-                                [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:13];
-                                [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:14];
-                                [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:15];
-                                [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:16];
-                                [encoder setBytes:&r2   length:sizeof(r2)   atIndex:17];
-                                [encoder setBytes:&r3   length:sizeof(r3)   atIndex:18];
-
-                                if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 ||
-                                    src0t == GGML_TYPE_Q5_0 || src0t == GGML_TYPE_Q5_1 || src0t == GGML_TYPE_Q8_0 ||
-                                    src0t == GGML_TYPE_Q2_K) { // || src0t == GGML_TYPE_Q4_K) {
-                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-                                }
-                                else if (src0t == GGML_TYPE_IQ2_XXS || src0t == GGML_TYPE_IQ2_XS) {
-                                    const int mem_size = src0t == GGML_TYPE_IQ2_XXS ? 256*8+128 : 512*8+128;
-                                    [encoder setThreadgroupMemoryLength:mem_size atIndex:0];
-                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-                                }
-                                else if (src0t == GGML_TYPE_Q4_K) {
-                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-                                }
-                                else if (src0t == GGML_TYPE_Q3_K) {
-#ifdef GGML_QKK_64
-                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-#else
-                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-#endif
-                                }
-                                else if (src0t == GGML_TYPE_Q5_K) {
-                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-                                }
-                                else if (src0t == GGML_TYPE_Q6_K) {
-                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-                                } else {
-                                    const int64_t ny = (ne11 + nrows - 1)/nrows;
-                                    [encoder dispatchThreadgroups:MTLSizeMake(ne01, ny, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-                                }
-                            }
-                        } break;
-                    case GGML_OP_MUL_MAT_ID:
-                        {
-                            //GGML_ASSERT(ne00 == ne10);
-                            //GGML_ASSERT(ne03 == ne13);
-
-                            GGML_ASSERT(src0t == GGML_TYPE_I32);
-
-                            const int n_as = ((int32_t *) dst->op_params)[1];
-
-                            // TODO: make this more general
-                            GGML_ASSERT(n_as <= 8);
-
-                            // max size of the src1ids array in the kernel stack
-                            GGML_ASSERT(ne11 <= 512);
-
-                            struct ggml_tensor * src2 = gf->nodes[i]->src[2];
-
-                            const int64_t  ne20 = src2 ? src2->ne[0] : 0;
-                            const int64_t  ne21 = src2 ? src2->ne[1] : 0;
-                            const int64_t  ne22 = src2 ? src2->ne[2] : 0;
-                            const int64_t  ne23 = src2 ? src2->ne[3] : 0; GGML_UNUSED(ne23);
-
-                            const uint64_t nb20 = src2 ? src2->nb[0] : 0; GGML_UNUSED(nb20);
-                            const uint64_t nb21 = src2 ? src2->nb[1] : 0;
-                            const uint64_t nb22 = src2 ? src2->nb[2] : 0;
-                            const uint64_t nb23 = src2 ? src2->nb[3] : 0; GGML_UNUSED(nb23);
-
-                            const enum ggml_type src2t = src2 ? src2->type : GGML_TYPE_COUNT; GGML_UNUSED(src2t);
-
-                            GGML_ASSERT(!ggml_is_transposed(src2));
-                            GGML_ASSERT(!ggml_is_transposed(src1));
-
-                            GGML_ASSERT(src1t == GGML_TYPE_F32);
-
-                            const uint r2 = ne12/ne22;
-                            const uint r3 = ne13/ne23;
-
-                            // find the break-even point where the matrix-matrix kernel becomes more efficient compared
-                            // to the matrix-vector kernel
-                            int ne11_mm_min = n_as;
-
-                            const int idx = ((int32_t *) dst->op_params)[0];
-
-                            // batch size
-                            GGML_ASSERT(ne01 == ne11);
-
-                            // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
-                            // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
-                            // !!!
-                            // TODO: for now, always use mat-vec kernels until we figure out how to improve the
-                            //       indirect matrix multiplication
-                            // !!!
-                            if ([ctx->device supportsFamily:MTLGPUFamilyApple7] &&
-                                ne20 % 32 == 0 && ne20 >= 64 &&
-                                ne11 > ne11_mm_min) {
-
-                                id<MTLComputePipelineState> pipeline = nil;
-
-                                switch (src2->type) {
-                                    case GGML_TYPE_F32:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32    ].pipeline; break;
-                                    case GGML_TYPE_F16:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32    ].pipeline; break;
-                                    case GGML_TYPE_Q4_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32   ].pipeline; break;
-                                    case GGML_TYPE_Q4_1:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_1_F32   ].pipeline; break;
-                                    case GGML_TYPE_Q5_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_0_F32   ].pipeline; break;
-                                    case GGML_TYPE_Q5_1:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_1_F32   ].pipeline; break;
-                                    case GGML_TYPE_Q8_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q8_0_F32   ].pipeline; break;
-                                    case GGML_TYPE_Q2_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q2_K_F32   ].pipeline; break;
-                                    case GGML_TYPE_Q3_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q3_K_F32   ].pipeline; break;
-                                    case GGML_TYPE_Q4_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_K_F32   ].pipeline; break;
-                                    case GGML_TYPE_Q5_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_K_F32   ].pipeline; break;
-                                    case GGML_TYPE_Q6_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q6_K_F32   ].pipeline; break;
-                                    case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32].pipeline; break;
-                                    case GGML_TYPE_IQ2_XS:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32 ].pipeline; break;
-                                    default: GGML_ASSERT(false && "MUL_MAT_ID not implemented");
-                                }
-
-                                [encoder setComputePipelineState:pipeline];
-                                [encoder setBuffer:id_src0 offset:offs_src0    atIndex:0];
-                                [encoder setBuffer:id_src1 offset:offs_src1    atIndex:1];
-                                [encoder setBuffer:id_dst  offset:offs_dst     atIndex:2];
-                                [encoder setBytes:&nb01    length:sizeof(nb01) atIndex:3];
-                                [encoder setBytes:&ne20    length:sizeof(ne20) atIndex:4];
-                                [encoder setBytes:&ne22    length:sizeof(ne22) atIndex:5];
-                                [encoder setBytes:&nb21    length:sizeof(nb21) atIndex:6];
-                                [encoder setBytes:&nb22    length:sizeof(nb22) atIndex:7];
-                                [encoder setBytes:&ne12    length:sizeof(ne12) atIndex:8];
-                                [encoder setBytes:&ne13    length:sizeof(ne13) atIndex:9];
-                                [encoder setBytes:&nb10    length:sizeof(nb10) atIndex:10];
-                                [encoder setBytes:&nb11    length:sizeof(nb11) atIndex:11];
-                                [encoder setBytes:&nb12    length:sizeof(nb12) atIndex:12];
-                                [encoder setBytes:&ne0     length:sizeof(ne0)  atIndex:13];
-                                [encoder setBytes:&ne1     length:sizeof(ne1)  atIndex:14];
-                                [encoder setBytes:&nb1     length:sizeof(nb1)  atIndex:15];
-                                [encoder setBytes:&r2      length:sizeof(r2)   atIndex:16];
-                                [encoder setBytes:&r3      length:sizeof(r3)   atIndex:17];
-                                [encoder setBytes:&idx     length:sizeof(idx)  atIndex:18];
-                                // TODO: how to make this an array? read Metal docs
-                                for (int j = 0; j < 8; ++j) {
-                                    // NOTE: this is done like this to avoid uninitialized kernel arguments when n_as < 8
-                                    struct ggml_tensor * src_cur = dst->src[2 + (j % n_as)];
-
-                                    size_t offs_src_cur = 0;
-                                    id<MTLBuffer> id_src_cur = ggml_metal_get_buffer(ctx, src_cur, &offs_src_cur);
-
-                                    [encoder setBuffer:id_src_cur offset:offs_src_cur atIndex:19 + j];
-                                }
-
-                                [encoder setThreadgroupMemoryLength:8192 atIndex:0];
-
-                                [encoder dispatchThreadgroups:MTLSizeMake((ne11 + 31)/32, (ne21 + 63)/64, n_as*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
-                            } else {
-                                int nth0 = 32;
-                                int nth1 = 1;
-                                int nrows = 1;
-                                //printf("vector: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12);
-
-                                id<MTLComputePipelineState> pipeline = nil;
-
-                                // use custom matrix x vector kernel
-                                switch (src2t) {
-                                    case GGML_TYPE_F32:
-                                        {
-                                            GGML_ASSERT(src1t == GGML_TYPE_F32);
-                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32].pipeline;
-                                        } break;
-                                    case GGML_TYPE_F16:
-                                        {
-                                            GGML_ASSERT(src1t == GGML_TYPE_F32);
-                                            nth0 = 32;
-                                            nth1 = 1;
-                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32].pipeline;
-                                        } break;
-                                    case GGML_TYPE_Q4_0:
-                                        {
-                                            nth0 = 8;
-                                            nth1 = 8;
-                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_0_F32].pipeline;
-                                        } break;
-                                    case GGML_TYPE_Q4_1:
-                                        {
-                                            nth0 = 8;
-                                            nth1 = 8;
-                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_1_F32].pipeline;
-                                        } break;
-                                    case GGML_TYPE_Q5_0:
-                                        {
-                                            nth0 = 8;
-                                            nth1 = 8;
-                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_0_F32].pipeline;
-                                        } break;
-                                    case GGML_TYPE_Q5_1:
-                                        {
-                                            nth0 = 8;
-                                            nth1 = 8;
-                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_1_F32].pipeline;
-                                        } break;
-                                    case GGML_TYPE_Q8_0:
-                                        {
-                                            nth0 = 8;
-                                            nth1 = 8;
-                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q8_0_F32].pipeline;
-                                        } break;
-                                    case GGML_TYPE_Q2_K:
-                                        {
-                                            nth0 = 2;
-                                            nth1 = 32;
-                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q2_K_F32].pipeline;
-                                        } break;
-                                    case GGML_TYPE_Q3_K:
-                                        {
-                                            nth0 = 2;
-                                            nth1 = 32;
-                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q3_K_F32].pipeline;
-                                        } break;
-                                    case GGML_TYPE_Q4_K:
-                                        {
-                                            nth0 = 4; //1;
-                                            nth1 = 8; //32;
-                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_K_F32].pipeline;
-                                        } break;
-                                    case GGML_TYPE_Q5_K:
-                                        {
-                                            nth0 = 2;
-                                            nth1 = 32;
-                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_K_F32].pipeline;
-                                        } break;
-                                    case GGML_TYPE_Q6_K:
-                                        {
-                                            nth0 = 2;
-                                            nth1 = 32;
-                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q6_K_F32].pipeline;
-                                        } break;
-                                    case GGML_TYPE_IQ2_XXS:
-                                        {
-                                            nth0 = 4;
-                                            nth1 = 16;
-                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XXS_F32].pipeline;
-                                        } break;
-                                    case GGML_TYPE_IQ2_XS:
-                                        {
-                                            nth0 = 4;
-                                            nth1 = 16;
-                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32].pipeline;
-                                        } break;
-                                    default:
-                                        {
-                                            GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src2t);
-                                            GGML_ASSERT(false && "not implemented");
-                                        }
-                                };
-
-                                if (ggml_is_quantized(src2t)) {
-                                    GGML_ASSERT(ne20 >= nth0*nth1);
-                                }
-
-                                const int64_t _ne1 = 1; // kernels needs a reference in constant memory
-
-                                [encoder setComputePipelineState:pipeline];
-                                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                                [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
-                                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
-                                [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:3];
-                                [encoder setBytes:&ne20 length:sizeof(ne20) atIndex:4];
-                                [encoder setBytes:&ne21 length:sizeof(ne21) atIndex:5];
-                                [encoder setBytes:&ne22 length:sizeof(ne22) atIndex:6];
-                                [encoder setBytes:&nb20 length:sizeof(nb20) atIndex:7];
-                                [encoder setBytes:&nb21 length:sizeof(nb21) atIndex:8];
-                                [encoder setBytes:&nb22 length:sizeof(nb22) atIndex:9];
-                                [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:10];
-                                [encoder setBytes:&_ne1 length:sizeof(_ne1) atIndex:11];
-                                [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:12];
-                                [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:13];
-                                [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:14];
-                                [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:15];
-                                [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:16];
-                                [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:17];
-                                [encoder setBytes:&_ne1 length:sizeof(_ne1) atIndex:18];
-                                [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:19];
-                                [encoder setBytes:&r2   length:sizeof(r2)   atIndex:20];
-                                [encoder setBytes:&r3   length:sizeof(r3)   atIndex:21];
-                                [encoder setBytes:&idx  length:sizeof(idx)  atIndex:22];
-                                // TODO: how to make this an array? read Metal docs
-                                for (int j = 0; j < 8; ++j) {
-                                    // NOTE: this is done like this to avoid uninitialized kernel arguments when n_as < 8
-                                    struct ggml_tensor * src_cur = dst->src[2 + (j % n_as)];
-
-                                    size_t offs_src_cur = 0;
-                                    id<MTLBuffer> id_src_cur = ggml_metal_get_buffer(ctx, src_cur, &offs_src_cur);
-
-                                    [encoder setBuffer:id_src_cur offset:offs_src_cur atIndex:23 + j];
-                                }
-
-                                if (src2t == GGML_TYPE_Q4_0 || src2t == GGML_TYPE_Q4_1 ||
-                                    src2t == GGML_TYPE_Q5_0 || src2t == GGML_TYPE_Q5_1 || src2t == GGML_TYPE_Q8_0 ||
-                                    src2t == GGML_TYPE_Q2_K) { // || src2t == GGML_TYPE_Q4_K) {
-                                    [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-                                }
-                                else if (src2t == GGML_TYPE_IQ2_XXS || src2t == GGML_TYPE_IQ2_XS) {
-                                    const int mem_size = src2t == GGML_TYPE_IQ2_XXS ? 256*8+128 : 512*8+128;
-                                    [encoder setThreadgroupMemoryLength:mem_size atIndex:0];
-                                    [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-                                }
-                                else if (src2t == GGML_TYPE_Q4_K) {
-                                    [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 3)/4, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-                                }
-                                else if (src2t == GGML_TYPE_Q3_K) {
-#ifdef GGML_QKK_64
-                                    [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 1)/2, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-#else
-                                    [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 3)/4, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-#endif
-                                }
-                                else if (src2t == GGML_TYPE_Q5_K) {
-                                    [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 3)/4, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-                                }
-                                else if (src2t == GGML_TYPE_Q6_K) {
-                                    [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 1)/2, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-                                } else {
-                                    const int64_t ny = (_ne1 + nrows - 1)/nrows;
-                                    [encoder dispatchThreadgroups:MTLSizeMake(ne21, ny, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-                                }
-                            }
-                        } break;
-                    case GGML_OP_GET_ROWS:
-                        {
-                            id<MTLComputePipelineState> pipeline = nil;
-
-                            switch (src0->type) {
-                                case GGML_TYPE_F32:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_F32    ].pipeline; break;
-                                case GGML_TYPE_F16:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_F16    ].pipeline; break;
-                                case GGML_TYPE_Q4_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_0   ].pipeline; break;
-                                case GGML_TYPE_Q4_1:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_1   ].pipeline; break;
-                                case GGML_TYPE_Q5_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_0   ].pipeline; break;
-                                case GGML_TYPE_Q5_1:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_1   ].pipeline; break;
-                                case GGML_TYPE_Q8_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q8_0   ].pipeline; break;
-                                case GGML_TYPE_Q2_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q2_K   ].pipeline; break;
-                                case GGML_TYPE_Q3_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q3_K   ].pipeline; break;
-                                case GGML_TYPE_Q4_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_K   ].pipeline; break;
-                                case GGML_TYPE_Q5_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_K   ].pipeline; break;
-                                case GGML_TYPE_Q6_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q6_K   ].pipeline; break;
-                                case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XXS].pipeline; break;
-                                case GGML_TYPE_IQ2_XS:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS ].pipeline; break;
-                                case GGML_TYPE_I32:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_I32    ].pipeline; break;
-                                default: GGML_ASSERT(false && "not implemented");
-                            }
-
-                            [encoder setComputePipelineState:pipeline];
-                            [encoder setBuffer:id_src0     offset:offs_src0 atIndex:0];
-                            [encoder setBuffer:id_src1     offset:offs_src1 atIndex:1];
-                            [encoder setBuffer:id_dst      offset:offs_dst  atIndex:2];
-                            [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:3];
-                            [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:4];
-                            [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:5];
-                            [encoder setBytes:&ne10 length:sizeof( int64_t) atIndex:6];
-                            [encoder setBytes:&nb10 length:sizeof( int64_t) atIndex:7];
-                            [encoder setBytes:&nb11 length:sizeof( int64_t) atIndex:8];
-                            [encoder setBytes:&nb1  length:sizeof(uint64_t) atIndex:9];
-                            [encoder setBytes:&nb2  length:sizeof(uint64_t) atIndex:10];
-
-                            [encoder dispatchThreadgroups:MTLSizeMake(ne10, ne11, 1) threadsPerThreadgroup:MTLSizeMake(32, 1, 1)];
-                        } break;
-                    case GGML_OP_RMS_NORM:
-                        {
-                            GGML_ASSERT(ne00 % 4 == 0);
-
-                            float eps;
-                            memcpy(&eps, dst->op_params, sizeof(float));
-
-                            int nth = 32; // SIMD width
-
-                            while (nth < ne00/4 && nth < 1024) {
-                                nth *= 2;
-                            }
-
-                            id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_RMS_NORM].pipeline;
-
-                            [encoder setComputePipelineState:pipeline];
-                            [encoder setBuffer:id_src0 offset:offs_src0        atIndex:0];
-                            [encoder setBuffer:id_dst  offset:offs_dst         atIndex:1];
-                            [encoder setBytes:&ne00    length:sizeof( int64_t) atIndex:2];
-                            [encoder setBytes:&nb01    length:sizeof(uint64_t) atIndex:3];
-                            [encoder setBytes:&eps     length:sizeof(   float) atIndex:4];
-                            [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
-
-                            const int64_t nrows = ggml_nrows(src0);
-
-                            [encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
-                        } break;
-                    case GGML_OP_GROUP_NORM:
-                        {
-                            GGML_ASSERT(ne00 % 4 == 0);
-
-                            //float eps;
-                            //memcpy(&eps, dst->op_params, sizeof(float));
-
-                            const float eps = 1e-6f; // TODO: temporarily hardcoded
-
-                            const int32_t n_groups = ((int32_t *) dst->op_params)[0];
-
-                            int nth = 32; // SIMD width
-
-                            //while (nth < ne00/4 && nth < 1024) {
-                            //    nth *= 2;
-                            //}
-
-                            id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GROUP_NORM].pipeline;
-
-                            [encoder setComputePipelineState:pipeline];
-                            [encoder setBuffer:id_src0  offset:offs_src0        atIndex:0];
-                            [encoder setBuffer:id_dst   offset:offs_dst         atIndex:1];
-                            [encoder setBytes:&ne00     length:sizeof( int64_t) atIndex:2];
-                            [encoder setBytes:&ne01     length:sizeof( int64_t) atIndex:3];
-                            [encoder setBytes:&ne02     length:sizeof( int64_t) atIndex:4];
-                            [encoder setBytes:&nb00     length:sizeof(uint64_t) atIndex:5];
-                            [encoder setBytes:&nb01     length:sizeof(uint64_t) atIndex:6];
-                            [encoder setBytes:&nb02     length:sizeof(uint64_t) atIndex:7];
-                            [encoder setBytes:&n_groups length:sizeof( int32_t) atIndex:8];
-                            [encoder setBytes:&eps      length:sizeof(   float) atIndex:9];
-                            [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
-
-                            [encoder dispatchThreadgroups:MTLSizeMake(n_groups, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
-                        } break;
-                    case GGML_OP_NORM:
-                        {
-                            float eps;
-                            memcpy(&eps, dst->op_params, sizeof(float));
-
-                            const int nth = MIN(256, ne00);
-
-                            id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_NORM].pipeline;
-
-                            [encoder setComputePipelineState:pipeline];
-                            [encoder setBuffer:id_src0 offset:offs_src0        atIndex:0];
-                            [encoder setBuffer:id_dst  offset:offs_dst         atIndex:1];
-                            [encoder setBytes:&ne00    length:sizeof( int64_t) atIndex:2];
-                            [encoder setBytes:&nb01    length:sizeof(uint64_t) atIndex:3];
-                            [encoder setBytes:&eps     length:sizeof(   float) atIndex:4];
-                            [encoder setThreadgroupMemoryLength:GGML_PAD(nth*sizeof(float), 16) atIndex:0];
-
-                            const int64_t nrows = ggml_nrows(src0);
-
-                            [encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
-                        } break;
-                    case GGML_OP_ALIBI:
-                        {
-                            GGML_ASSERT((src0t == GGML_TYPE_F32));
-
-                            const int nth = MIN(1024, ne00);
-
-                            //const int n_past = ((int32_t *) dst->op_params)[0];
-                            const int n_head = ((int32_t *) dst->op_params)[1];
-                            float max_bias;
-                            memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
-
-                            const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
-                            const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
-                            const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
-
-                            id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ALIBI_F32].pipeline;
-
-                            [encoder setComputePipelineState:pipeline];
-                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-                            [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
-                            [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3];
-                            [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4];
-                            [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5];
-                            [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6];
-                            [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7];
-                            [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8];
-                            [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9];
-                            [encoder setBytes:&ne0  length:sizeof( int64_t) atIndex:10];
-                            [encoder setBytes:&ne1  length:sizeof( int64_t) atIndex:11];
-                            [encoder setBytes:&ne2  length:sizeof( int64_t) atIndex:12];
-                            [encoder setBytes:&ne3  length:sizeof( int64_t) atIndex:13];
-                            [encoder setBytes:&nb0  length:sizeof(uint64_t) atIndex:14];
-                            [encoder setBytes:&nb1  length:sizeof(uint64_t) atIndex:15];
-                            [encoder setBytes:&nb2  length:sizeof(uint64_t) atIndex:16];
-                            [encoder setBytes:&nb3  length:sizeof(uint64_t) atIndex:17];
-                            [encoder setBytes:&m0   length:sizeof(   float) atIndex:18];
-                            [encoder setBytes:&m1   length:sizeof(   float) atIndex:19];
-                            [encoder setBytes:&n_heads_log2_floor   length:sizeof(int) atIndex:20];
-
-                            [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
-                        } break;
-                    case GGML_OP_ROPE:
-                        {
-                            GGML_ASSERT(ne10 == ne02);
-
-                            const int nth = MIN(1024, ne00);
-
-                            const int n_past     = ((int32_t *) dst->op_params)[0];
-                            const int n_dims     = ((int32_t *) dst->op_params)[1];
-                            const int mode       = ((int32_t *) dst->op_params)[2];
-                            // skip 3, n_ctx, used in GLM RoPE, unimplemented in metal
-                            const int n_orig_ctx = ((int32_t *) dst->op_params)[4];
-
-                            float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
-                            memcpy(&freq_base,   (int32_t *) dst->op_params +  5, sizeof(float));
-                            memcpy(&freq_scale,  (int32_t *) dst->op_params +  6, sizeof(float));
-                            memcpy(&ext_factor,  (int32_t *) dst->op_params +  7, sizeof(float));
-                            memcpy(&attn_factor, (int32_t *) dst->op_params +  8, sizeof(float));
-                            memcpy(&beta_fast,   (int32_t *) dst->op_params +  9, sizeof(float));
-                            memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));
-
-                            id<MTLComputePipelineState> pipeline = nil;
-
-                            switch (src0->type) {
-                                case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_F32].pipeline; break;
-                                case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_F16].pipeline; break;
-                                default: GGML_ASSERT(false);
-                            };
-
-                            [encoder setComputePipelineState:pipeline];
-                            [encoder setBuffer:id_src0     offset:offs_src0        atIndex:0];
-                            [encoder setBuffer:id_src1     offset:offs_src1        atIndex:1];
-                            [encoder setBuffer:id_dst      offset:offs_dst         atIndex:2];
-                            [encoder setBytes:&ne00        length:sizeof( int64_t) atIndex:3];
-                            [encoder setBytes:&ne01        length:sizeof( int64_t) atIndex:4];
-                            [encoder setBytes:&ne02        length:sizeof( int64_t) atIndex:5];
-                            [encoder setBytes:&ne03        length:sizeof( int64_t) atIndex:6];
-                            [encoder setBytes:&nb00        length:sizeof(uint64_t) atIndex:7];
-                            [encoder setBytes:&nb01        length:sizeof(uint64_t) atIndex:8];
-                            [encoder setBytes:&nb02        length:sizeof(uint64_t) atIndex:9];
-                            [encoder setBytes:&nb03        length:sizeof(uint64_t) atIndex:10];
-                            [encoder setBytes:&ne0         length:sizeof( int64_t) atIndex:11];
-                            [encoder setBytes:&ne1         length:sizeof( int64_t) atIndex:12];
-                            [encoder setBytes:&ne2         length:sizeof( int64_t) atIndex:13];
-                            [encoder setBytes:&ne3         length:sizeof( int64_t) atIndex:14];
-                            [encoder setBytes:&nb0         length:sizeof(uint64_t) atIndex:15];
-                            [encoder setBytes:&nb1         length:sizeof(uint64_t) atIndex:16];
-                            [encoder setBytes:&nb2         length:sizeof(uint64_t) atIndex:17];
-                            [encoder setBytes:&nb3         length:sizeof(uint64_t) atIndex:18];
-                            [encoder setBytes:&n_past      length:sizeof(     int) atIndex:19];
-                            [encoder setBytes:&n_dims      length:sizeof(     int) atIndex:20];
-                            [encoder setBytes:&mode        length:sizeof(     int) atIndex:21];
-                            [encoder setBytes:&n_orig_ctx  length:sizeof(     int) atIndex:22];
-                            [encoder setBytes:&freq_base   length:sizeof(   float) atIndex:23];
-                            [encoder setBytes:&freq_scale  length:sizeof(   float) atIndex:24];
-                            [encoder setBytes:&ext_factor  length:sizeof(   float) atIndex:25];
-                            [encoder setBytes:&attn_factor length:sizeof(   float) atIndex:26];
-                            [encoder setBytes:&beta_fast   length:sizeof(   float) atIndex:27];
-                            [encoder setBytes:&beta_slow   length:sizeof(   float) atIndex:28];
-
-                            [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
-                        } break;
-                    case GGML_OP_IM2COL:
-                        {
-                            GGML_ASSERT(src0->type == GGML_TYPE_F16);
-                            GGML_ASSERT(src1->type == GGML_TYPE_F32);
-                            GGML_ASSERT( dst->type == GGML_TYPE_F16);
-
-                            const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
-                            const int32_t s1 = ((const int32_t *)(dst->op_params))[1];
-                            const int32_t p0 = ((const int32_t *)(dst->op_params))[2];
-                            const int32_t p1 = ((const int32_t *)(dst->op_params))[3];
-                            const int32_t d0 = ((const int32_t *)(dst->op_params))[4];
-                            const int32_t d1 = ((const int32_t *)(dst->op_params))[5];
-                            const bool is_2D = ((const int32_t *)(dst->op_params))[6] == 1;
-
-                            const int32_t N  = src1->ne[is_2D ? 3 : 2];
-                            const int32_t IC = src1->ne[is_2D ? 2 : 1];
-                            const int32_t IH = is_2D ? src1->ne[1] : 1;
-                            const int32_t IW =         src1->ne[0];
-
-                            const int32_t KH = is_2D ? src0->ne[1] : 1;
-                            const int32_t KW =         src0->ne[0];
-
-                            const int32_t OH = is_2D ? dst->ne[2] : 1;
-                            const int32_t OW =         dst->ne[1];
-
-                            const int32_t CHW = IC * KH * KW;
-
-                            const int32_t ofs0 = src1->nb[is_2D ? 3 : 2] / 4;
-                            const int32_t ofs1 = src1->nb[is_2D ? 2 : 1] / 4;
-
-                            id<MTLComputePipelineState> pipeline = nil;
-
-                            switch (src0->type) {
-                                case GGML_TYPE_F32: GGML_ASSERT(false && "not implemented"); break;
-                                case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_IM2COL_F16].pipeline; break;
-                                default: GGML_ASSERT(false);
-                            };
-
-                            [encoder setComputePipelineState:pipeline];
-                            [encoder setBuffer:id_src1 offset:offs_src1        atIndex:0];
-                            [encoder setBuffer:id_dst  offset:offs_dst         atIndex:1];
-                            [encoder setBytes:&ofs0    length:sizeof( int32_t) atIndex:2];
-                            [encoder setBytes:&ofs1    length:sizeof( int32_t) atIndex:3];
-                            [encoder setBytes:&IW      length:sizeof( int32_t) atIndex:4];
-                            [encoder setBytes:&IH      length:sizeof( int32_t) atIndex:5];
-                            [encoder setBytes:&CHW     length:sizeof( int32_t) atIndex:6];
-                            [encoder setBytes:&s0      length:sizeof( int32_t) atIndex:7];
-                            [encoder setBytes:&s1      length:sizeof( int32_t) atIndex:8];
-                            [encoder setBytes:&p0      length:sizeof( int32_t) atIndex:9];
-                            [encoder setBytes:&p1      length:sizeof( int32_t) atIndex:10];
-                            [encoder setBytes:&d0      length:sizeof( int32_t) atIndex:11];
-                            [encoder setBytes:&d1      length:sizeof( int32_t) atIndex:12];
-
-                            [encoder dispatchThreadgroups:MTLSizeMake(IC, OH, OW) threadsPerThreadgroup:MTLSizeMake(N, KH, KW)];
-                        } break;
-                    case GGML_OP_UPSCALE:
-                        {
-                            GGML_ASSERT(src0->type == GGML_TYPE_F32);
-
-                            const int sf = dst->op_params[0];
-
-                            const id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_UPSCALE_F32].pipeline;
-
-                            [encoder setComputePipelineState:pipeline];
-                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-                            [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
-                            [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
-                            [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
-                            [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5];
-                            [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
-                            [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
-                            [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
-                            [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9];
-                            [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:10];
-                            [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:11];
-                            [encoder setBytes:&ne2  length:sizeof(ne2)  atIndex:12];
-                            [encoder setBytes:&ne3  length:sizeof(ne3)  atIndex:13];
-                            [encoder setBytes:&nb0  length:sizeof(nb0)  atIndex:14];
-                            [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:15];
-                            [encoder setBytes:&nb2  length:sizeof(nb2)  atIndex:16];
-                            [encoder setBytes:&nb3  length:sizeof(nb3)  atIndex:17];
-                            [encoder setBytes:&sf   length:sizeof(sf)   atIndex:18];
-
-                            const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne0);
-
-                            [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
-                        } break;
-                    case GGML_OP_PAD:
-                        {
-                            GGML_ASSERT(src0->type == GGML_TYPE_F32);
-
-                            id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_PAD_F32].pipeline;
-
-                            [encoder setComputePipelineState:pipeline];
-                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-                            [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
-                            [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
-                            [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
-                            [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5];
-                            [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
-                            [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
-                            [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
-                            [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9];
-                            [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:10];
-                            [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:11];
-                            [encoder setBytes:&ne2  length:sizeof(ne2)  atIndex:12];
-                            [encoder setBytes:&ne3  length:sizeof(ne3)  atIndex:13];
-                            [encoder setBytes:&nb0  length:sizeof(nb0)  atIndex:14];
-                            [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:15];
-                            [encoder setBytes:&nb2  length:sizeof(nb2)  atIndex:16];
-                            [encoder setBytes:&nb3  length:sizeof(nb3)  atIndex:17];
-
-                            const int nth = MIN(1024, ne0);
-
-                            [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
-                        } break;
-                    case GGML_OP_ARGSORT:
-                        {
-                            GGML_ASSERT(src0->type == GGML_TYPE_F32);
-                            GGML_ASSERT( dst->type == GGML_TYPE_I32);
-
-                            const int nrows = ggml_nrows(src0);
-
-                            enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0];
-
-                            id<MTLComputePipelineState> pipeline = nil;
-
-                            switch (order) {
-                                case GGML_SORT_ASC:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC].pipeline;  break;
-                                case GGML_SORT_DESC: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC].pipeline; break;
-                                default: GGML_ASSERT(false);
-                            };
-
-                            [encoder setComputePipelineState:pipeline];
-                            [encoder setBuffer:id_src0 offset:offs_src0        atIndex:0];
-                            [encoder setBuffer:id_dst  offset:offs_dst         atIndex:1];
-                            [encoder setBytes:&ne00    length:sizeof( int64_t) atIndex:2];
-
-                            [encoder dispatchThreadgroups:MTLSizeMake(1, nrows, 1) threadsPerThreadgroup:MTLSizeMake(ne00, 1, 1)];
-                        } break;
-                    case GGML_OP_LEAKY_RELU:
-                        {
-                            GGML_ASSERT(src0->type == GGML_TYPE_F32);
-
-                            float slope;
-                            memcpy(&slope, dst->op_params, sizeof(float));
-
-                            id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32].pipeline;
-
-                            [encoder setComputePipelineState:pipeline];
-                            [encoder setBuffer:id_src0 offset:offs_src0   atIndex:0];
-                            [encoder setBuffer:id_dst  offset:offs_dst    atIndex:1];
-                            [encoder setBytes:&slope length:sizeof(slope) atIndex:2];
-
-                            const int64_t n = ggml_nelements(dst);
-
-                            [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                        } break;
-                    case GGML_OP_DUP:
-                    case GGML_OP_CPY:
-                    case GGML_OP_CONT:
-                        {
-                            GGML_ASSERT(ne00 % ggml_blck_size(src0->type) == 0);
-
-                            int nth = MIN(1024, ne00/ggml_blck_size(src0->type));
-
-                            id<MTLComputePipelineState> pipeline = nil;
-
-                            switch (src0t) {
-                                case GGML_TYPE_F32:
-                                    {
-                                        GGML_ASSERT(ne0 % ggml_blck_size(dst->type) == 0);
-
-                                        switch (dstt) {
-                                            case GGML_TYPE_F16:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_F16].pipeline;  break;
-                                            case GGML_TYPE_F32:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_F32].pipeline;  break;
-                                            case GGML_TYPE_Q8_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q8_0].pipeline; break;
-                                            case GGML_TYPE_Q4_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_0].pipeline; break;
-                                            case GGML_TYPE_Q4_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_1].pipeline; break;
-                                          //case GGML_TYPE_Q5_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_0].pipeline; break;
-                                          //case GGML_TYPE_Q5_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_1].pipeline; break;
-                                            default: GGML_ASSERT(false && "not implemented");
-                                        };
-                                    } break;
-                                case GGML_TYPE_F16:
-                                    {
-                                        switch (dstt) {
-                                            case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F16_F16].pipeline; break;
-                                            case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F16_F32].pipeline; break;
-                                            default: GGML_ASSERT(false && "not implemented");
-                                        };
-                                    } break;
-                                default: GGML_ASSERT(false && "not implemented");
-                            }
-
-                            [encoder setComputePipelineState:pipeline];
-                            [encoder setBuffer:id_src0 offset:offs_src0        atIndex:0];
-                            [encoder setBuffer:id_dst  offset:offs_dst         atIndex:1];
-                            [encoder setBytes:&ne00    length:sizeof( int64_t) atIndex:2];
-                            [encoder setBytes:&ne01    length:sizeof( int64_t) atIndex:3];
-                            [encoder setBytes:&ne02    length:sizeof( int64_t) atIndex:4];
-                            [encoder setBytes:&ne03    length:sizeof( int64_t) atIndex:5];
-                            [encoder setBytes:&nb00    length:sizeof(uint64_t) atIndex:6];
-                            [encoder setBytes:&nb01    length:sizeof(uint64_t) atIndex:7];
-                            [encoder setBytes:&nb02    length:sizeof(uint64_t) atIndex:8];
-                            [encoder setBytes:&nb03    length:sizeof(uint64_t) atIndex:9];
-                            [encoder setBytes:&ne0     length:sizeof( int64_t) atIndex:10];
-                            [encoder setBytes:&ne1     length:sizeof( int64_t) atIndex:11];
-                            [encoder setBytes:&ne2     length:sizeof( int64_t) atIndex:12];
-                            [encoder setBytes:&ne3     length:sizeof( int64_t) atIndex:13];
-                            [encoder setBytes:&nb0     length:sizeof(uint64_t) atIndex:14];
-                            [encoder setBytes:&nb1     length:sizeof(uint64_t) atIndex:15];
-                            [encoder setBytes:&nb2     length:sizeof(uint64_t) atIndex:16];
-                            [encoder setBytes:&nb3     length:sizeof(uint64_t) atIndex:17];
-
-                            [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
-                        } break;
-                    default:
-                        {
-                            GGML_METAL_LOG_ERROR("%s: error: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
-                            GGML_ASSERT(false);
-                        }
-                }
-
-#ifndef GGML_METAL_NDEBUG
-                [encoder popDebugGroup];
-#endif
-            }
-
-            if (encoder != nil) {
-                [encoder endEncoding];
-                encoder = nil;
-            }
-
-            [command_buffer commit];
-        });
-    }
-
-    // wait for all threads to finish
-    dispatch_barrier_sync(ctx->d_queue, ^{});
-
-    // check status of command buffers
-    // needed to detect if the device ran out-of-memory for example (#1881)
-    for (int i = 0; i < n_cb; i++) {
-        [ctx->command_buffers[i] waitUntilCompleted];
-
-        MTLCommandBufferStatus status = (MTLCommandBufferStatus) [ctx->command_buffers[i] status];
-        if (status != MTLCommandBufferStatusCompleted) {
-            GGML_METAL_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status);
-            return false;
-        }
-    }
-
-    return true;
-    }
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-// backend interface
-
-// default buffer
-static id<MTLDevice> g_backend_device = nil;
-static int g_backend_device_ref_count = 0;
-
-static id<MTLDevice> ggml_backend_metal_get_device(void) {
-    if (g_backend_device == nil) {
-        g_backend_device = MTLCreateSystemDefaultDevice();
-    }
-
-    g_backend_device_ref_count++;
-
-    return g_backend_device;
-}
-
-static void ggml_backend_metal_free_device(void) {
-    assert(g_backend_device_ref_count > 0);
-
-    g_backend_device_ref_count--;
-
-    if (g_backend_device_ref_count == 0) {
-        [g_backend_device release];
-        g_backend_device = nil;
-    }
-}
-
-static const char * ggml_backend_metal_buffer_get_name(ggml_backend_buffer_t buffer) {
-    return "Metal";
-
-    UNUSED(buffer);
-}
-
-static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) {
-    struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
-
-    for (int i = 0; i < ctx->n_buffers; i++) {
-        [ctx->buffers[i].metal release];
-    }
-    ggml_backend_metal_free_device();
-
-    if (ctx->owned) {
-        free(ctx->all_data);
-    }
-
-    free(ctx);
-}
-
-static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) {
-    struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
-
-    return ctx->all_data;
-}
-
-static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
-    memcpy((char *)tensor->data + offset, data, size);
-
-    UNUSED(buffer);
-}
-
-static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
-    memcpy(data, (const char *)tensor->data + offset, size);
-
-    UNUSED(buffer);
-}
-
-static bool ggml_backend_metal_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
-    if (ggml_backend_buffer_is_host(src->buffer)) {
-        memcpy(dst->data, src->data, ggml_nbytes(src));
-        return true;
-    }
-    return false;
-
-    UNUSED(buffer);
-}
-
-static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
-    struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
-
-    memset(ctx->all_data, value, ctx->all_size);
-}
-
-static struct ggml_backend_buffer_i ggml_backend_metal_buffer_i = {
-    /* .get_name        = */ ggml_backend_metal_buffer_get_name,
-    /* .free_buffer     = */ ggml_backend_metal_buffer_free_buffer,
-    /* .get_base        = */ ggml_backend_metal_buffer_get_base,
-    /* .init_tensor     = */ NULL,
-    /* .set_tensor      = */ ggml_backend_metal_buffer_set_tensor,
-    /* .get_tensor      = */ ggml_backend_metal_buffer_get_tensor,
-    /* .cpy_tensor      = */ ggml_backend_metal_buffer_cpy_tensor,
-    /* .clear           = */ ggml_backend_metal_buffer_clear,
-    /* .reset           = */ NULL,
-};
-
-// default buffer type
-
-static const char * ggml_backend_metal_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
-    return "Metal";
-
-    UNUSED(buft);
-}
-
-static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
-    struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context));
-
-    const size_t size_page = sysconf(_SC_PAGESIZE);
-
-    size_t size_aligned = size;
-    if ((size_aligned % size_page) != 0) {
-        size_aligned += (size_page - (size_aligned % size_page));
-    }
-
-    id<MTLDevice> device = ggml_backend_metal_get_device();
-
-    ctx->all_data = ggml_metal_host_malloc(size_aligned);
-    ctx->all_size = size_aligned;
-    ctx->owned = true;
-    ctx->n_buffers = 1;
-
-    ctx->buffers[0].data = ctx->all_data;
-    ctx->buffers[0].size = size;
-    ctx->buffers[0].metal = [device newBufferWithBytesNoCopy:ctx->all_data
-                    length:size_aligned
-                    options:MTLResourceStorageModeShared
-                    deallocator:nil];
-
-    if (ctx->buffers[0].metal == nil) {
-        GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
-        free(ctx);
-        ggml_backend_metal_free_device();
-        return NULL;
-    }
-
-    GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB", __func__, size_aligned / 1024.0 / 1024.0);
-
-
-#if TARGET_OS_OSX
-    GGML_METAL_LOG_INFO(", (%8.2f / %8.2f)",
-            device.currentAllocatedSize / 1024.0 / 1024.0,
-            device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
-
-    if (device.currentAllocatedSize > device.recommendedMaxWorkingSetSize) {
-        GGML_METAL_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__);
-    } else {
-        GGML_METAL_LOG_INFO("\n");
-    }
-#else
-    GGML_METAL_LOG_INFO(", (%8.2f)\n", device.currentAllocatedSize / 1024.0 / 1024.0);
-#endif
-
-
-    return ggml_backend_buffer_init(buft, ggml_backend_metal_buffer_i, ctx, size);
-}
-
-static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
-    return 32;
-    UNUSED(buft);
-}
-
-static bool ggml_backend_metal_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
-    return ggml_backend_is_metal(backend) || ggml_backend_is_cpu(backend);
-
-    UNUSED(buft);
-}
-
-static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
-    return true;
-
-    UNUSED(buft);
-}
-
-ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
-    static struct ggml_backend_buffer_type ggml_backend_buffer_type_metal = {
-        /* .iface = */ {
-            /* .get_name         = */ ggml_backend_metal_buffer_type_get_name,
-            /* .alloc_buffer     = */ ggml_backend_metal_buffer_type_alloc_buffer,
-            /* .get_alignment    = */ ggml_backend_metal_buffer_type_get_alignment,
-            /* .get_alloc_size   = */ NULL, // defaults to ggml_nbytes
-            /* .supports_backend = */ ggml_backend_metal_buffer_type_supports_backend,
-            /* .is_host          = */ ggml_backend_metal_buffer_type_is_host,
-        },
-        /* .context = */ NULL,
-    };
-
-    return &ggml_backend_buffer_type_metal;
-}
-
-// buffer from ptr
-
-ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size) {
-    struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context));
-
-    ctx->all_data = data;
-    ctx->all_size = size;
-    ctx->owned = false;
-    ctx->n_buffers = 0;
-
-    const size_t size_page = sysconf(_SC_PAGESIZE);
-
-    // page-align the data ptr
-    {
-        const uintptr_t offs = (uintptr_t) data % size_page;
-        data  = (void *) ((char *) data - offs);
-        size += offs;
-    }
-
-    size_t size_aligned = size;
-    if ((size_aligned % size_page) != 0) {
-        size_aligned += (size_page - (size_aligned % size_page));
-    }
-
-    id<MTLDevice> device = ggml_backend_metal_get_device();
-
-    // the buffer fits into the max buffer size allowed by the device
-    if (size_aligned <= device.maxBufferLength) {
-        ctx->buffers[ctx->n_buffers].data = data;
-        ctx->buffers[ctx->n_buffers].size = size;
-
-        ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
-
-        if (ctx->buffers[ctx->n_buffers].metal == nil) {
-            GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
-            return false;
-        }
-
-        GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB", __func__, size_aligned / 1024.0 / 1024.0);
-
-        ++ctx->n_buffers;
-    } else {
-        // this overlap between the views will guarantee that the tensor with the maximum size will fully fit into
-        // one of the views
-        const size_t size_ovlp = ((max_size + size_page - 1) / size_page + 1) * size_page; // round-up 2 pages just in case
-        const size_t size_step = device.maxBufferLength - size_ovlp;
-        const size_t size_view = device.maxBufferLength;
-
-        for (size_t i = 0; i < size; i += size_step) {
-            const size_t size_step_aligned = (i + size_view <= size) ? size_view : (size_aligned - i);
-
-            ctx->buffers[ctx->n_buffers].data = (void *) ((uint8_t *) data + i);
-            ctx->buffers[ctx->n_buffers].size = size_step_aligned;
-
-            ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
-
-            if (ctx->buffers[ctx->n_buffers].metal == nil) {
-                GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_step_aligned / 1024.0 / 1024.0);
-                return false;
-            }
-
-            GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB, offs = %12ld", __func__, size_step_aligned / 1024.0 / 1024.0, i);
-            if (i + size_step < size) {
-                GGML_METAL_LOG_INFO("\n");
-            }
-
-            ++ctx->n_buffers;
-        }
-    }
-
-#if TARGET_OS_OSX
-    GGML_METAL_LOG_INFO(", (%8.2f / %8.2f)",
-            device.currentAllocatedSize / 1024.0 / 1024.0,
-            device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
-
-    if (device.currentAllocatedSize > device.recommendedMaxWorkingSetSize) {
-        GGML_METAL_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__);
-    } else {
-        GGML_METAL_LOG_INFO("\n");
-    }
-#else
-    GGML_METAL_LOG_INFO(", (%8.2f)\n", device.currentAllocatedSize / 1024.0 / 1024.0);
-#endif
-
-    return ggml_backend_buffer_init(ggml_backend_metal_buffer_type(), ggml_backend_metal_buffer_i, ctx, size);
-}
-
-// backend
-
-static const char * ggml_backend_metal_name(ggml_backend_t backend) {
-    return "Metal";
-
-    UNUSED(backend);
-}
-
-static void ggml_backend_metal_free(ggml_backend_t backend) {
-    struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
-    ggml_metal_free(ctx);
-    free(backend);
-}
-
-static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffer_type(ggml_backend_t backend) {
-    return ggml_backend_metal_buffer_type();
-
-    UNUSED(backend);
-}
-
-static bool ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
-    struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context;
-
-    return ggml_metal_graph_compute(metal_ctx, cgraph);
-}
-
-static bool ggml_backend_metal_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
-    struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context;
-
-    return ggml_metal_supports_op(metal_ctx, op);
-}
-
-static struct ggml_backend_i ggml_backend_metal_i = {
-    /* .get_name                = */ ggml_backend_metal_name,
-    /* .free                    = */ ggml_backend_metal_free,
-    /* .get_default_buffer_type = */ ggml_backend_metal_get_default_buffer_type,
-    /* .set_tensor_async        = */ NULL,
-    /* .get_tensor_async        = */ NULL,
-    /* .cpy_tensor_async        = */ NULL,
-    /* .synchronize             = */ NULL,
-    /* .graph_plan_create       = */ NULL,
-    /* .graph_plan_free         = */ NULL,
-    /* .graph_plan_compute      = */ NULL,
-    /* .graph_compute           = */ ggml_backend_metal_graph_compute,
-    /* .supports_op             = */ ggml_backend_metal_supports_op,
-};
-
-void ggml_backend_metal_log_set_callback(ggml_log_callback log_callback, void * user_data) {
-    ggml_metal_log_callback  = log_callback;
-    ggml_metal_log_user_data = user_data;
-}
-
-ggml_backend_t ggml_backend_metal_init(void) {
-    struct ggml_metal_context * ctx = ggml_metal_init(GGML_DEFAULT_N_THREADS);
-
-    if (ctx == NULL) {
-        return NULL;
-    }
-
-    ggml_backend_t metal_backend = malloc(sizeof(struct ggml_backend));
-
-    *metal_backend = (struct ggml_backend) {
-        /* .interface = */ ggml_backend_metal_i,
-        /* .context   = */ ctx,
-    };
-
-    return metal_backend;
-}
-
-bool ggml_backend_is_metal(ggml_backend_t backend) {
-    return backend && backend->iface.get_name == ggml_backend_metal_name;
-}
-
-void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
-    GGML_ASSERT(ggml_backend_is_metal(backend));
-
-    struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
-
-    ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_BUFFERS);
-}
-
-bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family) {
-    GGML_ASSERT(ggml_backend_is_metal(backend));
-
-    struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
-
-    return [ctx->device supportsFamily:(MTLGPUFamilyApple1 + family - 1)];
-}
-
-ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data); // silence warning
-
-ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data) {
-    return ggml_backend_metal_init();
-
-    GGML_UNUSED(params);
-    GGML_UNUSED(user_data);
-}
diff --git a/ggml/src/ggml-metal.metal b/ggml/src/ggml-metal.metal
deleted file mode 100644
index 029578d..0000000
--- a/ggml/src/ggml-metal.metal
+++ /dev/null
@@ -1,5820 +0,0 @@
-#include <metal_stdlib>
-
-using namespace metal;
-
-#define MAX(x, y) ((x) > (y) ? (x) : (y))
-#define MIN(x, y) ((x) < (y) ? (x) : (y))
-#define SWAP(x, y) { auto tmp = (x); (x) = (y); (y) = tmp; }
-
-#define QK4_0 32
-#define QR4_0 2
-typedef struct {
-    half    d;             // delta
-    uint8_t qs[QK4_0 / 2]; // nibbles / quants
-} block_q4_0;
-
-#define QK4_1 32
-typedef struct {
-    half d;                 // delta
-    half m;                 // min
-    uint8_t qs[QK4_1 / 2];  // nibbles / quants
-} block_q4_1;
-
-#define QK5_0 32
-typedef struct {
-    half d;                // delta
-    uint8_t qh[4];         // 5-th bit of quants
-    uint8_t qs[QK5_0 / 2]; // nibbles / quants
-} block_q5_0;
-
-#define QK5_1 32
-typedef struct {
-    half d;                 // delta
-    half m;                 // min
-    uint8_t qh[4];          // 5-th bit of quants
-    uint8_t qs[QK5_1 / 2];  // nibbles / quants
-} block_q5_1;
-
-#define QK8_0 32
-typedef struct {
-    half    d;         // delta
-    int8_t  qs[QK8_0]; // quants
-} block_q8_0;
-
-#define N_SIMDWIDTH 32 // assuming SIMD group size is 32
-
-enum ggml_sort_order {
-    GGML_SORT_ASC,
-    GGML_SORT_DESC,
-};
-
-// general-purpose kernel for addition, multiplication and division of two tensors
-// pros: works for non-contiguous tensors, supports broadcast across all dims
-// cons: not very efficient
-kernel void kernel_add(
-        device const char * src0,
-        device const char * src1,
-        device       char * dst,
-        constant  int64_t & ne00,
-        constant  int64_t & ne01,
-        constant  int64_t & ne02,
-        constant  int64_t & ne03,
-        constant uint64_t & nb00,
-        constant uint64_t & nb01,
-        constant uint64_t & nb02,
-        constant uint64_t & nb03,
-        constant  int64_t & ne10,
-        constant  int64_t & ne11,
-        constant  int64_t & ne12,
-        constant  int64_t & ne13,
-        constant uint64_t & nb10,
-        constant uint64_t & nb11,
-        constant uint64_t & nb12,
-        constant uint64_t & nb13,
-        constant  int64_t & ne0,
-        constant  int64_t & ne1,
-        constant  int64_t & ne2,
-        constant  int64_t & ne3,
-        constant uint64_t & nb0,
-        constant uint64_t & nb1,
-        constant uint64_t & nb2,
-        constant uint64_t & nb3,
-        constant  int64_t & offs,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint3 tpitg[[thread_position_in_threadgroup]],
-        uint3   ntg[[threads_per_threadgroup]]) {
-    const int64_t i03 = tgpig.z;
-    const int64_t i02 = tgpig.y;
-    const int64_t i01 = tgpig.x;
-
-    const int64_t i13 = i03 % ne13;
-    const int64_t i12 = i02 % ne12;
-    const int64_t i11 = i01 % ne11;
-
-    device const char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01 + offs;
-    device const char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11;
-    device       char * dst_ptr  = dst  + i03*nb3  + i02*nb2  + i01*nb1  + offs;
-
-    for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
-        const int i10 = i0 % ne10;
-        *((device float *)(dst_ptr + i0*nb0)) = *((device float *)(src0_ptr + i0*nb00)) + *((device float *)(src1_ptr + i10*nb10));
-    }
-}
-
-kernel void kernel_mul(
-        device const char * src0,
-        device const char * src1,
-        device       char * dst,
-        constant  int64_t & ne00,
-        constant  int64_t & ne01,
-        constant  int64_t & ne02,
-        constant  int64_t & ne03,
-        constant uint64_t & nb00,
-        constant uint64_t & nb01,
-        constant uint64_t & nb02,
-        constant uint64_t & nb03,
-        constant  int64_t & ne10,
-        constant  int64_t & ne11,
-        constant  int64_t & ne12,
-        constant  int64_t & ne13,
-        constant uint64_t & nb10,
-        constant uint64_t & nb11,
-        constant uint64_t & nb12,
-        constant uint64_t & nb13,
-        constant  int64_t & ne0,
-        constant  int64_t & ne1,
-        constant  int64_t & ne2,
-        constant  int64_t & ne3,
-        constant uint64_t & nb0,
-        constant uint64_t & nb1,
-        constant uint64_t & nb2,
-        constant uint64_t & nb3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint3 tpitg[[thread_position_in_threadgroup]],
-        uint3   ntg[[threads_per_threadgroup]]) {
-    const int64_t i03 = tgpig.z;
-    const int64_t i02 = tgpig.y;
-    const int64_t i01 = tgpig.x;
-
-    const int64_t i13 = i03 % ne13;
-    const int64_t i12 = i02 % ne12;
-    const int64_t i11 = i01 % ne11;
-
-    device const char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01;
-    device const char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11;
-    device       char * dst_ptr  = dst  + i03*nb3  + i02*nb2  + i01*nb1;
-
-    for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
-        const int i10 = i0 % ne10;
-        *((device float *)(dst_ptr + i0*nb0)) = *((device float *)(src0_ptr + i0*nb00)) * *((device float *)(src1_ptr + i10*nb10));
-    }
-}
-
-kernel void kernel_div(
-        device const char * src0,
-        device const char * src1,
-        device       char * dst,
-        constant  int64_t & ne00,
-        constant  int64_t & ne01,
-        constant  int64_t & ne02,
-        constant  int64_t & ne03,
-        constant uint64_t & nb00,
-        constant uint64_t & nb01,
-        constant uint64_t & nb02,
-        constant uint64_t & nb03,
-        constant  int64_t & ne10,
-        constant  int64_t & ne11,
-        constant  int64_t & ne12,
-        constant  int64_t & ne13,
-        constant uint64_t & nb10,
-        constant uint64_t & nb11,
-        constant uint64_t & nb12,
-        constant uint64_t & nb13,
-        constant  int64_t & ne0,
-        constant  int64_t & ne1,
-        constant  int64_t & ne2,
-        constant  int64_t & ne3,
-        constant uint64_t & nb0,
-        constant uint64_t & nb1,
-        constant uint64_t & nb2,
-        constant uint64_t & nb3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint3 tpitg[[thread_position_in_threadgroup]],
-        uint3   ntg[[threads_per_threadgroup]]) {
-    const int64_t i03 = tgpig.z;
-    const int64_t i02 = tgpig.y;
-    const int64_t i01 = tgpig.x;
-
-    const int64_t i13 = i03 % ne13;
-    const int64_t i12 = i02 % ne12;
-    const int64_t i11 = i01 % ne11;
-
-    device const char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01;
-    device const char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11;
-    device       char * dst_ptr  = dst  + i03*nb3  + i02*nb2  + i01*nb1;
-
-    for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
-        const int i10 = i0 % ne10;
-        *((device float *)(dst_ptr + i0*nb0)) = *((device float *)(src0_ptr + i0*nb00)) / *((device float *)(src1_ptr + i10*nb10));
-    }
-}
-
-// assumption: src1 is a row
-// broadcast src1 into src0
-kernel void kernel_add_row(
-        device const float4 * src0,
-        device const float4 * src1,
-        device       float4 * dst,
-        constant   uint64_t & nb [[buffer(28)]],
-        uint tpig[[thread_position_in_grid]]) {
-    dst[tpig] = src0[tpig] + src1[tpig % nb];
-}
-
-kernel void kernel_mul_row(
-        device const float4 * src0,
-        device const float4 * src1,
-        device       float4 * dst,
-        constant   uint64_t & nb  [[buffer(28)]],
-        uint tpig[[thread_position_in_grid]]) {
-    dst[tpig] = src0[tpig] * src1[tpig % nb];
-}
-
-kernel void kernel_div_row(
-        device const float4 * src0,
-        device const float4 * src1,
-        device       float4 * dst,
-        constant   uint64_t & nb  [[buffer(28)]],
-        uint tpig[[thread_position_in_grid]]) {
-    dst[tpig] = src0[tpig] / src1[tpig % nb];
-}
-
-kernel void kernel_scale(
-        device const float * src0,
-        device       float * dst,
-        constant     float & scale,
-        uint tpig[[thread_position_in_grid]]) {
-    dst[tpig] = src0[tpig] * scale;
-}
-
-kernel void kernel_scale_4(
-        device const float4 * src0,
-        device       float4 * dst,
-        constant     float  & scale,
-        uint tpig[[thread_position_in_grid]]) {
-    dst[tpig] = src0[tpig] * scale;
-}
-
-kernel void kernel_relu(
-        device const float * src0,
-        device       float * dst,
-        uint tpig[[thread_position_in_grid]]) {
-    dst[tpig] = max(0.0f, src0[tpig]);
-}
-
-kernel void kernel_tanh(
-        device const float * src0,
-        device       float * dst,
-        uint tpig[[thread_position_in_grid]]) {
-    device const float & x = src0[tpig];
-    dst[tpig] = precise::tanh(x);
-}
-
-constant float GELU_COEF_A     = 0.044715f;
-constant float GELU_QUICK_COEF = -1.702f;
-constant float SQRT_2_OVER_PI  = 0.79788456080286535587989211986876f;
-
-kernel void kernel_gelu(
-    device const float4 * src0,
-    device       float4 * dst,
-    uint tpig[[thread_position_in_grid]]) {
-    device const float4 & x = src0[tpig];
-
-    // BEWARE !!!
-    // Simply using "tanh" instead of "precise::tanh" will sometimes results in NaNs!
-    // This was observed with Falcon 7B and 40B models
-    //
-    dst[tpig] = 0.5f*x*(1.0f + precise::tanh(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
-}
-
-kernel void kernel_gelu_quick(
-    device const float4 * src0,
-    device       float4 * dst,
-    uint tpig[[thread_position_in_grid]]) {
-    device const float4 & x = src0[tpig];
-
-    dst[tpig] = x*(1.0f/(1.0f+exp(GELU_QUICK_COEF*x)));
-}
-
-kernel void kernel_silu(
-        device const float4 * src0,
-        device       float4 * dst,
-        uint tpig[[thread_position_in_grid]]) {
-    device const float4 & x = src0[tpig];
-    dst[tpig] = x / (1.0f + exp(-x));
-}
-
-kernel void kernel_sqr(
-        device const float * src0,
-        device       float * dst,
-        uint tpig[[thread_position_in_grid]]) {
-    dst[tpig] = src0[tpig] * src0[tpig];
-}
-
-kernel void kernel_sum_rows(
-        device const float * src0,
-        device       float * dst,
-        constant  int64_t & ne00,
-        constant  int64_t & ne01,
-        constant  int64_t & ne02,
-        constant  int64_t & ne03,
-        constant uint64_t & nb00,
-        constant uint64_t & nb01,
-        constant uint64_t & nb02,
-        constant uint64_t & nb03,
-        constant  int64_t & ne10,
-        constant  int64_t & ne11,
-        constant  int64_t & ne12,
-        constant  int64_t & ne13,
-        constant uint64_t & nb10,
-        constant uint64_t & nb11,
-        constant uint64_t & nb12,
-        constant uint64_t & nb13,
-        constant  int64_t & ne0,
-        constant  int64_t & ne1,
-        constant  int64_t & ne2,
-        constant  int64_t & ne3,
-        constant uint64_t & nb0,
-        constant uint64_t & nb1,
-        constant uint64_t & nb2,
-        constant uint64_t & nb3,
-        uint3 tpig[[thread_position_in_grid]]) {
-    int64_t i3 = tpig.z;
-    int64_t i2 = tpig.y;
-    int64_t i1 = tpig.x;
-
-    if (i3 >= ne03 || i2 >= ne02 || i1 >= ne01) {
-        return;
-    }
-
-    device const float * src_row = (device const float *) ((device const char *) src0 + i1*nb01 + i2*nb02 + i3*nb03);
-    device       float * dst_row = (device       float *) ((device       char *) dst  + i1*nb1  + i2*nb2  + i3*nb3);
-
-    float row_sum = 0;
-
-    for (int64_t i0 = 0; i0 < ne00; i0++) {
-        row_sum += src_row[i0];
-    }
-
-    dst_row[0] = row_sum;
-}
-
-kernel void kernel_soft_max(
-        device const float * src0,
-        device const float * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant     float & scale,
-        threadgroup float  * buf [[threadgroup(0)]],
-        uint  tgpig[[threadgroup_position_in_grid]],
-        uint  tpitg[[thread_position_in_threadgroup]],
-        uint  sgitg[[simdgroup_index_in_threadgroup]],
-        uint  tiisg[[thread_index_in_simdgroup]],
-        uint    ntg[[threads_per_threadgroup]]) {
-    const int64_t i03 = (tgpig) / (ne02*ne01);
-    const int64_t i02 = (tgpig - i03*ne02*ne01) / ne01;
-    const int64_t i01 = (tgpig - i03*ne02*ne01 - i02*ne01);
-
-    device const float * psrc0 =         src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
-    device const float * pmask = src1 != src0 ? src1                               + i01*ne00 : nullptr;
-    device       float * pdst  =         dst  + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
-
-    // parallel max
-    float lmax = -INFINITY;
-
-    for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
-        lmax = MAX(lmax, psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f));
-    }
-
-    // find the max value in the block
-    float max_val = simd_max(lmax);
-    if (ntg > N_SIMDWIDTH) {
-        if (sgitg == 0) {
-            buf[tiisg] = -INFINITY;
-        }
-
-        threadgroup_barrier(mem_flags::mem_threadgroup);
-
-        if (tiisg == 0) {
-            buf[sgitg] = max_val;
-        }
-
-        threadgroup_barrier(mem_flags::mem_threadgroup);
-
-        max_val = buf[tiisg];
-        max_val = simd_max(max_val);
-    }
-
-    // parallel sum
-    float lsum = 0.0f;
-    for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
-        const float exp_psrc0 = exp((psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f)) - max_val);
-        lsum += exp_psrc0;
-        pdst[i00] = exp_psrc0;
-    }
-
-    // This barrier fixes a failing test
-    // ref: https://github.com/ggerganov/ggml/pull/621#discussion_r1425156335
-    threadgroup_barrier(mem_flags::mem_none);
-
-    float sum = simd_sum(lsum);
-
-    if (ntg > N_SIMDWIDTH) {
-        if (sgitg == 0) {
-            buf[tiisg] = 0.0f;
-        }
-
-        threadgroup_barrier(mem_flags::mem_threadgroup);
-
-        if (tiisg == 0) {
-            buf[sgitg] = sum;
-        }
-
-        threadgroup_barrier(mem_flags::mem_threadgroup);
-
-        sum = buf[tiisg];
-        sum = simd_sum(sum);
-    }
-
-    const float inv_sum = 1.0f/sum;
-
-    for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
-        pdst[i00] *= inv_sum;
-    }
-}
-
-kernel void kernel_soft_max_4(
-        device const float * src0,
-        device const float * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant     float & scale,
-        threadgroup float  * buf [[threadgroup(0)]],
-        uint  tgpig[[threadgroup_position_in_grid]],
-        uint  tpitg[[thread_position_in_threadgroup]],
-        uint  sgitg[[simdgroup_index_in_threadgroup]],
-        uint  tiisg[[thread_index_in_simdgroup]],
-        uint    ntg[[threads_per_threadgroup]]) {
-    const int64_t i03 = (tgpig) / (ne02*ne01);
-    const int64_t i02 = (tgpig - i03*ne02*ne01) / ne01;
-    const int64_t i01 = (tgpig - i03*ne02*ne01 - i02*ne01);
-
-    device const float4 * psrc4 =                (device const float4 *)(src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
-    device const float4 * pmask = src1 != src0 ? (device const float4 *)(src1 +                                      i01*ne00) : nullptr;
-    device       float4 * pdst4 =                (device       float4 *)(dst  + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
-
-    // parallel max
-    float4 lmax4 = -INFINITY;
-
-    for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
-        lmax4 = fmax(lmax4, psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f));
-    }
-
-    const float lmax = MAX(MAX(lmax4[0], lmax4[1]), MAX(lmax4[2], lmax4[3]));
-
-    float max_val = simd_max(lmax);
-    if (ntg > N_SIMDWIDTH) {
-        if (sgitg == 0) {
-            buf[tiisg] = -INFINITY;
-        }
-
-        threadgroup_barrier(mem_flags::mem_threadgroup);
-
-        if (tiisg == 0) {
-            buf[sgitg] = max_val;
-        }
-
-        threadgroup_barrier(mem_flags::mem_threadgroup);
-
-        max_val = buf[tiisg];
-        max_val = simd_max(max_val);
-    }
-
-    // parallel sum
-    float4 lsum4 = 0.0f;
-    for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
-        const float4 exp_psrc4 = exp((psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f)) - max_val);
-        lsum4 += exp_psrc4;
-        pdst4[i00] = exp_psrc4;
-    }
-
-    const float lsum = lsum4[0] + lsum4[1] + lsum4[2] + lsum4[3];
-
-    // This barrier fixes a failing test
-    // ref: https://github.com/ggerganov/ggml/pull/621#discussion_r1425156335
-    threadgroup_barrier(mem_flags::mem_none);
-
-    float sum = simd_sum(lsum);
-
-    if (ntg > N_SIMDWIDTH) {
-        if (sgitg == 0) {
-            buf[tiisg] = 0.0f;
-        }
-
-        threadgroup_barrier(mem_flags::mem_threadgroup);
-
-        if (tiisg == 0) {
-            buf[sgitg] = sum;
-        }
-
-        threadgroup_barrier(mem_flags::mem_threadgroup);
-
-        sum = buf[tiisg];
-        sum = simd_sum(sum);
-    }
-
-    const float inv_sum = 1.0f/sum;
-
-    for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
-        pdst4[i00] *= inv_sum;
-    }
-}
-
-kernel void kernel_diag_mask_inf(
-        device const float * src0,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant       int & n_past,
-        uint3 tpig[[thread_position_in_grid]]) {
-    const int64_t i02 = tpig[2];
-    const int64_t i01 = tpig[1];
-    const int64_t i00 = tpig[0];
-
-    if (i00 > n_past + i01) {
-        dst[i02*ne01*ne00 + i01*ne00 + i00] = -INFINITY;
-    } else {
-        dst[i02*ne01*ne00 + i01*ne00 + i00] = src0[i02*ne01*ne00 + i01*ne00 + i00];
-    }
-}
-
-kernel void kernel_diag_mask_inf_8(
-        device const float4 * src0,
-        device       float4 * dst,
-        constant    int64_t & ne00,
-        constant    int64_t & ne01,
-        constant        int & n_past,
-        uint3 tpig[[thread_position_in_grid]]) {
-
-    const int64_t i = 2*tpig[0];
-
-    dst[i+0] = src0[i+0];
-    dst[i+1] = src0[i+1];
-    int64_t i4 = 4*i;
-    const int64_t i02 = i4/(ne00*ne01); i4 -= i02*ne00*ne01;
-    const int64_t i01 = i4/(ne00);      i4 -= i01*ne00;
-    const int64_t i00 = i4;
-    for (int k = 3; k >= 0; --k) {
-        if (i00 + 4 + k <= n_past + i01) {
-            break;
-        }
-        dst[i+1][k] = -INFINITY;
-        if (i00 + k > n_past + i01) {
-            dst[i][k] = -INFINITY;
-        }
-    }
-}
-
-kernel void kernel_norm(
-        device const  void * src0,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant  uint64_t & nb01,
-        constant     float & eps,
-        threadgroup float  * sum [[threadgroup(0)]],
-        uint tgpig[[threadgroup_position_in_grid]],
-        uint tpitg[[thread_position_in_threadgroup]],
-        uint   ntg[[threads_per_threadgroup]]) {
-    device const float * x = (device const float *) ((device const char *) src0 + tgpig*nb01);
-    // MEAN
-    // parallel sum
-    sum[tpitg] = 0.0f;
-    for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
-        sum[tpitg] += x[i00];
-    }
-    // reduce
-    threadgroup_barrier(mem_flags::mem_threadgroup);
-    for (uint i = ntg/2; i > 0; i /= 2) {
-        if (tpitg < i) {
-            sum[tpitg] += sum[tpitg + i];
-        }
-        threadgroup_barrier(mem_flags::mem_threadgroup);
-    }
-    const float mean  = sum[0] / ne00;
-
-    // recenter and VARIANCE
-    threadgroup_barrier(mem_flags::mem_threadgroup);
-    device float * y = dst + tgpig*ne00;
-    sum[tpitg] = 0.0f;
-    for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
-        y[i00] = x[i00] - mean;
-        sum[tpitg] += y[i00] * y[i00];
-    }
-
-    // reduce
-    threadgroup_barrier(mem_flags::mem_threadgroup);
-    for (uint i = ntg/2; i > 0; i /= 2) {
-        if (tpitg < i) {
-            sum[tpitg] += sum[tpitg + i];
-        }
-        threadgroup_barrier(mem_flags::mem_threadgroup);
-    }
-    const float variance = sum[0] / ne00;
-
-    const float scale = 1.0f/sqrt(variance + eps);
-    for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
-        y[i00] = y[i00] * scale;
-    }
-}
-
-kernel void kernel_rms_norm(
-        device const  void * src0,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant  uint64_t & nb01,
-        constant     float & eps,
-        threadgroup float  * buf [[threadgroup(0)]],
-        uint tgpig[[threadgroup_position_in_grid]],
-        uint tpitg[[thread_position_in_threadgroup]],
-        uint sgitg[[simdgroup_index_in_threadgroup]],
-        uint tiisg[[thread_index_in_simdgroup]],
-        uint   ntg[[threads_per_threadgroup]]) {
-    device const float4 * x = (device const float4 *) ((device const char *) src0 + tgpig*nb01);
-
-    float4 sumf = 0;
-    float all_sum = 0;
-
-    // parallel sum
-    for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
-        sumf += x[i00] * x[i00];
-    }
-    all_sum = sumf[0] + sumf[1] + sumf[2] + sumf[3];
-    all_sum = simd_sum(all_sum);
-    if (ntg > N_SIMDWIDTH) {
-        if (sgitg == 0) {
-            buf[tiisg] = 0.0f;
-        }
-
-        threadgroup_barrier(mem_flags::mem_threadgroup);
-
-        if (tiisg == 0) {
-            buf[sgitg] = all_sum;
-        }
-
-        threadgroup_barrier(mem_flags::mem_threadgroup);
-
-        all_sum = buf[tiisg];
-        all_sum = simd_sum(all_sum);
-    }
-
-    const float mean  = all_sum/ne00;
-    const float scale = 1.0f/sqrt(mean + eps);
-
-    device float4 * y = (device float4 *) (dst + tgpig*ne00);
-    for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
-        y[i00] = x[i00] * scale;
-    }
-}
-
-kernel void kernel_group_norm(
-        device const float * src0,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant   int32_t & n_groups,
-        constant     float & eps,
-        threadgroup float  * buf [[threadgroup(0)]],
-        uint tgpig[[threadgroup_position_in_grid]],
-        uint tpitg[[thread_position_in_threadgroup]],
-        uint sgitg[[simdgroup_index_in_threadgroup]],
-        uint tiisg[[thread_index_in_simdgroup]],
-        uint   ntg[[threads_per_threadgroup]]) {
-    const int64_t ne = ne00*ne01*ne02;
-    const int64_t gs = ne00*ne01*((ne02 + n_groups - 1) / n_groups);
-
-    int start = tgpig * gs;
-    int end   = start + gs;
-
-    start += tpitg;
-
-    if (end >= ne) {
-        end = ne;
-    }
-
-    float tmp = 0.0f; // partial sum for thread in warp
-
-    for (int j = start; j < end; j += ntg) {
-        tmp += src0[j];
-    }
-
-    threadgroup_barrier(mem_flags::mem_threadgroup);
-    tmp = simd_sum(tmp);
-    if (ntg > N_SIMDWIDTH) {
-        if (sgitg == 0) {
-            buf[tiisg] = 0.0f;
-        }
-
-        threadgroup_barrier(mem_flags::mem_threadgroup);
-
-        if (tiisg == 0) {
-            buf[sgitg] = tmp;
-        }
-
-        threadgroup_barrier(mem_flags::mem_threadgroup);
-
-        tmp = buf[tiisg];
-        tmp = simd_sum(tmp);
-    }
-
-    const float mean = tmp / gs;
-    tmp = 0.0f;
-
-    for (int j = start; j < end; j += ntg) {
-        float xi = src0[j] - mean;
-        dst[j] = xi;
-        tmp += xi * xi;
-    }
-
-    tmp = simd_sum(tmp);
-    if (ntg > N_SIMDWIDTH) {
-        if (sgitg == 0) {
-            buf[tiisg] = 0.0f;
-        }
-
-        threadgroup_barrier(mem_flags::mem_threadgroup);
-
-        if (tiisg == 0) {
-            buf[sgitg] = tmp;
-        }
-
-        threadgroup_barrier(mem_flags::mem_threadgroup);
-
-        tmp = buf[tiisg];
-        tmp = simd_sum(tmp);
-    }
-
-    const float variance = tmp / gs;
-    const float scale = 1.0f/sqrt(variance + eps);
-    for (int j = start; j < end; j += ntg) {
-        dst[j] *= scale;
-    }
-}
-
-// function for calculate inner product between half a q4_0 block and 16 floats (yl), sumy is SUM(yl[i])
-// il indicates where the q4 quants begin (0 or QK4_0/4)
-// we assume that the yl's have been multiplied with the appropriate scale factor
-// that corresponds to the missing bit shifts (1, 1/16, 1/256, 1/4096)
-inline float block_q_n_dot_y(device const block_q4_0 * qb_curr, float sumy, thread float * yl, int il) {
-    float d = qb_curr->d;
-
-    float2 acc = 0.f;
-
-    device const uint16_t * qs = ((device const uint16_t *)qb_curr + 1 + il/2);
-
-    for (int i = 0; i < 8; i+=2) {
-        acc[0] += yl[i + 0] * (qs[i / 2] & 0x000F)
-                + yl[i + 1] * (qs[i / 2] & 0x0F00);
-        acc[1] += yl[i + 8] * (qs[i / 2] & 0x00F0)
-                + yl[i + 9] * (qs[i / 2] & 0xF000);
-    }
-    return d * (sumy * -8.f + acc[0] + acc[1]);
-}
-
-// function for calculate inner product between half a q4_1 block and 16 floats (yl), sumy is SUM(yl[i])
-// il indicates where the q4 quants begin (0 or QK4_0/4)
-// we assume that the yl's have been multiplied with the appropriate scale factor
-// that corresponds to the missing bit shifts (1, 1/16, 1/256, 1/4096)
-inline float block_q_n_dot_y(device const block_q4_1 * qb_curr, float sumy, thread float * yl, int il) {
-    float d = qb_curr->d;
-    float m = qb_curr->m;
-
-    float2 acc = 0.f;
-
-    device const uint16_t * qs = ((device const uint16_t *)qb_curr + 2 + il/2);
-
-    for (int i = 0; i < 8; i+=2) {
-        acc[0] += yl[i + 0] * (qs[i / 2] & 0x000F)
-                + yl[i + 1] * (qs[i / 2] & 0x0F00);
-        acc[1] += yl[i + 8] * (qs[i / 2] & 0x00F0)
-                + yl[i + 9] * (qs[i / 2] & 0xF000);
-    }
-    return d * (acc[0] + acc[1]) + sumy * m;
-}
-
-// function for calculate inner product between half a q5_0 block and 16 floats (yl), sumy is SUM(yl[i])
-// il indicates where the q5 quants begin (0 or QK5_0/4)
-// we assume that the yl's have been multiplied with the appropriate scale factor
-// that corresponds to the missing bit shifts (1, 1/16, 1/256, 1/4096)
-inline float block_q_n_dot_y(device const block_q5_0 * qb_curr, float sumy, thread float * yl, int il) {
-    float d = qb_curr->d;
-
-    float2 acc = 0.f;
-
-    device const uint16_t * qs =  ((device const uint16_t *)qb_curr + 3 + il/2);
-           const uint32_t   qh = *((device const uint32_t *)qb_curr->qh);
-
-    for (int i = 0; i < 8; i+=2) {
-        acc[0] += yl[i + 0] * ((qs[i / 2] & 0x000F) | ((qh >> (i+0+il        ) << 4 ) & 0x00010))
-                + yl[i + 1] * ((qs[i / 2] & 0x0F00) | ((qh >> (i+1+il        ) << 12) & 0x01000));
-        acc[1] += yl[i + 8] * ((qs[i / 2] & 0x00F0) | ((qh >> (i+0+il+QK5_0/2) << 8 ) & 0x00100))
-                + yl[i + 9] * ((qs[i / 2] & 0xF000) | ((qh >> (i+1+il+QK5_0/2) << 16) & 0x10000));
-    }
-    return d * (sumy * -16.f + acc[0] + acc[1]);
-}
-
-// function for calculate inner product between half a q5_1 block and 16 floats (yl), sumy is SUM(yl[i])
-// il indicates where the q5 quants begin (0 or QK5_1/4)
-// we assume that the yl's have been multiplied with the appropriate scale factor
-// that corresponds to the missing bit shifts (1, 1/16, 1/256, 1/4096)
-inline float block_q_n_dot_y(device const block_q5_1 * qb_curr, float sumy, thread float * yl, int il) {
-    float d = qb_curr->d;
-    float m = qb_curr->m;
-
-    float2 acc = 0.f;
-
-    device const uint16_t * qs =  ((device const uint16_t *)qb_curr + 4 + il/2);
-           const uint32_t   qh = *((device const uint32_t *)qb_curr->qh);
-
-    for (int i = 0; i < 8; i+=2) {
-        acc[0] += yl[i + 0] * ((qs[i / 2] & 0x000F) | ((qh >> (i+0+il        ) << 4 ) & 0x00010))
-                + yl[i + 1] * ((qs[i / 2] & 0x0F00) | ((qh >> (i+1+il        ) << 12) & 0x01000));
-        acc[1] += yl[i + 8] * ((qs[i / 2] & 0x00F0) | ((qh >> (i+0+il+QK5_0/2) << 8 ) & 0x00100))
-                + yl[i + 9] * ((qs[i / 2] & 0xF000) | ((qh >> (i+1+il+QK5_0/2) << 16) & 0x10000));
-    }
-    return d * (acc[0] + acc[1]) + sumy * m;
-}
-
-// putting them in the kernel cause a significant performance penalty
-#define N_DST 4        // each SIMD group works on 4 rows
-#define N_SIMDGROUP 2  // number of SIMD groups in a thread group
-//Note: This is a template, but strictly speaking it only applies to
-//      quantizations where the block size is 32. It also does not
-//      guard against the number of rows not being divisible by
-//      N_DST, so this is another explicit assumption of the implementation.
-template<typename block_q_type, int nr, int nsg, int nw>
-void mul_vec_q_n_f32_impl(
-        device const void  * src0,
-        device const float * src1,
-        device       float * dst,
-                   int64_t   ne00,
-                   int64_t   ne01,
-                   int64_t   ne02,
-                   int64_t   ne10,
-                   int64_t   ne12,
-                   int64_t   ne0,
-                   int64_t   ne1,
-                   uint      r2,
-                   uint      r3,
-                   uint3 tgpig, uint tiisg, uint sgitg) {
-    const int nb = ne00/QK4_0;
-
-    const int r0 = tgpig.x;
-    const int r1 = tgpig.y;
-    const int im = tgpig.z;
-
-    const int first_row = (r0 * nsg + sgitg) * nr;
-
-    const uint i12 = im%ne12;
-    const uint i13 = im/ne12;
-
-    const uint offset0 = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
-
-    device const block_q_type * x = (device const block_q_type *) src0 + offset0;
-    device const float        * y = (device const float        *) src1 + r1*ne10 + im*ne00*ne1;
-
-    float yl[16]; // src1 vector cache
-    float sumf[nr] = {0.f};
-
-    const int ix = (tiisg/2);
-    const int il = (tiisg%2)*8;
-
-    device const float * yb = y + ix * QK4_0 + il;
-
-    // each thread in a SIMD group deals with half a block.
-    for (int ib = ix; ib < nb; ib += nw/2) {
-        float sumy = 0;
-        for (int i = 0; i < 8; i += 2) {
-            sumy += yb[i] + yb[i+1];
-            yl[i+0] = yb[i+ 0];
-            yl[i+1] = yb[i+ 1]/256.f;
-
-            sumy += yb[i+16] + yb[i+17];
-            yl[i+8] = yb[i+16]/16.f;
-            yl[i+9] = yb[i+17]/4096.f;
-        }
-
-        for (int row = 0; row < nr; row++) {
-            sumf[row] += block_q_n_dot_y(x+ib+row*nb, sumy, yl, il);
-        }
-
-        yb += QK4_0 * 16;
-    }
-
-    for (int row = 0; row < nr; ++row) {
-        const float tot = simd_sum(sumf[row]);
-        if (tiisg == 0 && first_row + row < ne01) {
-            dst[im*ne0*ne1 + r1*ne0 + first_row + row] = tot;
-        }
-    }
-}
-
-kernel void kernel_mul_mv_q4_0_f32(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant   int64_t & ne10,
-        constant   int64_t & ne11,
-        constant   int64_t & ne12,
-        constant  uint64_t & nb10,
-        constant  uint64_t & nb11,
-        constant  uint64_t & nb12,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint  tiisg[[thread_index_in_simdgroup]],
-        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
-    mul_vec_q_n_f32_impl<block_q4_0, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,r2,r3,tgpig,tiisg,sgitg);
-}
-
-kernel void kernel_mul_mv_q4_1_f32(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant   int64_t & ne10,
-        constant   int64_t & ne11,
-        constant   int64_t & ne12,
-        constant  uint64_t & nb10,
-        constant  uint64_t & nb11,
-        constant  uint64_t & nb12,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint tiisg[[thread_index_in_simdgroup]],
-        uint sgitg[[simdgroup_index_in_threadgroup]]) {
-     mul_vec_q_n_f32_impl<block_q4_1, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,r2,r3,tgpig,tiisg,sgitg);
-}
-
-kernel void kernel_mul_mv_q5_0_f32(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant   int64_t & ne10,
-        constant   int64_t & ne11,
-        constant   int64_t & ne12,
-        constant  uint64_t & nb10,
-        constant  uint64_t & nb11,
-        constant  uint64_t & nb12,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint  tiisg[[thread_index_in_simdgroup]],
-        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
-    mul_vec_q_n_f32_impl<block_q5_0, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,r2,r3,tgpig,tiisg,sgitg);
-}
-
-kernel void kernel_mul_mv_q5_1_f32(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant   int64_t & ne10,
-        constant   int64_t & ne11,
-        constant   int64_t & ne12,
-        constant  uint64_t & nb10,
-        constant  uint64_t & nb11,
-        constant  uint64_t & nb12,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint  tiisg[[thread_index_in_simdgroup]],
-        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
-    mul_vec_q_n_f32_impl<block_q5_1, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,r2,r3,tgpig,tiisg,sgitg);
-}
-
-
-#define NB_Q8_0 8
-
-void kernel_mul_mv_q8_0_f32_impl(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant   int64_t & ne10,
-        constant   int64_t & ne12,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint  tiisg[[thread_index_in_simdgroup]],
-        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
-    const int nr  = N_DST;
-    const int nsg = N_SIMDGROUP;
-    const int nw  = N_SIMDWIDTH;
-
-    const int nb = ne00/QK8_0;
-    const int r0 = tgpig.x;
-    const int r1 = tgpig.y;
-    const int im = tgpig.z;
-
-    const int first_row = (r0 * nsg + sgitg) * nr;
-
-    const uint i12 = im%ne12;
-    const uint i13 = im/ne12;
-
-    const uint offset0 = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
-
-    device const block_q8_0 * x = (device const block_q8_0 *) src0 + offset0;
-    device const float      * y = (device const float      *) src1 + r1*ne10 + im*ne00*ne1;
-
-    float yl[NB_Q8_0];
-    float sumf[nr]={0.f};
-
-    const int ix = tiisg/4;
-    const int il = tiisg%4;
-
-    device const float * yb = y + ix * QK8_0 + NB_Q8_0*il;
-
-    // each thread in a SIMD group deals with NB_Q8_0 quants at a time
-    for (int ib = ix; ib < nb; ib += nw/4) {
-        for (int i = 0; i < NB_Q8_0; ++i) {
-            yl[i] = yb[i];
-        }
-
-        for (int row = 0; row < nr; row++) {
-            device const int8_t * qs = x[ib+row*nb].qs + NB_Q8_0*il;
-            float sumq = 0.f;
-            for (int iq = 0; iq < NB_Q8_0; ++iq) {
-                sumq += qs[iq] * yl[iq];
-            }
-            sumf[row] += sumq*x[ib+row*nb].d;
-        }
-
-        yb += NB_Q8_0 * nw;
-    }
-
-    for (int row = 0; row < nr; ++row) {
-        const float tot = simd_sum(sumf[row]);
-        if (tiisg == 0 && first_row + row < ne01) {
-            dst[r1*ne0 + im*ne0*ne1 + first_row + row] = tot;
-        }
-    }
-}
-
-[[host_name("kernel_mul_mv_q8_0_f32")]]
-kernel void kernel_mul_mv_q8_0_f32(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant   int64_t & ne10,
-        constant   int64_t & ne11,
-        constant   int64_t & ne12,
-        constant  uint64_t & nb10,
-        constant  uint64_t & nb11,
-        constant  uint64_t & nb12,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint  tiisg[[thread_index_in_simdgroup]],
-        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
-    kernel_mul_mv_q8_0_f32_impl(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,r2,r3,tgpig,tiisg,sgitg);
-}
-
-#define N_F32_F32 4
-
-void kernel_mul_mv_f32_f32_impl(
-        device const  char * src0,
-        device const  char * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant   int64_t & ne10,
-        constant   int64_t & ne11,
-        constant   int64_t & ne12,
-        constant  uint64_t & nb10,
-        constant  uint64_t & nb11,
-        constant  uint64_t & nb12,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint  tiisg[[thread_index_in_simdgroup]]) {
-
-    const int64_t r0 = tgpig.x;
-    const int64_t rb = tgpig.y*N_F32_F32;
-    const int64_t im = tgpig.z;
-
-    const uint i12 = im%ne12;
-    const uint i13 = im/ne12;
-
-    const uint offset0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb02*ne02;
-
-    device const float * x = (device const float *) (src0 + offset0);
-
-    if (ne00 < 128) {
-        for (int row = 0; row < N_F32_F32; ++row) {
-            int r1 = rb + row;
-            if (r1 >= ne11) {
-                break;
-            }
-
-            device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
-
-            float sumf = 0;
-            for (int i = tiisg; i < ne00; i += 32) {
-                sumf += (float) x[i] * (float) y[i];
-            }
-
-            float all_sum = simd_sum(sumf);
-            if (tiisg == 0) {
-                dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
-            }
-        }
-    } else {
-        device const float4 * x4 = (device const float4 *)x;
-        for (int row = 0; row < N_F32_F32; ++row) {
-            int r1 = rb + row;
-            if (r1 >= ne11) {
-                break;
-            }
-
-            device const float  * y  = (device const float  *) (src1 + r1*nb11 + im*nb12);
-            device const float4 * y4 = (device const float4 *) y;
-
-            float sumf = 0;
-            for (int i = tiisg; i < ne00/4; i += 32) {
-                for (int k = 0; k < 4; ++k) sumf += (float) x4[i][k] * y4[i][k];
-            }
-
-            float all_sum = simd_sum(sumf);
-            if (tiisg == 0) {
-                for (int i = 4*(ne00/4); i < ne00; ++i) all_sum += (float) x[i] * y[i];
-                dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
-            }
-        }
-    }
-}
-
-[[host_name("kernel_mul_mv_f32_f32")]]
-kernel void kernel_mul_mv_f32_f32(
-        device const  char * src0,
-        device const  char * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant   int64_t & ne10,
-        constant   int64_t & ne11,
-        constant   int64_t & ne12,
-        constant  uint64_t & nb10,
-        constant  uint64_t & nb11,
-        constant  uint64_t & nb12,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint  tiisg[[thread_index_in_simdgroup]]) {
-    kernel_mul_mv_f32_f32_impl(src0, src1, dst, ne00, ne01, ne02, nb00, nb01, nb02, ne10, ne11, ne12, nb10, nb11, nb12, ne0, ne1, r2, r3, tgpig, tiisg);
-}
-
-#define N_F16_F16 4
-
-kernel void kernel_mul_mv_f16_f16(
-        device const  char * src0,
-        device const  char * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant   int64_t & ne10,
-        constant   int64_t & ne11,
-        constant   int64_t & ne12,
-        constant  uint64_t & nb10,
-        constant  uint64_t & nb11,
-        constant  uint64_t & nb12,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint  tiisg[[thread_index_in_simdgroup]]) {
-
-    const int64_t r0 = tgpig.x;
-    const int64_t rb = tgpig.y*N_F16_F16;
-    const int64_t im = tgpig.z;
-
-    const uint i12 = im%ne12;
-    const uint i13 = im/ne12;
-
-    const uint offset0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb02*ne02;
-
-    device const half * x = (device const half *) (src0 + offset0);
-
-    if (ne00 < 128) {
-        for (int row = 0; row < N_F16_F16; ++row) {
-            int r1 = rb + row;
-            if (r1 >= ne11) {
-                break;
-            }
-
-            device const half * y = (device const half *) (src1 + r1*nb11 + im*nb12);
-
-            float sumf = 0;
-            for (int i = tiisg; i < ne00; i += 32) {
-                sumf += (half) x[i] * (half) y[i];
-            }
-
-            float all_sum = simd_sum(sumf);
-            if (tiisg == 0) {
-                dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
-            }
-        }
-    } else {
-        device const half4 * x4 = (device const half4 *)x;
-        for (int row = 0; row < N_F16_F16; ++row) {
-            int r1 = rb + row;
-            if (r1 >= ne11) {
-                break;
-            }
-
-            device const half  * y  = (device const half  *) (src1 + r1*nb11 + im*nb12);
-            device const half4 * y4 = (device const half4 *) y;
-
-            float sumf = 0;
-            for (int i = tiisg; i < ne00/4; i += 32) {
-                for (int k = 0; k < 4; ++k) sumf += (half) x4[i][k] * y4[i][k];
-            }
-
-            float all_sum = simd_sum(sumf);
-            if (tiisg == 0) {
-                for (int i = 4*(ne00/4); i < ne00; ++i) all_sum += (half) x[i] * y[i];
-                dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
-            }
-        }
-    }
-}
-
-void kernel_mul_mv_f16_f32_1row_impl(
-        device const  char * src0,
-        device const  char * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant   int64_t & ne10,
-        constant   int64_t & ne11,
-        constant   int64_t & ne12,
-        constant  uint64_t & nb10,
-        constant  uint64_t & nb11,
-        constant  uint64_t & nb12,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint  tiisg[[thread_index_in_simdgroup]]) {
-
-    const int64_t r0 = tgpig.x;
-    const int64_t r1 = tgpig.y;
-    const int64_t im = tgpig.z;
-
-    const uint i12 = im%ne12;
-    const uint i13 = im/ne12;
-
-    const uint offset0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb02*ne02;
-
-    device const half  * x = (device const half  *) (src0 + offset0);
-    device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
-
-    float sumf = 0;
-    if (ne00 < 128) {
-        for (int i = tiisg; i < ne00; i += 32) {
-            sumf += (float) x[i] * (float) y[i];
-        }
-        float all_sum = simd_sum(sumf);
-        if (tiisg == 0) {
-            dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
-        }
-    } else {
-        device const half4  * x4 = (device const half4  *) x;
-        device const float4 * y4 = (device const float4 *) y;
-        for (int i = tiisg; i < ne00/4; i += 32) {
-            for (int k = 0; k < 4; ++k) sumf += (float)x4[i][k] * y4[i][k];
-        }
-        float all_sum = simd_sum(sumf);
-        if (tiisg == 0) {
-            for (int i = 4*(ne00/4); i < ne00; ++i) all_sum += (float) x[i] * y[i];
-            dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
-        }
-    }
-}
-
-[[host_name("kernel_mul_mv_f16_f32_1row")]]
-kernel void kernel_mul_mv_f16_f32_1row(
-        device const  char * src0,
-        device const  char * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant   int64_t & ne10,
-        constant   int64_t & ne11,
-        constant   int64_t & ne12,
-        constant  uint64_t & nb10,
-        constant  uint64_t & nb11,
-        constant  uint64_t & nb12,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint  tiisg[[thread_index_in_simdgroup]]) {
-    kernel_mul_mv_f16_f32_1row_impl(src0, src1, dst, ne00, ne01, ne02, nb00, nb01, nb02, ne10, ne11, ne12, nb10, nb11, nb12, ne0, ne1, r2, r3, tgpig, tiisg);
-}
-
-#define N_F16_F32 4
-
-void kernel_mul_mv_f16_f32_impl(
-        device const  char * src0,
-        device const  char * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant   int64_t & ne10,
-        constant   int64_t & ne11,
-        constant   int64_t & ne12,
-        constant  uint64_t & nb10,
-        constant  uint64_t & nb11,
-        constant  uint64_t & nb12,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint tiisg[[thread_index_in_simdgroup]]) {
-
-    const int64_t r0 = tgpig.x;
-    const int64_t rb = tgpig.y*N_F16_F32;
-    const int64_t im = tgpig.z;
-
-    const uint i12 = im%ne12;
-    const uint i13 = im/ne12;
-
-    const uint offset0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb02*ne02;
-
-    device const half * x = (device const half *) (src0 + offset0);
-
-    if (ne00 < 128) {
-        for (int row = 0; row < N_F16_F32; ++row) {
-            int r1 = rb + row;
-            if (r1 >= ne11) {
-                break;
-            }
-
-            device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
-
-            float sumf = 0;
-            for (int i = tiisg; i < ne00; i += 32) {
-                sumf += (float) x[i] * (float) y[i];
-            }
-
-            float all_sum = simd_sum(sumf);
-            if (tiisg == 0) {
-                dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
-            }
-        }
-    } else {
-        device const half4 * x4 = (device const half4 *)x;
-        for (int row = 0; row < N_F16_F32; ++row) {
-            int r1 = rb + row;
-            if (r1 >= ne11) {
-                break;
-            }
-
-            device const float  * y  = (device const float  *) (src1 + r1*nb11 + im*nb12);
-            device const float4 * y4 = (device const float4 *) y;
-
-            float sumf = 0;
-            for (int i = tiisg; i < ne00/4; i += 32) {
-                for (int k = 0; k < 4; ++k) sumf += (float) x4[i][k] * y4[i][k];
-            }
-
-            float all_sum = simd_sum(sumf);
-            if (tiisg == 0) {
-                for (int i = 4*(ne00/4); i < ne00; ++i) all_sum += (float) x[i] * y[i];
-                dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
-            }
-        }
-    }
-}
-
-[[host_name("kernel_mul_mv_f16_f32")]]
-kernel void kernel_mul_mv_f16_f32(
-        device const  char * src0,
-        device const  char * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant   int64_t & ne10,
-        constant   int64_t & ne11,
-        constant   int64_t & ne12,
-        constant  uint64_t & nb10,
-        constant  uint64_t & nb11,
-        constant  uint64_t & nb12,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint tiisg[[thread_index_in_simdgroup]]) {
-    kernel_mul_mv_f16_f32_impl(src0, src1, dst, ne00, ne01, ne02, nb00, nb01, nb02, ne10, ne11, ne12, nb10, nb11, nb12, ne0, ne1, r2, r3, tgpig, tiisg);
-}
-
-// Assumes row size (ne00) is a multiple of 4
-kernel void kernel_mul_mv_f16_f32_l4(
-        device const  char * src0,
-        device const  char * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant   int64_t & ne10,
-        constant   int64_t & ne11,
-        constant   int64_t & ne12,
-        constant  uint64_t & nb10,
-        constant  uint64_t & nb11,
-        constant  uint64_t & nb12,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint tiisg[[thread_index_in_simdgroup]]) {
-
-    const int nrows = ne11;
-    const int64_t r0 = tgpig.x;
-    const int64_t im = tgpig.z;
-
-    const uint i12 = im%ne12;
-    const uint i13 = im/ne12;
-
-    const uint offset0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb02*ne02;
-
-    device const half4 * x4 = (device const half4 *) (src0 + offset0);
-
-    for (int r1 = 0; r1 < nrows; ++r1) {
-        device const float4 * y4 = (device const float4 *) (src1 + r1*nb11 + im*nb12);
-
-        float sumf = 0;
-        for (int i = tiisg; i < ne00/4; i += 32) {
-            for (int k = 0; k < 4; ++k) sumf += (float) x4[i][k] * y4[i][k];
-        }
-
-        float all_sum = simd_sum(sumf);
-        if (tiisg == 0) {
-            dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
-        }
-    }
-}
-
-kernel void kernel_alibi_f32(
-        device const float * src0,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant   int64_t & ne03,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant  uint64_t & nb03,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   int64_t & ne2,
-        constant   int64_t & ne3,
-        constant  uint64_t & nb0,
-        constant  uint64_t & nb1,
-        constant  uint64_t & nb2,
-        constant  uint64_t & nb3,
-        constant     float & m0,
-        constant     float & m1,
-        constant       int & n_heads_log2_floor,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint3 tpitg[[thread_position_in_threadgroup]],
-        uint3   ntg[[threads_per_threadgroup]]) {
-    const int64_t i03 = tgpig[2];
-    const int64_t i02 = tgpig[1];
-    const int64_t i01 = tgpig[0];
-
-    const int64_t n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
-
-    const int64_t i3 = n / (ne2*ne1*ne0);
-    const int64_t i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
-    const int64_t i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
-  //const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0);
-
-    const int64_t k = i3*ne3 + i2;
-
-    float m_k;
-    if (k < n_heads_log2_floor) {
-        m_k = pow(m0, k + 1);
-    } else {
-        m_k = pow(m1, 2 * (k - n_heads_log2_floor) + 1);
-    }
-
-    device       char * dst_row = (device char *) dst + i3*nb3 + i2*nb2 + i1*nb1;
-    device const char * src_row = (device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01;
-    for (int64_t i00 = tpitg.x; i00 < ne00; i00 += ntg.x) {
-        const  float   src_v = *(device float *)(src_row + i00*nb00);
-        device float * dst_v =  (device float *)(dst_row + i00*nb0);
-        *dst_v = i00 * m_k + src_v;
-    }
-}
-
-static float rope_yarn_ramp(const float low, const float high, const int i0) {
-    const float y = (i0 / 2 - low) / max(0.001f, high - low);
-    return 1.0f - min(1.0f, max(0.0f, y));
-}
-
-// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
-// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
-static void rope_yarn(
-    float theta_extrap, float freq_scale, float corr_dims[2], int64_t i0, float ext_factor, float mscale,
-    thread float * cos_theta, thread float * sin_theta
-) {
-    // Get n-d rotational scaling corrected for extrapolation
-    float theta_interp = freq_scale * theta_extrap;
-    float theta = theta_interp;
-    if (ext_factor != 0.0f) {
-        float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor;
-        theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
-
-        // Get n-d magnitude scaling corrected for interpolation
-        mscale *= 1.0f + 0.1f * log(1.0f / freq_scale);
-    }
-    *cos_theta = cos(theta) * mscale;
-    *sin_theta = sin(theta) * mscale;
-}
-
-// Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get
-// `corr_fac(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))`
-static float rope_yarn_corr_factor(int n_dims, int n_orig_ctx, float n_rot, float base) {
-    return n_dims * log(n_orig_ctx / (n_rot * 2 * M_PI_F)) / (2 * log(base));
-}
-
-static void rope_yarn_corr_dims(
-    int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]
-) {
-    // start and end correction dims
-    dims[0] = max(0.0f,         floor(rope_yarn_corr_factor(n_dims, n_orig_ctx, beta_fast, freq_base)));
-    dims[1] = min(n_dims - 1.0f, ceil(rope_yarn_corr_factor(n_dims, n_orig_ctx, beta_slow, freq_base)));
-}
-
-typedef void (rope_t)(
-        device const    void * src0,
-        device const int32_t * src1,
-        device         float * dst,
-        constant     int64_t & ne00,
-        constant     int64_t & ne01,
-        constant     int64_t & ne02,
-        constant     int64_t & ne03,
-        constant    uint64_t & nb00,
-        constant    uint64_t & nb01,
-        constant    uint64_t & nb02,
-        constant    uint64_t & nb03,
-        constant     int64_t & ne0,
-        constant     int64_t & ne1,
-        constant     int64_t & ne2,
-        constant     int64_t & ne3,
-        constant    uint64_t & nb0,
-        constant    uint64_t & nb1,
-        constant    uint64_t & nb2,
-        constant    uint64_t & nb3,
-        constant         int & n_past,
-        constant         int & n_dims,
-        constant         int & mode,
-        constant         int & n_orig_ctx,
-        constant       float & freq_base,
-        constant       float & freq_scale,
-        constant       float & ext_factor,
-        constant       float & attn_factor,
-        constant       float & beta_fast,
-        constant       float & beta_slow,
-        uint  tiitg[[thread_index_in_threadgroup]],
-        uint3 tptg[[threads_per_threadgroup]],
-        uint3 tgpig[[threadgroup_position_in_grid]]);
-
-template<typename T>
-kernel void kernel_rope(
-        device const    void * src0,
-        device const int32_t * src1,
-        device         float * dst,
-        constant     int64_t & ne00,
-        constant     int64_t & ne01,
-        constant     int64_t & ne02,
-        constant     int64_t & ne03,
-        constant    uint64_t & nb00,
-        constant    uint64_t & nb01,
-        constant    uint64_t & nb02,
-        constant    uint64_t & nb03,
-        constant     int64_t & ne0,
-        constant     int64_t & ne1,
-        constant     int64_t & ne2,
-        constant     int64_t & ne3,
-        constant    uint64_t & nb0,
-        constant    uint64_t & nb1,
-        constant    uint64_t & nb2,
-        constant    uint64_t & nb3,
-        constant         int & n_past,
-        constant         int & n_dims,
-        constant         int & mode,
-        constant         int & n_orig_ctx,
-        constant       float & freq_base,
-        constant       float & freq_scale,
-        constant       float & ext_factor,
-        constant       float & attn_factor,
-        constant       float & beta_fast,
-        constant       float & beta_slow,
-        uint  tiitg[[thread_index_in_threadgroup]],
-        uint3 tptg[[threads_per_threadgroup]],
-        uint3 tgpig[[threadgroup_position_in_grid]]) {
-    const int64_t i3 = tgpig[2];
-    const int64_t i2 = tgpig[1];
-    const int64_t i1 = tgpig[0];
-
-    const bool is_neox = mode & 2;
-
-    float corr_dims[2];
-    rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims);
-
-    device const int32_t * pos = src1;
-
-    const int64_t p = pos[i2];
-
-    const float theta_0 = (float)p;
-    const float inv_ndims = -1.f/n_dims;
-
-    if (!is_neox) {
-        for (int64_t i0 = 2*tiitg; i0 < ne0; i0 += 2*tptg.x) {
-
-            const float theta = theta_0 * pow(freq_base, inv_ndims*i0);
-            float cos_theta, sin_theta;
-            rope_yarn(theta, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta);
-
-            device const T * const src = (device T *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
-            device       T * dst_data  = (device T *)((device char *)  dst + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
-
-            const T x0 = src[0];
-            const T x1 = src[1];
-
-            dst_data[0] = x0*cos_theta - x1*sin_theta;
-            dst_data[1] = x0*sin_theta + x1*cos_theta;
-        }
-    } else {
-        for (int64_t ic = 2*tiitg; ic < ne0; ic += 2*tptg.x) {
-            if (ic < n_dims) {
-                const int64_t ib = 0;
-
-                // simplified from `(ib * n_dims + ic) * inv_ndims`
-                const float cur_rot = inv_ndims*ic - ib;
-
-                const float theta = theta_0 * pow(freq_base, cur_rot);
-                float cos_theta, sin_theta;
-                rope_yarn(theta, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
-
-                const int64_t i0 = ib*n_dims + ic/2;
-
-                device const T * const src = (device T *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
-                device       T * dst_data  = (device T *)((device char *)  dst + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
-
-                const float x0 = src[0];
-                const float x1 = src[n_dims/2];
-
-                dst_data[0]        = x0*cos_theta - x1*sin_theta;
-                dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
-            } else {
-                const int64_t i0 = ic;
-
-                device const T * const src = (device T *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
-                device       T * dst_data  = (device T *)((device char *)  dst + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
-
-                dst_data[0] = src[0];
-                dst_data[1] = src[1];
-            }
-        }
-    }
-}
-
-template [[host_name("kernel_rope_f32")]] kernel rope_t kernel_rope<float>;
-template [[host_name("kernel_rope_f16")]] kernel rope_t kernel_rope<half>;
-
-kernel void kernel_im2col_f16(
-        device const float * x,
-        device       half * dst,
-        constant   int32_t & ofs0,
-        constant   int32_t & ofs1,
-        constant   int32_t & IW,
-        constant   int32_t & IH,
-        constant   int32_t & CHW,
-        constant   int32_t & s0,
-        constant   int32_t & s1,
-        constant   int32_t & p0,
-        constant   int32_t & p1,
-        constant   int32_t & d0,
-        constant   int32_t & d1,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint3  tgpg[[threadgroups_per_grid]],
-        uint3 tpitg[[thread_position_in_threadgroup]],
-        uint3   ntg[[threads_per_threadgroup]]) {
-    const int32_t iiw = tgpig[2] * s0 + tpitg[2] * d0 - p0;
-    const int32_t iih = tgpig[1] * s1 + tpitg[1] * d1 - p1;
-
-    const int32_t offset_dst =
-        (tpitg[0] * tgpg[1] * tgpg[2] + tgpig[1] * tgpg[2] + tgpig[2]) * CHW +
-        (tgpig[0] * (ntg[1] * ntg[2]) + tpitg[1] * ntg[2] + tpitg[2]);
-
-    if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
-        dst[offset_dst] = 0.0f;
-    } else {
-        const int32_t offset_src = tpitg[0] * ofs0 + tgpig[0] * ofs1;
-        dst[offset_dst] = x[offset_src + iih * IW + iiw];
-    }
-}
-
-kernel void kernel_upscale_f32(
-    device  const char * src0,
-    device        char * dst,
-    constant   int64_t & ne00,
-    constant   int64_t & ne01,
-    constant   int64_t & ne02,
-    constant   int64_t & ne03,
-    constant  uint64_t & nb00,
-    constant  uint64_t & nb01,
-    constant  uint64_t & nb02,
-    constant  uint64_t & nb03,
-    constant   int64_t & ne0,
-    constant   int64_t & ne1,
-    constant   int64_t & ne2,
-    constant   int64_t & ne3,
-    constant  uint64_t & nb0,
-    constant  uint64_t & nb1,
-    constant  uint64_t & nb2,
-    constant  uint64_t & nb3,
-    constant   int32_t & sf,
-    uint3 tgpig[[threadgroup_position_in_grid]],
-    uint3 tpitg[[thread_position_in_threadgroup]],
-    uint3   ntg[[threads_per_threadgroup]]) {
-
-    const int64_t i3 = tgpig.z;
-    const int64_t i2 = tgpig.y;
-    const int64_t i1 = tgpig.x;
-
-    const int64_t i03 = i3;
-    const int64_t i02 = i2;
-    const int64_t i01 = i1/sf;
-
-    device const float * src0_ptr = (device const float *) (src0 + i03*nb03 + i02*nb02 + i01*nb01);
-    device       float * dst_ptr  = (device       float *) (dst  +  i3*nb3  +  i2*nb2  +  i1*nb1);
-
-    for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
-        dst_ptr[i0] = src0_ptr[i0/sf];
-    }
-}
-
-kernel void kernel_pad_f32(
-    device  const char * src0,
-    device        char * dst,
-    constant   int64_t & ne00,
-    constant   int64_t & ne01,
-    constant   int64_t & ne02,
-    constant   int64_t & ne03,
-    constant  uint64_t & nb00,
-    constant  uint64_t & nb01,
-    constant  uint64_t & nb02,
-    constant  uint64_t & nb03,
-    constant   int64_t & ne0,
-    constant   int64_t & ne1,
-    constant   int64_t & ne2,
-    constant   int64_t & ne3,
-    constant  uint64_t & nb0,
-    constant  uint64_t & nb1,
-    constant  uint64_t & nb2,
-    constant  uint64_t & nb3,
-    uint3 tgpig[[threadgroup_position_in_grid]],
-    uint3 tpitg[[thread_position_in_threadgroup]],
-    uint3   ntg[[threads_per_threadgroup]]) {
-
-    const int64_t i3 = tgpig.z;
-    const int64_t i2 = tgpig.y;
-    const int64_t i1 = tgpig.x;
-
-    const int64_t i03 = i3;
-    const int64_t i02 = i2;
-    const int64_t i01 = i1;
-
-    device const float * src0_ptr = (device const float *) (src0 + i03*nb03 + i02*nb02 + i01*nb01);
-    device       float * dst_ptr  = (device       float *) (dst  +  i3*nb3  +  i2*nb2  +  i1*nb1);
-
-    if (i1 < ne01 && i2 < ne02 && i3 < ne03) {
-        for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
-            if (i0 < ne00) {
-                dst_ptr[i0] = src0_ptr[i0];
-            } else {
-                dst_ptr[i0] = 0.0f;
-            }
-        }
-
-        return;
-    }
-
-    for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
-        dst_ptr[i0] = 0.0f;
-    }
-}
-
-// bitonic sort implementation following the CUDA kernels as reference
-typedef void (argsort_t)(
-        device const float * x,
-        device     int32_t * dst,
-        constant   int64_t & ncols,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint3 tpitg[[thread_position_in_threadgroup]]);
-
-template<ggml_sort_order order>
-kernel void kernel_argsort_f32_i32(
-        device const float   * x,
-        device       int32_t * dst,
-        constant     int64_t & ncols,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint3 tpitg[[thread_position_in_threadgroup]]) {
-    // bitonic sort
-    int col = tpitg[0];
-    int row = tgpig[1];
-
-    if (col >= ncols) return;
-
-    device const float   * x_row   = x   + row * ncols;
-    device       int32_t * dst_row = dst + row * ncols;
-
-    // initialize indices
-    if (col < ncols) {
-        dst_row[col] = col;
-    }
-    threadgroup_barrier(mem_flags::mem_threadgroup);
-
-    for (int k = 2; k <= ncols; k *= 2) {
-        for (int j = k / 2; j > 0; j /= 2) {
-            int ixj = col ^ j;
-            if (ixj > col) {
-                if ((col & k) == 0) {
-                    if (order == GGML_SORT_ASC ? x_row[dst_row[col]] > x_row[dst_row[ixj]] : x_row[dst_row[col]] < x_row[dst_row[ixj]]) {
-                        SWAP(dst_row[col], dst_row[ixj]);
-                    }
-                } else {
-                    if (order == GGML_SORT_ASC ? x_row[dst_row[col]] < x_row[dst_row[ixj]] : x_row[dst_row[col]] > x_row[dst_row[ixj]]) {
-                        SWAP(dst_row[col], dst_row[ixj]);
-                    }
-                }
-            }
-            threadgroup_barrier(mem_flags::mem_threadgroup);
-        }
-    }
-}
-
-template [[host_name("kernel_argsort_f32_i32_asc")]]  kernel argsort_t kernel_argsort_f32_i32<GGML_SORT_ASC>;
-template [[host_name("kernel_argsort_f32_i32_desc")]] kernel argsort_t kernel_argsort_f32_i32<GGML_SORT_DESC>;
-
-kernel void kernel_leaky_relu_f32(
-        device const float * src0,
-        device       float * dst,
-        constant     float & slope,
-        uint tpig[[thread_position_in_grid]]) {
-    dst[tpig] = src0[tpig] > 0.0f ? src0[tpig] : src0[tpig] * slope;
-}
-
-kernel void kernel_cpy_f16_f16(
-        device  const half * src0,
-        device        half * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant   int64_t & ne03,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant  uint64_t & nb03,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   int64_t & ne2,
-        constant   int64_t & ne3,
-        constant  uint64_t & nb0,
-        constant  uint64_t & nb1,
-        constant  uint64_t & nb2,
-        constant  uint64_t & nb3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint3 tpitg[[thread_position_in_threadgroup]],
-        uint3   ntg[[threads_per_threadgroup]]) {
-    const int64_t i03 = tgpig[2];
-    const int64_t i02 = tgpig[1];
-    const int64_t i01 = tgpig[0];
-
-    const int64_t n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
-
-    const int64_t i3 = n / (ne2*ne1*ne0);
-    const int64_t i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
-    const int64_t i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
-    const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0);
-
-    device half * dst_data = (device half *) ((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
-
-    for (int64_t i00 = tpitg.x; i00 < ne00; i00 += ntg.x) {
-        device const half * src = (device half *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
-        dst_data[i00] = src[0];
-    }
-}
-
-kernel void kernel_cpy_f16_f32(
-        device  const half * src0,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant   int64_t & ne03,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant  uint64_t & nb03,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   int64_t & ne2,
-        constant   int64_t & ne3,
-        constant  uint64_t & nb0,
-        constant  uint64_t & nb1,
-        constant  uint64_t & nb2,
-        constant  uint64_t & nb3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint3 tpitg[[thread_position_in_threadgroup]],
-        uint3   ntg[[threads_per_threadgroup]]) {
-    const int64_t i03 = tgpig[2];
-    const int64_t i02 = tgpig[1];
-    const int64_t i01 = tgpig[0];
-
-    const int64_t n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
-
-    const int64_t i3 = n / (ne2*ne1*ne0);
-    const int64_t i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
-    const int64_t i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
-    const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0);
-
-    device float * dst_data = (device float *) ((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
-
-    for (int64_t i00 = tpitg.x; i00 < ne00; i00 += ntg.x) {
-        device const half * src = (device half *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
-        dst_data[i00] = src[0];
-    }
-}
-
-kernel void kernel_cpy_f32_f16(
-        device const float * src0,
-        device        half * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant   int64_t & ne03,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant  uint64_t & nb03,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   int64_t & ne2,
-        constant   int64_t & ne3,
-        constant  uint64_t & nb0,
-        constant  uint64_t & nb1,
-        constant  uint64_t & nb2,
-        constant  uint64_t & nb3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint3 tpitg[[thread_position_in_threadgroup]],
-        uint3   ntg[[threads_per_threadgroup]]) {
-    const int64_t i03 = tgpig[2];
-    const int64_t i02 = tgpig[1];
-    const int64_t i01 = tgpig[0];
-
-    const int64_t n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
-
-    const int64_t i3 = n / (ne2*ne1*ne0);
-    const int64_t i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
-    const int64_t i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
-    const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0);
-
-    device half * dst_data = (device half *) ((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
-
-    for (int64_t i00 = tpitg.x; i00 < ne00; i00 += ntg.x) {
-        device const float * src = (device float *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
-
-        dst_data[i00] = src[0];
-    }
-}
-
-kernel void kernel_cpy_f32_f32(
-        device const float * src0,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant   int64_t & ne03,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant  uint64_t & nb03,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   int64_t & ne2,
-        constant   int64_t & ne3,
-        constant  uint64_t & nb0,
-        constant  uint64_t & nb1,
-        constant  uint64_t & nb2,
-        constant  uint64_t & nb3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint3 tpitg[[thread_position_in_threadgroup]],
-        uint3   ntg[[threads_per_threadgroup]]) {
-    const int64_t i03 = tgpig[2];
-    const int64_t i02 = tgpig[1];
-    const int64_t i01 = tgpig[0];
-
-    const int64_t n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
-
-    const int64_t i3 = n / (ne2*ne1*ne0);
-    const int64_t i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
-    const int64_t i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
-    const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0);
-
-    device float * dst_data = (device float *) ((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
-
-    for (int64_t i00 = tpitg.x; i00 < ne00; i00 += ntg.x) {
-        device const float * src = (device float *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
-
-        dst_data[i00] = src[0];
-    }
-}
-
-kernel void kernel_cpy_f32_q8_0(
-        device const float * src0,
-        device        void * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant   int64_t & ne03,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant  uint64_t & nb03,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   int64_t & ne2,
-        constant   int64_t & ne3,
-        constant  uint64_t & nb0,
-        constant  uint64_t & nb1,
-        constant  uint64_t & nb2,
-        constant  uint64_t & nb3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint3 tpitg[[thread_position_in_threadgroup]],
-        uint3   ntg[[threads_per_threadgroup]]) {
-    const int64_t i03 = tgpig[2];
-    const int64_t i02 = tgpig[1];
-    const int64_t i01 = tgpig[0];
-
-    const int64_t n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
-
-    const int64_t i3 = n / (ne2*ne1*ne0);
-    const int64_t i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
-    const int64_t i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
-    const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0)/QK8_0;
-
-    device block_q8_0 * dst_data = (device block_q8_0 *) ((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
-
-    for (int64_t i00 = tpitg.x*QK8_0; i00 < ne00; i00 += ntg.x*QK8_0) {
-        device const float * src = (device float *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
-
-        float amax = 0.0f; // absolute max
-
-        for (int j = 0; j < QK8_0; j++) {
-            const float v = src[j];
-            amax = MAX(amax, fabs(v));
-        }
-
-        const float d = amax / ((1 << 7) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
-
-        dst_data[i00/QK8_0].d = d;
-
-        for (int j = 0; j < QK8_0; ++j) {
-            const float x0 = src[j]*id;
-
-            dst_data[i00/QK8_0].qs[j] = round(x0);
-        }
-    }
-}
-
-kernel void kernel_cpy_f32_q4_0(
-        device const float * src0,
-        device        void * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant   int64_t & ne03,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant  uint64_t & nb03,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   int64_t & ne2,
-        constant   int64_t & ne3,
-        constant  uint64_t & nb0,
-        constant  uint64_t & nb1,
-        constant  uint64_t & nb2,
-        constant  uint64_t & nb3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint3 tpitg[[thread_position_in_threadgroup]],
-        uint3   ntg[[threads_per_threadgroup]]) {
-    const int64_t i03 = tgpig[2];
-    const int64_t i02 = tgpig[1];
-    const int64_t i01 = tgpig[0];
-
-    const int64_t n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
-
-    const int64_t i3 = n / (ne2*ne1*ne0);
-    const int64_t i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
-    const int64_t i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
-    const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0)/QK4_0;
-
-    device block_q4_0 * dst_data = (device block_q4_0 *) ((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
-
-    for (int64_t i00 = tpitg.x*QK4_0; i00 < ne00; i00 += ntg.x*QK4_0) {
-        device const float * src = (device float *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
-
-        float amax = 0.0f; // absolute max
-        float max  = 0.0f;
-
-        for (int j = 0; j < QK4_0; j++) {
-            const float v = src[j];
-            if (amax < fabs(v)) {
-                amax = fabs(v);
-                max  = v;
-            }
-        }
-
-        const float d = max / -8;
-        const float id = d ? 1.0f/d : 0.0f;
-
-        dst_data[i00/QK4_0].d = d;
-
-        for (int j = 0; j < QK4_0/2; ++j) {
-            const float x0 = src[0       + j]*id;
-            const float x1 = src[QK4_0/2 + j]*id;
-
-            const uint8_t xi0 = MIN(15, (int8_t)(x0 + 8.5f));
-            const uint8_t xi1 = MIN(15, (int8_t)(x1 + 8.5f));
-
-            dst_data[i00/QK4_0].qs[j]  = xi0;
-            dst_data[i00/QK4_0].qs[j] |= xi1 << 4;
-        }
-    }
-}
-
-kernel void kernel_cpy_f32_q4_1(
-        device const float * src0,
-        device        void * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant   int64_t & ne03,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant  uint64_t & nb03,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   int64_t & ne2,
-        constant   int64_t & ne3,
-        constant  uint64_t & nb0,
-        constant  uint64_t & nb1,
-        constant  uint64_t & nb2,
-        constant  uint64_t & nb3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint3 tpitg[[thread_position_in_threadgroup]],
-        uint3   ntg[[threads_per_threadgroup]]) {
-    const int64_t i03 = tgpig[2];
-    const int64_t i02 = tgpig[1];
-    const int64_t i01 = tgpig[0];
-
-    const int64_t n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
-
-    const int64_t i3 = n / (ne2*ne1*ne0);
-    const int64_t i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
-    const int64_t i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
-    const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0)/QK4_1;
-
-    device block_q4_1 * dst_data = (device block_q4_1 *) ((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
-
-    for (int64_t i00 = tpitg.x*QK4_1; i00 < ne00; i00 += ntg.x*QK4_1) {
-        device const float * src = (device float *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
-
-        float min = FLT_MAX;
-        float max = -FLT_MAX;
-
-        for (int j = 0; j < QK4_1; j++) {
-            const float v = src[j];
-            if (min > v) min = v;
-            if (max < v) max = v;
-        }
-
-        const float d = (max - min) / ((1 << 4) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
-
-        dst_data[i00/QK4_1].d = d;
-        dst_data[i00/QK4_1].m = min;
-
-        for (int j = 0; j < QK4_1/2; ++j) {
-            const float x0 = (src[0       + j] - min)*id;
-            const float x1 = (src[QK4_1/2 + j] - min)*id;
-
-            const uint8_t xi0 = MIN(15, (int8_t)(x0 + 0.5f));
-            const uint8_t xi1 = MIN(15, (int8_t)(x1 + 0.5f));
-
-            dst_data[i00/QK4_1].qs[j]  = xi0;
-            dst_data[i00/QK4_1].qs[j] |= xi1 << 4;
-        }
-    }
-}
-
-kernel void kernel_concat(
-    device  const char * src0,
-    device  const char * src1,
-    device        char * dst,
-    constant   int64_t & ne00,
-    constant   int64_t & ne01,
-    constant   int64_t & ne02,
-    constant   int64_t & ne03,
-    constant  uint64_t & nb00,
-    constant  uint64_t & nb01,
-    constant  uint64_t & nb02,
-    constant  uint64_t & nb03,
-    constant   int64_t & ne10,
-    constant   int64_t & ne11,
-    constant   int64_t & ne12,
-    constant   int64_t & ne13,
-    constant  uint64_t & nb10,
-    constant  uint64_t & nb11,
-    constant  uint64_t & nb12,
-    constant  uint64_t & nb13,
-    constant   int64_t & ne0,
-    constant   int64_t & ne1,
-    constant   int64_t & ne2,
-    constant   int64_t & ne3,
-    constant  uint64_t & nb0,
-    constant  uint64_t & nb1,
-    constant  uint64_t & nb2,
-    constant  uint64_t & nb3,
-    uint3 tgpig[[threadgroup_position_in_grid]],
-    uint3 tpitg[[thread_position_in_threadgroup]],
-    uint3   ntg[[threads_per_threadgroup]]) {
-
-    const int64_t i03 = tgpig.z;
-    const int64_t i02 = tgpig.y;
-    const int64_t i01 = tgpig.x;
-
-    const int64_t i13 = i03 % ne13;
-    const int64_t i12 = i02 % ne12;
-    const int64_t i11 = i01 % ne11;
-
-    device const char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01 + tpitg.x*nb00;
-    device const char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11 + tpitg.x*nb10;
-    device       char * dst_ptr  = dst  + i03*nb3  + i02*nb2  + i01*nb1  + tpitg.x*nb0;
-
-    for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
-        if (i02 < ne02) {
-            ((device float *)dst_ptr)[0] = ((device float *)src0_ptr)[0];
-            src0_ptr += ntg.x*nb00;
-        } else {
-            ((device float *)dst_ptr)[0] = ((device float *)src1_ptr)[0];
-            src1_ptr += ntg.x*nb10;
-        }
-        dst_ptr += ntg.x*nb0;
-    }
-}
-
-//============================================ k-quants ======================================================
-
-#ifndef QK_K
-#define QK_K 256
-#else
-static_assert(QK_K == 256 || QK_K == 64, "QK_K must be 256 or 64");
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-typedef struct {
-    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
-    uint8_t qs[QK_K/4];      // quants
-    half d;           // super-block scale for quantized scales
-    half dmin;        // super-block scale for quantized mins
-} block_q2_K;
-// 84 bytes / block
-
-typedef struct {
-    uint8_t hmask[QK_K/8];     // quants - high bit
-    uint8_t qs[QK_K/4];        // quants - low 2 bits
-#if QK_K == 64
-    uint8_t scales[2];
-#else
-    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
-#endif
-    half d;             // super-block scale
-} block_q3_K;
-
-#if QK_K == 64
-typedef struct {
-    half    d[2];          // super-block scales/mins
-    uint8_t scales[2];
-    uint8_t qs[QK_K/2];    // 4-bit quants
-} block_q4_K;
-#else
-typedef struct {
-    half d;             // super-block scale for quantized scales
-    half dmin;          // super-block scale for quantized mins
-    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
-    uint8_t qs[QK_K/2];        // 4--bit quants
-} block_q4_K;
-#endif
-
-#if QK_K == 64
-typedef struct {
-    half  d;                     // super-block scales/mins
-    int8_t  scales[QK_K/16];     // 8-bit block scales
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-} block_q5_K;
-#else
-typedef struct {
-    half d;                      // super-block scale for quantized scales
-    half dmin;                   // super-block scale for quantized mins
-    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-} block_q5_K;
-// 176 bytes / block
-#endif
-
-typedef struct {
-    uint8_t ql[QK_K/2];      // quants, lower 4 bits
-    uint8_t qh[QK_K/4];      // quants, upper 2 bits
-    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
-    half d;                  // super-block scale
-} block_q6_K;
-// 210 bytes / block
-
-typedef struct {
-    half d;
-    uint16_t qs[QK_K/8];
-} block_iq2_xxs;
-// 66 bytes / block for QK_K = 256, so 2.0625 bpw
-
-typedef struct {
-    half d;
-    uint16_t qs[QK_K/8];
-    uint8_t  scales[QK_K/32];
-} block_iq2_xs;
-// 74 bytes / block for QK_K = 256, so 2.3125 bpw
-
-//====================================== dot products =========================
-
-void kernel_mul_mv_q2_K_f32_impl(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant   int64_t & ne10,
-        constant   int64_t & ne12,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint  tiisg[[thread_index_in_simdgroup]],
-        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
-
-    const int nb = ne00/QK_K;
-    const int r0 = tgpig.x;
-    const int r1 = tgpig.y;
-    const int im = tgpig.z;
-
-    const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
-    const int ib_row = first_row * nb;
-
-    const uint i12 = im%ne12;
-    const uint i13 = im/ne12;
-
-    const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
-
-    device const block_q2_K * x = (device const block_q2_K *) src0 + ib_row + offset0;
-    device const float      * y = (device const float      *) src1 + r1*ne10 + im*ne00*ne1;
-
-    float yl[32];
-    float sumf[N_DST]={0.f}, all_sum;
-
-    const int step = sizeof(block_q2_K) * nb;
-
-#if QK_K == 256
-    const int ix = tiisg/8;  // 0...3
-    const int it = tiisg%8;  // 0...7
-    const int iq = it/4;     // 0 or 1
-    const int ir = it%4;     // 0...3
-    const int is = (8*ir)/16;// 0 or 1
-
-    device const float * y4 = y + ix * QK_K + 128 * iq + 8 * ir;
-
-    for (int ib = ix; ib < nb; ib += 4) {
-
-        float4 sumy = {0.f, 0.f, 0.f, 0.f};
-        for (int i = 0; i < 8; ++i) {
-            yl[i+ 0] = y4[i+ 0]; sumy[0] += yl[i+ 0];
-            yl[i+ 8] = y4[i+32]; sumy[1] += yl[i+ 8];
-            yl[i+16] = y4[i+64]; sumy[2] += yl[i+16];
-            yl[i+24] = y4[i+96]; sumy[3] += yl[i+24];
-        }
-
-        device const uint8_t  * sc = (device const uint8_t  *)x[ib].scales + 8*iq + is;
-        device const uint16_t * qs = (device const uint16_t *)x[ib].qs + 16 * iq + 4 * ir;
-        device const half     * dh = &x[ib].d;
-
-        for (int row = 0; row < N_DST; row++) {
-
-            float4 acc1 = {0.f, 0.f, 0.f, 0.f};
-            float4 acc2 = {0.f, 0.f, 0.f, 0.f};
-            for (int i = 0; i < 8; i += 2) {
-                acc1[0] += yl[i+ 0] * (qs[i/2] & 0x0003);
-                acc2[0] += yl[i+ 1] * (qs[i/2] & 0x0300);
-                acc1[1] += yl[i+ 8] * (qs[i/2] & 0x000c);
-                acc2[1] += yl[i+ 9] * (qs[i/2] & 0x0c00);
-                acc1[2] += yl[i+16] * (qs[i/2] & 0x0030);
-                acc2[2] += yl[i+17] * (qs[i/2] & 0x3000);
-                acc1[3] += yl[i+24] * (qs[i/2] & 0x00c0);
-                acc2[3] += yl[i+25] * (qs[i/2] & 0xc000);
-            }
-            float dall = dh[0];
-            float dmin = dh[1] * 1.f/16.f;
-            sumf[row] += dall * ((acc1[0] + 1.f/256.f * acc2[0]) * (sc[0] & 0xF) * 1.f/ 1.f +
-                                 (acc1[1] + 1.f/256.f * acc2[1]) * (sc[2] & 0xF) * 1.f/ 4.f +
-                                 (acc1[2] + 1.f/256.f * acc2[2]) * (sc[4] & 0xF) * 1.f/16.f +
-                                 (acc1[3] + 1.f/256.f * acc2[3]) * (sc[6] & 0xF) * 1.f/64.f) -
-                         dmin * (sumy[0] * (sc[0] & 0xF0) + sumy[1] * (sc[2] & 0xF0) + sumy[2] * (sc[4] & 0xF0) + sumy[3] * (sc[6] & 0xF0));
-
-            qs += step/2;
-            sc += step;
-            dh += step/2;
-        }
-
-        y4 += 4 * QK_K;
-    }
-#else
-    const int ix = tiisg/2;  // 0...15
-    const int it = tiisg%2;  // 0...1
-
-    device const float * y4 = y + ix * QK_K + 8 * it;
-
-    for (int ib = ix; ib < nb; ib += 16) {
-
-        float4 sumy = {0.f, 0.f, 0.f, 0.f};
-        for (int i = 0; i < 8; ++i) {
-            yl[i+ 0] = y4[i+ 0]; sumy[0] += yl[i+ 0];
-            yl[i+ 8] = y4[i+16]; sumy[1] += yl[i+ 8];
-            yl[i+16] = y4[i+32]; sumy[2] += yl[i+16];
-            yl[i+24] = y4[i+48]; sumy[3] += yl[i+24];
-        }
-
-        device const uint8_t  * sc = (device const uint8_t  *)x[ib].scales;
-        device const uint16_t * qs = (device const uint16_t *)x[ib].qs + 4 * it;
-        device const half     * dh = &x[ib].d;
-
-        for (int row = 0; row < N_DST; row++) {
-
-            float4 acc1 = {0.f, 0.f, 0.f, 0.f};
-            float4 acc2 = {0.f, 0.f, 0.f, 0.f};
-            for (int i = 0; i < 8; i += 2) {
-                acc1[0] += yl[i+ 0] * (qs[i/2] & 0x0003);
-                acc2[0] += yl[i+ 1] * (qs[i/2] & 0x0300);
-                acc1[1] += yl[i+ 8] * (qs[i/2] & 0x000c);
-                acc2[1] += yl[i+ 9] * (qs[i/2] & 0x0c00);
-                acc1[2] += yl[i+16] * (qs[i/2] & 0x0030);
-                acc2[2] += yl[i+17] * (qs[i/2] & 0x3000);
-                acc1[3] += yl[i+24] * (qs[i/2] & 0x00c0);
-                acc2[3] += yl[i+25] * (qs[i/2] & 0xc000);
-            }
-
-            float dall = dh[0];
-            float dmin = dh[1];
-            sumf[row] += dall * ((acc1[0] + 1.f/256.f * acc2[0]) * (sc[0] & 0xF) * 1.f/ 1.f +
-                                 (acc1[1] + 1.f/256.f * acc2[1]) * (sc[1] & 0xF) * 1.f/ 4.f +
-                                 (acc1[2] + 1.f/256.f * acc2[2]) * (sc[2] & 0xF) * 1.f/16.f +
-                                 (acc1[3] + 1.f/256.f * acc2[3]) * (sc[3] & 0xF) * 1.f/64.f) -
-                         dmin * (sumy[0] * (sc[0] >> 4) + sumy[1] * (sc[1] >> 4) + sumy[2] * (sc[2] >> 4) + sumy[3] * (sc[3] >> 4));
-
-            qs += step/2;
-            sc += step;
-            dh += step/2;
-        }
-
-        y4 += 16 * QK_K;
-    }
-#endif
-
-    for (int row = 0; row < N_DST; ++row) {
-        all_sum = simd_sum(sumf[row]);
-        if (tiisg == 0) {
-            dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum;
-        }
-    }
-}
-
-[[host_name("kernel_mul_mv_q2_K_f32")]]
-kernel void kernel_mul_mv_q2_K_f32(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant   int64_t & ne10,
-        constant   int64_t & ne11,
-        constant   int64_t & ne12,
-        constant  uint64_t & nb10,
-        constant  uint64_t & nb11,
-        constant  uint64_t & nb12,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint  tiisg[[thread_index_in_simdgroup]],
-        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
-
-    kernel_mul_mv_q2_K_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, tgpig, tiisg, sgitg);
-}
-
-#if QK_K == 256
-void kernel_mul_mv_q3_K_f32_impl(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant   int64_t & ne10,
-        constant   int64_t & ne12,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint  tiisg[[thread_index_in_simdgroup]],
-        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
-
-    const int nb = ne00/QK_K;
-
-    const int64_t r0 = tgpig.x;
-    const int64_t r1 = tgpig.y;
-    const int64_t im = tgpig.z;
-
-    const int first_row = (r0 * N_SIMDGROUP + sgitg) * 2;
-
-    const uint i12 = im%ne12;
-    const uint i13 = im/ne12;
-
-    const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
-
-    device const block_q3_K * x = (device const block_q3_K *) src0 + first_row*nb + offset0;
-    device const float     * yy = (device const float      *) src1 + r1*ne10 + im*ne00*ne1;
-
-    float yl[32];
-
-    //const uint16_t kmask1 = 0x3030;
-    //const uint16_t kmask2 = 0x0f0f;
-
-    const int tid = tiisg/4;
-    const int ix  = tiisg%4;
-    const int ip  = tid/4;          // 0 or 1
-    const int il  = 2*((tid%4)/2);  // 0 or 2
-    const int ir  = tid%2;
-    const int n   = 8;
-    const int l0  = n*ir;
-
-    // One would think that the Metal compiler would figure out that ip and il can only have
-    // 4 possible states, and optimize accordingly. Well, no. It needs help, and we do it
-    // with these two tales.
-    //
-    // Possible masks for the high bit
-    const ushort4 mm[4] = {{0x0001, 0x0100, 0x0002, 0x0200},  // ip = 0, il = 0
-                           {0x0004, 0x0400, 0x0008, 0x0800},  // ip = 0, il = 2
-                           {0x0010, 0x1000, 0x0020, 0x2000},  // ip = 1, il = 0
-                           {0x0040, 0x4000, 0x0080, 0x8000}}; // ip = 1, il = 2
-
-    // Possible masks for the low 2 bits
-    const int4 qm[2] = {{0x0003, 0x0300, 0x000c, 0x0c00}, {0x0030, 0x3000, 0x00c0, 0xc000}};
-
-    const ushort4 hm = mm[2*ip + il/2];
-
-    const int shift = 2*il;
-    const float    v1 = il == 0 ? 4.f : 64.f;
-    const float    v2 = 4.f * v1;
-
-    const uint16_t s_shift1 = 4*ip;
-    const uint16_t s_shift2 = s_shift1 + il;
-
-    const int q_offset = 32*ip + l0;
-    const int y_offset = 128*ip + 32*il + l0;
-
-    const int step = sizeof(block_q3_K) * nb / 2;
-
-    device const float * y1 = yy + ix*QK_K + y_offset;
-
-    uint32_t scales32, aux32;
-    thread uint16_t * scales16 = (thread uint16_t *)&scales32;
-    thread const int8_t * scales = (thread const int8_t *)&scales32;
-
-    float sumf1[2] = {0.f};
-    float sumf2[2] = {0.f};
-    for (int i = ix; i < nb; i += 4) {
-
-        for (int l = 0; l < 8; ++l) {
-            yl[l+ 0] = y1[l+ 0];
-            yl[l+ 8] = y1[l+16];
-            yl[l+16] = y1[l+32];
-            yl[l+24] = y1[l+48];
-        }
-
-        device const uint16_t * q = (device const uint16_t *)(x[i].qs + q_offset);
-        device const uint16_t * h = (device const uint16_t *)(x[i].hmask + l0);
-        device const uint16_t * a = (device const uint16_t *)(x[i].scales);
-        device const half * dh = &x[i].d;
-
-        for (int row = 0; row < 2; ++row) {
-
-            const float d_all = (float)dh[0];
-
-            scales16[0] = a[4];
-            scales16[1] = a[5];
-            aux32 = ((scales32 >> s_shift2) << 4) & 0x30303030;
-            scales16[0] = a[il+0];
-            scales16[1] = a[il+1];
-            scales32 = ((scales32 >> s_shift1) & 0x0f0f0f0f) | aux32;
-
-            float s1 = 0, s2 = 0, s3 = 0, s4 = 0, s5 = 0, s6 = 0;
-            for (int l = 0; l < n; l += 2) {
-                const int32_t qs = q[l/2];
-                s1 += yl[l+0] * (qs & qm[il/2][0]);
-                s2 += yl[l+1] * (qs & qm[il/2][1]);
-                s3 += ((h[l/2] & hm[0]) ? 0.f : yl[l+0]) + ((h[l/2] & hm[1]) ? 0.f : yl[l+1]);
-                s4 += yl[l+16] * (qs & qm[il/2][2]);
-                s5 += yl[l+17] * (qs & qm[il/2][3]);
-                s6 += ((h[l/2] & hm[2]) ? 0.f : yl[l+16]) + ((h[l/2] & hm[3]) ? 0.f : yl[l+17]);
-            }
-            float d1 = d_all * (s1 + 1.f/256.f * s2 - s3*v1);
-            float d2 = d_all * (s4 + 1.f/256.f * s5 - s6*v2);
-            sumf1[row] += d1 * (scales[0] - 32);
-            sumf2[row] += d2 * (scales[2] - 32);
-
-            s1 = s2 = s3 = s4 = s5 = s6 = 0;
-            for (int l = 0; l < n; l += 2) {
-                const int32_t qs = q[l/2+8];
-                s1 += yl[l+8] * (qs & qm[il/2][0]);
-                s2 += yl[l+9] * (qs & qm[il/2][1]);
-                s3 += ((h[l/2+8] & hm[0]) ? 0.f : yl[l+8]) + ((h[l/2+8] & hm[1]) ? 0.f : yl[l+9]);
-                s4 += yl[l+24] * (qs & qm[il/2][2]);
-                s5 += yl[l+25] * (qs & qm[il/2][3]);
-                s6 += ((h[l/2+8] & hm[2]) ? 0.f : yl[l+24]) + ((h[l/2+8] & hm[3]) ? 0.f : yl[l+25]);
-            }
-            d1 = d_all * (s1 + 1.f/256.f * s2 - s3*v1);
-            d2 = d_all * (s4 + 1.f/256.f * s5 - s6*v2);
-            sumf1[row] += d1 * (scales[1] - 32);
-            sumf2[row] += d2 * (scales[3] - 32);
-
-            q  += step;
-            h  += step;
-            a  += step;
-            dh += step;
-
-        }
-
-        y1 += 4 * QK_K;
-
-    }
-
-    for (int row = 0; row < 2; ++row) {
-        const float sumf = (sumf1[row] + 0.25f * sumf2[row]) / (1 << shift);
-        sumf1[row] = simd_sum(sumf);
-    }
-    if (tiisg == 0) {
-        for (int row = 0; row < 2; ++row) {
-            dst[r1*ne0 + im*ne0*ne1 + first_row + row] = sumf1[row];
-        }
-    }
-}
-#else
-void kernel_mul_mv_q3_K_f32_impl(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant   int64_t & ne10,
-        constant   int64_t & ne12,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint  tiisg[[thread_index_in_simdgroup]],
-        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
-
-    const int nb = ne00/QK_K;
-
-    const int64_t r0 = tgpig.x;
-    const int64_t r1 = tgpig.y;
-    const int64_t im = tgpig.z;
-
-    const int row = 2 * r0 + sgitg;
-
-    const uint i12 = im%ne12;
-    const uint i13 = im/ne12;
-
-    const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
-
-    device const block_q3_K * x = (device const block_q3_K *) src0 + row*nb + offset0;
-    device const float     * yy = (device const float      *) src1 + r1*ne10 + im*ne00*ne1;
-
-    const int ix = tiisg/4;
-    const int il = 4 * (tiisg%4);// 0, 4, 8, 12
-    const int iq = il/8;         // 0, 0, 1, 1
-    const int in = il%8;         // 0, 4, 0, 4
-
-    float2 sum = {0.f, 0.f};
-
-    for (int i = ix; i < nb; i += 8) {
-
-        const float d_all = (float)(x[i].d);
-
-        device const uint16_t * q = (device const uint16_t *)(x[i].qs + il);
-        device const uint16_t * h = (device const uint16_t *)(x[i].hmask + in);
-        device const uint16_t * s = (device const uint16_t *)(x[i].scales);
-        device const float    * y = yy + i * QK_K + il;
-
-        const float d1 = d_all * ((int32_t)(s[0] & 0x000F) - 8);
-        const float d2 = d_all * ((int32_t)(s[0] & 0x00F0) - 128) * 1.f/64.f;
-        const float d3 = d_all * ((int32_t)(s[0] & 0x0F00) - 2048) * 1.f/4096.f;
-        const float d4 = d_all * ((int32_t)(s[0] & 0xF000) - 32768) * 1.f/262144.f;
-
-        for (int l = 0; l < 4; l += 2) {
-            const uint16_t hm = h[l/2] >> iq;
-            sum[0] += y[l+ 0] * d1 * ((int32_t)(q[l/2] & 0x0003) - ((hm & 0x0001) ? 0 :  4))
-                    + y[l+16] * d2 * ((int32_t)(q[l/2] & 0x000c) - ((hm & 0x0004) ? 0 : 16))
-                    + y[l+32] * d3 * ((int32_t)(q[l/2] & 0x0030) - ((hm & 0x0010) ? 0 : 64))
-                    + y[l+48] * d4 * ((int32_t)(q[l/2] & 0x00c0) - ((hm & 0x0040) ? 0 : 256));
-            sum[1] += y[l+ 1] * d1 * ((int32_t)(q[l/2] & 0x0300) - ((hm & 0x0100) ? 0 : 1024))
-                    + y[l+17] * d2 * ((int32_t)(q[l/2] & 0x0c00) - ((hm & 0x0400) ? 0 : 4096))
-                    + y[l+33] * d3 * ((int32_t)(q[l/2] & 0x3000) - ((hm & 0x1000) ? 0 : 16384))
-                    + y[l+49] * d4 * ((int32_t)(q[l/2] & 0xc000) - ((hm & 0x4000) ? 0 : 65536));
-        }
-
-    }
-    const float sumf = sum[0] + sum[1] * 1.f/256.f;
-
-    const float tot = simd_sum(sumf);
-    if (tiisg == 0) {
-        dst[r1*ne0 + im*ne0*ne1 + row] = tot;
-    }
-
-}
-#endif
-
-[[host_name("kernel_mul_mv_q3_K_f32")]]
-kernel void kernel_mul_mv_q3_K_f32(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant   int64_t & ne10,
-        constant   int64_t & ne11,
-        constant   int64_t & ne12,
-        constant  uint64_t & nb10,
-        constant  uint64_t & nb11,
-        constant  uint64_t & nb12,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint  tiisg[[thread_index_in_simdgroup]],
-        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
-
-    kernel_mul_mv_q3_K_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, tgpig, tiisg, sgitg);
-}
-
-#if QK_K == 256
-void kernel_mul_mv_q4_K_f32_impl(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant   int64_t & ne10,
-        constant   int64_t & ne12,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint  tiisg[[thread_index_in_simdgroup]],
-        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
-
-    const uint16_t kmask1 = 0x3f3f;
-    const uint16_t kmask2 = 0x0f0f;
-    const uint16_t kmask3 = 0xc0c0;
-
-    const int ix = tiisg/8;  // 0...3
-    const int it = tiisg%8;  // 0...7
-    const int iq = it/4;     // 0 or 1
-    const int ir = it%4;     // 0...3
-
-    const int nb = ne00/QK_K;
-    const int r0 = tgpig.x;
-    const int r1 = tgpig.y;
-    const int im = tgpig.z;
-    //const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
-    const int first_row = r0 * N_DST;
-    const int ib_row = first_row * nb;
-
-    const uint i12 = im%ne12;
-    const uint i13 = im/ne12;
-
-    const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
-
-    device const block_q4_K * x = (device const block_q4_K *) src0 + ib_row + offset0;
-    device const float      * y = (device const float      *) src1 + r1*ne10 + im*ne00*ne1;
-
-    float yl[16];
-    float yh[16];
-    float sumf[N_DST]={0.f}, all_sum;
-
-    const int step = sizeof(block_q4_K) * nb / 2;
-
-    device const float * y4 = y + ix * QK_K + 64 * iq + 8 * ir;
-
-    uint16_t sc16[4];
-    thread const uint8_t * sc8 = (thread const uint8_t *)sc16;
-
-    for (int ib = ix; ib < nb; ib += 4) {
-
-        float4 sumy = {0.f, 0.f, 0.f, 0.f};
-        for (int i = 0; i < 8; ++i) {
-            yl[i+0] = y4[i+  0]; sumy[0] += yl[i+0];
-            yl[i+8] = y4[i+ 32]; sumy[1] += yl[i+8];
-            yh[i+0] = y4[i+128]; sumy[2] += yh[i+0];
-            yh[i+8] = y4[i+160]; sumy[3] += yh[i+8];
-        }
-
-        device const uint16_t * sc = (device const uint16_t *)x[ib].scales + iq;
-        device const uint16_t * q1 = (device const uint16_t *)x[ib].qs + 16 * iq + 4 * ir;
-        device const half     * dh = &x[ib].d;
-
-        for (int row = 0; row < N_DST; row++) {
-
-            sc16[0] = sc[0] & kmask1;
-            sc16[1] = sc[2] & kmask1;
-            sc16[2] = ((sc[4] >> 0) & kmask2) | ((sc[0] & kmask3) >> 2);
-            sc16[3] = ((sc[4] >> 4) & kmask2) | ((sc[2] & kmask3) >> 2);
-
-            device const uint16_t * q2 = q1 + 32;
-
-            float4 acc1 = {0.f, 0.f, 0.f, 0.f};
-            float4 acc2 = {0.f, 0.f, 0.f, 0.f};
-            for (int i = 0; i < 8; i += 2) {
-                acc1[0] += yl[i+0] * (q1[i/2] & 0x000F);
-                acc1[1] += yl[i+1] * (q1[i/2] & 0x0F00);
-                acc1[2] += yl[i+8] * (q1[i/2] & 0x00F0);
-                acc1[3] += yl[i+9] * (q1[i/2] & 0xF000);
-                acc2[0] += yh[i+0] * (q2[i/2] & 0x000F);
-                acc2[1] += yh[i+1] * (q2[i/2] & 0x0F00);
-                acc2[2] += yh[i+8] * (q2[i/2] & 0x00F0);
-                acc2[3] += yh[i+9] * (q2[i/2] & 0xF000);
-            }
-
-            float dall = dh[0];
-            float dmin = dh[1];
-            sumf[row] += dall * ((acc1[0] + 1.f/256.f * acc1[1]) * sc8[0] +
-                                 (acc1[2] + 1.f/256.f * acc1[3]) * sc8[1] * 1.f/16.f +
-                                 (acc2[0] + 1.f/256.f * acc2[1]) * sc8[4] +
-                                 (acc2[2] + 1.f/256.f * acc2[3]) * sc8[5] * 1.f/16.f) -
-                         dmin * (sumy[0] * sc8[2] + sumy[1] * sc8[3] + sumy[2] * sc8[6] + sumy[3] * sc8[7]);
-
-            q1 += step;
-            sc += step;
-            dh += step;
-        }
-
-        y4 += 4 * QK_K;
-    }
-
-    for (int row = 0; row < N_DST; ++row) {
-        all_sum = simd_sum(sumf[row]);
-        if (tiisg == 0) {
-            dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum;
-        }
-    }
-}
-#else
-void kernel_mul_mv_q4_K_f32_impl(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant   int64_t & ne10,
-        constant   int64_t & ne12,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint  tiisg[[thread_index_in_simdgroup]],
-        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
-
-    const int ix = tiisg/4;  // 0...7
-    const int it = tiisg%4;  // 0...3
-
-    const int nb = ne00/QK_K;
-    const int r0 = tgpig.x;
-    const int r1 = tgpig.y;
-    const int im = tgpig.z;
-    const int first_row = r0 * N_DST;
-    const int ib_row = first_row * nb;
-
-    const uint i12 = im%ne12;
-    const uint i13 = im/ne12;
-
-    const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
-
-    device const block_q4_K * x = (device const block_q4_K *) src0 + ib_row + offset0;
-    device const float      * y = (device const float      *) src1 + r1*ne10 + im*ne00*ne1;
-
-    float yl[8];
-    float yh[8];
-    float sumf[N_DST]={0.f}, all_sum;
-
-    const int step = sizeof(block_q4_K) * nb / 2;
-
-    device const float * y4 = y + ix * QK_K + 8 * it;
-
-    uint16_t sc16[4];
-
-    for (int ib = ix; ib < nb; ib += 8) {
-
-        float2 sumy = {0.f, 0.f};
-        for (int i = 0; i < 8; ++i) {
-            yl[i] = y4[i+ 0]; sumy[0] += yl[i];
-            yh[i] = y4[i+32]; sumy[1] += yh[i];
-        }
-
-        device const uint16_t * sc = (device const uint16_t *)x[ib].scales;
-        device const uint16_t * qs = (device const uint16_t *)x[ib].qs + 4 * it;
-        device const half     * dh = x[ib].d;
-
-        for (int row = 0; row < N_DST; row++) {
-
-            sc16[0] = sc[0] & 0x000f;
-            sc16[1] = sc[0] & 0x0f00;
-            sc16[2] = sc[0] & 0x00f0;
-            sc16[3] = sc[0] & 0xf000;
-
-            float2 acc1 = {0.f, 0.f};
-            float2 acc2 = {0.f, 0.f};
-            for (int i = 0; i < 8; i += 2) {
-                acc1[0] += yl[i+0] * (qs[i/2] & 0x000F);
-                acc1[1] += yl[i+1] * (qs[i/2] & 0x0F00);
-                acc2[0] += yh[i+0] * (qs[i/2] & 0x00F0);
-                acc2[1] += yh[i+1] * (qs[i/2] & 0xF000);
-            }
-
-            float dall = dh[0];
-            float dmin = dh[1];
-            sumf[row] += dall * ((acc1[0] + 1.f/256.f * acc1[1]) * sc16[0] +
-                                 (acc2[0] + 1.f/256.f * acc2[1]) * sc16[1] * 1.f/4096.f) -
-                         dmin * 1.f/16.f * (sumy[0] * sc16[2] + sumy[1] * sc16[3] * 1.f/256.f);
-
-            qs += step;
-            sc += step;
-            dh += step;
-        }
-
-        y4 += 8 * QK_K;
-    }
-
-    for (int row = 0; row < N_DST; ++row) {
-        all_sum = simd_sum(sumf[row]);
-        if (tiisg == 0) {
-            dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum;
-        }
-    }
-}
-#endif
-
-[[host_name("kernel_mul_mv_q4_K_f32")]]
-kernel void kernel_mul_mv_q4_K_f32(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant   int64_t & ne10,
-        constant   int64_t & ne11,
-        constant   int64_t & ne12,
-        constant  uint64_t & nb10,
-        constant  uint64_t & nb11,
-        constant  uint64_t & nb12,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint tiisg[[thread_index_in_simdgroup]],
-        uint sgitg[[simdgroup_index_in_threadgroup]]) {
-
-    kernel_mul_mv_q4_K_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, tgpig, tiisg, sgitg);
-}
-
-void kernel_mul_mv_q5_K_f32_impl(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant   int64_t & ne10,
-        constant   int64_t & ne12,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint  tiisg[[thread_index_in_simdgroup]],
-        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
-
-    const int nb = ne00/QK_K;
-
-    const int64_t r0 = tgpig.x;
-    const int64_t r1 = tgpig.y;
-    const int im = tgpig.z;
-
-    const int first_row = (r0 * N_SIMDGROUP + sgitg) * 2;
-
-    const uint i12 = im%ne12;
-    const uint i13 = im/ne12;
-
-    const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
-
-    device const block_q5_K * x = (device const block_q5_K *) src0 + first_row*nb + offset0;
-    device const float     * yy = (device const float      *) src1 + r1*ne10 + im*ne00*ne1;
-
-    float sumf[2]={0.f};
-
-    const int step = sizeof(block_q5_K) * nb;
-
-#if QK_K == 256
-#
-    float yl[16], yh[16];
-
-    const uint16_t kmask1 = 0x3f3f;
-    const uint16_t kmask2 = 0x0f0f;
-    const uint16_t kmask3 = 0xc0c0;
-
-    const int tid = tiisg/4;
-    const int ix  = tiisg%4;
-    const int iq  = tid/4;
-    const int ir  = tid%4;
-    const int n   = 8;
-
-    const int l0 = n*ir;
-    const int q_offset = 32*iq + l0;
-    const int y_offset = 64*iq + l0;
-
-    const uint8_t hm1 = 1u << (2*iq);
-    const uint8_t hm2 = hm1 << 1;
-    const uint8_t hm3 = hm1 << 4;
-    const uint8_t hm4 = hm2 << 4;
-
-    uint16_t sc16[4];
-    thread const uint8_t * sc8 = (thread const uint8_t *)sc16;
-
-    device const float * y1 = yy + ix*QK_K + y_offset;
-
-    for (int i = ix; i < nb; i += 4) {
-
-        device const uint8_t * q1 = x[i].qs + q_offset;
-        device const uint8_t * qh = x[i].qh + l0;
-        device const half * dh = &x[i].d;
-        device const uint16_t * a = (device const uint16_t *)x[i].scales + iq;
-
-        device const float * y2 = y1 + 128;
-        float4 sumy = {0.f, 0.f, 0.f, 0.f};
-        for (int l = 0; l < 8; ++l) {
-            yl[l+0] = y1[l+ 0]; sumy[0] += yl[l+0];
-            yl[l+8] = y1[l+32]; sumy[1] += yl[l+8];
-            yh[l+0] = y2[l+ 0]; sumy[2] += yh[l+0];
-            yh[l+8] = y2[l+32]; sumy[3] += yh[l+8];
-        }
-
-        for (int row = 0; row < 2; ++row) {
-
-            device const uint8_t * q2 = q1 + 64;
-
-            sc16[0] = a[0] & kmask1;
-            sc16[1] = a[2] & kmask1;
-            sc16[2] = ((a[4] >> 0) & kmask2) | ((a[0] & kmask3) >> 2);
-            sc16[3] = ((a[4] >> 4) & kmask2) | ((a[2] & kmask3) >> 2);
-
-            float4 acc1 = {0.f};
-            float4 acc2 = {0.f};
-            for (int l = 0; l < n; ++l) {
-                uint8_t h = qh[l];
-                acc1[0] += yl[l+0] * (q1[l] & 0x0F);
-                acc1[1] += yl[l+8] * (q1[l] & 0xF0);
-                acc1[2] += yh[l+0] * (q2[l] & 0x0F);
-                acc1[3] += yh[l+8] * (q2[l] & 0xF0);
-                acc2[0] += h & hm1 ? yl[l+0] : 0.f;
-                acc2[1] += h & hm2 ? yl[l+8] : 0.f;
-                acc2[2] += h & hm3 ? yh[l+0] : 0.f;
-                acc2[3] += h & hm4 ? yh[l+8] : 0.f;
-            }
-            const float dall = dh[0];
-            const float dmin = dh[1];
-            sumf[row] += dall * (sc8[0] * (acc1[0] +  16.f*acc2[0]) +
-                                 sc8[1] * (acc1[1]/16.f + 16.f*acc2[1]) +
-                                 sc8[4] * (acc1[2] +  16.f*acc2[2]) +
-                                 sc8[5] * (acc1[3]/16.f + 16.f*acc2[3])) -
-                         dmin * (sumy[0] * sc8[2] + sumy[1] * sc8[3] + sumy[2] * sc8[6] + sumy[3] * sc8[7]);
-
-            q1 += step;
-            qh += step;
-            dh += step/2;
-            a  += step/2;
-
-        }
-
-        y1 += 4 * QK_K;
-
-    }
-#else
-    float yl[8], yh[8];
-
-    const int il = 4 * (tiisg/8);  // 0, 4, 8, 12
-    const int ix = tiisg%8;
-    const int iq = il/8;         // 0, 0, 1, 1
-    const int in = il%8;         // 0, 4, 0, 4
-
-    device const float * y = yy + ix*QK_K + il;
-
-    for (int i = ix; i < nb; i += 8) {
-
-        for (int l = 0; l < 4; ++l) {
-            yl[l+0] = y[l+ 0];
-            yl[l+4] = y[l+16];
-            yh[l+0] = y[l+32];
-            yh[l+4] = y[l+48];
-        }
-
-        device const half * dh = &x[i].d;
-        device const uint8_t * q = x[i].qs + il;
-        device const uint8_t * h = x[i].qh + in;
-        device const int8_t  * s = x[i].scales;
-
-        for (int row = 0; row < 2; ++row) {
-
-            const float d = dh[0];
-
-            float2 acc = {0.f, 0.f};
-            for (int l = 0; l < 4; ++l) {
-                const uint8_t hl = h[l] >> iq;
-                acc[0] += yl[l+0] * s[0] * ((int16_t)(q[l+ 0] & 0x0F) - (hl & 0x01 ? 0 : 16))
-                        + yl[l+4] * s[1] * ((int16_t)(q[l+16] & 0x0F) - (hl & 0x04 ? 0 : 16));
-                acc[1] += yh[l+0] * s[2] * ((int16_t)(q[l+ 0] & 0xF0) - (hl & 0x10 ? 0 : 256))
-                        + yh[l+4] * s[3] * ((int16_t)(q[l+16] & 0xF0) - (hl & 0x40 ? 0 : 256));
-            }
-            sumf[row] += d * (acc[0] + 1.f/16.f * acc[1]);
-
-            q += step;
-            h += step;
-            s += step;
-            dh += step/2;
-
-        }
-
-        y += 8 * QK_K;
-    }
-#endif
-
-    for (int row = 0; row < 2; ++row) {
-        const float tot = simd_sum(sumf[row]);
-        if (tiisg == 0) {
-            dst[r1*ne0 + im*ne0*ne1 + first_row + row] = tot;
-        }
-    }
-}
-
-[[host_name("kernel_mul_mv_q5_K_f32")]]
-kernel void kernel_mul_mv_q5_K_f32(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant   int64_t & ne10,
-        constant   int64_t & ne11,
-        constant   int64_t & ne12,
-        constant  uint64_t & nb10,
-        constant  uint64_t & nb11,
-        constant  uint64_t & nb12,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint  tiisg[[thread_index_in_simdgroup]],
-        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
-
-    kernel_mul_mv_q5_K_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, tgpig, tiisg, sgitg);
-}
-
-void kernel_mul_mv_q6_K_f32_impl(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant   int64_t & ne10,
-        constant   int64_t & ne12,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint  tiisg[[thread_index_in_simdgroup]],
-        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
-
-    const uint8_t kmask1 = 0x03;
-    const uint8_t kmask2 = 0x0C;
-    const uint8_t kmask3 = 0x30;
-    const uint8_t kmask4 = 0xC0;
-
-    const int nb = ne00/QK_K;
-
-    const int64_t r0 = tgpig.x;
-    const int64_t r1 = tgpig.y;
-    const int     im = tgpig.z;
-
-    const int row = 2 * r0 + sgitg;
-
-    const uint i12 = im%ne12;
-    const uint i13 = im/ne12;
-
-    const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
-
-    device const block_q6_K * x = (device const block_q6_K *) src0 + row * nb + offset0;
-    device const float     * yy = (device const float      *) src1 + r1*ne10 + im*ne00*ne1;
-
-    float sumf = 0;
-
-#if QK_K == 256
-    const int tid  = tiisg/2;
-    const int ix   = tiisg%2;
-    const int ip   = tid/8;         // 0 or 1
-    const int il   = tid%8;
-    const int n    = 4;
-    const int l0   = n*il;
-    const int is   = 8*ip + l0/16;
-
-    const int y_offset = 128*ip + l0;
-    const int q_offset_l = 64*ip + l0;
-    const int q_offset_h = 32*ip + l0;
-
-    for (int i = ix; i < nb; i += 2) {
-
-        device const uint8_t * q1 = x[i].ql + q_offset_l;
-        device const uint8_t * q2 = q1 + 32;
-        device const uint8_t * qh = x[i].qh + q_offset_h;
-        device const int8_t  * sc = x[i].scales + is;
-
-        device const float * y = yy + i * QK_K + y_offset;
-
-        const float dall = x[i].d;
-
-        float4 sums = {0.f, 0.f, 0.f, 0.f};
-        for (int l = 0; l < n; ++l) {
-            sums[0] += y[l+ 0] * ((int8_t)((q1[l] & 0xF) | ((qh[l] & kmask1) << 4)) - 32);
-            sums[1] += y[l+32] * ((int8_t)((q2[l] & 0xF) | ((qh[l] & kmask2) << 2)) - 32);
-            sums[2] += y[l+64] * ((int8_t)((q1[l]  >> 4) | ((qh[l] & kmask3) << 0)) - 32);
-            sums[3] += y[l+96] * ((int8_t)((q2[l]  >> 4) | ((qh[l] & kmask4) >> 2)) - 32);
-        }
-
-        sumf += dall * (sums[0] * sc[0] + sums[1] * sc[2] + sums[2] * sc[4] + sums[3] * sc[6]);
-
-    }
-
-#else
-    const int ix  = tiisg/4;
-    const int il  = 4*(tiisg%4);
-
-    for (int i = ix; i < nb; i += 8) {
-        device const float * y = yy + i * QK_K + il;
-        device const uint8_t * ql = x[i].ql + il;
-        device const uint8_t * qh = x[i].qh + il;
-        device const int8_t  * s  = x[i].scales;
-
-        const float d = x[i].d;
-
-        float4 sums = {0.f, 0.f, 0.f, 0.f};
-        for (int l = 0; l < 4; ++l) {
-            sums[0] += y[l+ 0] * ((int8_t)((ql[l+ 0] & 0xF) | ((qh[l] & kmask1) << 4)) - 32);
-            sums[1] += y[l+16] * ((int8_t)((ql[l+16] & 0xF) | ((qh[l] & kmask2) << 2)) - 32);
-            sums[2] += y[l+32] * ((int8_t)((ql[l+ 0] >>  4) | ((qh[l] & kmask3) >> 0)) - 32);
-            sums[3] += y[l+48] * ((int8_t)((ql[l+16] >>  4) | ((qh[l] & kmask4) >> 2)) - 32);
-        }
-        sumf += d * (sums[0] * s[0] + sums[1] * s[1] + sums[2] * s[2] + sums[3] * s[3]);
-    }
-
-#endif
-
-    const float tot = simd_sum(sumf);
-    if (tiisg == 0) {
-        dst[r1*ne0 + im*ne0*ne1 + row] = tot;
-    }
-}
-
-[[host_name("kernel_mul_mv_q6_K_f32")]]
-kernel void kernel_mul_mv_q6_K_f32(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant   int64_t & ne10,
-        constant   int64_t & ne11,
-        constant   int64_t & ne12,
-        constant  uint64_t & nb10,
-        constant  uint64_t & nb11,
-        constant  uint64_t & nb12,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint  tiisg[[thread_index_in_simdgroup]],
-        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
-
-    kernel_mul_mv_q6_K_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, tgpig, tiisg, sgitg);
-}
-
-// ======================= "True" 2-bit
-
-constexpr constant static uint64_t iq2xxs_grid[256] = {
-    0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
-    0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x08080808082b0808,
-    0x08080808082b082b, 0x08080808082b2b08, 0x08080808082b2b2b, 0x0808080819080819,
-    0x0808080819081908, 0x0808080819190808, 0x0808080819192b08, 0x08080808192b0819,
-    0x08080808192b1908, 0x080808082b080808, 0x080808082b08082b, 0x080808082b082b2b,
-    0x080808082b2b082b, 0x0808081908080819, 0x0808081908081908, 0x0808081908190808,
-    0x0808081908191919, 0x0808081919080808, 0x080808192b081908, 0x080808192b192b08,
-    0x0808082b08080808, 0x0808082b0808082b, 0x0808082b082b082b, 0x0808082b2b08082b,
-    0x0808190808080819, 0x0808190808081908, 0x0808190808190808, 0x08081908082b0819,
-    0x08081908082b1908, 0x0808190819080808, 0x080819081908082b, 0x0808190819082b08,
-    0x08081908192b0808, 0x080819082b080819, 0x080819082b081908, 0x080819082b190808,
-    0x080819082b2b1908, 0x0808191908080808, 0x080819190808082b, 0x0808191908082b08,
-    0x08081919082b0808, 0x080819191908192b, 0x08081919192b2b19, 0x080819192b080808,
-    0x080819192b190819, 0x0808192b08082b19, 0x0808192b08190808, 0x0808192b19080808,
-    0x0808192b2b081908, 0x0808192b2b2b1908, 0x08082b0808080808, 0x08082b0808081919,
-    0x08082b0808082b08, 0x08082b0808191908, 0x08082b08082b2b08, 0x08082b0819080819,
-    0x08082b0819081908, 0x08082b0819190808, 0x08082b081919082b, 0x08082b082b082b08,
-    0x08082b1908081908, 0x08082b1919080808, 0x08082b2b0808082b, 0x08082b2b08191908,
-    0x0819080808080819, 0x0819080808081908, 0x0819080808190808, 0x08190808082b0819,
-    0x0819080819080808, 0x08190808192b0808, 0x081908082b081908, 0x081908082b190808,
-    0x081908082b191919, 0x0819081908080808, 0x0819081908082b08, 0x08190819082b0808,
-    0x0819081919190808, 0x0819081919192b2b, 0x081908192b080808, 0x0819082b082b1908,
-    0x0819082b19081919, 0x0819190808080808, 0x0819190808082b08, 0x08191908082b0808,
-    0x08191908082b1919, 0x0819190819082b19, 0x081919082b080808, 0x0819191908192b08,
-    0x08191919192b082b, 0x0819192b08080808, 0x0819192b0819192b, 0x08192b0808080819,
-    0x08192b0808081908, 0x08192b0808190808, 0x08192b0819080808, 0x08192b082b080819,
-    0x08192b1908080808, 0x08192b1908081919, 0x08192b192b2b0808, 0x08192b2b19190819,
-    0x082b080808080808, 0x082b08080808082b, 0x082b080808082b2b, 0x082b080819081908,
-    0x082b0808192b0819, 0x082b08082b080808, 0x082b08082b08082b, 0x082b0819082b2b19,
-    0x082b081919082b08, 0x082b082b08080808, 0x082b082b0808082b, 0x082b190808080819,
-    0x082b190808081908, 0x082b190808190808, 0x082b190819080808, 0x082b19081919192b,
-    0x082b191908080808, 0x082b191919080819, 0x082b1919192b1908, 0x082b192b2b190808,
-    0x082b2b0808082b08, 0x082b2b08082b0808, 0x082b2b082b191908, 0x082b2b2b19081908,
-    0x1908080808080819, 0x1908080808081908, 0x1908080808190808, 0x1908080808192b08,
-    0x19080808082b0819, 0x19080808082b1908, 0x1908080819080808, 0x1908080819082b08,
-    0x190808081919192b, 0x19080808192b0808, 0x190808082b080819, 0x190808082b081908,
-    0x190808082b190808, 0x1908081908080808, 0x19080819082b0808, 0x19080819192b0819,
-    0x190808192b080808, 0x190808192b081919, 0x1908082b08080819, 0x1908082b08190808,
-    0x1908082b19082b08, 0x1908082b1919192b, 0x1908082b192b2b08, 0x1908190808080808,
-    0x1908190808082b08, 0x19081908082b0808, 0x190819082b080808, 0x190819082b192b19,
-    0x190819190819082b, 0x19081919082b1908, 0x1908192b08080808, 0x19082b0808080819,
-    0x19082b0808081908, 0x19082b0808190808, 0x19082b0819080808, 0x19082b0819081919,
-    0x19082b1908080808, 0x19082b1919192b08, 0x19082b19192b0819, 0x19082b192b08082b,
-    0x19082b2b19081919, 0x19082b2b2b190808, 0x1919080808080808, 0x1919080808082b08,
-    0x1919080808190819, 0x1919080808192b19, 0x19190808082b0808, 0x191908082b080808,
-    0x191908082b082b08, 0x1919081908081908, 0x191908191908082b, 0x191908192b2b1908,
-    0x1919082b2b190819, 0x191919082b190808, 0x191919082b19082b, 0x1919191908082b2b,
-    0x1919192b08080819, 0x1919192b19191908, 0x19192b0808080808, 0x19192b0808190819,
-    0x19192b0808192b19, 0x19192b08192b1908, 0x19192b1919080808, 0x19192b2b08082b08,
-    0x192b080808081908, 0x192b080808190808, 0x192b080819080808, 0x192b0808192b2b08,
-    0x192b081908080808, 0x192b081919191919, 0x192b082b08192b08, 0x192b082b192b0808,
-    0x192b190808080808, 0x192b190808081919, 0x192b191908190808, 0x192b19190819082b,
-    0x192b19192b081908, 0x192b2b081908082b, 0x2b08080808080808, 0x2b0808080808082b,
-    0x2b08080808082b2b, 0x2b08080819080819, 0x2b0808082b08082b, 0x2b08081908081908,
-    0x2b08081908192b08, 0x2b08081919080808, 0x2b08082b08190819, 0x2b08190808080819,
-    0x2b08190808081908, 0x2b08190808190808, 0x2b08190808191919, 0x2b08190819080808,
-    0x2b081908192b0808, 0x2b08191908080808, 0x2b0819191908192b, 0x2b0819192b191908,
-    0x2b08192b08082b19, 0x2b08192b19080808, 0x2b08192b192b0808, 0x2b082b080808082b,
-    0x2b082b1908081908, 0x2b082b2b08190819, 0x2b19080808081908, 0x2b19080808190808,
-    0x2b190808082b1908, 0x2b19080819080808, 0x2b1908082b2b0819, 0x2b1908190819192b,
-    0x2b1908192b080808, 0x2b19082b19081919, 0x2b19190808080808, 0x2b191908082b082b,
-    0x2b19190819081908, 0x2b19191919190819, 0x2b192b082b080819, 0x2b192b19082b0808,
-    0x2b2b08080808082b, 0x2b2b080819190808, 0x2b2b08082b081919, 0x2b2b081908082b19,
-    0x2b2b082b08080808, 0x2b2b190808192b08, 0x2b2b2b0819190808, 0x2b2b2b1908081908,
-};
-
-constexpr constant static uint64_t iq2xs_grid[512] = {
-    0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
-    0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x080808080819192b,
-    0x0808080808192b19, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919,
-    0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908, 0x080808081908192b,
-    0x0808080819082b19, 0x0808080819190808, 0x080808081919082b, 0x0808080819191919,
-    0x0808080819192b08, 0x08080808192b0819, 0x08080808192b1908, 0x080808082b080808,
-    0x080808082b08082b, 0x080808082b081919, 0x080808082b082b08, 0x080808082b190819,
-    0x080808082b191908, 0x080808082b192b19, 0x080808082b2b0808, 0x0808081908080819,
-    0x0808081908081908, 0x080808190808192b, 0x0808081908082b19, 0x0808081908190808,
-    0x080808190819082b, 0x0808081908191919, 0x0808081908192b08, 0x0808081908192b2b,
-    0x08080819082b0819, 0x08080819082b1908, 0x0808081919080808, 0x080808191908082b,
-    0x0808081919081919, 0x0808081919082b08, 0x0808081919190819, 0x0808081919191908,
-    0x08080819192b0808, 0x08080819192b2b08, 0x080808192b080819, 0x080808192b081908,
-    0x080808192b190808, 0x0808082b08080808, 0x0808082b0808082b, 0x0808082b08081919,
-    0x0808082b08082b08, 0x0808082b08190819, 0x0808082b08191908, 0x0808082b082b0808,
-    0x0808082b19080819, 0x0808082b19081908, 0x0808082b19190808, 0x0808082b19191919,
-    0x0808082b2b080808, 0x0808082b2b082b2b, 0x0808190808080819, 0x0808190808081908,
-    0x080819080808192b, 0x0808190808082b19, 0x0808190808190808, 0x080819080819082b,
-    0x0808190808191919, 0x0808190808192b08, 0x08081908082b0819, 0x08081908082b1908,
-    0x0808190819080808, 0x080819081908082b, 0x0808190819081919, 0x0808190819082b08,
-    0x0808190819190819, 0x0808190819191908, 0x080819081919192b, 0x08081908192b0808,
-    0x080819082b080819, 0x080819082b081908, 0x080819082b190808, 0x0808191908080808,
-    0x080819190808082b, 0x0808191908081919, 0x0808191908082b08, 0x0808191908190819,
-    0x0808191908191908, 0x08081919082b0808, 0x0808191919080819, 0x0808191919081908,
-    0x0808191919190808, 0x08081919192b0819, 0x080819192b080808, 0x0808192b08080819,
-    0x0808192b08081908, 0x0808192b08190808, 0x0808192b082b192b, 0x0808192b19080808,
-    0x0808192b1908082b, 0x0808192b2b081908, 0x08082b0808080808, 0x08082b080808082b,
-    0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808082b2b, 0x08082b0808190819,
-    0x08082b0808191908, 0x08082b08082b0808, 0x08082b08082b1919, 0x08082b0819080819,
-    0x08082b0819081908, 0x08082b0819190808, 0x08082b0819192b08, 0x08082b082b080808,
-    0x08082b082b2b0808, 0x08082b082b2b2b2b, 0x08082b1908080819, 0x08082b1908081908,
-    0x08082b1908190808, 0x08082b1919080808, 0x08082b192b080819, 0x08082b192b082b19,
-    0x08082b2b08080808, 0x08082b2b082b0808, 0x08082b2b082b2b08, 0x08082b2b2b19192b,
-    0x08082b2b2b2b0808, 0x0819080808080819, 0x0819080808081908, 0x081908080808192b,
-    0x0819080808082b19, 0x0819080808190808, 0x081908080819082b, 0x0819080808191919,
-    0x0819080808192b08, 0x08190808082b0819, 0x08190808082b1908, 0x0819080819080808,
-    0x081908081908082b, 0x0819080819081919, 0x0819080819082b08, 0x0819080819190819,
-    0x0819080819191908, 0x08190808192b0808, 0x08190808192b2b2b, 0x081908082b080819,
-    0x081908082b081908, 0x081908082b190808, 0x0819081908080808, 0x081908190808082b,
-    0x0819081908081919, 0x0819081908082b08, 0x0819081908190819, 0x0819081908191908,
-    0x08190819082b0808, 0x0819081919080819, 0x0819081919081908, 0x0819081919190808,
-    0x081908192b080808, 0x081908192b191908, 0x081908192b19192b, 0x0819082b08080819,
-    0x0819082b08081908, 0x0819082b0808192b, 0x0819082b08190808, 0x0819082b19080808,
-    0x0819082b192b0808, 0x0819190808080808, 0x081919080808082b, 0x0819190808081919,
-    0x0819190808082b08, 0x0819190808190819, 0x0819190808191908, 0x08191908082b0808,
-    0x0819190819080819, 0x0819190819081908, 0x0819190819082b19, 0x0819190819190808,
-    0x08191908192b1908, 0x081919082b080808, 0x0819191908080819, 0x0819191908081908,
-    0x0819191908190808, 0x0819191919080808, 0x0819192b08080808, 0x0819192b08191908,
-    0x0819192b19082b19, 0x08192b0808080819, 0x08192b0808081908, 0x08192b0808190808,
-    0x08192b080819082b, 0x08192b0819080808, 0x08192b0819191908, 0x08192b082b08192b,
-    0x08192b1908080808, 0x08192b1908081919, 0x08192b19192b192b, 0x08192b2b19190819,
-    0x08192b2b2b2b2b19, 0x082b080808080808, 0x082b08080808082b, 0x082b080808081919,
-    0x082b080808082b08, 0x082b080808082b2b, 0x082b080808190819, 0x082b080808191908,
-    0x082b0808082b0808, 0x082b080819080819, 0x082b080819081908, 0x082b080819190808,
-    0x082b08082b080808, 0x082b08082b2b0808, 0x082b081908080819, 0x082b081908081908,
-    0x082b081908190808, 0x082b081919080808, 0x082b081919082b08, 0x082b0819192b1919,
-    0x082b082b08080808, 0x082b082b082b082b, 0x082b082b2b080808, 0x082b082b2b2b2b08,
-    0x082b190808080819, 0x082b190808081908, 0x082b190808190808, 0x082b1908082b2b19,
-    0x082b190819080808, 0x082b191908080808, 0x082b191919080819, 0x082b19191919082b,
-    0x082b19192b192b19, 0x082b192b08080819, 0x082b192b08192b2b, 0x082b192b2b2b192b,
-    0x082b2b0808080808, 0x082b2b0808082b08, 0x082b2b0808082b2b, 0x082b2b08082b0808,
-    0x082b2b0819191919, 0x082b2b082b082b08, 0x082b2b082b2b082b, 0x082b2b19192b2b08,
-    0x082b2b192b190808, 0x082b2b2b08082b08, 0x082b2b2b082b0808, 0x082b2b2b2b08082b,
-    0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819, 0x1908080808081908,
-    0x190808080808192b, 0x1908080808082b19, 0x1908080808190808, 0x190808080819082b,
-    0x1908080808191919, 0x1908080808192b08, 0x19080808082b0819, 0x19080808082b1908,
-    0x1908080819080808, 0x190808081908082b, 0x1908080819081919, 0x1908080819082b08,
-    0x1908080819082b2b, 0x1908080819190819, 0x1908080819191908, 0x19080808192b0808,
-    0x19080808192b1919, 0x190808082b080819, 0x190808082b081908, 0x190808082b190808,
-    0x1908081908080808, 0x190808190808082b, 0x1908081908081919, 0x1908081908082b08,
-    0x1908081908190819, 0x1908081908191908, 0x19080819082b0808, 0x1908081919080819,
-    0x1908081919081908, 0x1908081919190808, 0x190808192b080808, 0x190808192b081919,
-    0x190808192b2b082b, 0x1908082b08080819, 0x1908082b08081908, 0x1908082b08190808,
-    0x1908082b0819082b, 0x1908082b082b2b19, 0x1908082b19080808, 0x1908190808080808,
-    0x190819080808082b, 0x1908190808081919, 0x1908190808082b08, 0x1908190808190819,
-    0x1908190808191908, 0x1908190808192b19, 0x19081908082b0808, 0x1908190819080819,
-    0x1908190819081908, 0x1908190819190808, 0x190819082b080808, 0x190819082b191908,
-    0x1908191908080819, 0x1908191908081908, 0x1908191908190808, 0x19081919082b1908,
-    0x1908191919080808, 0x190819192b192b2b, 0x1908192b08080808, 0x1908192b08082b2b,
-    0x1908192b19081908, 0x1908192b19190808, 0x19082b0808080819, 0x19082b0808081908,
-    0x19082b0808190808, 0x19082b0819080808, 0x19082b0819081919, 0x19082b0819191908,
-    0x19082b08192b082b, 0x19082b1908080808, 0x19082b1908190819, 0x19082b1919081908,
-    0x19082b1919190808, 0x19082b19192b2b19, 0x19082b2b08081908, 0x1919080808080808,
-    0x191908080808082b, 0x1919080808081919, 0x1919080808082b08, 0x1919080808190819,
-    0x1919080808191908, 0x19190808082b0808, 0x19190808082b2b08, 0x1919080819080819,
-    0x1919080819081908, 0x1919080819190808, 0x191908082b080808, 0x1919081908080819,
-    0x1919081908081908, 0x1919081908190808, 0x1919081908191919, 0x1919081919080808,
-    0x191908191908082b, 0x1919082b08080808, 0x1919082b19081908, 0x1919082b2b2b2b2b,
-    0x1919190808080819, 0x1919190808081908, 0x1919190808190808, 0x19191908082b0819,
-    0x1919190819080808, 0x19191908192b0808, 0x191919082b080819, 0x191919082b2b0819,
-    0x1919191908080808, 0x1919191908082b08, 0x191919192b080808, 0x191919192b082b08,
-    0x1919192b082b0819, 0x1919192b192b2b08, 0x1919192b2b2b0819, 0x19192b0808080808,
-    0x19192b0808191908, 0x19192b0819080819, 0x19192b0819190808, 0x19192b082b192b19,
-    0x19192b1908192b2b, 0x19192b1919080808, 0x19192b191908082b, 0x19192b2b2b081919,
-    0x192b080808080819, 0x192b080808081908, 0x192b080808190808, 0x192b080819080808,
-    0x192b080819191908, 0x192b0808192b082b, 0x192b08082b08192b, 0x192b08082b2b2b19,
-    0x192b081908080808, 0x192b082b082b1908, 0x192b082b19082b2b, 0x192b082b2b19082b,
-    0x192b190808080808, 0x192b19080819192b, 0x192b191908190808, 0x192b191919080808,
-    0x192b191919081919, 0x192b19192b2b1908, 0x192b2b0808080819, 0x192b2b08192b2b2b,
-    0x192b2b19082b1919, 0x192b2b2b0808192b, 0x192b2b2b19191908, 0x192b2b2b192b082b,
-    0x2b08080808080808, 0x2b0808080808082b, 0x2b08080808081919, 0x2b08080808082b08,
-    0x2b08080808190819, 0x2b08080808191908, 0x2b080808082b0808, 0x2b080808082b2b2b,
-    0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808, 0x2b0808082b080808,
-    0x2b0808082b08082b, 0x2b0808082b2b2b08, 0x2b0808082b2b2b2b, 0x2b08081908080819,
-    0x2b08081908081908, 0x2b0808190808192b, 0x2b08081908190808, 0x2b08081919080808,
-    0x2b08081919190819, 0x2b08081919192b19, 0x2b08082b08080808, 0x2b08082b082b0808,
-    0x2b08082b2b080808, 0x2b08082b2b08082b, 0x2b08082b2b2b0808, 0x2b08082b2b2b2b08,
-    0x2b08190808080819, 0x2b08190808081908, 0x2b08190808190808, 0x2b0819080819082b,
-    0x2b08190808191919, 0x2b08190819080808, 0x2b081908192b0808, 0x2b0819082b082b19,
-    0x2b08191908080808, 0x2b08191919081908, 0x2b0819192b2b1919, 0x2b08192b08192b08,
-    0x2b08192b192b2b2b, 0x2b082b0808080808, 0x2b082b0808082b08, 0x2b082b08082b1919,
-    0x2b082b0819192b2b, 0x2b082b082b080808, 0x2b082b082b08082b, 0x2b082b082b2b2b08,
-    0x2b082b190808192b, 0x2b082b2b082b082b, 0x2b082b2b2b080808, 0x2b082b2b2b082b08,
-    0x2b082b2b2b19192b, 0x2b082b2b2b2b2b08, 0x2b19080808080819, 0x2b19080808081908,
-    0x2b19080808190808, 0x2b19080819080808, 0x2b1908081919192b, 0x2b1908082b081908,
-    0x2b19081908080808, 0x2b190819082b082b, 0x2b190819192b1908, 0x2b19082b1919192b,
-    0x2b19082b2b082b19, 0x2b19190808080808, 0x2b19190808081919, 0x2b19190819081908,
-    0x2b19190819190808, 0x2b19190819192b08, 0x2b191919082b2b19, 0x2b1919192b190808,
-    0x2b1919192b19082b, 0x2b19192b19080819, 0x2b192b0819190819, 0x2b192b082b2b192b,
-    0x2b192b1919082b19, 0x2b192b2b08191919, 0x2b192b2b192b0808, 0x2b2b080808080808,
-    0x2b2b08080808082b, 0x2b2b080808082b08, 0x2b2b080808082b2b, 0x2b2b0808082b0808,
-    0x2b2b0808082b2b2b, 0x2b2b08082b2b0808, 0x2b2b081919190819, 0x2b2b081919192b19,
-    0x2b2b08192b2b192b, 0x2b2b082b08080808, 0x2b2b082b0808082b, 0x2b2b082b08082b08,
-    0x2b2b082b082b2b2b, 0x2b2b082b2b080808, 0x2b2b082b2b2b0808, 0x2b2b190819080808,
-    0x2b2b19082b191919, 0x2b2b192b192b1919, 0x2b2b192b2b192b08, 0x2b2b2b0808082b2b,
-    0x2b2b2b08082b0808, 0x2b2b2b08082b082b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b0808,
-    0x2b2b2b082b2b2b08, 0x2b2b2b1908081908, 0x2b2b2b192b081908, 0x2b2b2b192b08192b,
-    0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b, 0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b,
-};
-
-constexpr constant static uint8_t ksigns_iq2xs[128] = {
-      0, 129, 130,   3, 132,   5,   6, 135, 136,   9,  10, 139,  12, 141, 142,  15,
-    144,  17,  18, 147,  20, 149, 150,  23,  24, 153, 154,  27, 156,  29,  30, 159,
-    160,  33,  34, 163,  36, 165, 166,  39,  40, 169, 170,  43, 172,  45,  46, 175,
-     48, 177, 178,  51, 180,  53,  54, 183, 184,  57,  58, 187,  60, 189, 190,  63,
-    192,  65,  66, 195,  68, 197, 198,  71,  72, 201, 202,  75, 204,  77,  78, 207,
-     80, 209, 210,  83, 212,  85,  86, 215, 216,  89,  90, 219,  92, 221, 222,  95,
-     96, 225, 226,  99, 228, 101, 102, 231, 232, 105, 106, 235, 108, 237, 238, 111,
-    240, 113, 114, 243, 116, 245, 246, 119, 120, 249, 250, 123, 252, 125, 126, 255,
-};
-
-constexpr constant static uint8_t kmask_iq2xs[8] = {1, 2, 4, 8, 16, 32, 64, 128};
-
-void kernel_mul_mv_iq2_xxs_f32_impl(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant   int64_t & ne10,
-        constant   int64_t & ne12,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        threadgroup int8_t * shared_values [[threadgroup(0)]],
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint  tiisg[[thread_index_in_simdgroup]],
-        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
-
-    const int nb = ne00/QK_K;
-    const int r0 = tgpig.x;
-    const int r1 = tgpig.y;
-    const int im = tgpig.z;
-
-    const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
-    const int ib_row = first_row * nb;
-
-    const uint i12 = im%ne12;
-    const uint i13 = im/ne12;
-
-    const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
-
-    device const block_iq2_xxs * x = (device const block_iq2_xxs *) src0 + ib_row + offset0;
-    device const float         * y = (device const float         *) src1 + r1*ne10 + im*ne00*ne1;
-
-    float yl[32];
-    float sumf[N_DST]={0.f}, all_sum;
-
-    const int nb32 = nb * (QK_K / 32);
-
-    threadgroup uint64_t * values = (threadgroup uint64_t *)shared_values;
-    threadgroup uint8_t  * shared_signs = (threadgroup uint8_t *)(values + 256);
-    {
-        int nval = 4;
-        int pos  = (32*sgitg + tiisg)*nval;
-        for (int i = 0; i < nval; ++i) values[pos + i] = iq2xxs_grid[pos + i];
-        nval = 2;
-        pos  = (32*sgitg + tiisg)*nval;
-        for (int i = 0; i < nval; ++i) shared_signs[pos+i] = ksigns_iq2xs[pos+i];
-        threadgroup_barrier(mem_flags::mem_threadgroup);
-    }
-
-#if QK_K == 256
-    const int ix = tiisg;
-
-    device const float * y4 = y + 32 * ix;
-
-    for (int ib32 = ix; ib32 < nb32; ib32 += 32) {
-
-        for (int i = 0; i < 32; ++i) {
-            yl[i] = y4[i];
-        }
-
-        const int ibl = ib32 / (QK_K / 32);
-        const int ib  = ib32 % (QK_K / 32);
-
-        device const block_iq2_xxs * xr = x + ibl;
-        device const uint16_t * q2 = xr->qs + 4 * ib;
-        device const half * dh = &xr->d;
-
-        for (int row = 0; row < N_DST; row++) {
-
-            const float db = dh[0];
-            device const uint8_t * aux8 = (device const uint8_t *)q2;
-            const uint32_t aux32 = q2[2] | (q2[3] << 16);
-            const float d = db * (0.5f + (aux32 >> 28));
-
-            float sum = 0;
-            for (int l = 0; l < 4; ++l) {
-                const threadgroup uint8_t * grid = (const threadgroup uint8_t *)(values + aux8[l]);
-                const uint8_t signs = shared_signs[(aux32 >> 7*l) & 127];
-                for (int j = 0; j < 8; ++j) {
-                    sum += yl[8*l + j] * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
-                }
-            }
-            sumf[row] += d * sum;
-
-            dh += nb*sizeof(block_iq2_xxs)/2;
-            q2 += nb*sizeof(block_iq2_xxs)/2;
-        }
-
-        y4 += 32 * 32;
-    }
-#else
-    // TODO
-#endif
-
-    for (int row = 0; row < N_DST; ++row) {
-        all_sum = simd_sum(sumf[row]);
-        if (tiisg == 0) {
-            dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum * 0.25f;
-        }
-    }
-}
-
-[[host_name("kernel_mul_mv_iq2_xxs_f32")]]
-kernel void kernel_mul_mv_iq2_xxs_f32(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant   int64_t & ne10,
-        constant   int64_t & ne11,
-        constant   int64_t & ne12,
-        constant  uint64_t & nb10,
-        constant  uint64_t & nb11,
-        constant  uint64_t & nb12,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        threadgroup int8_t * shared_values [[threadgroup(0)]],
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint  tiisg[[thread_index_in_simdgroup]],
-        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
-
-    kernel_mul_mv_iq2_xxs_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
-}
-
-void kernel_mul_mv_iq2_xs_f32_impl(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant   int64_t & ne10,
-        constant   int64_t & ne12,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        threadgroup int8_t * shared_values [[threadgroup(0)]],
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint  tiisg[[thread_index_in_simdgroup]],
-        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
-
-    const int nb = ne00/QK_K;
-    const int r0 = tgpig.x;
-    const int r1 = tgpig.y;
-    const int im = tgpig.z;
-
-    const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
-    const int ib_row = first_row * nb;
-
-    const uint i12 = im%ne12;
-    const uint i13 = im/ne12;
-
-    const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
-
-    device const block_iq2_xs * x = (device const block_iq2_xs *) src0 + ib_row + offset0;
-    device const float        * y = (device const float        *) src1 + r1*ne10 + im*ne00*ne1;
-
-    float yl[32];
-    float sumf[N_DST]={0.f}, all_sum;
-
-    const int nb32 = nb * (QK_K / 32);
-
-    threadgroup uint64_t * values = (threadgroup uint64_t *)shared_values;
-    threadgroup uint8_t  * shared_signs = (threadgroup uint8_t *)(values + 512);
-    {
-        int nval = 8;
-        int pos  = (32*sgitg + tiisg)*nval;
-        for (int i = 0; i < nval; ++i) values[pos + i] = iq2xs_grid[pos + i];
-        nval = 2;
-        pos  = (32*sgitg + tiisg)*nval;
-        for (int i = 0; i < nval; ++i) shared_signs[pos+i] = ksigns_iq2xs[pos+i];
-        threadgroup_barrier(mem_flags::mem_threadgroup);
-    }
-
-#if QK_K == 256
-    const int ix = tiisg;
-
-    device const float * y4 = y + 32 * ix;
-
-    for (int ib32 = ix; ib32 < nb32; ib32 += 32) {
-
-        for (int i = 0; i < 32; ++i) {
-            yl[i] = y4[i];
-        }
-
-        const int ibl = ib32 / (QK_K / 32);
-        const int ib  = ib32 % (QK_K / 32);
-
-        device const block_iq2_xs * xr = x + ibl;
-        device const uint16_t * q2 = xr->qs + 4 * ib;
-        device const uint8_t  * sc = xr->scales + ib;
-        device const half * dh = &xr->d;
-
-        for (int row = 0; row < N_DST; row++) {
-
-            const float db = dh[0];
-            const uint8_t ls1 = sc[0] & 0xf;
-            const uint8_t ls2 = sc[0] >>  4;
-            const float d1 = db * (0.5f + ls1);
-            const float d2 = db * (0.5f + ls2);
-
-            float sum1 = 0, sum2 = 0;
-            for (int l = 0; l < 2; ++l) {
-                const threadgroup uint8_t * grid = (const threadgroup uint8_t *)(values + (q2[l] & 511));
-                const uint8_t signs = shared_signs[(q2[l] >> 9)];
-                for (int j = 0; j < 8; ++j) {
-                    sum1 += yl[8*l + j] * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
-                }
-            }
-            for (int l = 2; l < 4; ++l) {
-                const threadgroup uint8_t * grid = (const threadgroup uint8_t *)(values + (q2[l] & 511));
-                const uint8_t signs = shared_signs[(q2[l] >> 9)];
-                for (int j = 0; j < 8; ++j) {
-                    sum2 += yl[8*l + j] * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
-                }
-            }
-            sumf[row] += d1 * sum1 + d2 * sum2;
-
-            dh += nb*sizeof(block_iq2_xs)/2;
-            q2 += nb*sizeof(block_iq2_xs)/2;
-            sc += nb*sizeof(block_iq2_xs);
-        }
-
-        y4 += 32 * 32;
-    }
-#else
-    // TODO
-#endif
-
-    for (int row = 0; row < N_DST; ++row) {
-        all_sum = simd_sum(sumf[row]);
-        if (tiisg == 0) {
-            dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum * 0.25f;
-        }
-    }
-}
-
-[[host_name("kernel_mul_mv_iq2_xs_f32")]]
-kernel void kernel_mul_mv_iq2_xs_f32(
-        device const  void * src0,
-        device const float * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant   int64_t & ne10,
-        constant   int64_t & ne11,
-        constant   int64_t & ne12,
-        constant  uint64_t & nb10,
-        constant  uint64_t & nb11,
-        constant  uint64_t & nb12,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   uint    & r2,
-        constant   uint    & r3,
-        threadgroup int8_t * shared_values [[threadgroup(0)]],
-        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint  tiisg[[thread_index_in_simdgroup]],
-        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
-
-    kernel_mul_mv_iq2_xs_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
-}
-
-//============================= templates and their specializations =============================
-
-// NOTE: this is not dequantizing - we are simply fitting the template
-template <typename type4x4>
-void dequantize_f32(device const float4x4 * src, short il, thread type4x4 & reg) {
-    float4x4 temp = *(((device float4x4 *)src));
-    for (int i = 0; i < 16; i++){
-        reg[i/4][i%4] = temp[i/4][i%4];
-    }
-}
-
-template <typename type4x4>
-void dequantize_f16(device const half4x4 * src, short il, thread type4x4 & reg) {
-    half4x4 temp = *(((device half4x4 *)src));
-    for (int i = 0; i < 16; i++){
-        reg[i/4][i%4] = temp[i/4][i%4];
-    }
-}
-
-template <typename type4x4>
-void dequantize_q4_0(device const block_q4_0 *xb, short il, thread type4x4 & reg) {
-    device const uint16_t * qs = ((device const uint16_t *)xb + 1);
-    const float d1 = il ? (xb->d / 16.h) : xb->d;
-    const float d2 = d1 / 256.f;
-    const float md = -8.h * xb->d;
-    const ushort mask0 = il ? 0x00F0 : 0x000F;
-    const ushort mask1 = mask0 << 8;
-
-    for (int i=0;i<8;i++) {
-        reg[i/2][2*(i%2)+0] = d1 * (qs[i] & mask0) + md;
-        reg[i/2][2*(i%2)+1] = d2 * (qs[i] & mask1) + md;
-    }
-}
-
-template <typename type4x4>
-void dequantize_q4_1(device const block_q4_1 *xb, short il, thread type4x4 & reg) {
-    device const uint16_t * qs = ((device const uint16_t *)xb + 2);
-    const float d1 = il ? (xb->d / 16.h) : xb->d;
-    const float d2 = d1 / 256.f;
-    const float  m = xb->m;
-    const ushort mask0 = il ? 0x00F0 : 0x000F;
-    const ushort mask1 = mask0 << 8;
-
-    for (int i=0;i<8;i++) {
-        reg[i/2][2*(i%2)+0] = ((qs[i] & mask0) * d1) + m;
-        reg[i/2][2*(i%2)+1] = ((qs[i] & mask1) * d2) + m;
-    }
-}
-
-template <typename type4x4>
-void dequantize_q5_0(device const block_q5_0 *xb, short il, thread type4x4 & reg) {
-    device const uint16_t * qs = ((device const uint16_t *)xb + 3);
-    const float d = xb->d;
-    const float md = -16.h * xb->d;
-    const ushort mask = il ? 0x00F0 : 0x000F;
-
-    const uint32_t qh = *((device const uint32_t *)xb->qh);
-
-    const int x_mv = il ? 4 : 0;
-
-    const int gh_mv = il ? 12 : 0;
-    const int gh_bk = il ?  0 : 4;
-
-    for (int i = 0; i < 8; i++) {
-        // extract the 5-th bits for x0 and x1
-        const uint8_t xh_0 = ((qh >> (gh_mv + 2*i  )) << gh_bk) & 0x10;
-        const uint8_t xh_1 = ((qh >> (gh_mv + 2*i+1)) << gh_bk) & 0x10;
-
-        // combine the 4-bits from qs with the 5th bit
-        const int32_t x0 = ((((qs[i]     ) & mask) >> x_mv) | xh_0);
-        const int32_t x1 = ((((qs[i] >> 8) & mask) >> x_mv) | xh_1);
-
-        reg[i/2][2*(i%2)+0] = d * x0 + md;
-        reg[i/2][2*(i%2)+1] = d * x1 + md;
-    }
-}
-
-template <typename type4x4>
-void dequantize_q5_1(device const block_q5_1 *xb, short il, thread type4x4 & reg) {
-    device const uint16_t * qs = ((device const uint16_t *)xb + 4);
-    const float d = xb->d;
-    const float m = xb->m;
-    const ushort mask = il ? 0x00F0 : 0x000F;
-
-    const uint32_t qh = *((device const uint32_t *)xb->qh);
-
-    const int x_mv = il ? 4 : 0;
-
-    const int gh_mv = il ? 12 : 0;
-    const int gh_bk = il ?  0 : 4;
-
-    for (int i = 0; i < 8; i++) {
-        // extract the 5-th bits for x0 and x1
-        const uint8_t xh_0 = ((qh >> (gh_mv + 2*i  )) << gh_bk) & 0x10;
-        const uint8_t xh_1 = ((qh >> (gh_mv + 2*i+1)) << gh_bk) & 0x10;
-
-        // combine the 4-bits from qs with the 5th bit
-        const int32_t x0 = ((((qs[i]     ) & mask) >> x_mv) | xh_0);
-        const int32_t x1 = ((((qs[i] >> 8) & mask) >> x_mv) | xh_1);
-
-        reg[i/2][2*(i%2)+0] = d * x0 + m;
-        reg[i/2][2*(i%2)+1] = d * x1 + m;
-    }
-}
-
-template <typename type4x4>
-void dequantize_q8_0(device const block_q8_0 *xb, short il, thread type4x4 & reg) {
-    device const int8_t * qs = ((device const int8_t *)xb->qs);
-    const half d = xb->d;
-
-    for (int i = 0; i < 16; i++) {
-        reg[i/4][i%4] = (qs[i + 16*il] * d);
-    }
-}
-
-template <typename type4x4>
-void dequantize_q2_K(device const block_q2_K *xb, short il, thread type4x4 & reg) {
-    const float d = xb->d;
-    const float min = xb->dmin;
-    device const uint8_t * q = (device const uint8_t *)xb->qs;
-    float dl, ml;
-    uint8_t sc = xb->scales[il];
-
-#if QK_K == 256
-    q = q + 32*(il/8) + 16*(il&1);
-    il = (il/2)%4;
-#endif
-    half  coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h);
-    uchar mask = il>1 ? (il>2 ? 192    : 48)     : (il>0 ? 12    : 3);
-    dl = d * (sc & 0xF) * coef, ml = min * (sc >> 4);
-    for (int i = 0; i < 16; ++i) {
-        reg[i/4][i%4] = dl * (q[i] & mask) - ml;
-    }
-}
-
-template <typename type4x4>
-void dequantize_q3_K(device const block_q3_K *xb, short il, thread type4x4 & reg) {
-    const half d_all = xb->d;
-    device const uint8_t * q = (device const uint8_t *)xb->qs;
-    device const uint8_t * h = (device const uint8_t *)xb->hmask;
-    device const int8_t * scales = (device const int8_t *)xb->scales;
-
-#if QK_K == 256
-    q = q + 32 * (il/8) + 16 * (il&1);
-    h = h + 16 * (il&1);
-    uint8_t m = 1 << (il/2);
-    uint16_t kmask1 = (il/4)>1 ? ((il/4)>2 ? 192 : 48) : \
-                                 ((il/4)>0 ? 12  : 3);
-    uint16_t kmask2 = il/8 ? 0xF0 : 0x0F;
-    uint16_t scale_2 = scales[il%8], scale_1 = scales[8 + il%4];
-    int16_t  dl_int = (il/4)&1 ? (scale_2&kmask2) | ((scale_1&kmask1) << 2)
-                               : (scale_2&kmask2) | ((scale_1&kmask1) << 4);
-    float dl = il<8 ? d_all * (dl_int - 32.f) : d_all * (dl_int / 16.f - 32.f);
-    const float ml = 4.f * dl;
-
-    il = (il/2) & 3;
-    const half    coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h);
-    const uint8_t mask = il>1 ? (il>2 ? 192    : 48)     : (il>0 ? 12    : 3);
-    dl *= coef;
-
-    for (int i = 0; i < 16; ++i) {
-        reg[i/4][i%4] = dl * (q[i] & mask) - (h[i] & m ? 0 : ml);
-    }
-#else
-    float    kcoef = il&1 ? 1.f/16.f : 1.f;
-    uint16_t kmask = il&1 ? 0xF0     : 0x0F;
-    float    dl = d_all * ((scales[il/2] & kmask) * kcoef - 8);
-    float    coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h);
-    uint8_t  mask = il>1 ? (il>2 ? 192    : 48)     : (il>0 ? 12    : 3);
-    uint8_t  m = 1<<(il*2);
-    for (int i = 0; i < 16; ++i) {
-        reg[i/4][i%4] = coef * dl * ((q[i] & mask) - ((h[i%8] & (m * (1 + i/8))) ? 0 : 4.f/coef));
-    }
-#endif
-}
-
-static inline uchar2 get_scale_min_k4_just2(int j, int k, device const uchar * q) {
-    return j < 4 ? uchar2{uchar(q[j+0+k] & 63), uchar(q[j+4+k] & 63)}
-                 : uchar2{uchar((q[j+4+k] & 0xF) | ((q[j-4+k] & 0xc0) >> 2)), uchar((q[j+4+k] >> 4) | ((q[j-0+k] & 0xc0) >> 2))};
-}
-
-template <typename type4x4>
-void dequantize_q4_K(device const block_q4_K *xb, short il, thread type4x4 & reg) {
-    device const uchar * q = xb->qs;
-
-#if QK_K == 256
-    short is = (il/4) * 2;
-    q = q + (il/4) * 32 + 16 * (il&1);
-    il = il & 3;
-    const uchar2 sc = get_scale_min_k4_just2(is, il/2, xb->scales);
-    const float d   = il < 2 ? xb->d : xb->d / 16.h;
-    const float min = xb->dmin;
-    const float dl = d * sc[0];
-    const float ml = min * sc[1];
-#else
-    q = q + 16 * (il&1);
-    device const uint8_t * s = xb->scales;
-    device const half2 * dh = (device const half2 *)xb->d;
-    const float2 d = (float2)dh[0];
-    const float dl = il<2 ? d[0] * (s[0]&0xF) : d[0] * (s[1]&0xF)/16.h;
-    const float ml = il<2 ? d[1] * (s[0]>>4)  : d[1] * (s[1]>>4);
-#endif
-    const ushort mask = il<2 ? 0x0F : 0xF0;
-    for (int i = 0; i < 16; ++i) {
-        reg[i/4][i%4] = dl * (q[i] & mask) - ml;
-    }
-}
-
-template <typename type4x4>
-void dequantize_q5_K(device const block_q5_K *xb, short il, thread type4x4 & reg) {
-    device const uint8_t * q  = xb->qs;
-    device const uint8_t * qh = xb->qh;
-
-#if QK_K == 256
-    short is = (il/4) * 2;
-    q  = q + 32 * (il/4) + 16 * (il&1);
-    qh = qh + 16 * (il&1);
-    uint8_t ul = 1 << (il/2);
-    il = il & 3;
-    const uchar2 sc = get_scale_min_k4_just2(is, il/2, xb->scales);
-    const float d = il < 2 ? xb->d : xb->d / 16.f;
-    const float min = xb->dmin;
-    const float dl = d * sc[0];
-    const float ml = min * sc[1];
-
-    const ushort mask  = il<2 ? 0x0F : 0xF0;
-    const float qh_val = il<2 ? 16.f : 256.f;
-    for (int i = 0; i < 16; ++i) {
-        reg[i/4][i%4] = dl * ((q[i] & mask) + (qh[i] & ul ? qh_val : 0)) - ml;
-    }
-#else
-    q = q + 16 * (il&1);
-    device const int8_t * s = xb->scales;
-    const float dl = xb->d * s[il];
-    uint8_t m = 1<<(il*2);
-    const float  coef = il<2 ? 1.f  : 1.f/16.f;
-    const ushort mask = il<2 ? 0x0F : 0xF0;
-    for (int i = 0; i < 16; ++i) {
-        reg[i/4][i%4] = coef * dl * ((q[i] & mask) - (qh[i%8] & (m*(1+i/8)) ? 0.f : 16.f/coef));
-    }
-#endif
-}
-
-template <typename type4x4>
-void dequantize_q6_K(device const block_q6_K *xb, short il, thread type4x4 & reg) {
-    const half d_all = xb->d;
-    device const uint8_t * ql = (device const uint8_t *)xb->ql;
-    device const uint8_t * qh = (device const uint8_t *)xb->qh;
-    device const int8_t * scales = (device const int8_t *)xb->scales;
-
-#if QK_K == 256
-    ql = ql + 64*(il/8) + 32*((il/2)&1) + 16*(il&1);
-    qh = qh + 32*(il/8) + 16*(il&1);
-    float sc = scales[(il%2) + 2 * ((il/2))];
-    il = (il/2) & 3;
-#else
-    ql = ql + 16 * (il&1);
-    float sc = scales[il];
-#endif
-    const uint16_t  kmask1 = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3);
-    const uint16_t  kmask2 = il>1 ? 0xF0              : 0x0F;
-    const float       coef = il>1 ? 1.f/16.f          : 1.f;
-    const float ml = d_all * sc * 32.f;
-    const float dl = d_all * sc * coef;
-    for (int i = 0; i < 16; ++i) {
-        const half q = il&1 ? ((ql[i] & kmask2) | ((qh[i] & kmask1) << 2))
-                            : ((ql[i] & kmask2) | ((qh[i] & kmask1) << 4));
-        reg[i/4][i%4] = dl * q - ml;
-    }
-}
-
-template <typename type4x4>
-void dequantize_iq2_xxs(device const block_iq2_xxs * xb, short il, thread type4x4 & reg) {
-    // il is 0...15 for QK_K = 256 => index of block of 32 is il/2
-    const float d = xb->d;
-    const int ib32 = il/2;
-    il = il%2;
-    // il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16
-    // each block of 32 needs 2 uint32_t's for the quants & scale, so 4 uint16_t's.
-    device const uint16_t * q2 = xb->qs + 4*ib32;
-    const uint32_t aux32_g = q2[0] | (q2[1] << 16);
-    const uint32_t aux32_s = q2[2] | (q2[3] << 16);
-    thread const uint8_t * aux8 = (thread const uint8_t *)&aux32_g;
-    const float dl = d * (0.5f + (aux32_s >> 28)) * 0.25f;
-    constant uint8_t * grid = (constant uint8_t *)(iq2xxs_grid + aux8[2*il+0]);
-    uint8_t signs = ksigns_iq2xs[(aux32_s >> 14*il) & 127];
-    for (int i = 0; i < 8; ++i) {
-        reg[i/4][i%4] = dl * grid[i] * (signs & kmask_iq2xs[i] ? -1.f : 1.f);
-    }
-    grid = (constant uint8_t *)(iq2xxs_grid + aux8[2*il+1]);
-    signs = ksigns_iq2xs[(aux32_s >> (14*il+7)) & 127];
-    for (int i = 0; i < 8; ++i) {
-        reg[2+i/4][i%4] = dl * grid[i] * (signs & kmask_iq2xs[i] ? -1.f : 1.f);
-    }
-}
-
-template <typename type4x4>
-void dequantize_iq2_xs(device const block_iq2_xs * xb, short il, thread type4x4 & reg) {
-    // il is 0...15 for QK_K = 256 => index of block of 32 is il/2
-    const float d = xb->d;
-    const int ib32 = il/2;
-    il = il%2;
-    // il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16
-    device const uint16_t * q2 = xb->qs + 4*ib32;
-    const float dl = d * (0.5f + ((xb->scales[ib32] >> 4*il) & 0xf)) * 0.25f;
-    constant uint8_t * grid = (constant uint8_t *)(iq2xs_grid + (q2[2*il+0] & 511));
-    uint8_t signs = ksigns_iq2xs[q2[2*il+0] >> 9];
-    for (int i = 0; i < 8; ++i) {
-        reg[i/4][i%4] = dl * grid[i] * (signs & kmask_iq2xs[i] ? -1.f : 1.f);
-    }
-    grid = (constant uint8_t *)(iq2xs_grid + (q2[2*il+1] & 511));
-    signs = ksigns_iq2xs[q2[2*il+1] >> 9];
-    for (int i = 0; i < 8; ++i) {
-        reg[2+i/4][i%4] = dl * grid[i] * (signs & kmask_iq2xs[i] ? -1.f : 1.f);
-    }
-}
-
-template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread float4x4 &)>
-kernel void kernel_get_rows(
-        device const  void * src0,
-        device const  char * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant   int64_t & ne10,
-        constant  uint64_t & nb10,
-        constant  uint64_t & nb11,
-        constant  uint64_t & nb1,
-        constant  uint64_t & nb2,
-        uint3                tgpig[[threadgroup_position_in_grid]],
-        uint                 tiitg[[thread_index_in_threadgroup]],
-        uint3                tptg [[threads_per_threadgroup]]) {
-    //const int64_t i = tgpig;
-    //const int64_t r = ((device int32_t *) src1)[i];
-
-    const int64_t i10 = tgpig.x;
-    const int64_t i11 = tgpig.y;
-
-    const int64_t r = ((device int32_t *) ((device char *) src1 + i11*nb11 + i10*nb10))[0];
-
-    const int64_t i02 = i11;
-
-    for (int64_t ind = tiitg; ind < ne00/16; ind += tptg.x) {
-        float4x4 temp;
-        dequantize_func(
-            ((device const block_q *) ((device char *) src0 + r*nb01 + i02*nb02)) + ind/nl, ind%nl, temp);
-        *(((device float4x4 *) ((device char *) dst + i11*nb2 + i10*nb1)) + ind) = temp;
-    }
-}
-
-kernel void kernel_get_rows_f32(
-        device const  void * src0,
-        device const  char * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant   int64_t & ne10,
-        constant  uint64_t & nb10,
-        constant  uint64_t & nb11,
-        constant  uint64_t & nb1,
-        constant  uint64_t & nb2,
-        uint3                tgpig[[threadgroup_position_in_grid]],
-        uint                 tiitg[[thread_index_in_threadgroup]],
-        uint3                tptg [[threads_per_threadgroup]]) {
-    const int64_t i10 = tgpig.x;
-    const int64_t i11 = tgpig.y;
-
-    const int64_t r = ((device int32_t *) ((device char *) src1 + i11*nb11 + i10*nb10))[0];
-
-    const int64_t i02 = i11;
-
-    for (int ind = tiitg; ind < ne00; ind += tptg.x) {
-        ((device float *) ((device char *) dst + i11*nb2 + i10*nb1))[ind] =
-            ((device float *) ((device char *) src0 + r*nb01 + i02*nb02))[ind];
-    }
-}
-
-kernel void kernel_get_rows_f16(
-        device const  void * src0,
-        device const  char * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant   int64_t & ne10,
-        constant  uint64_t & nb10,
-        constant  uint64_t & nb11,
-        constant  uint64_t & nb1,
-        constant  uint64_t & nb2,
-        uint3                tgpig[[threadgroup_position_in_grid]],
-        uint                 tiitg[[thread_index_in_threadgroup]],
-        uint3                tptg [[threads_per_threadgroup]]) {
-    const int64_t i10 = tgpig.x;
-    const int64_t i11 = tgpig.y;
-
-    const int64_t r = ((device int32_t *) ((device char *) src1 + i11*nb11 + i10*nb10))[0];
-
-    const int64_t i02 = i11;
-
-    for (int ind = tiitg; ind < ne00; ind += tptg.x) {
-        ((device float *) ((device char *) dst + i11*nb2 + i10*nb1))[ind] =
-            ((device half *) ((device char *) src0 + r*nb01 + i02*nb02))[ind];
-    }
-}
-
-kernel void kernel_get_rows_i32(
-        device const  void * src0,
-        device const  char * src1,
-        device     int32_t * dst,
-        constant   int64_t & ne00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant   int64_t & ne10,
-        constant  uint64_t & nb10,
-        constant  uint64_t & nb11,
-        constant  uint64_t & nb1,
-        constant  uint64_t & nb2,
-        uint3                tgpig[[threadgroup_position_in_grid]],
-        uint                 tiitg[[thread_index_in_threadgroup]],
-        uint3                tptg [[threads_per_threadgroup]]) {
-    const int64_t i10 = tgpig.x;
-    const int64_t i11 = tgpig.y;
-
-    const int64_t r = ((device int32_t *) ((device char *) src1 + i11*nb11 + i10*nb10))[0];
-
-    const int64_t i02 = i11;
-
-    for (int ind = tiitg; ind < ne00; ind += tptg.x) {
-        ((device int32_t *) ((device char *) dst + i11*nb2 + i10*nb1))[ind] =
-            ((device int32_t *) ((device char *) src0 + r*nb01 + i02*nb02))[ind];
-    }
-}
-
-
-#define BLOCK_SIZE_M 64 // 8 simdgroup matrices from matrix A
-#define BLOCK_SIZE_N 32 // 4 simdgroup matrices from matrix B
-#define BLOCK_SIZE_K 32
-#define THREAD_MAT_M 4 // each thread take 4 simdgroup matrices from matrix A
-#define THREAD_MAT_N 2 // each thread take 2 simdgroup matrices from matrix B
-#define THREAD_PER_BLOCK 128
-#define THREAD_PER_ROW 2 // 2 thread for each row in matrix A to load numbers
-#define THREAD_PER_COL 4 // 4 thread for each row in matrix B to load numbers
-#define SG_MAT_SIZE 64 // simdgroup matrix is of shape 8x8
-#define SG_MAT_ROW 8
-
-// each block_q contains 16*nl weights
-template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread half4x4 &)>
-void kernel_mul_mm_impl(device const  uchar * src0,
-                        device const  uchar * src1,
-                        device        float * dst,
-                        constant    int64_t & ne00,
-                        constant    int64_t & ne02,
-                        constant   uint64_t & nb01,
-                        constant   uint64_t & nb02,
-                        constant    int64_t & ne12,
-                        constant   uint64_t & nb10,
-                        constant   uint64_t & nb11,
-                        constant   uint64_t & nb12,
-                        constant    int64_t & ne0,
-                        constant    int64_t & ne1,
-                        constant       uint & r2,
-                        constant       uint & r3,
-                        threadgroup   uchar * shared_memory [[threadgroup(0)]],
-                        uint3                 tgpig[[threadgroup_position_in_grid]],
-                        uint                  tiitg[[thread_index_in_threadgroup]],
-                        uint                  sgitg[[simdgroup_index_in_threadgroup]]) {
-
-    threadgroup half  * sa = (threadgroup half  *)(shared_memory);
-    threadgroup float * sb = (threadgroup float *)(shared_memory + 4096);
-
-    const uint r0 = tgpig.y;
-    const uint r1 = tgpig.x;
-    const uint im = tgpig.z;
-
-    // if this block is of 64x32 shape or smaller
-    short n_rows = (ne0 - r0 * BLOCK_SIZE_M < BLOCK_SIZE_M) ? (ne0 - r0 * BLOCK_SIZE_M) : BLOCK_SIZE_M;
-    short n_cols = (ne1 - r1 * BLOCK_SIZE_N < BLOCK_SIZE_N) ? (ne1 - r1 * BLOCK_SIZE_N) : BLOCK_SIZE_N;
-
-    // a thread shouldn't load data outside of the matrix
-    short thread_row = ((short)tiitg/THREAD_PER_ROW) < n_rows ? ((short)tiitg/THREAD_PER_ROW) : n_rows - 1;
-    short thread_col = ((short)tiitg/THREAD_PER_COL) < n_cols ? ((short)tiitg/THREAD_PER_COL) : n_cols - 1;
-
-    simdgroup_half8x8  ma[4];
-    simdgroup_float8x8 mb[2];
-    simdgroup_float8x8 c_res[8];
-    for (int i = 0; i < 8; i++){
-        c_res[i] = make_filled_simdgroup_matrix<float, 8>(0.f);
-    }
-
-    short il = (tiitg % THREAD_PER_ROW);
-
-    const uint i12 = im%ne12;
-    const uint i13 = im/ne12;
-
-    uint   offset0 = (i12/r2)*nb02 + (i13/r3)*(nb02*ne02);
-    ushort offset1 = il/nl;
-
-    device const block_q * x = (device const block_q *)(src0 + (r0 * BLOCK_SIZE_M + thread_row) * nb01 + offset0) + offset1;
-    device const float   * y = (device const float   *)(src1
-        + nb12 * im
-        + nb11 * (r1 * BLOCK_SIZE_N + thread_col)
-        + nb10 * (BLOCK_SIZE_K / THREAD_PER_COL * (tiitg % THREAD_PER_COL)));
-
-    for (int loop_k = 0; loop_k < ne00; loop_k += BLOCK_SIZE_K) {
-        // load data and store to threadgroup memory
-        half4x4 temp_a;
-        dequantize_func(x, il, temp_a);
-        threadgroup_barrier(mem_flags::mem_threadgroup);
-
-        #pragma unroll(16)
-        for (int i = 0; i < 16; i++) {
-            *(sa + SG_MAT_SIZE * ((tiitg / THREAD_PER_ROW / 8) \
-            +                     (tiitg % THREAD_PER_ROW) * 16 + (i / 8) * 8) \
-            +                     (tiitg / THREAD_PER_ROW) % 8  + (i & 7) * 8) = temp_a[i/4][i%4];
-        }
-
-        *(threadgroup float2x4 *)(sb + (tiitg % THREAD_PER_COL) * 8 * 32 + 8 * (tiitg / THREAD_PER_COL)) = *((device float2x4 *)y);
-
-        il = (il + 2 < nl) ? il + 2 : il % 2;
-        x  = (il < 2) ? x + (2+nl-1)/nl : x;
-        y += BLOCK_SIZE_K;
-
-        threadgroup_barrier(mem_flags::mem_threadgroup);
-
-        // load matrices from threadgroup memory and conduct outer products
-        threadgroup half  * lsma = (sa + THREAD_MAT_M * SG_MAT_SIZE * (sgitg % 2));
-        threadgroup float * lsmb = (sb + THREAD_MAT_N * SG_MAT_SIZE * (sgitg / 2));
-
-        #pragma unroll(4)
-        for (int ik = 0; ik < BLOCK_SIZE_K / 8; ik++) {
-            #pragma unroll(4)
-            for (int i = 0; i < 4; i++) {
-                simdgroup_load(ma[i],lsma + SG_MAT_SIZE * i);
-            }
-            simdgroup_barrier(mem_flags::mem_none);
-            #pragma unroll(2)
-            for (int i = 0; i < 2; i++) {
-                simdgroup_load(mb[i],lsmb + SG_MAT_SIZE * i);
-            }
-
-            lsma += BLOCK_SIZE_M / SG_MAT_ROW * SG_MAT_SIZE;
-            lsmb += BLOCK_SIZE_N / SG_MAT_ROW * SG_MAT_SIZE;
-
-            #pragma unroll(8)
-            for (int i = 0; i < 8; i++){
-                simdgroup_multiply_accumulate(c_res[i], mb[i/4], ma[i%4], c_res[i]);
-            }
-        }
-    }
-
-    if ((r0 + 1) * BLOCK_SIZE_M <= ne0 && (r1 + 1) * BLOCK_SIZE_N <= ne1) {
-        device float * C = dst + (BLOCK_SIZE_M * r0 + 32 * (sgitg &  1)) \
-                               + (BLOCK_SIZE_N * r1 + 16 * (sgitg >> 1)) * ne0 + im*ne1*ne0;
-        for (int i = 0; i < 8; i++) {
-            simdgroup_store(c_res[i], C + 8 * (i%4) + 8 * ne0 * (i/4), ne0);
-        }
-    } else {
-        // block is smaller than 64x32, we should avoid writing data outside of the matrix
-        threadgroup_barrier(mem_flags::mem_threadgroup);
-        threadgroup float * temp_str = ((threadgroup float *)shared_memory) \
-                                      + 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_M;
-        for (int i = 0; i < 8; i++) {
-            simdgroup_store(c_res[i], temp_str + 8 * (i%4) + 8 * BLOCK_SIZE_M * (i/4), BLOCK_SIZE_M);
-        }
-
-        threadgroup_barrier(mem_flags::mem_threadgroup);
-
-        device float * C = dst + (BLOCK_SIZE_M * r0) + (BLOCK_SIZE_N * r1) * ne0 + im*ne1*ne0;
-        if (sgitg == 0) {
-            for (int i = 0; i < n_rows; i++) {
-                for (int j = tiitg; j < n_cols; j += BLOCK_SIZE_N) {
-                    *(C + i + j * ne0) = *(temp_str + i + j * BLOCK_SIZE_M);
-                }
-            }
-        }
-    }
-}
-
-// same as kernel_mul_mm_impl, but src1 and dst are accessed via indices stored in src1ids
-template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread half4x4 &)>
-void kernel_mul_mm_id_impl(
-        device const  uchar * src0,
-        device const  uchar * src1,
-        thread        short * src1ids,
-        device        float * dst,
-        constant    int64_t & ne00,
-        constant    int64_t & ne02,
-        constant   uint64_t & nb01,
-        constant   uint64_t & nb02,
-        constant    int64_t & ne12,
-        constant   uint64_t & nb10,
-        constant   uint64_t & nb11,
-        constant   uint64_t & nb12,
-        constant    int64_t & ne0,
-                    int64_t   ne1,
-        constant       uint & r2,
-        constant       uint & r3,
-        threadgroup   uchar * shared_memory,
-        uint3                 tgpig[[threadgroup_position_in_grid]],
-        uint                  tiitg[[thread_index_in_threadgroup]],
-        uint                  sgitg[[simdgroup_index_in_threadgroup]]) {
-
-    threadgroup half  * sa = (threadgroup half  *)(shared_memory);
-    threadgroup float * sb = (threadgroup float *)(shared_memory + 4096);
-
-    const uint r0 = tgpig.y;
-    const uint r1 = tgpig.x;
-    const uint im = tgpig.z;
-
-    if (r1 * BLOCK_SIZE_N >= ne1) return;
-
-    // if this block is of 64x32 shape or smaller
-    short n_rows = (ne0 - r0 * BLOCK_SIZE_M < BLOCK_SIZE_M) ? (ne0 - r0 * BLOCK_SIZE_M) : BLOCK_SIZE_M;
-    short n_cols = (ne1 - r1 * BLOCK_SIZE_N < BLOCK_SIZE_N) ? (ne1 - r1 * BLOCK_SIZE_N) : BLOCK_SIZE_N;
-
-    // a thread shouldn't load data outside of the matrix
-    short thread_row = ((short)tiitg/THREAD_PER_ROW) < n_rows ? ((short)tiitg/THREAD_PER_ROW) : n_rows - 1;
-    short thread_col = ((short)tiitg/THREAD_PER_COL) < n_cols ? ((short)tiitg/THREAD_PER_COL) : n_cols - 1;
-
-    simdgroup_half8x8  ma[4];
-    simdgroup_float8x8 mb[2];
-    simdgroup_float8x8 c_res[8];
-    for (int i = 0; i < 8; i++){
-        c_res[i] = make_filled_simdgroup_matrix<float, 8>(0.f);
-    }
-
-    short il = (tiitg % THREAD_PER_ROW);
-
-    const uint i12 = im%ne12;
-    const uint i13 = im/ne12;
-
-    uint   offset0 = (i12/r2)*nb02 + (i13/r3)*(nb02*ne02);
-    ushort offset1 = il/nl;
-
-    device const block_q * x = (device const block_q *)(src0 + (r0 * BLOCK_SIZE_M + thread_row) * nb01 + offset0) + offset1;
-    device const float   * y = (device const float   *)(src1
-        + nb12 * im
-        + nb11 * src1ids[r1 * BLOCK_SIZE_N + thread_col]
-        + nb10 * (BLOCK_SIZE_K / THREAD_PER_COL * (tiitg % THREAD_PER_COL)));
-
-    for (int loop_k = 0; loop_k < ne00; loop_k += BLOCK_SIZE_K) {
-        // load data and store to threadgroup memory
-        half4x4 temp_a;
-        dequantize_func(x, il, temp_a);
-        threadgroup_barrier(mem_flags::mem_threadgroup);
-
-        for (int i = 0; i < 16; i++) {
-            *(sa + SG_MAT_SIZE * ((tiitg / THREAD_PER_ROW / 8) \
-            +                     (tiitg % THREAD_PER_ROW) * 16 + (i / 8) * 8) \
-            +                     (tiitg / THREAD_PER_ROW) % 8  + (i & 7) * 8) = temp_a[i/4][i%4];
-        }
-
-        *(threadgroup float2x4 *)(sb + (tiitg % THREAD_PER_COL) * 8 * 32 + 8 * (tiitg / THREAD_PER_COL)) = *((device float2x4 *)y);
-
-        il = (il + 2 < nl) ? il + 2 : il % 2;
-        x  = (il < 2) ? x + (2+nl-1)/nl : x;
-        y += BLOCK_SIZE_K;
-
-        threadgroup_barrier(mem_flags::mem_threadgroup);
-
-        // load matrices from threadgroup memory and conduct outer products
-        threadgroup half  * lsma = (sa + THREAD_MAT_M * SG_MAT_SIZE * (sgitg % 2));
-        threadgroup float * lsmb = (sb + THREAD_MAT_N * SG_MAT_SIZE * (sgitg / 2));
-
-        for (int ik = 0; ik < BLOCK_SIZE_K / 8; ik++) {
-            for (int i = 0; i < 4; i++) {
-                simdgroup_load(ma[i],lsma + SG_MAT_SIZE * i);
-            }
-            simdgroup_barrier(mem_flags::mem_none);
-            for (int i = 0; i < 2; i++) {
-                simdgroup_load(mb[i],lsmb + SG_MAT_SIZE * i);
-            }
-
-            lsma += BLOCK_SIZE_M / SG_MAT_ROW * SG_MAT_SIZE;
-            lsmb += BLOCK_SIZE_N / SG_MAT_ROW * SG_MAT_SIZE;
-
-            for (int i = 0; i < 8; i++){
-                simdgroup_multiply_accumulate(c_res[i], mb[i/4], ma[i%4], c_res[i]);
-            }
-        }
-    }
-
-    {
-        threadgroup_barrier(mem_flags::mem_threadgroup);
-        threadgroup float * temp_str = ((threadgroup float *)shared_memory) \
-                                      + 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_M;
-        for (int i = 0; i < 8; i++) {
-            simdgroup_store(c_res[i], temp_str + 8 * (i%4) + 8 * BLOCK_SIZE_M * (i/4), BLOCK_SIZE_M);
-        }
-
-        threadgroup_barrier(mem_flags::mem_threadgroup);
-
-        device float * C = dst + (BLOCK_SIZE_M * r0) + im*ne1*ne0;
-        if (sgitg == 0) {
-            for (int i = 0; i < n_rows; i++) {
-                for (int j = tiitg; j < n_cols; j += BLOCK_SIZE_N) {
-                    *(C + i + src1ids[j + r1*BLOCK_SIZE_N] * ne0) = *(temp_str + i + j * BLOCK_SIZE_M);
-                }
-            }
-        }
-    }
-}
-
-template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread half4x4 &)>
-kernel void kernel_mul_mm(device const  uchar * src0,
-                          device const  uchar * src1,
-                          device        float * dst,
-                          constant    int64_t & ne00,
-                          constant    int64_t & ne02,
-                          constant   uint64_t & nb01,
-                          constant   uint64_t & nb02,
-                          constant    int64_t & ne12,
-                          constant   uint64_t & nb10,
-                          constant   uint64_t & nb11,
-                          constant   uint64_t & nb12,
-                          constant    int64_t & ne0,
-                          constant    int64_t & ne1,
-                          constant       uint & r2,
-                          constant       uint & r3,
-                          threadgroup   uchar * shared_memory [[threadgroup(0)]],
-                          uint3                 tgpig[[threadgroup_position_in_grid]],
-                          uint                  tiitg[[thread_index_in_threadgroup]],
-                          uint                  sgitg[[simdgroup_index_in_threadgroup]]) {
-    kernel_mul_mm_impl<block_q, nl, dequantize_func>(
-        src0,
-        src1,
-        dst,
-        ne00,
-        ne02,
-        nb01,
-        nb02,
-        ne12,
-        nb10,
-        nb11,
-        nb12,
-        ne0,
-        ne1,
-        r2,
-        r3,
-        shared_memory,
-        tgpig,
-        tiitg,
-        sgitg);
-}
-
-template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread half4x4 &)>
-kernel void kernel_mul_mm_id(
-        device const   uchar * ids,
-        device const   uchar * src1,
-        device         float * dst,
-        constant    uint64_t & nbi1,
-        constant     int64_t & ne00,
-        constant     int64_t & ne02,
-        constant    uint64_t & nb01,
-        constant    uint64_t & nb02,
-        constant     int64_t & ne12,
-        constant     int64_t & ne13,
-        constant    uint64_t & nb10,
-        constant    uint64_t & nb11,
-        constant    uint64_t & nb12,
-        constant     int64_t & ne0,
-        constant     int64_t & ne1,
-        constant    uint64_t & nb1,
-        constant        uint & r2,
-        constant        uint & r3,
-        constant         int & idx,
-        device const   uchar * src00,
-        device const   uchar * src01,
-        device const   uchar * src02,
-        device const   uchar * src03,
-        device const   uchar * src04,
-        device const   uchar * src05,
-        device const   uchar * src06,
-        device const   uchar * src07,
-        threadgroup    uchar * shared_memory [[threadgroup(0)]],
-        uint3                  tgpig[[threadgroup_position_in_grid]],
-        uint                   tiitg[[thread_index_in_threadgroup]],
-        uint                   sgitg[[simdgroup_index_in_threadgroup]]) {
-    device const uchar * src0s[8] = {src00, src01, src02, src03, src04, src05, src06, src07};
-
-    // expert id
-    const int32_t id = tgpig.z/(ne12*ne13);
-
-    tgpig.z = tgpig.z%(ne12*ne13);
-
-    // row indices of src1 for expert id
-    int64_t _ne1 = 0;
-    short src1ids[512];
-
-    for (int64_t i1 = 0; i1 < ne1; i1++) {
-        if (((device int32_t *) (ids + i1*nbi1))[idx] == id) {
-            src1ids[_ne1++] = i1;
-        }
-    }
-
-    kernel_mul_mm_id_impl<block_q, nl, dequantize_func>(
-        src0s[id],
-        src1,
-        src1ids,
-        dst,
-        ne00,
-        ne02,
-        nb01,
-        nb02,
-        ne12,
-        nb10,
-        nb11,
-        nb12,
-        ne0,
-        _ne1,
-        r2,
-        r3,
-        shared_memory,
-        tgpig,
-        tiitg,
-        sgitg);
-}
-
-#if QK_K == 256
-#define QK_NL 16
-#else
-#define QK_NL 4
-#endif
-
-//
-// get rows
-//
-
-typedef void (get_rows_t)(
-        device const void * src0,
-        device const char * src1,
-        device      float * dst,
-        constant  int64_t & ne00,
-        constant uint64_t & nb01,
-        constant uint64_t & nb02,
-        constant  int64_t & ne10,
-        constant uint64_t & nb10,
-        constant uint64_t & nb11,
-        constant uint64_t & nb1,
-        constant uint64_t & nb2,
-        uint3, uint, uint3);
-
-//template [[host_name("kernel_get_rows_f32")]]  kernel get_rows_t kernel_get_rows<float4x4,   1, dequantize_f32>;
-//template [[host_name("kernel_get_rows_f16")]]  kernel get_rows_t kernel_get_rows<half4x4,    1, dequantize_f16>;
-template [[host_name("kernel_get_rows_q4_0")]] kernel get_rows_t kernel_get_rows<block_q4_0, 2, dequantize_q4_0>;
-template [[host_name("kernel_get_rows_q4_1")]] kernel get_rows_t kernel_get_rows<block_q4_1, 2, dequantize_q4_1>;
-template [[host_name("kernel_get_rows_q5_0")]] kernel get_rows_t kernel_get_rows<block_q5_0, 2, dequantize_q5_0>;
-template [[host_name("kernel_get_rows_q5_1")]] kernel get_rows_t kernel_get_rows<block_q5_1, 2, dequantize_q5_1>;
-template [[host_name("kernel_get_rows_q8_0")]] kernel get_rows_t kernel_get_rows<block_q8_0, 2, dequantize_q8_0>;
-template [[host_name("kernel_get_rows_q2_K")]] kernel get_rows_t kernel_get_rows<block_q2_K, QK_NL, dequantize_q2_K>;
-template [[host_name("kernel_get_rows_q3_K")]] kernel get_rows_t kernel_get_rows<block_q3_K, QK_NL, dequantize_q3_K>;
-template [[host_name("kernel_get_rows_q4_K")]] kernel get_rows_t kernel_get_rows<block_q4_K, QK_NL, dequantize_q4_K>;
-template [[host_name("kernel_get_rows_q5_K")]] kernel get_rows_t kernel_get_rows<block_q5_K, QK_NL, dequantize_q5_K>;
-template [[host_name("kernel_get_rows_q6_K")]] kernel get_rows_t kernel_get_rows<block_q6_K, QK_NL, dequantize_q6_K>;
-template [[host_name("kernel_get_rows_iq2_xxs")]] kernel get_rows_t kernel_get_rows<block_iq2_xxs, QK_NL, dequantize_iq2_xxs>;
-template [[host_name("kernel_get_rows_iq2_xs")]]  kernel get_rows_t kernel_get_rows<block_iq2_xs,  QK_NL, dequantize_iq2_xs>;
-
-//
-// matrix-matrix multiplication
-//
-
-typedef void (mat_mm_t)(
-        device const  uchar * src0,
-        device const  uchar * src1,
-        device        float * dst,
-        constant    int64_t & ne00,
-        constant    int64_t & ne02,
-        constant   uint64_t & nb01,
-        constant   uint64_t & nb02,
-        constant    int64_t & ne12,
-        constant   uint64_t & nb10,
-        constant   uint64_t & nb11,
-        constant   uint64_t & nb12,
-        constant    int64_t & ne0,
-        constant    int64_t & ne1,
-        constant       uint & r2,
-        constant       uint & r3,
-        threadgroup   uchar *,
-        uint3, uint, uint);
-
-template [[host_name("kernel_mul_mm_f32_f32")]]  kernel mat_mm_t kernel_mul_mm<float4x4,   1,     dequantize_f32>;
-template [[host_name("kernel_mul_mm_f16_f32")]]  kernel mat_mm_t kernel_mul_mm<half4x4,    1,     dequantize_f16>;
-template [[host_name("kernel_mul_mm_q4_0_f32")]] kernel mat_mm_t kernel_mul_mm<block_q4_0, 2,     dequantize_q4_0>;
-template [[host_name("kernel_mul_mm_q4_1_f32")]] kernel mat_mm_t kernel_mul_mm<block_q4_1, 2,     dequantize_q4_1>;
-template [[host_name("kernel_mul_mm_q5_0_f32")]] kernel mat_mm_t kernel_mul_mm<block_q5_0, 2,     dequantize_q5_0>;
-template [[host_name("kernel_mul_mm_q5_1_f32")]] kernel mat_mm_t kernel_mul_mm<block_q5_1, 2,     dequantize_q5_1>;
-template [[host_name("kernel_mul_mm_q8_0_f32")]] kernel mat_mm_t kernel_mul_mm<block_q8_0, 2,     dequantize_q8_0>;
-template [[host_name("kernel_mul_mm_q2_K_f32")]] kernel mat_mm_t kernel_mul_mm<block_q2_K, QK_NL, dequantize_q2_K>;
-template [[host_name("kernel_mul_mm_q3_K_f32")]] kernel mat_mm_t kernel_mul_mm<block_q3_K, QK_NL, dequantize_q3_K>;
-template [[host_name("kernel_mul_mm_q4_K_f32")]] kernel mat_mm_t kernel_mul_mm<block_q4_K, QK_NL, dequantize_q4_K>;
-template [[host_name("kernel_mul_mm_q5_K_f32")]] kernel mat_mm_t kernel_mul_mm<block_q5_K, QK_NL, dequantize_q5_K>;
-template [[host_name("kernel_mul_mm_q6_K_f32")]] kernel mat_mm_t kernel_mul_mm<block_q6_K, QK_NL, dequantize_q6_K>;
-template [[host_name("kernel_mul_mm_iq2_xxs_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq2_xxs, QK_NL, dequantize_iq2_xxs>;
-template [[host_name("kernel_mul_mm_iq2_xs_f32")]]  kernel mat_mm_t kernel_mul_mm<block_iq2_xs,  QK_NL, dequantize_iq2_xs>;
-
-//
-// indirect matrix-matrix multiplication
-//
-
-typedef void (mat_mm_id_t)(
-        device const   uchar * ids,
-        device const   uchar * src1,
-        device         float * dst,
-        constant    uint64_t & nbi1,
-        constant     int64_t & ne00,
-        constant     int64_t & ne02,
-        constant    uint64_t & nb01,
-        constant    uint64_t & nb02,
-        constant     int64_t & ne12,
-        constant     int64_t & ne13,
-        constant    uint64_t & nb10,
-        constant    uint64_t & nb11,
-        constant    uint64_t & nb12,
-        constant     int64_t & ne0,
-        constant     int64_t & ne1,
-        constant    uint64_t & nb1,
-        constant        uint & r2,
-        constant        uint & r3,
-        constant         int & idx,
-        device const   uchar * src00,
-        device const   uchar * src01,
-        device const   uchar * src02,
-        device const   uchar * src03,
-        device const   uchar * src04,
-        device const   uchar * src05,
-        device const   uchar * src06,
-        device const   uchar * src07,
-        threadgroup    uchar *,
-        uint3, uint, uint);
-
-template [[host_name("kernel_mul_mm_id_f32_f32")]]  kernel mat_mm_id_t kernel_mul_mm_id<float4x4,   1,     dequantize_f32>;
-template [[host_name("kernel_mul_mm_id_f16_f32")]]  kernel mat_mm_id_t kernel_mul_mm_id<half4x4,    1,     dequantize_f16>;
-template [[host_name("kernel_mul_mm_id_q4_0_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_q4_0, 2,     dequantize_q4_0>;
-template [[host_name("kernel_mul_mm_id_q4_1_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_q4_1, 2,     dequantize_q4_1>;
-template [[host_name("kernel_mul_mm_id_q5_0_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_q5_0, 2,     dequantize_q5_0>;
-template [[host_name("kernel_mul_mm_id_q5_1_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_q5_1, 2,     dequantize_q5_1>;
-template [[host_name("kernel_mul_mm_id_q8_0_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_q8_0, 2,     dequantize_q8_0>;
-template [[host_name("kernel_mul_mm_id_q2_K_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_q2_K, QK_NL, dequantize_q2_K>;
-template [[host_name("kernel_mul_mm_id_q3_K_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_q3_K, QK_NL, dequantize_q3_K>;
-template [[host_name("kernel_mul_mm_id_q4_K_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_q4_K, QK_NL, dequantize_q4_K>;
-template [[host_name("kernel_mul_mm_id_q5_K_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_q5_K, QK_NL, dequantize_q5_K>;
-template [[host_name("kernel_mul_mm_id_q6_K_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_q6_K, QK_NL, dequantize_q6_K>;
-template [[host_name("kernel_mul_mm_id_iq2_xxs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq2_xxs, QK_NL, dequantize_iq2_xxs>;
-template [[host_name("kernel_mul_mm_id_iq2_xs_f32")]]  kernel mat_mm_id_t kernel_mul_mm_id<block_iq2_xs,  QK_NL, dequantize_iq2_xs>;
-
-//
-// matrix-vector multiplication
-//
-
-[[host_name("kernel_mul_mv_id_f32_f32")]]
-kernel void kernel_mul_mv_id_f32_f32(
-        device const    char * ids,
-        device const    char * src1,
-        device         float * dst,
-        constant    uint64_t & nbi1,
-        constant     int64_t & ne00,
-        constant     int64_t & ne01,
-        constant     int64_t & ne02,
-        constant    uint64_t & nb00,
-        constant    uint64_t & nb01,
-        constant    uint64_t & nb02,
-        constant     int64_t & ne10,
-        constant     int64_t & ne11,
-        constant     int64_t & ne12,
-        constant     int64_t & ne13,
-        constant    uint64_t & nb10,
-        constant    uint64_t & nb11,
-        constant    uint64_t & nb12,
-        constant     int64_t & ne0,
-        constant     int64_t & ne1,
-        constant    uint64_t & nb1,
-        constant        uint & r2,
-        constant        uint & r3,
-        constant         int & idx,
-        device const    char * src00,
-        device const    char * src01,
-        device const    char * src02,
-        device const    char * src03,
-        device const    char * src04,
-        device const    char * src05,
-        device const    char * src06,
-        device const    char * src07,
-        uint3                  tgpig[[threadgroup_position_in_grid]],
-        uint                   tiitg[[thread_index_in_threadgroup]],
-        uint                   tiisg[[thread_index_in_simdgroup]],
-        uint                   sgitg[[simdgroup_index_in_threadgroup]]) {
-    device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07};
-
-    const int64_t bid = tgpig.z/(ne12*ne13);
-
-    tgpig.z = tgpig.z%(ne12*ne13);
-
-    const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx];
-
-    kernel_mul_mv_f32_f32_impl(
-        src0[id],
-        src1 + bid*nb11,
-        dst  + bid*ne0,
-        ne00,
-        ne01,
-        ne02,
-        nb00,
-        nb01,
-        nb02,
-        ne10,
-        ne11,
-        ne12,
-        nb10,
-        nb11,
-        nb12,
-        ne0,
-        ne1,
-        r2,
-        r3,
-        tgpig,
-        tiisg);
-}
-
-[[host_name("kernel_mul_mv_id_f16_f32")]]
-kernel void kernel_mul_mv_id_f16_f32(
-        device const    char * ids,
-        device const    char * src1,
-        device         float * dst,
-        constant    uint64_t & nbi1,
-        constant     int64_t & ne00,
-        constant     int64_t & ne01,
-        constant     int64_t & ne02,
-        constant    uint64_t & nb00,
-        constant    uint64_t & nb01,
-        constant    uint64_t & nb02,
-        constant     int64_t & ne10,
-        constant     int64_t & ne11,
-        constant     int64_t & ne12,
-        constant     int64_t & ne13,
-        constant    uint64_t & nb10,
-        constant    uint64_t & nb11,
-        constant    uint64_t & nb12,
-        constant     int64_t & ne0,
-        constant     int64_t & ne1,
-        constant    uint64_t & nb1,
-        constant        uint & r2,
-        constant        uint & r3,
-        constant         int & idx,
-        device const    char * src00,
-        device const    char * src01,
-        device const    char * src02,
-        device const    char * src03,
-        device const    char * src04,
-        device const    char * src05,
-        device const    char * src06,
-        device const    char * src07,
-        uint3                  tgpig[[threadgroup_position_in_grid]],
-        uint                   tiitg[[thread_index_in_threadgroup]],
-        uint                   tiisg[[thread_index_in_simdgroup]],
-        uint                   sgitg[[simdgroup_index_in_threadgroup]]) {
-    device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07};
-
-    const int64_t bid = tgpig.z/(ne12*ne13);
-
-    tgpig.z = tgpig.z%(ne12*ne13);
-
-    const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx];
-
-    kernel_mul_mv_f16_f32_impl(
-        src0[id],
-        src1 + bid*nb11,
-        dst  + bid*ne0,
-        ne00,
-        ne01,
-        ne02,
-        nb00,
-        nb01,
-        nb02,
-        ne10,
-        ne11,
-        ne12,
-        nb10,
-        nb11,
-        nb12,
-        ne0,
-        ne1,
-        r2,
-        r3,
-        tgpig,
-        tiisg);
-}
-
-[[host_name("kernel_mul_mv_id_q8_0_f32")]]
-kernel void kernel_mul_mv_id_q8_0_f32(
-        device const    char * ids,
-        device const    char * src1,
-        device         float * dst,
-        constant    uint64_t & nbi1,
-        constant     int64_t & ne00,
-        constant     int64_t & ne01,
-        constant     int64_t & ne02,
-        constant    uint64_t & nb00,
-        constant    uint64_t & nb01,
-        constant    uint64_t & nb02,
-        constant     int64_t & ne10,
-        constant     int64_t & ne11,
-        constant     int64_t & ne12,
-        constant     int64_t & ne13,
-        constant    uint64_t & nb10,
-        constant    uint64_t & nb11,
-        constant    uint64_t & nb12,
-        constant     int64_t & ne0,
-        constant     int64_t & ne1,
-        constant    uint64_t & nb1,
-        constant        uint & r2,
-        constant        uint & r3,
-        constant         int & idx,
-        device const    char * src00,
-        device const    char * src01,
-        device const    char * src02,
-        device const    char * src03,
-        device const    char * src04,
-        device const    char * src05,
-        device const    char * src06,
-        device const    char * src07,
-        uint3                  tgpig[[threadgroup_position_in_grid]],
-        uint                   tiitg[[thread_index_in_threadgroup]],
-        uint                   tiisg[[thread_index_in_simdgroup]],
-        uint                   sgitg[[simdgroup_index_in_threadgroup]]) {
-    device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07};
-
-    const int64_t bid = tgpig.z/(ne12*ne13);
-
-    tgpig.z = tgpig.z%(ne12*ne13);
-
-    const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx];
-
-    kernel_mul_mv_q8_0_f32_impl(
-        src0[id],
-        (device const float *) (src1 + bid*nb11),
-        dst + bid*ne0,
-        ne00,
-        ne01,
-        ne02,
-        ne10,
-        ne12,
-        ne0,
-        ne1,
-        r2,
-        r3,
-        tgpig,
-        tiisg,
-        sgitg);
-}
-
-[[host_name("kernel_mul_mv_id_q4_0_f32")]]
-kernel void kernel_mul_mv_id_q4_0_f32(
-        device const    char * ids,
-        device const    char * src1,
-        device         float * dst,
-        constant    uint64_t & nbi1,
-        constant     int64_t & ne00,
-        constant     int64_t & ne01,
-        constant     int64_t & ne02,
-        constant    uint64_t & nb00,
-        constant    uint64_t & nb01,
-        constant    uint64_t & nb02,
-        constant     int64_t & ne10,
-        constant     int64_t & ne11,
-        constant     int64_t & ne12,
-        constant     int64_t & ne13,
-        constant    uint64_t & nb10,
-        constant    uint64_t & nb11,
-        constant    uint64_t & nb12,
-        constant     int64_t & ne0,
-        constant     int64_t & ne1,
-        constant    uint64_t & nb1,
-        constant        uint & r2,
-        constant        uint & r3,
-        constant         int & idx,
-        device const    char * src00,
-        device const    char * src01,
-        device const    char * src02,
-        device const    char * src03,
-        device const    char * src04,
-        device const    char * src05,
-        device const    char * src06,
-        device const    char * src07,
-        uint3                  tgpig[[threadgroup_position_in_grid]],
-        uint                   tiitg[[thread_index_in_threadgroup]],
-        uint                   tiisg[[thread_index_in_simdgroup]],
-        uint                   sgitg[[simdgroup_index_in_threadgroup]]) {
-    device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07};
-
-    const int64_t bid = tgpig.z/(ne12*ne13);
-
-    tgpig.z = tgpig.z%(ne12*ne13);
-
-    const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx];
-
-    mul_vec_q_n_f32_impl<block_q4_0, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(
-        src0[id],
-        (device const float *) (src1 + bid*nb11),
-        dst + bid*ne0,
-        ne00,
-        ne01,
-        ne02,
-        ne10,
-        ne12,
-        ne0,
-        ne1,
-        r2,
-        r3,
-        tgpig,
-        tiisg,
-        sgitg);
-}
-
-[[host_name("kernel_mul_mv_id_q4_1_f32")]]
-kernel void kernel_mul_mv_id_q4_1_f32(
-        device const    char * ids,
-        device const    char * src1,
-        device         float * dst,
-        constant    uint64_t & nbi1,
-        constant     int64_t & ne00,
-        constant     int64_t & ne01,
-        constant     int64_t & ne02,
-        constant    uint64_t & nb00,
-        constant    uint64_t & nb01,
-        constant    uint64_t & nb02,
-        constant     int64_t & ne10,
-        constant     int64_t & ne11,
-        constant     int64_t & ne12,
-        constant     int64_t & ne13,
-        constant    uint64_t & nb10,
-        constant    uint64_t & nb11,
-        constant    uint64_t & nb12,
-        constant     int64_t & ne0,
-        constant     int64_t & ne1,
-        constant    uint64_t & nb1,
-        constant        uint & r2,
-        constant        uint & r3,
-        constant         int & idx,
-        device const    char * src00,
-        device const    char * src01,
-        device const    char * src02,
-        device const    char * src03,
-        device const    char * src04,
-        device const    char * src05,
-        device const    char * src06,
-        device const    char * src07,
-        uint3                  tgpig[[threadgroup_position_in_grid]],
-        uint                   tiitg[[thread_index_in_threadgroup]],
-        uint                   tiisg[[thread_index_in_simdgroup]],
-        uint                   sgitg[[simdgroup_index_in_threadgroup]]) {
-    device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07};
-
-    const int64_t bid = tgpig.z/(ne12*ne13);
-
-    tgpig.z = tgpig.z%(ne12*ne13);
-
-    const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx];
-
-    mul_vec_q_n_f32_impl<block_q4_1, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(
-        src0[id],
-        (device const float *) (src1 + bid*nb11),
-        dst + bid*ne0,
-        ne00,
-        ne01,
-        ne02,
-        ne10,
-        ne12,
-        ne0,
-        ne1,
-        r2,
-        r3,
-        tgpig,
-        tiisg,
-        sgitg);
-}
-
-[[host_name("kernel_mul_mv_id_q5_0_f32")]]
-kernel void kernel_mul_mv_id_q5_0_f32(
-        device const    char * ids,
-        device const    char * src1,
-        device         float * dst,
-        constant    uint64_t & nbi1,
-        constant     int64_t & ne00,
-        constant     int64_t & ne01,
-        constant     int64_t & ne02,
-        constant    uint64_t & nb00,
-        constant    uint64_t & nb01,
-        constant    uint64_t & nb02,
-        constant     int64_t & ne10,
-        constant     int64_t & ne11,
-        constant     int64_t & ne12,
-        constant     int64_t & ne13,
-        constant    uint64_t & nb10,
-        constant    uint64_t & nb11,
-        constant    uint64_t & nb12,
-        constant     int64_t & ne0,
-        constant     int64_t & ne1,
-        constant    uint64_t & nb1,
-        constant        uint & r2,
-        constant        uint & r3,
-        constant         int & idx,
-        device const    char * src00,
-        device const    char * src01,
-        device const    char * src02,
-        device const    char * src03,
-        device const    char * src04,
-        device const    char * src05,
-        device const    char * src06,
-        device const    char * src07,
-        uint3                  tgpig[[threadgroup_position_in_grid]],
-        uint                   tiitg[[thread_index_in_threadgroup]],
-        uint                   tiisg[[thread_index_in_simdgroup]],
-        uint                   sgitg[[simdgroup_index_in_threadgroup]]) {
-    device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07};
-
-    const int64_t bid = tgpig.z/(ne12*ne13);
-
-    tgpig.z = tgpig.z%(ne12*ne13);
-
-    const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx];
-
-    mul_vec_q_n_f32_impl<block_q5_0, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(
-        src0[id],
-        (device const float *) (src1 + bid*nb11),
-        dst + bid*ne0,
-        ne00,
-        ne01,
-        ne02,
-        ne10,
-        ne12,
-        ne0,
-        ne1,
-        r2,
-        r3,
-        tgpig,
-        tiisg,
-        sgitg);
-}
-
-[[host_name("kernel_mul_mv_id_q5_1_f32")]]
-kernel void kernel_mul_mv_id_q5_1_f32(
-        device const    char * ids,
-        device const    char * src1,
-        device         float * dst,
-        constant    uint64_t & nbi1,
-        constant     int64_t & ne00,
-        constant     int64_t & ne01,
-        constant     int64_t & ne02,
-        constant    uint64_t & nb00,
-        constant    uint64_t & nb01,
-        constant    uint64_t & nb02,
-        constant     int64_t & ne10,
-        constant     int64_t & ne11,
-        constant     int64_t & ne12,
-        constant     int64_t & ne13,
-        constant    uint64_t & nb10,
-        constant    uint64_t & nb11,
-        constant    uint64_t & nb12,
-        constant     int64_t & ne0,
-        constant     int64_t & ne1,
-        constant    uint64_t & nb1,
-        constant        uint & r2,
-        constant        uint & r3,
-        constant         int & idx,
-        device const    char * src00,
-        device const    char * src01,
-        device const    char * src02,
-        device const    char * src03,
-        device const    char * src04,
-        device const    char * src05,
-        device const    char * src06,
-        device const    char * src07,
-        uint3                  tgpig[[threadgroup_position_in_grid]],
-        uint                   tiitg[[thread_index_in_threadgroup]],
-        uint                   tiisg[[thread_index_in_simdgroup]],
-        uint                   sgitg[[simdgroup_index_in_threadgroup]]) {
-    device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07};
-
-    const int64_t bid = tgpig.z/(ne12*ne13);
-
-    tgpig.z = tgpig.z%(ne12*ne13);
-
-    const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx];
-
-    mul_vec_q_n_f32_impl<block_q5_1, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(
-        src0[id],
-        (device const float *) (src1 + bid*nb11),
-        dst + bid*ne0,
-        ne00,
-        ne01,
-        ne02,
-        ne10,
-        ne12,
-        ne0,
-        ne1,
-        r2,
-        r3,
-        tgpig,
-        tiisg,
-        sgitg);
-}
-
-[[host_name("kernel_mul_mv_id_q2_K_f32")]]
-kernel void kernel_mul_mv_id_q2_K_f32(
-        device const    char * ids,
-        device const    char * src1,
-        device         float * dst,
-        constant    uint64_t & nbi1,
-        constant     int64_t & ne00,
-        constant     int64_t & ne01,
-        constant     int64_t & ne02,
-        constant    uint64_t & nb00,
-        constant    uint64_t & nb01,
-        constant    uint64_t & nb02,
-        constant     int64_t & ne10,
-        constant     int64_t & ne11,
-        constant     int64_t & ne12,
-        constant     int64_t & ne13,
-        constant    uint64_t & nb10,
-        constant    uint64_t & nb11,
-        constant    uint64_t & nb12,
-        constant     int64_t & ne0,
-        constant     int64_t & ne1,
-        constant    uint64_t & nb1,
-        constant        uint & r2,
-        constant        uint & r3,
-        constant         int & idx,
-        device const    char * src00,
-        device const    char * src01,
-        device const    char * src02,
-        device const    char * src03,
-        device const    char * src04,
-        device const    char * src05,
-        device const    char * src06,
-        device const    char * src07,
-        uint3                  tgpig[[threadgroup_position_in_grid]],
-        uint                   tiitg[[thread_index_in_threadgroup]],
-        uint                   tiisg[[thread_index_in_simdgroup]],
-        uint                   sgitg[[simdgroup_index_in_threadgroup]]) {
-    device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07};
-
-    const int64_t bid = tgpig.z/(ne12*ne13);
-
-    tgpig.z = tgpig.z%(ne12*ne13);
-
-    const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx];
-
-    kernel_mul_mv_q2_K_f32_impl(
-        src0[id],
-        (device const float *) (src1 + bid*nb11),
-        dst + bid*ne0,
-        ne00,
-        ne01,
-        ne02,
-        ne10,
-        ne12,
-        ne0,
-        ne1,
-        r2,
-        r3,
-        tgpig,
-        tiisg,
-        sgitg);
-}
-
-[[host_name("kernel_mul_mv_id_q3_K_f32")]]
-kernel void kernel_mul_mv_id_q3_K_f32(
-        device const    char * ids,
-        device const    char * src1,
-        device         float * dst,
-        constant    uint64_t & nbi1,
-        constant     int64_t & ne00,
-        constant     int64_t & ne01,
-        constant     int64_t & ne02,
-        constant    uint64_t & nb00,
-        constant    uint64_t & nb01,
-        constant    uint64_t & nb02,
-        constant     int64_t & ne10,
-        constant     int64_t & ne11,
-        constant     int64_t & ne12,
-        constant     int64_t & ne13,
-        constant    uint64_t & nb10,
-        constant    uint64_t & nb11,
-        constant    uint64_t & nb12,
-        constant     int64_t & ne0,
-        constant     int64_t & ne1,
-        constant    uint64_t & nb1,
-        constant        uint & r2,
-        constant        uint & r3,
-        constant         int & idx,
-        device const    char * src00,
-        device const    char * src01,
-        device const    char * src02,
-        device const    char * src03,
-        device const    char * src04,
-        device const    char * src05,
-        device const    char * src06,
-        device const    char * src07,
-        uint3                  tgpig[[threadgroup_position_in_grid]],
-        uint                   tiitg[[thread_index_in_threadgroup]],
-        uint                   tiisg[[thread_index_in_simdgroup]],
-        uint                   sgitg[[simdgroup_index_in_threadgroup]]) {
-    device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07};
-
-    const int64_t bid = tgpig.z/(ne12*ne13);
-
-    tgpig.z = tgpig.z%(ne12*ne13);
-
-    const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx];
-
-    kernel_mul_mv_q3_K_f32_impl(
-        src0[id],
-        (device const float *) (src1 + bid*nb11),
-        dst + bid*ne0,
-        ne00,
-        ne01,
-        ne02,
-        ne10,
-        ne12,
-        ne0,
-        ne1,
-        r2,
-        r3,
-        tgpig,
-        tiisg,
-        sgitg);
-}
-
-[[host_name("kernel_mul_mv_id_q4_K_f32")]]
-kernel void kernel_mul_mv_id_q4_K_f32(
-        device const    char * ids,
-        device const    char * src1,
-        device         float * dst,
-        constant    uint64_t & nbi1,
-        constant     int64_t & ne00,
-        constant     int64_t & ne01,
-        constant     int64_t & ne02,
-        constant    uint64_t & nb00,
-        constant    uint64_t & nb01,
-        constant    uint64_t & nb02,
-        constant     int64_t & ne10,
-        constant     int64_t & ne11,
-        constant     int64_t & ne12,
-        constant     int64_t & ne13,
-        constant    uint64_t & nb10,
-        constant    uint64_t & nb11,
-        constant    uint64_t & nb12,
-        constant     int64_t & ne0,
-        constant     int64_t & ne1,
-        constant    uint64_t & nb1,
-        constant        uint & r2,
-        constant        uint & r3,
-        constant         int & idx,
-        device const    char * src00,
-        device const    char * src01,
-        device const    char * src02,
-        device const    char * src03,
-        device const    char * src04,
-        device const    char * src05,
-        device const    char * src06,
-        device const    char * src07,
-        uint3                  tgpig[[threadgroup_position_in_grid]],
-        uint                   tiitg[[thread_index_in_threadgroup]],
-        uint                   tiisg[[thread_index_in_simdgroup]],
-        uint                   sgitg[[simdgroup_index_in_threadgroup]]) {
-    device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07};
-
-    const int64_t bid = tgpig.z/(ne12*ne13);
-
-    tgpig.z = tgpig.z%(ne12*ne13);
-
-    const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx];
-
-    kernel_mul_mv_q4_K_f32_impl(
-        src0[id],
-        (device const float *) (src1 + bid*nb11),
-        dst + bid*ne0,
-        ne00,
-        ne01,
-        ne02,
-        ne10,
-        ne12,
-        ne0,
-        ne1,
-        r2,
-        r3,
-        tgpig,
-        tiisg,
-        sgitg);
-}
-
-[[host_name("kernel_mul_mv_id_q5_K_f32")]]
-kernel void kernel_mul_mv_id_q5_K_f32(
-        device const    char * ids,
-        device const    char * src1,
-        device         float * dst,
-        constant    uint64_t & nbi1,
-        constant     int64_t & ne00,
-        constant     int64_t & ne01,
-        constant     int64_t & ne02,
-        constant    uint64_t & nb00,
-        constant    uint64_t & nb01,
-        constant    uint64_t & nb02,
-        constant     int64_t & ne10,
-        constant     int64_t & ne11,
-        constant     int64_t & ne12,
-        constant     int64_t & ne13,
-        constant    uint64_t & nb10,
-        constant    uint64_t & nb11,
-        constant    uint64_t & nb12,
-        constant     int64_t & ne0,
-        constant     int64_t & ne1,
-        constant    uint64_t & nb1,
-        constant        uint & r2,
-        constant        uint & r3,
-        constant         int & idx,
-        device const    char * src00,
-        device const    char * src01,
-        device const    char * src02,
-        device const    char * src03,
-        device const    char * src04,
-        device const    char * src05,
-        device const    char * src06,
-        device const    char * src07,
-        uint3                  tgpig[[threadgroup_position_in_grid]],
-        uint                   tiitg[[thread_index_in_threadgroup]],
-        uint                   tiisg[[thread_index_in_simdgroup]],
-        uint                   sgitg[[simdgroup_index_in_threadgroup]]) {
-    device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07};
-
-    const int64_t bid = tgpig.z/(ne12*ne13);
-
-    tgpig.z = tgpig.z%(ne12*ne13);
-
-    const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx];
-
-    kernel_mul_mv_q5_K_f32_impl(
-        src0[id],
-        (device const float *) (src1 + bid*nb11),
-        dst + bid*ne0,
-        ne00,
-        ne01,
-        ne02,
-        ne10,
-        ne12,
-        ne0,
-        ne1,
-        r2,
-        r3,
-        tgpig,
-        tiisg,
-        sgitg);
-}
-
-[[host_name("kernel_mul_mv_id_q6_K_f32")]]
-kernel void kernel_mul_mv_id_q6_K_f32(
-        device const    char * ids,
-        device const    char * src1,
-        device         float * dst,
-        constant    uint64_t & nbi1,
-        constant     int64_t & ne00,
-        constant     int64_t & ne01,
-        constant     int64_t & ne02,
-        constant    uint64_t & nb00,
-        constant    uint64_t & nb01,
-        constant    uint64_t & nb02,
-        constant     int64_t & ne10,
-        constant     int64_t & ne11,
-        constant     int64_t & ne12,
-        constant     int64_t & ne13,
-        constant    uint64_t & nb10,
-        constant    uint64_t & nb11,
-        constant    uint64_t & nb12,
-        constant     int64_t & ne0,
-        constant     int64_t & ne1,
-        constant    uint64_t & nb1,
-        constant        uint & r2,
-        constant        uint & r3,
-        constant         int & idx,
-        device const    char * src00,
-        device const    char * src01,
-        device const    char * src02,
-        device const    char * src03,
-        device const    char * src04,
-        device const    char * src05,
-        device const    char * src06,
-        device const    char * src07,
-        uint3                  tgpig[[threadgroup_position_in_grid]],
-        uint                   tiitg[[thread_index_in_threadgroup]],
-        uint                   tiisg[[thread_index_in_simdgroup]],
-        uint                   sgitg[[simdgroup_index_in_threadgroup]]) {
-    device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07};
-
-    const int64_t bid = tgpig.z/(ne12*ne13);
-
-    tgpig.z = tgpig.z%(ne12*ne13);
-
-    const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx];
-
-    kernel_mul_mv_q6_K_f32_impl(
-        src0[id],
-        (device const float *) (src1 + bid*nb11),
-        dst + bid*ne0,
-        ne00,
-        ne01,
-        ne02,
-        ne10,
-        ne12,
-        ne0,
-        ne1,
-        r2,
-        r3,
-        tgpig,
-        tiisg,
-        sgitg);
-}
-
-[[host_name("kernel_mul_mv_id_iq2_xxs_f32")]]
-kernel void kernel_mul_mv_id_iq2_xxs_f32(
-        device const    char * ids,
-        device const    char * src1,
-        device         float * dst,
-        constant    uint64_t & nbi1,
-        constant     int64_t & ne00,
-        constant     int64_t & ne01,
-        constant     int64_t & ne02,
-        constant    uint64_t & nb00,
-        constant    uint64_t & nb01,
-        constant    uint64_t & nb02,
-        constant     int64_t & ne10,
-        constant     int64_t & ne11,
-        constant     int64_t & ne12,
-        constant     int64_t & ne13,
-        constant    uint64_t & nb10,
-        constant    uint64_t & nb11,
-        constant    uint64_t & nb12,
-        constant     int64_t & ne0,
-        constant     int64_t & ne1,
-        constant    uint64_t & nb1,
-        constant        uint & r2,
-        constant        uint & r3,
-        constant         int & idx,
-        device const    char * src00,
-        device const    char * src01,
-        device const    char * src02,
-        device const    char * src03,
-        device const    char * src04,
-        device const    char * src05,
-        device const    char * src06,
-        device const    char * src07,
-        threadgroup int8_t   * shared_values [[threadgroup(0)]],
-        uint3                  tgpig[[threadgroup_position_in_grid]],
-        uint                   tiitg[[thread_index_in_threadgroup]],
-        uint                   tiisg[[thread_index_in_simdgroup]],
-        uint                   sgitg[[simdgroup_index_in_threadgroup]]) {
-    device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07};
-
-    const int64_t bid = tgpig.z/(ne12*ne13);
-
-    tgpig.z = tgpig.z%(ne12*ne13);
-
-    const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx];
-
-    kernel_mul_mv_iq2_xxs_f32_impl(
-        src0[id],
-        (device const float *) (src1 + bid*nb11),
-        dst + bid*ne0,
-        ne00,
-        ne01,
-        ne02,
-        ne10,
-        ne12,
-        ne0,
-        ne1,
-        r2,
-        r3,
-        shared_values,
-        tgpig,
-        tiisg,
-        sgitg);
-}
-
-[[host_name("kernel_mul_mv_id_iq2_xs_f32")]]
-kernel void kernel_mul_mv_id_iq2_xs_f32(
-        device const    char * ids,
-        device const    char * src1,
-        device         float * dst,
-        constant    uint64_t & nbi1,
-        constant     int64_t & ne00,
-        constant     int64_t & ne01,
-        constant     int64_t & ne02,
-        constant    uint64_t & nb00,
-        constant    uint64_t & nb01,
-        constant    uint64_t & nb02,
-        constant     int64_t & ne10,
-        constant     int64_t & ne11,
-        constant     int64_t & ne12,
-        constant     int64_t & ne13,
-        constant    uint64_t & nb10,
-        constant    uint64_t & nb11,
-        constant    uint64_t & nb12,
-        constant     int64_t & ne0,
-        constant     int64_t & ne1,
-        constant    uint64_t & nb1,
-        constant        uint & r2,
-        constant        uint & r3,
-        constant         int & idx,
-        device const    char * src00,
-        device const    char * src01,
-        device const    char * src02,
-        device const    char * src03,
-        device const    char * src04,
-        device const    char * src05,
-        device const    char * src06,
-        device const    char * src07,
-        threadgroup int8_t   * shared_values [[threadgroup(0)]],
-        uint3                  tgpig[[threadgroup_position_in_grid]],
-        uint                   tiitg[[thread_index_in_threadgroup]],
-        uint                   tiisg[[thread_index_in_simdgroup]],
-        uint                   sgitg[[simdgroup_index_in_threadgroup]]) {
-    device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07};
-
-    const int64_t bid = tgpig.z/(ne12*ne13);
-
-    tgpig.z = tgpig.z%(ne12*ne13);
-
-    const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx];
-
-    kernel_mul_mv_iq2_xs_f32_impl(
-        src0[id],
-        (device const float *) (src1 + bid*nb11),
-        dst + bid*ne0,
-        ne00,
-        ne01,
-        ne02,
-        ne10,
-        ne12,
-        ne0,
-        ne1,
-        r2,
-        r3,
-        shared_values,
-        tgpig,
-        tiisg,
-        sgitg);
-}
diff --git a/ggml/src/ggml-opencl.cpp b/ggml/src/ggml-opencl.cpp
deleted file mode 100644
index 2bb9363..0000000
--- a/ggml/src/ggml-opencl.cpp
+++ /dev/null
@@ -1,2204 +0,0 @@
-#include "ggml.h"
-#include "ggml-opencl.h"
-#include "ggml-backend-impl.h"
-
-#include <array>
-#include <atomic>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include <limits>
-#include <sstream>
-#include <vector>
-
-#define CL_TARGET_OPENCL_VERSION 120
-#include <clblast.h>
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-
-#define CL_DMMV_LOCAL_SIZE 32
-
-#ifndef K_QUANTS_PER_ITERATION
-#define K_QUANTS_PER_ITERATION 1
-#else
-static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
-#endif
-
-#define MULTILINE_QUOTE(...) #__VA_ARGS__
-static std::string program_source = MULTILINE_QUOTE(
-
-typedef char int8_t;
-typedef uchar uint8_t;
-typedef short int16_t;
-typedef ushort uint16_t;
-typedef int int32_t;
-typedef uint uint32_t;
-
-struct __attribute__ ((packed)) block_q4_0
-{
-    half d;
-    uint8_t qs[QK4_0 / 2];
-};
-
-struct __attribute__ ((packed)) block_q4_1
-{
-    half d;
-    half m;
-    uint8_t qs[QK4_1 / 2];
-};
-
-struct __attribute__ ((packed)) block_q5_0
-{
-    half d;
-    uint32_t qh;
-    uint8_t qs[QK5_0 / 2];
-};
-
-struct __attribute__ ((packed)) block_q5_1
-{
-    half d;
-    half m;
-    uint32_t qh;
-    uint8_t qs[QK5_1 / 2];
-};
-
-struct __attribute__ ((packed)) block_q8_0
-{
-    half d;
-    int8_t qs[QK8_0];
-};
-
-struct __attribute__((packed)) block_q2_K
-{
-    uint8_t scales[16];
-    uint8_t qs[64];
-    half d;
-    half dmin;
-};
-
-struct __attribute__((packed)) block_q3_K
-{
-    uint8_t hmask[32];
-    uint8_t qs[64];
-    uint8_t scales[12];
-    half d;
-};
-
-struct __attribute__((packed)) block_q4_K
-{
-    half d;
-    half dmin;
-    uint8_t scales[12];
-    uint8_t qs[128];
-};
-
-struct __attribute__((packed)) block_q5_K
-{
-    half d;
-    half dmin;
-    uint8_t scales[12];
-    uint8_t qh[32];
-    uint8_t qs[128];
-};
-
-struct __attribute__((packed)) block_q6_K
-{
-    uint8_t ql[128];
-    uint8_t qh[64];
-    int8_t scales[16];
-    half d;
-};
-
-__kernel void convert_fp16_to_fp32(__global half* x, __global float* y) {
-    const uint i = get_global_id(0);
-
-    y[i] = vload_half(0, &x[i]);
-}
-
-void dequantize_q4_0(__global const struct block_q4_0* x, const int ib, const int iqs, float* v0, float* v1) {
-    const float d = vload_half(0, &x[ib].d);
-
-    const uint8_t vui = x[ib].qs[iqs];
-
-    const int8_t vi0 = vui & 0xF;
-    const int8_t vi1 = vui >> 4;
-
-    *v0 = (vi0 - 8)*d;
-    *v1 = (vi1 - 8)*d;
-}
-void dequantize_q4_1(__global const struct block_q4_1* x, const int ib, const int iqs, float* v0, float* v1) {
-    const float d = vload_half(0, &x[ib].d);
-    const float m = vload_half(0, &x[ib].m);
-
-    const uint8_t vui = x[ib].qs[iqs];
-
-    const int8_t vi0 = vui & 0xF;
-    const int8_t vi1 = vui >> 4;
-
-    *v0 = vi0*d + m;
-    *v1 = vi1*d + m;
-}
-void dequantize_q5_0(__global const struct block_q5_0* x, const int ib, const int iqs, float* v0, float* v1) {
-    const float d = vload_half(0, &x[ib].d);
-
-    uint32_t qh = x[ib].qh;
-
-    const uint8_t xh_0 = ((qh >> (iqs +  0)) << 4) & 0x10;
-    const uint8_t xh_1 = ((qh >> (iqs + 12))     ) & 0x10;
-
-    const int32_t x0 = ((x[ib].qs[iqs] & 0xf) | xh_0) - 16;
-    const int32_t x1 = ((x[ib].qs[iqs] >>  4) | xh_1) - 16;
-
-    *v0 = x0*d;
-    *v1 = x1*d;
-}
-void dequantize_q5_1(__global const struct block_q5_1* x, const int ib, const int iqs, float* v0, float* v1) {
-    const float d = vload_half(0, &x[ib].d);
-    const float m = vload_half(0, &x[ib].m);
-
-    uint32_t qh = x[ib].qh;
-
-    const uint8_t xh_0 = ((qh >> (iqs +  0)) << 4) & 0x10;
-    const uint8_t xh_1 = ((qh >> (iqs + 12))     ) & 0x10;
-
-    const int32_t x0 = ((x[ib].qs[iqs] & 0xf) | xh_0);
-    const int32_t x1 = ((x[ib].qs[iqs] >>  4) | xh_1);
-
-    *v0 = x0*d + m;
-    *v1 = x1*d + m;
-}
-void dequantize_q8_0(__global const struct block_q8_0* x, const int ib, const int iqs, float* v0, float* v1) {
-    const float d = vload_half(0, &x[ib].d);
-
-    const int8_t vi0 = x[ib].qs[iqs + 0];
-    const int8_t vi1 = x[ib].qs[iqs + 1];
-
-    *v0 = vi0*d;
-    *v1 = vi1*d;
-}
-void convert_f16(__global half* x, const int ib, const int iqs, float* v0, float* v1){
-    *v0 = vload_half(0, &x[ib + 0]);
-    *v1 = vload_half(0, &x[ib + 1]);
-}
-);
-
-static std::string k_quants_source = MULTILINE_QUOTE(
-inline void get_scale_min_k4(int j, const __global uint8_t *q, uint8_t *d, uint8_t *m)
-{
-    if (j < 4)
-    {
-        *d = q[j] & 63;
-        *m = q[j + 4] & 63;
-    }
-    else
-    {
-        *d = (q[j + 4] & 0xF) | ((q[j - 4] >> 6) << 4);
-        *m = (q[j + 4] >> 4) | ((q[j - 0] >> 6) << 4);
-    }
-}
-
-__kernel void dequantize_block_q2_K(__global const struct block_q2_K *x, __global float *yy)
-{
-    const int i = get_group_id(0) + get_global_offset(0);
-    const int tid = get_local_id(0);
-    const int n = tid / 32;
-    const int l = tid - 32 * n;
-    const int is = 8 * n + l / 16;
-
-    const uint8_t q = x[i].qs[32 * n + l];
-    __global float *y = yy + get_group_id(0) * QK_K + 128 * n;
-
-    const float dall = vload_half(0, &x[i].d);
-    const float dmin = vload_half(0, &x[i].dmin);
-
-    y[l + 0] = dall * (x[i].scales[is + 0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is + 0] >> 4);
-    y[l + 32] = dall * (x[i].scales[is + 2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is + 2] >> 4);
-    y[l + 64] = dall * (x[i].scales[is + 4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is + 4] >> 4);
-    y[l + 96] = dall * (x[i].scales[is + 6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is + 6] >> 4);
-}
-
-__kernel void dequantize_block_q3_K(__global const struct block_q3_K *x, __global float *yy)
-{
-    int r = get_local_id(0) / 4;
-    int i = get_group_id(0) + get_global_offset(0);
-    int tid = r / 2;
-    int is0 = r % 2;
-    int l0 = 16 * is0 + 4 * (get_local_id(0) % 4);
-    int n = tid / 4;
-    int j = tid - 4 * n;
-
-    uint8_t m = 1 << (4 * n + j);
-    int is = 8 * n + 2 * j + is0;
-    int shift = 2 * j;
-
-    int8_t us = is < 4 ? (x[i].scales[is - 0] & 0xF) | (((x[i].scales[is + 8] >> 0) & 3) << 4)
-              : is < 8 ? (x[i].scales[is - 0] & 0xF) | (((x[i].scales[is + 4] >> 2) & 3) << 4)
-              : is < 12  ? (x[i].scales[is - 8] >> 4) | (((x[i].scales[is + 0] >> 4) & 3) << 4)
-              : (x[i].scales[is - 8] >> 4) | (((x[i].scales[is - 4] >> 6) & 3) << 4);
-    float d_all = vload_half(0, &x[i].d);
-    float dl = d_all * (us - 32);
-
-    __global float *y = yy + get_group_id(0) * QK_K + 128 * n + 32 * j;
-    const __global uint8_t *q = x[i].qs + 32 * n;
-    const __global uint8_t *hm = x[i].hmask;
-
-    for (int l = l0; l < l0 + 4; ++l)
-        y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4));
-}
-
-__kernel void dequantize_block_q4_K(__global const struct block_q4_K *x, __global float *yy)
-{
-    const int i = get_group_id(0) + get_global_offset(0);
-    const int tid = get_local_id(0);
-    const int il = tid / 8;
-    const int ir = tid % 8;
-    const int is = 2 * il;
-    const int n = 4;
-
-    __global float *y = yy + get_group_id(0) * QK_K + 64 * il + n * ir;
-
-    const float dall = vload_half(0, &x[i].d);
-    const float dmin = vload_half(0, &x[i].dmin);
-
-    __global const uint8_t *q = x[i].qs + 32 * il + n * ir;
-
-    uint8_t sc, m;
-    get_scale_min_k4(is + 0, x[i].scales, &sc, &m);
-    float d1 = dall * sc;
-    float m1 = dmin * m;
-    get_scale_min_k4(is + 1, x[i].scales, &sc, &m);
-    float d2 = dall * sc;
-    float m2 = dmin * m;
-    for (int l = 0; l < n; ++l)
-    {
-        y[l + 0] = d1 * (q[l] & 0xF) - m1;
-        y[l + 32] = d2 * (q[l] >> 4) - m2;
-    }
-}
-
-__kernel void dequantize_block_q5_K(__global const struct block_q5_K *x, __global float *yy)
-{
-    const int i = get_group_id(0) + get_global_offset(0);
-    const int tid = get_local_id(0);
-    const int il = tid / 16;
-    const int ir = tid % 16;
-    const int is = 2 * il;
-
-    __global float *y = yy + get_group_id(0) * QK_K + 64 * il + 2 * ir;
-
-    const float dall = vload_half(0, &x[i].d);
-    const float dmin = vload_half(0, &x[i].dmin);
-
-    __global const uint8_t *ql = x[i].qs + 32 * il + 2 * ir;
-    __global const uint8_t *qh = x[i].qh + 2 * ir;
-
-    uint8_t sc, m;
-    get_scale_min_k4(is + 0, x[i].scales, &sc, &m);
-    const float d1 = dall * sc;
-    const float m1 = dmin * m;
-    get_scale_min_k4(is + 1, x[i].scales, &sc, &m);
-    const float d2 = dall * sc;
-    const float m2 = dmin * m;
-
-    uint8_t hm = 1 << (2 * il);
-    y[0] = d1 * ((ql[0] & 0xF) + (qh[0] & hm ? 16 : 0)) - m1;
-    y[1] = d1 * ((ql[1] & 0xF) + (qh[1] & hm ? 16 : 0)) - m1;
-    hm <<= 1;
-    y[32] = d2 * ((ql[0] >> 4) + (qh[0] & hm ? 16 : 0)) - m2;
-    y[33] = d2 * ((ql[1] >> 4) + (qh[1] & hm ? 16 : 0)) - m2;
-}
-
-__kernel void dequantize_block_q6_K(__global const struct block_q6_K *x, __global float *yy)
-{
-    const int i = get_group_id(0) + get_global_offset(0);
-    const int tid = get_local_id(0);
-    const int ip = tid / 32;
-    const int il = tid - 32 * ip;
-    const int is = 8 * ip + il / 16;
-
-    __global float *y = yy + get_group_id(0) * QK_K + 128 * ip + il;
-
-    const float d = vload_half(0, &x[i].d);
-
-    __global const uint8_t *ql = x[i].ql + 64 * ip + il;
-    const uint8_t qh = x[i].qh[32 * ip + il];
-    __global const int8_t *sc = x[i].scales + is;
-
-    y[0] = d * sc[0] * ((int8_t)((ql[0] & 0xF) | (((qh >> 0) & 3) << 4)) - 32);
-    y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32);
-    y[64] = d * sc[4] * ((int8_t)((ql[0] >> 4) | (((qh >> 4) & 3) << 4)) - 32);
-    y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32);
-}
-
-__kernel void dequantize_mul_mat_vec_q2_K(__global const struct block_q2_K * xx, __local float* tmp, __global float* yy, __global float* dst, const int ncols) {
-
-    const int row = get_group_id(0);
-
-    const int num_blocks_per_row = ncols / QK_K;
-    const int ib0 = row*num_blocks_per_row + get_global_offset(0);
-
-    __global const struct block_q2_K * x = xx + ib0;
-
-    const int tid = get_local_id(0)/K_QUANTS_PER_ITERATION;  // 0...31 or 0...15
-    const int ix  = get_local_id(0)%K_QUANTS_PER_ITERATION;  // 0 or 0,1
-
-    const int step = 16/K_QUANTS_PER_ITERATION;
-
-    const int im = tid/step;                             // 0 or 1. 0 computes 0..., 1 computes 128...
-    const int in = tid - step*im;                        // 0...15 or 0...7
-
-    const int l0 = K_QUANTS_PER_ITERATION*in;            // 0...15 or 0...14 in steps of 2
-    const int q_offset = 32*im + l0;
-    const int s_offset = 8*im;
-    const int y_offset = 128*im + l0;
-
-    tmp[16 * ix + tid] = 0;
-
-    uint32_t aux[4];
-    const uint8_t * d = (const uint8_t *)aux;
-    const uint8_t * m = (const uint8_t *)(aux + 2);
-
-    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
-
-        __global const float   * y = yy + i * QK_K + y_offset;
-        __global const uint8_t * q = x[i].qs + q_offset;
-
-        const float dall = vload_half(0, &x[i].d);
-        const float dmin = vload_half(0, &x[i].dmin);
-
-        __global const uint32_t * a = (__global const uint32_t *)(x[i].scales + s_offset);
-        aux[0] = a[0] & 0x0f0f0f0f;
-        aux[1] = a[1] & 0x0f0f0f0f;
-        aux[2] = (a[0] >> 4) & 0x0f0f0f0f;
-        aux[3] = (a[1] >> 4) & 0x0f0f0f0f;
-
-        float sum1 = 0, sum2 = 0;
-        for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
-            sum1 += y[l+ 0] * d[0] * ((q[l+ 0] >> 0) & 3)
-                  + y[l+32] * d[2] * ((q[l+ 0] >> 2) & 3)
-                  + y[l+64] * d[4] * ((q[l+ 0] >> 4) & 3)
-                  + y[l+96] * d[6] * ((q[l+ 0] >> 6) & 3)
-                  + y[l+16] * d[1] * ((q[l+16] >> 0) & 3)
-                  + y[l+48] * d[3] * ((q[l+16] >> 2) & 3)
-                  + y[l+80] * d[5] * ((q[l+16] >> 4) & 3)
-                  +y[l+112] * d[7] * ((q[l+16] >> 6) & 3);
-            sum2 += y[l+ 0] * m[0] + y[l+32] * m[2] + y[l+64] * m[4] + y[ l+96] * m[6]
-                  + y[l+16] * m[1] + y[l+48] * m[3] + y[l+80] * m[5] + y[l+112] * m[7];
-
-        }
-        tmp[16 * ix + tid] += dall * sum1 - dmin * sum2;
-
-    }
-
-    // sum up partial sums and write back result
-    barrier(CLK_LOCAL_MEM_FENCE);
-    for (int s=16; s>0; s>>=1) {
-        if (tid < s) {
-            tmp[tid] += tmp[tid + s];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if (tid == 0) {
-        dst[row] = tmp[0];
-    }
-}
-
-__kernel void dequantize_mul_mat_vec_q3_K(__global const struct block_q3_K * xx, __local float* tmp, __global float* yy, __global float* dst, const int ncols) {
-    const uint16_t kmask1 = 0x0303;
-    const uint16_t kmask2 = 0x0f0f;
-
-    const int row = get_group_id(0);
-
-    const int num_blocks_per_row = ncols / QK_K;
-    const int ib0 = row*num_blocks_per_row + get_global_offset(0);
-
-    __global const struct block_q3_K * x = xx + ib0;
-
-    const int tid = get_local_id(0)/K_QUANTS_PER_ITERATION;  // 0...31 or 0...16
-    const int ix  = get_local_id(0)%K_QUANTS_PER_ITERATION;  // 0 or 0,1
-
-    const int n  = K_QUANTS_PER_ITERATION;               // iterations in the inner loop
-    const int step = 16/K_QUANTS_PER_ITERATION;
-    const int im = tid/step;                             // 0 or 1. 0 computes 0..., 1 computes 128...
-    const int in = tid - step*im;                        // 0....15 or 0...7
-
-    const uint8_t m = 1 << (4*im);
-
-    const int l0 = n*in;                                 // 0...15 or 0...14 in steps of 2
-    const int q_offset =  32*im + l0;
-    const int y_offset = 128*im + l0;
-
-    uint16_t utmp[4];
-    const int8_t * s = (const int8_t *)utmp;
-
-    const uint16_t s_shift = 4*im;
-
-    tmp[16 * ix + tid] = 0;
-
-    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
-
-        __global const float   * y  = yy + i * QK_K + y_offset;
-        __global const uint8_t * q = x[i].qs + q_offset;
-        __global const uint8_t * h = x[i].hmask + l0;
-
-        __global const uint16_t * a = (__global const uint16_t *)x[i].scales;
-        utmp[0] = ((a[0] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 0)) & kmask1) << 4);
-        utmp[1] = ((a[1] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 0)) & kmask1) << 4);
-        utmp[2] = ((a[2] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 2)) & kmask1) << 4);
-        utmp[3] = ((a[3] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 2)) & kmask1) << 4);
-
-        const float d = vload_half(0, &x[i].d);
-
-        float sum = 0;
-        for (int l = 0; l < n; ++l) {
-            sum += y[l+ 0] * (s[0] - 32) * (((q[l] >> 0) & 3) - (h[l] & (m << 0) ? 0 : 4))
-                 + y[l+32] * (s[2] - 32) * (((q[l] >> 2) & 3) - (h[l] & (m << 1) ? 0 : 4))
-                 + y[l+64] * (s[4] - 32) * (((q[l] >> 4) & 3) - (h[l] & (m << 2) ? 0 : 4))
-                 + y[l+96] * (s[6] - 32) * (((q[l] >> 6) & 3) - (h[l] & (m << 3) ? 0 : 4));
-            sum += y[l+16] * (s[1] - 32) * (((q[l+16] >> 0) & 3) - (h[l+16] & (m << 0) ? 0 : 4))
-                 + y[l+48] * (s[3] - 32) * (((q[l+16] >> 2) & 3) - (h[l+16] & (m << 1) ? 0 : 4))
-                 + y[l+80] * (s[5] - 32) * (((q[l+16] >> 4) & 3) - (h[l+16] & (m << 2) ? 0 : 4))
-                + y[l+112] * (s[7] - 32) * (((q[l+16] >> 6) & 3) - (h[l+16] & (m << 3) ? 0 : 4));
-        }
-        tmp[16 * ix + tid] += d * sum;
-
-    }
-
-    // sum up partial sums and write back result
-    barrier(CLK_LOCAL_MEM_FENCE);
-    for (int s=16; s>0; s>>=1) {
-        if (tid < s) {
-            tmp[tid] += tmp[tid + s];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if (tid == 0) {
-        dst[row] = tmp[0];
-    }
-}
-
-__kernel void dequantize_mul_mat_vec_q4_K(__global const struct block_q4_K * xx, __local float* tmp, __global float* yy, __global float* dst, const int ncols) {
-
-    //to rename it later, just to test now
-    const uint16_t kmask1 = 0x3f3f;
-    const uint16_t kmask2 = 0x0f0f;
-    const uint16_t kmask3 = 0xc0c0;
-
-    const int row = get_group_id(0);
-    const int num_blocks_per_row = ncols / QK_K;
-    const int ib0 = row*num_blocks_per_row + get_global_offset(0);
-
-    const int tid = get_local_id(0)/K_QUANTS_PER_ITERATION;  // 0...15
-    const int ix  = get_local_id(0)%K_QUANTS_PER_ITERATION;
-
-    const int step = 8/K_QUANTS_PER_ITERATION;
-
-    const int il  = tid/step;     // 0...3
-    const int ir  = tid - step*il;// 0...3
-    const int n   = 2*K_QUANTS_PER_ITERATION;
-
-    const int im = il/2;  // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
-    const int in = il%2;
-
-    const int l0 = n*(2*ir + in);
-    const int q_offset = 32*im + l0;
-    const int y_offset = 64*im + l0;
-
-    uint16_t aux[4];
-    const uint8_t * sc = (const uint8_t *)aux;
-
-    __global const struct block_q4_K * x = xx + ib0;
-
-    tmp[16 * ix + tid] = 0;
-
-    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
-
-        __global const uint8_t * q1 = x[i].qs + q_offset;
-        __global const uint8_t * q2 = q1 + 64;
-        __global const float   * y1 = yy + i*QK_K + y_offset;
-        __global const float   * y2 = y1 + 128;
-
-        const float dall = vload_half(0, &x[i].d);
-        const float dmin = vload_half(0, &x[i].dmin);
-
-        __global const uint16_t * a = (__global const uint16_t *)x[i].scales;
-        aux[0] = a[im+0] & kmask1;
-        aux[1] = a[im+2] & kmask1;
-        aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
-        aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
-
-        float4 s = (float4)(0.f);
-        float smin = 0;
-        for (int l = 0; l < n; ++l) {
-            s.x += y1[l] * (q1[l] & 0xF); s.y += y1[l+32] * (q1[l] >> 4);
-            s.z += y2[l] * (q2[l] & 0xF); s.w += y2[l+32] * (q2[l] >> 4);
-            smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
-        }
-        tmp[16 * ix + tid] += dall * (s.x * sc[0] + s.y * sc[1] + s.z * sc[4] + s.w * sc[5]) - dmin * smin;
-
-    }
-
-    // sum up partial sums and write back result
-    barrier(CLK_LOCAL_MEM_FENCE);
-    for (int s=16; s>0; s>>=1) {
-        if (tid < s) {
-            tmp[tid] += tmp[tid + s];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if (tid == 0) {
-        dst[row] = tmp[0];
-    }
-}
-
-__kernel void dequantize_mul_mat_vec_q5_K(__global const struct block_q5_K * xx, __local float* tmp, __global float* yy, __global float* dst, const int ncols) {
-
-    const uint16_t kmask1 = 0x3f3f;
-    const uint16_t kmask2 = 0x0f0f;
-    const uint16_t kmask3 = 0xc0c0;
-
-    const int row = get_group_id(0);
-    const int num_blocks_per_row = ncols / QK_K;
-    const int ib0 = row*num_blocks_per_row + get_global_offset(0);
-
-    const int tid = get_local_id(0)/2;  // 0...15
-    const int ix  = get_local_id(0)%2;
-
-    const int il  = tid/4;     // 0...3
-    const int ir  = tid - 4*il;// 0...3
-    const int n   = 2;
-
-    const int im = il/2;  // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
-    const int in = il%2;
-
-    const int l0 = n*(2*ir + in);
-    const int q_offset = 32*im + l0;
-    const int y_offset = 64*im + l0;
-
-    const uint8_t hm1  = 1 << (2*im);
-    const uint8_t hm2  = hm1 << 4;
-
-    uint16_t aux[4];
-    const uint8_t * sc = (const uint8_t *)aux;
-
-    __global const struct block_q5_K * x = xx + ib0;
-
-    tmp[16 * ix + tid] = 0;
-
-    for (int i = ix; i < num_blocks_per_row; i += 2) {
-
-        __global const uint8_t * ql1 = x[i].qs + q_offset;
-        __global const uint8_t * ql2 = ql1 + 64;
-        __global const uint8_t * qh  = x[i].qh + l0;
-        __global const float   * y1  = yy + i*QK_K + y_offset;
-        __global const float   * y2  = y1 + 128;
-
-        const float dall = vload_half(0, &x[i].d);
-        const float dmin = vload_half(0, &x[i].dmin);
-
-        __global const uint16_t * a = (__global const uint16_t *)x[i].scales;
-        aux[0] = a[im+0] & kmask1;
-        aux[1] = a[im+2] & kmask1;
-        aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
-        aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
-
-        float4 sum = (float4)(0.f);
-        float smin = 0;
-        for (int l = 0; l < n; ++l) {
-            sum.x += y1[l+ 0] * ((ql1[l+ 0] & 0xF) + (qh[l+ 0] & (hm1 << 0) ? 16 : 0))
-                   + y1[l+16] * ((ql1[l+16] & 0xF) + (qh[l+16] & (hm1 << 0) ? 16 : 0));
-            sum.y += y1[l+32] * ((ql1[l+ 0] >>  4) + (qh[l+ 0] & (hm1 << 1) ? 16 : 0))
-                   + y1[l+48] * ((ql1[l+16] >>  4) + (qh[l+16] & (hm1 << 1) ? 16 : 0));
-            sum.z += y2[l+ 0] * ((ql2[l+ 0] & 0xF) + (qh[l+ 0] & (hm2 << 0) ? 16 : 0))
-                   + y2[l+16] * ((ql2[l+16] & 0xF) + (qh[l+16] & (hm2 << 0) ? 16 : 0));
-            sum.w += y2[l+32] * ((ql2[l+ 0] >>  4) + (qh[l+ 0] & (hm2 << 1) ? 16 : 0))
-                   + y2[l+48] * ((ql2[l+16] >>  4) + (qh[l+16] & (hm2 << 1) ? 16 : 0));
-            smin += (y1[l] + y1[l+16]) * sc[2] + (y1[l+32] + y1[l+48]) * sc[3]
-                  + (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7];
-        }
-        tmp[16 * ix + tid] += dall * (sum.x * sc[0] + sum.y * sc[1] + sum.z * sc[4] + sum.w * sc[5]) - dmin * smin;
-
-    }
-
-    // sum up partial sums and write back result
-    barrier(CLK_LOCAL_MEM_FENCE);
-    for (int s=16; s>0; s>>=1) {
-        if (tid < s) {
-            tmp[tid] += tmp[tid + s];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if (tid == 0) {
-        dst[row] = tmp[0];
-    }
-}
-
-__kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx, __local float* tmp, __global const float * yy, __global float * dst, const int ncols) {
-
-    const int row = get_group_id(0);
-
-    const int num_blocks_per_row = ncols / QK_K;
-    const int ib0 = row*num_blocks_per_row + get_global_offset(0);
-
-    __global const struct block_q6_K * x = xx + ib0;
-
-    const int tid = get_local_id(0)/K_QUANTS_PER_ITERATION;  // 0...31 or 0...16
-    const int ix  = get_local_id(0)%K_QUANTS_PER_ITERATION;  // 0 or 0, 1
-
-    const int step = 16/K_QUANTS_PER_ITERATION;          // 16 or 8
-
-    const int im = tid/step;                             // 0 or 1. 0 computes 0..., 1 computes 128...
-    const int in = tid - step*im;                        // 0...15 or 0...7
-
-\n#if K_QUANTS_PER_ITERATION == 1\n
-    const int l0 = K_QUANTS_PER_ITERATION*in;            // 0...15
-    const int is = 0;
-
-\n#else\n
-
-    const int l0 = 4 * in;                               // 0, 4, 8, ..., 28
-    const int is = in / 4;
-
-\n#endif\n
-
-    const int ql_offset = 64*im + l0;
-    const int qh_offset = 32*im + l0;
-    const int s_offset  =  8*im + is;
-    const int y_offset = 128*im + l0;
-
-    tmp[16 * ix + tid] = 0; // partial sum for thread in warp
-
-    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
-
-        __global const float   * y  = yy + i * QK_K + y_offset;
-        __global const uint8_t * ql = x[i].ql + ql_offset;
-        __global const uint8_t * qh = x[i].qh + qh_offset;
-        __global const int8_t  * s  = x[i].scales + s_offset;
-
-        const float d = vload_half(0, &x[i].d);
-
-\n#if K_QUANTS_PER_ITERATION == 1\n
-        float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32)
-                  + y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32)
-                  + y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32)
-                  + y[48] * s[3] * d * ((int8_t)((ql[48] & 0xF) | ((qh[16] & 0x0c) << 2)) - 32)
-                  + y[64] * s[4] * d * ((int8_t)((ql[ 0]  >> 4) | ((qh[ 0] & 0x30) >> 0)) - 32)
-                  + y[80] * s[5] * d * ((int8_t)((ql[16]  >> 4) | ((qh[16] & 0x30) >> 0)) - 32)
-                  + y[96] * s[6] * d * ((int8_t)((ql[32]  >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32)
-                  +y[112] * s[7] * d * ((int8_t)((ql[48]  >> 4) | ((qh[16] & 0xc0) >> 2)) - 32);
-        tmp[16 * ix + tid] += sum;
-\n#else\n
-        float sum = 0;
-        for (int l = 0; l < 4; ++l) {
-            sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32)
-                 + y[l+32] * s[2] * d * ((int8_t)((ql[l+32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32)
-                 + y[l+64] * s[4] * d * ((int8_t)((ql[l+ 0]  >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32)
-                 + y[l+96] * s[6] * d * ((int8_t)((ql[l+32]  >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32);
-        }
-        tmp[16 * ix + tid] += sum;
-\n#endif\n
-
-    }
-
-    // sum up partial sums and write back result
-    barrier(CLK_LOCAL_MEM_FENCE);
-    for (int s=16; s>0; s>>=1) {
-        if (tid < s) {
-            tmp[tid] += tmp[tid + s];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if (tid == 0) {
-        dst[row] = tmp[0];
-    }
-}
-
-);
-
-
-std::string dequant_template = MULTILINE_QUOTE(
-__kernel void KERNEL_NAME(__global X_TYPE* x, __global float* y) {
-    const int i = get_group_id(0)*get_local_size(0) + get_local_id(0)*2;
-
-    if (i >= get_global_size(0)) {
-        return;
-    }
-
-    const uint qk = QUANT_K;
-    const uint qr = QUANT_R;
-
-    const int ib = i/qk + get_global_offset(0); // block index
-    const int iqs = (i%qk)/qr; // quant index
-    const int iybs = i - i%qk; // y block start index
-    const int y_offset = qr == 1 ? 1 : qk/2;
-
-    // dequantize
-    float v0, v1;
-    DEQUANT_FUNC(x, ib, iqs, &v0, &v1);
-    y[iybs + iqs + 0] = v0;
-    y[iybs + iqs + y_offset] = v1;
-}
-);
-
-std::string dequant_mul_mat_vec_template = MULTILINE_QUOTE(
-__kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) {
-    const int local_size = get_local_size(0);
-    const int row = get_group_id(0);
-    const int tid = get_local_id(0);
-
-    const uint qk = QUANT_K;
-    const uint qr = QUANT_R;
-
-    const int col_step = local_size * 2;
-    const int y_offset = qr == 1 ? 1 : qk/2;
-
-    x += get_global_offset(0);
-
-    tmp[tid] = 0;
-
-    for (int col = tid*2; col < ncols; col += col_step) {
-        const int ib = (row*ncols + col)/qk; // block index
-        const int iqs = (col%qk)/qr; // quant index
-        const int iybs = col - col%qk; // y block start index
-
-        // dequantize
-        float v0, v1;
-        DEQUANT_FUNC(x, ib, iqs, &v0, &v1);
-
-        // matrix multiplication
-        tmp[tid] += v0 * y[iybs + iqs + 0];
-        tmp[tid] += v1 * y[iybs + iqs + y_offset];
-    }
-
-    // sum up partial sums and write back result
-    barrier(CLK_LOCAL_MEM_FENCE);
-    for (int s=local_size/2; s>0; s>>=1) {
-        if (tid < s) {
-            tmp[tid] += tmp[tid + s];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if (tid == 0) {
-        dst[row] = tmp[0];
-    }
-}
-);
-
-
-std::string mul_template = MULTILINE_QUOTE(
-__kernel void KERNEL_NAME(__global TYPE* x, const int x_offset, __global TYPE* y, const int y_offset, __global TYPE* dst, const int dst_offset, const int ky) {
-    const int i = get_group_id(0)*get_local_size(0) + get_local_id(0);
-
-    if (i >= get_global_size(0)) {
-        return;
-    }
-
-    dst[dst_offset + i] = x[x_offset + i] * y[y_offset + i%ky];
-}
-);
-
-#define CL_CHECK(err)                                               \
-    do {                                                            \
-        cl_int err_ = (err);                                        \
-        if (err_ != CL_SUCCESS) {                                   \
-            fprintf(stderr, "ggml_opencl: %s error %d at %s:%d\n",  \
-                #err, err_, __FILE__, __LINE__);                    \
-            exit(1);                                                \
-        }                                                           \
-    } while (0)
-
-#define CLBLAST_CHECK(err)                                          \
-    do {                                                            \
-        CLBlastStatusCode err_ = (err);                             \
-        if (err_ != CLBlastSuccess) {                               \
-            fprintf(stderr, "ggml_opencl: %s error %d at %s:%d\n",  \
-                #err, err_, __FILE__, __LINE__);                    \
-            exit(1);                                                \
-        }                                                           \
-    } while (0)
-
-std::array<std::string, 5> dequant_str_keys = {
-    "KERNEL_NAME", "X_TYPE", "QUANT_K", "QUANT_R", "DEQUANT_FUNC"
-};
-
-std::array<std::string, 30> dequant_str_values = {
-    "dequantize_row_q4_0", "struct block_q4_0", "QK4_0", "QR4_0", "dequantize_q4_0",
-    "dequantize_row_q4_1", "struct block_q4_1", "QK4_1", "QR4_1", "dequantize_q4_1",
-    "dequantize_row_q5_0", "struct block_q5_0", "QK5_0", "QR5_0", "dequantize_q5_0",
-    "dequantize_row_q5_1", "struct block_q5_1", "QK5_1", "QR5_1", "dequantize_q5_1",
-    "dequantize_row_q8_0", "struct block_q8_0", "QK8_0", "QR8_0", "dequantize_q8_0",
-    "convert_row_f16", "half", "1", "1", "convert_f16"
-};
-
-std::array<std::string, 30> dequant_mul_mat_vec_str_values = {
-    "dequantize_mul_mat_vec_q4_0", "struct block_q4_0", "QK4_0", "QR4_0", "dequantize_q4_0",
-    "dequantize_mul_mat_vec_q4_1", "struct block_q4_1", "QK4_1", "QR4_1", "dequantize_q4_1",
-    "dequantize_mul_mat_vec_q5_0", "struct block_q5_0", "QK5_0", "QR5_0", "dequantize_q5_0",
-    "dequantize_mul_mat_vec_q5_1", "struct block_q5_1", "QK5_1", "QR5_1", "dequantize_q5_1",
-    "dequantize_mul_mat_vec_q8_0", "struct block_q8_0", "QK8_0", "QR8_0", "dequantize_q8_0",
-    "convert_mul_mat_vec_f16", "half", "1", "1", "convert_f16"
-};
-
-std::array<std::string, 2> mul_str_keys = {
-    "KERNEL_NAME", "TYPE"
-};
-std::array<std::string, 2> mul_str_values = {
-    "mul_f32", "float"
-};
-
-static std::string& replace(std::string& s, const std::string& from, const std::string& to) {
-    size_t pos = 0;
-    while ((pos = s.find(from, pos)) != std::string::npos) {
-         s.replace(pos, from.length(), to);
-         pos += to.length();
-    }
-    return s;
-}
-
-static std::string generate_kernels() {
-    std::stringstream src;
-    src << program_source << '\n';
-    src << k_quants_source << '\n';
-    for (size_t i = 0; i < dequant_str_values.size(); i += dequant_str_keys.size()) {
-        std::string dequant_kernel = dequant_template;
-        std::string dmmv_kernel = dequant_mul_mat_vec_template;
-        for (size_t j = 0; j < dequant_str_keys.size(); j++) {
-            replace(dequant_kernel, dequant_str_keys[j], dequant_str_values[i + j]);
-            replace(dmmv_kernel, dequant_str_keys[j], dequant_mul_mat_vec_str_values[i + j]);
-        }
-        src << dequant_kernel << '\n';
-        src << dmmv_kernel << '\n';
-    }
-    for (size_t i = 0; i < mul_str_values.size(); i += mul_str_keys.size()) {
-        std::string mul_kernel = mul_template;
-        for (size_t j = 0; j < mul_str_keys.size(); j++) {
-            replace(mul_kernel, mul_str_keys[j], mul_str_values[i + j]);
-        }
-        src << mul_kernel << '\n';
-    }
-
-    return src.str();
-}
-
-static cl_platform_id platform;
-static cl_device_id device;
-static cl_context context;
-static cl_command_queue queue;
-static cl_program program;
-static cl_kernel convert_row_f16_cl;
-static cl_kernel dequantize_row_q4_0_cl, dequantize_row_q4_1_cl, dequantize_row_q5_0_cl, dequantize_row_q5_1_cl, dequantize_row_q8_0_cl;
-static cl_kernel dequantize_mul_mat_vec_q4_0_cl, dequantize_mul_mat_vec_q4_1_cl, dequantize_mul_mat_vec_q5_0_cl, dequantize_mul_mat_vec_q5_1_cl, dequantize_mul_mat_vec_q8_0_cl, convert_mul_mat_vec_f16_cl;
-static cl_kernel dequantize_block_q2_k_cl, dequantize_block_q3_k_cl, dequantize_block_q4_k_cl, dequantize_block_q5_k_cl, dequantize_block_q6_k_cl;
-static cl_kernel dequantize_mul_mat_vec_q2_K_cl, dequantize_mul_mat_vec_q3_K_cl, dequantize_mul_mat_vec_q4_K_cl, dequantize_mul_mat_vec_q5_K_cl, dequantize_mul_mat_vec_q6_K_cl;
-static cl_kernel mul_f32_cl;
-static bool fp16_support;
-
-static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, const char* program_buffer) {
-    cl_program p;
-    char *program_log;
-    size_t program_size;
-    size_t log_size;
-    int err;
-
-    program_size = strlen(program_buffer);
-
-    p = clCreateProgramWithSource(ctx, 1, (const char**)&program_buffer, &program_size, &err);
-    if(err < 0) {
-        fprintf(stderr, "OpenCL error creating program");
-        exit(1);
-    }
-
-    std::string compile_opts = "-cl-mad-enable -cl-unsafe-math-optimizations -cl-finite-math-only -cl-fast-relaxed-math "
-                               "-DQK4_0=32 -DQR4_0=2 -DQK4_1=32 -DQR4_1=2 -DQK5_0=32 -DQR5_0=2 -DQK5_1=32 -DQR5_1=2 -DQK8_0=32 -DQR8_0=1 "
-                               "-DQK_K=256 -DK_QUANTS_PER_ITERATION=" + std::to_string(K_QUANTS_PER_ITERATION);
-
-    err = clBuildProgram(p, 0, NULL, compile_opts.c_str(), NULL, NULL);
-    if(err < 0) {
-
-        clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
-        program_log = (char*) malloc(log_size + 1);
-        program_log[log_size] = '\0';
-        clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, log_size + 1, program_log, NULL);
-        fprintf(stderr, "ggml_opencl: kernel compile error:\n\n%s\n", program_log);
-        free(program_log);
-        exit(1);
-    }
-
-    return p;
-}
-
-void ggml_cl_init(void) {
-    static bool initialized = false;
-    if (initialized) {
-        return;
-    }
-    initialized = true;
-
-    cl_int err;
-
-    struct cl_device;
-    struct cl_platform {
-        cl_platform_id id;
-        unsigned number;
-        char name[128];
-        char vendor[128];
-        struct cl_device * devices;
-        unsigned n_devices;
-        struct cl_device * default_device;
-    };
-
-    struct cl_device {
-        struct cl_platform * platform;
-        cl_device_id id;
-        unsigned number;
-        cl_device_type type;
-        char name[128];
-    };
-
-    enum { NPLAT = 16, NDEV = 16 };
-
-    struct cl_platform platforms[NPLAT];
-    unsigned n_platforms = 0;
-    struct cl_device devices[NDEV];
-    unsigned n_devices = 0;
-    struct cl_device * default_device = NULL;
-
-    platform = NULL;
-    device = NULL;
-
-    cl_platform_id platform_ids[NPLAT];
-    CL_CHECK(clGetPlatformIDs(NPLAT, platform_ids, &n_platforms));
-
-    for (unsigned i = 0; i < n_platforms; i++) {
-        struct cl_platform * p = &platforms[i];
-        p->number = i;
-        p->id = platform_ids[i];
-        CL_CHECK(clGetPlatformInfo(p->id, CL_PLATFORM_NAME, sizeof(p->name), &p->name, NULL));
-        CL_CHECK(clGetPlatformInfo(p->id, CL_PLATFORM_VENDOR, sizeof(p->vendor), &p->vendor, NULL));
-
-        cl_device_id device_ids[NDEV];
-        cl_int clGetDeviceIDsError = clGetDeviceIDs(p->id, CL_DEVICE_TYPE_ALL, NDEV, device_ids, &p->n_devices);
-        if (clGetDeviceIDsError == CL_DEVICE_NOT_FOUND) {
-            p->n_devices = 0;
-        } else {
-            CL_CHECK(clGetDeviceIDsError);
-        }
-        p->devices = p->n_devices > 0 ? &devices[n_devices] : NULL;
-        p->default_device = NULL;
-
-        for (unsigned j = 0; j < p->n_devices; j++) {
-            struct cl_device * d = &devices[n_devices];
-            d->number = n_devices++;
-            d->id = device_ids[j];
-            d->platform = p;
-            CL_CHECK(clGetDeviceInfo(d->id, CL_DEVICE_NAME, sizeof(d->name), &d->name, NULL));
-            CL_CHECK(clGetDeviceInfo(d->id, CL_DEVICE_TYPE, sizeof(d->type), &d->type, NULL));
-
-            if (p->default_device == NULL && d->type == CL_DEVICE_TYPE_GPU) {
-                p->default_device = d;
-            }
-        }
-
-        if (default_device == NULL && p->default_device != NULL) {
-            default_device = p->default_device;
-        }
-    }
-
-    if (n_devices == 0) {
-        fprintf(stderr, "ggml_opencl: could find any OpenCL devices.\n");
-        exit(1);
-    }
-
-    char * user_platform_string = getenv("GGML_OPENCL_PLATFORM");
-    char * user_device_string = getenv("GGML_OPENCL_DEVICE");
-    int user_platform_number = -1;
-    int user_device_number = -1;
-
-    unsigned n;
-    if (user_platform_string != NULL && sscanf(user_platform_string, " %u", &n) == 1 && n < n_platforms) {
-        user_platform_number = (int)n;
-    }
-    if (user_device_string != NULL && sscanf(user_device_string, " %u", &n) == 1 && n < n_devices) {
-        user_device_number = (int)n;
-    }
-    if (user_platform_number != -1 && user_device_number != -1) {
-        cl_platform* platform = &platforms[user_platform_number];
-        if ((unsigned)user_device_number >= platform->n_devices) {
-            fprintf(stderr, "ggml_opencl: invalid device number %d\n", user_device_number);
-            exit(1);
-        }
-        default_device = &platform->devices[user_device_number];
-    } else {
-
-        struct cl_device * selected_devices = devices;
-        unsigned n_selected_devices = n_devices;
-
-        if (user_platform_number == -1 && user_platform_string != NULL && user_platform_string[0] != 0) {
-            for (unsigned i = 0; i < n_platforms; i++) {
-                struct cl_platform * p = &platforms[i];
-                if (strstr(p->name, user_platform_string) != NULL ||
-                    strstr(p->vendor, user_platform_string) != NULL) {
-                    user_platform_number = (int)i;
-                    break;
-                }
-            }
-            if (user_platform_number == -1) {
-                fprintf(stderr, "ggml_opencl: no platform matching '%s' was found.\n", user_platform_string);
-                exit(1);
-            }
-        }
-        if (user_platform_number != -1) {
-            struct cl_platform * p = &platforms[user_platform_number];
-            selected_devices = p->devices;
-            n_selected_devices = p->n_devices;
-            default_device = p->default_device;
-            if (n_selected_devices == 0) {
-                fprintf(stderr, "ggml_opencl: selected platform '%s' does not have any devices.\n", p->name);
-                exit(1);
-            }
-        }
-
-        if (user_device_number == -1 && user_device_string != NULL && user_device_string[0] != 0) {
-            for (unsigned i = 0; i < n_selected_devices; i++) {
-                struct cl_device * d = &selected_devices[i];
-                if (strstr(d->name, user_device_string) != NULL) {
-                    user_device_number = d->number;
-                    break;
-                }
-            }
-            if (user_device_number == -1) {
-                fprintf(stderr, "ggml_opencl: no device matching '%s' was found.\n", user_device_string);
-                exit(1);
-            }
-        }
-        if (user_device_number != -1) {
-            selected_devices = &devices[user_device_number];
-            n_selected_devices = 1;
-            default_device = &selected_devices[0];
-        }
-
-        GGML_ASSERT(n_selected_devices > 0);
-
-        if (default_device == NULL) {
-            default_device = &selected_devices[0];
-        }
-    }
-
-    fprintf(stderr, "ggml_opencl: selecting platform: '%s'\n", default_device->platform->name);
-    fprintf(stderr, "ggml_opencl: selecting device: '%s'\n", default_device->name);
-    if (default_device->type != CL_DEVICE_TYPE_GPU) {
-        fprintf(stderr, "ggml_opencl: warning, not a GPU: '%s'.\n", default_device->name);
-    }
-
-    platform = default_device->platform->id;
-    device = default_device->id;
-
-    size_t ext_str_size;
-    clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 0, NULL, &ext_str_size);
-    char *ext_buffer = (char *)alloca(ext_str_size + 1);
-    clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, ext_str_size, ext_buffer, NULL);
-    ext_buffer[ext_str_size] = '\0'; // ensure it is null terminated
-    // Check if ext_buffer contains cl_khr_fp16
-    fp16_support = strstr(ext_buffer, "cl_khr_fp16") != NULL;
-    fprintf(stderr, "ggml_opencl: device FP16 support: %s\n", fp16_support ? "true" : "false");
-
-    cl_context_properties properties[] = {
-        (intptr_t)CL_CONTEXT_PLATFORM, (intptr_t)platform, 0
-    };
-
-    CL_CHECK((context = clCreateContext(properties, 1, &device, NULL, NULL, &err), err));
-
-    CL_CHECK((queue = clCreateCommandQueue(context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err),
-        (err != CL_INVALID_QUEUE_PROPERTIES && err != CL_INVALID_VALUE ? err :
-        (queue = clCreateCommandQueue(context, device, 0, &err), err)
-    )));
-
-    const std::string kernel_src = generate_kernels();
-
-    program = build_program_from_source(context, device, kernel_src.c_str());
-
-    // FP16 to FP32 kernel
-    CL_CHECK((convert_row_f16_cl = clCreateKernel(program, "convert_row_f16", &err), err));
-
-    // Dequantize kernels
-    CL_CHECK((dequantize_row_q4_0_cl = clCreateKernel(program, "dequantize_row_q4_0", &err), err));
-    CL_CHECK((dequantize_row_q4_1_cl = clCreateKernel(program, "dequantize_row_q4_1", &err), err));
-    CL_CHECK((dequantize_row_q5_0_cl = clCreateKernel(program, "dequantize_row_q5_0", &err), err));
-    CL_CHECK((dequantize_row_q5_1_cl = clCreateKernel(program, "dequantize_row_q5_1", &err), err));
-    CL_CHECK((dequantize_row_q8_0_cl = clCreateKernel(program, "dequantize_row_q8_0", &err), err));
-    CL_CHECK((dequantize_row_q8_0_cl = clCreateKernel(program, "dequantize_row_q8_0", &err), err));
-    CL_CHECK((dequantize_block_q2_k_cl = clCreateKernel(program, "dequantize_block_q2_K", &err), err));
-    CL_CHECK((dequantize_block_q3_k_cl = clCreateKernel(program, "dequantize_block_q3_K", &err), err));
-    CL_CHECK((dequantize_block_q4_k_cl = clCreateKernel(program, "dequantize_block_q4_K", &err), err));
-    CL_CHECK((dequantize_block_q5_k_cl = clCreateKernel(program, "dequantize_block_q5_K", &err), err));
-    CL_CHECK((dequantize_block_q6_k_cl = clCreateKernel(program, "dequantize_block_q6_K", &err), err));
-
-    // dequant mul mat kernel
-    CL_CHECK((dequantize_mul_mat_vec_q4_0_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q4_0", &err), err));
-    CL_CHECK((dequantize_mul_mat_vec_q4_1_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q4_1", &err), err));
-    CL_CHECK((dequantize_mul_mat_vec_q5_0_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q5_0", &err), err));
-    CL_CHECK((dequantize_mul_mat_vec_q5_1_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q5_1", &err), err));
-    CL_CHECK((dequantize_mul_mat_vec_q8_0_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q8_0", &err), err));
-    CL_CHECK((convert_mul_mat_vec_f16_cl = clCreateKernel(program, "convert_mul_mat_vec_f16", &err), err));
-    CL_CHECK((dequantize_mul_mat_vec_q2_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q2_K", &err), err));
-    CL_CHECK((dequantize_mul_mat_vec_q3_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q3_K", &err), err));
-    CL_CHECK((dequantize_mul_mat_vec_q4_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q4_K", &err), err));
-    CL_CHECK((dequantize_mul_mat_vec_q5_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q5_K", &err), err));
-    CL_CHECK((dequantize_mul_mat_vec_q6_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q6_K", &err), err));
-
-    // mul kernel
-    CL_CHECK((mul_f32_cl = clCreateKernel(program, "mul_f32", &err), err));
-}
-
-static cl_kernel* ggml_get_to_fp32_cl(ggml_type type) {
-    switch (type) {
-        case GGML_TYPE_Q4_0:
-            return &dequantize_row_q4_0_cl;
-        case GGML_TYPE_Q4_1:
-            return &dequantize_row_q4_1_cl;
-        case GGML_TYPE_Q5_0:
-            return &dequantize_row_q5_0_cl;
-        case GGML_TYPE_Q5_1:
-            return &dequantize_row_q5_1_cl;
-        case GGML_TYPE_Q8_0:
-            return &dequantize_row_q8_0_cl;
-        case GGML_TYPE_Q2_K:
-            return &dequantize_block_q2_k_cl;
-        case GGML_TYPE_Q3_K:
-            return &dequantize_block_q3_k_cl;
-        case GGML_TYPE_Q4_K:
-            return &dequantize_block_q4_k_cl;
-        case GGML_TYPE_Q5_K:
-            return &dequantize_block_q5_k_cl;
-        case GGML_TYPE_Q6_K:
-            return &dequantize_block_q6_k_cl;
-        case GGML_TYPE_F16:
-            return &convert_row_f16_cl;
-        default:
-            return nullptr;
-    }
-}
-
-static size_t ggml_cl_global_denom(ggml_type type) {
-    switch (type) {
-        case GGML_TYPE_Q4_0:
-        case GGML_TYPE_Q4_1:
-        case GGML_TYPE_Q5_0:
-        case GGML_TYPE_Q5_1:
-        case GGML_TYPE_Q8_0:
-            return 1;
-        case GGML_TYPE_Q2_K:
-        case GGML_TYPE_Q3_K:
-            return 4;
-        case GGML_TYPE_Q4_K:
-            return 8;
-        case GGML_TYPE_Q5_K:
-        case GGML_TYPE_Q6_K:
-            return 4;
-        case GGML_TYPE_F16:
-        default:
-            return 1;
-    }
-}
-
-static size_t ggml_cl_local_size(ggml_type type) {
-    switch (type) {
-        case GGML_TYPE_Q4_0:
-        case GGML_TYPE_Q4_1:
-        case GGML_TYPE_Q5_0:
-        case GGML_TYPE_Q5_1:
-        case GGML_TYPE_Q8_0:
-            return 0;
-        case GGML_TYPE_Q2_K:
-        case GGML_TYPE_Q3_K:
-            return 64;
-        case GGML_TYPE_Q4_K:
-            return 32;
-        case GGML_TYPE_Q5_K:
-        case GGML_TYPE_Q6_K:
-            return 64;
-        case GGML_TYPE_F16:
-        default:
-            return 0;
-    }
-}
-
-static cl_kernel* ggml_get_dequantize_mul_mat_vec_cl(ggml_type type) {
-    switch (type) {
-        case GGML_TYPE_Q4_0:
-            return &dequantize_mul_mat_vec_q4_0_cl;
-        case GGML_TYPE_Q4_1:
-            return &dequantize_mul_mat_vec_q4_1_cl;
-        case GGML_TYPE_Q5_0:
-            return &dequantize_mul_mat_vec_q5_0_cl;
-        case GGML_TYPE_Q5_1:
-            return &dequantize_mul_mat_vec_q5_1_cl;
-        case GGML_TYPE_Q8_0:
-            return &dequantize_mul_mat_vec_q8_0_cl;
-        case GGML_TYPE_F16:
-            return &convert_mul_mat_vec_f16_cl;
-        case GGML_TYPE_Q2_K:
-            return &dequantize_mul_mat_vec_q2_K_cl;
-        case GGML_TYPE_Q3_K:
-            return &dequantize_mul_mat_vec_q3_K_cl;
-        case GGML_TYPE_Q4_K:
-            return &dequantize_mul_mat_vec_q4_K_cl;
-        case GGML_TYPE_Q5_K:
-            return &dequantize_mul_mat_vec_q5_K_cl;
-        case GGML_TYPE_Q6_K:
-            return &dequantize_mul_mat_vec_q6_K_cl;
-        default:
-            return nullptr;
-    }
-}
-
-// buffer pool for cl
-#define MAX_CL_BUFFERS 256
-
-struct scoped_spin_lock {
-    std::atomic_flag& lock;
-    scoped_spin_lock(std::atomic_flag& lock) : lock(lock) {
-        while (lock.test_and_set(std::memory_order_acquire)) {
-            ; // spin
-        }
-    }
-    ~scoped_spin_lock() {
-        lock.clear(std::memory_order_release);
-    }
-    scoped_spin_lock(const scoped_spin_lock&) = delete;
-    scoped_spin_lock& operator=(const scoped_spin_lock&) = delete;
-};
-
-struct cl_buffer {
-    cl_mem mem;
-    size_t size = 0;
-};
-
-static cl_buffer g_cl_buffer_pool[MAX_CL_BUFFERS];
-static std::atomic_flag g_cl_pool_lock = ATOMIC_FLAG_INIT;
-
-static cl_mem ggml_cl_pool_malloc(size_t size, size_t * actual_size) {
-    scoped_spin_lock lock(g_cl_pool_lock);
-    cl_int err;
-
-    int best_i = -1;
-    size_t best_size = std::numeric_limits<size_t>::max(); //smallest unused buffer that fits our needs
-    int worst_i = -1;
-    size_t worst_size = 0; //largest unused buffer seen so far
-    for (int i = 0; i < MAX_CL_BUFFERS; ++i) {
-        cl_buffer &b = g_cl_buffer_pool[i];
-        if (b.size > 0 && b.size >= size && b.size < best_size)
-        {
-            best_i = i;
-            best_size = b.size;
-        }
-        if (b.size > 0 && b.size > worst_size)
-        {
-            worst_i = i;
-            worst_size = b.size;
-        }
-    }
-    if(best_i!=-1) //found the smallest buffer that fits our needs
-    {
-        cl_buffer& b = g_cl_buffer_pool[best_i];
-        cl_mem mem = b.mem;
-        *actual_size = b.size;
-        b.size = 0;
-        return mem;
-    }
-    if(worst_i!=-1) //no buffer that fits our needs, resize largest one to save memory
-    {
-         cl_buffer& b = g_cl_buffer_pool[worst_i];
-         cl_mem mem = b.mem;
-         b.size = 0;
-         clReleaseMemObject(mem);
-    }
-    cl_mem mem;
-    CL_CHECK((mem = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &err), err));
-    *actual_size = size;
-    return mem;
-}
-
-static void ggml_cl_pool_free(cl_mem mem, size_t size) {
-    scoped_spin_lock lock(g_cl_pool_lock);
-
-    for (int i = 0; i < MAX_CL_BUFFERS; ++i) {
-        cl_buffer& b = g_cl_buffer_pool[i];
-        if (b.size == 0) {
-            b.mem = mem;
-            b.size = size;
-            return;
-        }
-    }
-    fprintf(stderr, "WARNING: cl buffer pool full, increase MAX_CL_BUFFERS\n");
-    clReleaseMemObject(mem);
-}
-
-void ggml_cl_free_data(const struct ggml_tensor* tensor) {
-    if (tensor->backend != GGML_BACKEND_GPU) {
-        return;
-    }
-
-    cl_mem mem = (cl_mem)tensor->extra;
-    clReleaseMemObject(mem);
-}
-
-static cl_int ggml_cl_h2d_tensor_2d(cl_command_queue queue, cl_mem dst, size_t offset, const struct ggml_tensor * src, uint64_t i3, uint64_t i2, cl_event* ev) {
-    cl_int err;
-    const uint64_t ne0 = src->ne[0];
-    const uint64_t ne1 = src->ne[1];
-    const uint64_t nb0 = src->nb[0];
-    const uint64_t nb1 = src->nb[1];
-    const uint64_t nb2 = src->nb[2];
-    const uint64_t nb3 = src->nb[3];
-    const enum ggml_type type = src->type;
-    const size_t ts = ggml_type_size(type);
-    const size_t bs = ggml_blck_size(type);
-    const uint64_t row_size = ts*ne0/bs;
-
-    const char * x = (const char *) src->data + i2*nb2 + i3*nb3;
-    if (nb0 == ts && nb1 == row_size) {
-        return clEnqueueWriteBuffer(queue, dst, CL_FALSE, offset, ne1*row_size, x, 0, NULL, ev);
-    }
-    if (nb0 == ts) {
-        const size_t buffer_origin[3] = { offset, 0, 0 };
-        const size_t host_origin[3] = { 0, 0, 0 };
-        const size_t region[3] = { row_size, ne1, 1 };
-        return clEnqueueWriteBufferRect(queue, dst, CL_FALSE, buffer_origin, host_origin, region, row_size, 0, nb1, 0, x, 0, NULL, ev);
-    }
-    std::vector<cl_event> events;
-    if (ev && ne1>1) events.reserve(ne1-1);
-    for (uint64_t i1 = 0; i1 < ne1; i1++) {
-        // pretend the row is a matrix with cols=1
-        const size_t buffer_origin[3] = { offset + i1*row_size, 0, 0 };
-        const size_t host_origin[3] = { 0, 0, 0 };
-        const size_t region[3] = { ts, ne0/bs, 1 };
-        // if an event is requested, make the last write wait for all previous writes to complete
-        if (ev && i1) {
-            events.push_back(*ev);
-        }
-        cl_uint nevents = i1 == ne1-1 ? events.size() : 0U;
-        err = clEnqueueWriteBufferRect(queue, dst, CL_FALSE, buffer_origin, host_origin, region, ts, 0, nb0, 0, x + i1*nb1, nevents, nevents ? events.data() : nullptr, ev);
-        if (err != CL_SUCCESS) {
-            for (auto event : events) {
-                clReleaseEvent(event);
-            }
-            return err;
-        }
-    }
-    for (auto event : events) {
-        CL_CHECK(clReleaseEvent(event));
-    }
-    return CL_SUCCESS;
-}
-
-static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-    const int64_t ne03 = src0->ne[3];
-    const int64_t ne10 = src1->ne[0];
-    const int64_t ne11 = src1->ne[1];
-    const int64_t ne12 = src1->ne[2];
-    const int64_t ne13 = src1->ne[3];
-    const int nb2  = dst->nb[2];
-    const int nb3  = dst->nb[3];
-    size_t x_size;
-    size_t d_size;
-
-    cl_mem d_X = ggml_cl_pool_malloc(ne00 * ne01 * sizeof(float), &x_size); // src0
-    cl_mem d_Y = (cl_mem) src1->extra; // src1 is already on device, broadcasted.
-    cl_mem d_D = ggml_cl_pool_malloc(ne00 * ne01 * sizeof(float), &d_size); // dst
-
-
-    for (int64_t i03 = 0; i03 < ne03; i03++) {
-        for (int64_t i02 = 0; i02 < ne02; i02++) {
-            cl_event ev;
-
-            // copy src0 to device
-            CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, &ev));
-
-            const int64_t i13 = i03%ne13;
-            const int64_t i12 = i02%ne12;
-            const int i1 = i13*ne12*ne11 + i12*ne11;
-
-            cl_int x_offset = 0;
-            cl_int y_offset = i1*ne10;
-            cl_int d_offset = 0;
-
-            size_t global = ne00 * ne01;
-            cl_int ky = ne10 * ne11;
-
-            CL_CHECK(clSetKernelArg(mul_f32_cl, 0, sizeof(cl_mem), &d_X));
-            CL_CHECK(clSetKernelArg(mul_f32_cl, 1, sizeof(cl_int), &x_offset));
-            CL_CHECK(clSetKernelArg(mul_f32_cl, 2, sizeof(cl_mem), &d_Y));
-            CL_CHECK(clSetKernelArg(mul_f32_cl, 3, sizeof(cl_int), &y_offset));
-            CL_CHECK(clSetKernelArg(mul_f32_cl, 4, sizeof(cl_mem), &d_D));
-            CL_CHECK(clSetKernelArg(mul_f32_cl, 5, sizeof(cl_int), &d_offset));
-            CL_CHECK(clSetKernelArg(mul_f32_cl, 6, sizeof(cl_int), &ky));
-            CL_CHECK(clEnqueueNDRangeKernel(queue, mul_f32_cl, 1, NULL, &global, NULL, 1, &ev, NULL));
-
-            CL_CHECK(clReleaseEvent(ev));
-            CL_CHECK(clFinish(queue));
-
-            // copy dst to host
-            float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
-            CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * ne00*ne01, d, 0, NULL, NULL));
-        }
-    }
-    ggml_cl_pool_free(d_X, x_size);
-    ggml_cl_pool_free(d_D, d_size);
-}
-
-void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
-    GGML_ASSERT(src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
-    ggml_cl_mul_f32(src0, src1, dst);
-}
-
-static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-    const int64_t ne03 = src0->ne[3];
-
-    const int64_t ne10 = src1->ne[0];
-    const int64_t ne11 = src1->ne[1];
-    const int64_t ne12 = src1->ne[2];
-    const int64_t ne13 = src1->ne[3];
-
-    const int nb2  = dst->nb[2];
-    const int nb3  = dst->nb[3];
-
-    const int64_t r2 = ne12 / ne02;
-    const int64_t r3 = ne13 / ne03;
-
-    const float alpha = 1.0f;
-    const float beta = 0.0f;
-    const int x_ne = ne01 * ne00;
-    const int y_ne = ne11 * ne10;
-    const int d_ne = ne11 * ne01;
-
-    size_t x_size;
-    size_t y_size;
-    size_t d_size;
-    cl_mem d_X;
-    if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
-        d_X = (cl_mem) src0->extra;
-    } else {
-        d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size);
-    }
-    cl_mem d_Y = src1->backend == GGML_BACKEND_GPU ? (cl_mem) src1->extra : ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
-    cl_mem d_D =  dst->backend == GGML_BACKEND_GPU ? (cl_mem)  dst->extra : ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
-
-    size_t x_offset = 0;
-
-    for (int64_t i03 = 0; i03 < ne03; i03++) {
-        // TODO: copy src0 here when r3>1
-        for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
-            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                if (src0->backend == GGML_BACKEND_GPU) {
-                    x_offset = (i03 * ne02 + i02) * x_ne;
-                } else {
-                    // copy src0 to device
-                    CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
-                }
-
-                for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
-                    // copy src1 to device
-                    if (src1->backend == GGML_BACKEND_CPU) {
-                        CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
-                    }
-
-                    CL_CHECK(clFinish(queue));
-
-                    // compute
-                    cl_event ev_sgemm;
-                    clblast::StatusCode status = clblast::Gemm<cl_float>(clblast::Layout::kColMajor,
-                                                               clblast::Transpose::kYes, clblast::Transpose::kNo,
-                                                               ne01, ne11, ne10,
-                                                               alpha,
-                                                               d_X, x_offset, ne00,
-                                                               d_Y, 0, ne10,
-                                                               beta,
-                                                               d_D, 0, ne01,
-                                                               &queue, &ev_sgemm);
-
-                    if (status != clblast::StatusCode::kSuccess) {
-                        GGML_ASSERT(false);
-                    }
-
-                    // copy dst to host
-                    if (dst->backend == GGML_BACKEND_CPU) {
-                        float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
-                        CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL));
-                    }
-                }
-            }
-        }
-    }
-
-    if (src0->backend != GGML_BACKEND_GPU) {
-        ggml_cl_pool_free(d_X, x_size);
-    }
-    if (src1->backend != GGML_BACKEND_GPU) {
-        ggml_cl_pool_free(d_Y, y_size);
-    }
-    if (dst->backend != GGML_BACKEND_GPU) {
-        ggml_cl_pool_free(d_D, d_size);
-    }
-}
-
-static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, void * wdata, size_t wsize) {
-    GGML_ASSERT(fp16_support);
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-    const int64_t ne03 = src0->ne[3];
-
-    const int64_t ne10 = src1->ne[0];
-    const int64_t ne11 = src1->ne[1];
-    const int64_t ne12 = src1->ne[2];
-    const int64_t ne13 = src1->ne[3];
-
-    const int nb10 = src1->nb[0];
-    const int nb11 = src1->nb[1];
-    const int nb12 = src1->nb[2];
-    const int nb13 = src1->nb[3];
-
-    const int nb2  = dst->nb[2];
-    const int nb3  = dst->nb[3];
-
-    const int64_t r2 = ne12 / ne02;
-    const int64_t r3 = ne13 / ne03;
-
-    const ggml_fp16_t alpha = ggml_fp32_to_fp16(1.0f);
-    const ggml_fp16_t beta = ggml_fp32_to_fp16(0.0f);
-    const int x_ne = ne01 * ne00;
-    const int y_ne = ne11 * ne10;
-    const int d_ne = ne11 * ne01;
-
-    GGML_ASSERT(wsize >= sizeof(ggml_fp16_t) * y_ne);
-    GGML_ASSERT(wsize >= sizeof(ggml_fp16_t) * d_ne);
-    ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata;
-
-    size_t x_size;
-    size_t y_size;
-    size_t d_size;
-    cl_mem d_X;
-    if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
-        d_X = (cl_mem) src0->extra;
-    } else {
-        d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size);
-    }
-    cl_mem d_Y = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * y_ne, &y_size);
-    cl_mem d_D = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * d_ne, &d_size);
-
-    bool src1_cont_rows = nb10 == sizeof(float);
-    bool src1_cont_cols = (size_t)nb11 == ne11*sizeof(float);
-
-    size_t x_offset = 0;
-
-    for (int64_t i03 = 0; i03 < ne03; i03++) {
-        // TODO: copy src0 here when r3>1
-        for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
-            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                if (src0->backend == GGML_BACKEND_GPU) {
-                    x_offset = (i03 * ne02 + i02) * x_ne;
-                } else {
-                    // copy src0 to device
-                    CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
-                }
-
-                // FIXME: convert on device
-
-                for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
-                    // convert src1 to fp16
-                    // TODO: use multiple threads
-                    char * src1i = (char *) src1->data + i13*nb13 + i12*nb12;
-                    if (src1_cont_rows) {
-                        if (src1_cont_cols) {
-                            ggml_fp32_to_fp16_row((float *) src1i, tmp, ne10*ne11);
-                        }
-                        else {
-                            for (int64_t i11 = 0; i11 < ne11; i11++) {
-                                ggml_fp32_to_fp16_row((float *) (src1i + i11*nb11), tmp + i11*ne10, ne10);
-                            }
-                        }
-                    }
-                    else {
-                        for (int64_t i11 = 0; i11 < ne11; i11++) {
-                            for (int64_t i10 = 0; i10 < ne10; i10++) {
-                                // very slow due to no inlining
-                                tmp[i11*ne10 + i10] = ggml_fp32_to_fp16(*(float *) (src1i + i11*nb11 + i10*nb10));
-                            }
-                        }
-                    }
-
-                    // copy src1 to device
-                    CL_CHECK(clEnqueueWriteBuffer(queue, d_Y, false, 0, sizeof(ggml_fp16_t) * y_ne, tmp, 0, NULL, NULL));
-
-                    CL_CHECK(clFinish(queue));
-
-                    // compute
-                    cl_event ev_sgemm;
-                    clblast::StatusCode status = clblast::Gemm<cl_half>(clblast::Layout::kColMajor,
-                                                               clblast::Transpose::kYes, clblast::Transpose::kNo,
-                                                               ne01, ne11, ne10,
-                                                               alpha,
-                                                               d_X, x_offset, ne00,
-                                                               d_Y, 0, ne10,
-                                                               beta,
-                                                               d_D, 0, ne01,
-                                                               &queue, &ev_sgemm);
-
-                    if (status != clblast::StatusCode::kSuccess) {
-                        GGML_ASSERT(false);
-                    }
-
-                    // copy dst to host, then convert to float
-                    if (dst->backend == GGML_BACKEND_CPU) {
-                        CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL));
-                        float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
-                        ggml_fp16_to_fp32_row(tmp, d, d_ne);
-                    } else {
-                        // FIXME: convert dst to fp32 on device
-                    }
-                }
-            }
-        }
-    }
-
-    if (src0->backend != GGML_BACKEND_GPU) {
-        ggml_cl_pool_free(d_X, x_size);
-    }
-    ggml_cl_pool_free(d_Y, y_size);
-    ggml_cl_pool_free(d_D, d_size);
-}
-
-static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-    const int64_t ne03 = src0->ne[3];
-
-    const int64_t ne10 = src1->ne[0];
-    const int64_t ne11 = src1->ne[1];
-    const int64_t ne12 = src1->ne[2];
-    const int64_t ne13 = src1->ne[3];
-
-    const int nb2  = dst->nb[2];
-    const int nb3  = dst->nb[3];
-    const ggml_type type = src0->type;
-    const bool mul_mat_vec = ne11 == 1 && ne00%2 == 0;
-
-    const int64_t r2 = ne12 / ne02;
-    const int64_t r3 = ne13 / ne03;
-
-    const float alpha = 1.0f;
-    const float beta = 0.0f;
-    const int x_ne = ne01 * ne00;
-    const int y_ne = ne11 * ne10;
-    const int d_ne = ne11 * ne01;
-    const int x_bps = x_ne / ggml_blck_size(type); // blocks per 2D slice
-    const size_t q_sz = ggml_type_size(type) * x_bps;
-
-    size_t x_size;
-    size_t y_size;
-    size_t d_size;
-    size_t q_size;
-    cl_mem d_X;
-    if (!mul_mat_vec) {
-        d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size);
-    }
-    cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
-    cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
-    cl_mem d_Q;
-    if (src0->backend == GGML_BACKEND_CPU) {
-        d_Q = ggml_cl_pool_malloc(q_sz, &q_size);
-    }
-
-    cl_kernel* to_fp32_cl = ggml_get_to_fp32_cl(type);
-    cl_kernel* dmmv = ggml_get_dequantize_mul_mat_vec_cl(type);
-    GGML_ASSERT(to_fp32_cl != nullptr);
-
-    const size_t global_denom = ggml_cl_global_denom(type);
-    const size_t local = mul_mat_vec ? CL_DMMV_LOCAL_SIZE : ggml_cl_local_size(type);
-
-    size_t ev_idx = 0;
-    std::vector<cl_event> events;
-
-    for (int64_t i03 = 0; i03 < ne03; i03++) {
-        // TODO: copy and dequantize src0 here when r3>1
-        for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
-            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                // copy src0 to device if necessary
-                if (src0->backend == GGML_BACKEND_CPU) {
-                    events.emplace_back();
-                    CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
-                } else if (src0->backend == GGML_BACKEND_GPU) {
-                    d_Q = (cl_mem) src0->extra;
-                } else {
-                    GGML_ASSERT(false);
-                }
-
-                if (!mul_mat_vec) {
-                    // convert src0 to fp32 on device
-                    const size_t global = x_ne / global_denom;
-                    const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
-                    CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q));
-                    CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X));
-                    CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, &offset, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
-                }
-
-                for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
-                    if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel
-                        // copy src1 to device
-                        events.emplace_back();
-                        CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, events.data() + ev_idx++));
-
-                        // compute
-                        const size_t global = ne01 * local;
-                        const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
-                        const cl_int ncols = ne00;
-                        events.emplace_back();
-                        CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q));
-                        CL_CHECK(clSetKernelArg(*dmmv, 1, sizeof(float) * local, NULL));
-                        CL_CHECK(clSetKernelArg(*dmmv, 2, sizeof(cl_mem), &d_Y));
-                        CL_CHECK(clSetKernelArg(*dmmv, 3, sizeof(cl_mem), &d_D));
-                        CL_CHECK(clSetKernelArg(*dmmv, 4, sizeof(cl_int), &ncols));
-                        CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, &offset, &global, &local, events.size() - 1, events.data(), events.data() + ev_idx++));
-                    } else { // CLBlast matrix matrix multiplication
-                        // copy src1 to device
-                        CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
-
-                        // wait for conversion
-                        CL_CHECK(clFinish(queue));
-
-                        // compute
-                        events.emplace_back();
-                        clblast::StatusCode status = clblast::Gemm<cl_float>(clblast::Layout::kColMajor,
-                                                                   clblast::Transpose::kYes, clblast::Transpose::kNo,
-                                                                   ne01, ne11, ne10,
-                                                                   alpha,
-                                                                   d_X, 0, ne00,
-                                                                   d_Y, 0, ne10,
-                                                                   beta,
-                                                                   d_D, 0, ne01,
-                                                                   &queue, events.data() + ev_idx++);
-
-                        if (status != clblast::StatusCode::kSuccess) {
-                            GGML_ASSERT(false);
-                        }
-                    }
-
-                    // copy dst to host
-                    float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
-                    CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &events[events.size() - 1], NULL));
-                    for (auto *event : events) {
-                        clReleaseEvent(event);
-                    }
-
-                    ev_idx = 0;
-                    events.clear();
-                }
-            }
-        }
-    }
-
-    if (!mul_mat_vec) {
-        ggml_cl_pool_free(d_X, x_size);
-    }
-    ggml_cl_pool_free(d_Y, y_size);
-    ggml_cl_pool_free(d_D, d_size);
-    if (src0->backend == GGML_BACKEND_CPU) {
-        ggml_cl_pool_free(d_Q, q_size);
-    }
-}
-
-
-bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, const struct ggml_tensor * dst) {
-    const int64_t ne10 = src1->ne[0];
-
-    const int64_t ne0 = dst->ne[0];
-    const int64_t ne1 = dst->ne[1];
-
-    // TODO: find the optimal values for these
-    if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
-        src1->type == GGML_TYPE_F32 &&
-        dst->type == GGML_TYPE_F32 &&
-        ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_GPU)) {
-        return true;
-    }
-
-    return false;
-}
-
-static bool ggml_cl_mul_mat_use_f16(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * /* dst */) {
-    // If device doesn't support FP16
-    if (!fp16_support) {
-        return false;
-    }
-
-    size_t src0_sz = ggml_nbytes(src0);
-    size_t src1_sz = ggml_nbytes(src1);
-
-    // mul_mat_q: src0 is converted to fp32 on device
-    size_t mul_mat_q_transfer = src0_sz + src1_sz;
-
-    // mul_mat_f16: src1 is converted to fp16 on cpu
-    size_t mul_mat_f16_transfer = src0_sz + sizeof(ggml_fp16_t) * ggml_nelements(src1);
-
-    // choose the smaller one to transfer to the device
-    // TODO: this is not always the best choice due to the overhead of converting to fp16
-    return mul_mat_f16_transfer < mul_mat_q_transfer;
-}
-
-void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize) {
-    GGML_ASSERT(ggml_cl_can_mul_mat(src0, src1, dst));
-
-    if (src0->type == GGML_TYPE_F32) {
-        ggml_cl_mul_mat_f32(src0, src1, dst);
-    }
-    else if (src0->type == GGML_TYPE_F16) {
-        if (ggml_cl_mul_mat_use_f16(src0, src1, dst)) {
-            ggml_cl_mul_mat_f16(src0, src1, dst, wdata, wsize);
-        }
-        else {
-            ggml_cl_mul_mat_q_f32(src0, src1, dst);
-        }
-    }
-    else if (ggml_is_quantized(src0->type)) {
-        ggml_cl_mul_mat_q_f32(src0, src1, dst);
-    }
-    else {
-        GGML_ASSERT(false);
-    }
-}
-
-size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
-    if (src0->type == GGML_TYPE_F16 && ggml_cl_mul_mat_use_f16(src0, src1, dst)) {
-        return sizeof(ggml_fp16_t) * std::max(src1->ne[0] * src1->ne[1], dst->ne[0] * dst->ne[1]);
-    }
-    return 0;
-}
-
-void ggml_cl_transform_tensor(void * data, ggml_tensor * tensor) {
-    const int64_t ne0 = tensor->ne[0];
-    const int64_t ne1 = tensor->ne[1];
-    const int64_t ne2 = tensor->ne[2];
-    const int64_t ne3 = tensor->ne[3];
-
-    const ggml_type type = tensor->type;
-    const size_t s_sz = ggml_type_size(type) * (size_t) (ne0 * ne1 / ggml_blck_size(type));
-    const size_t q_sz = s_sz * (size_t) (ne2 * ne3);
-
-    size_t q_size;
-    cl_mem dst = ggml_cl_pool_malloc(q_sz, &q_size);
-
-    tensor->data = data;
-    // copy tensor to device
-    size_t offset = 0;
-    for (int64_t i3 = 0; i3 < ne3; i3++) {
-        for (int64_t i2 = 0; i2 < ne2; i2++) {
-            CL_CHECK(ggml_cl_h2d_tensor_2d(queue, dst, offset, tensor, i3, i2, NULL));
-            offset += s_sz;
-        }
-    }
-
-    CL_CHECK(clFinish(queue));
-
-    tensor->extra = dst;
-    GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
-}
-
-// ggml-backend
-
-// buffer
-
-struct ggml_backend_opencl_buffer_context {
-    ~ggml_backend_opencl_buffer_context() {
-        if (buffer) {
-            clReleaseMemObject(buffer);
-        }
-        for (auto * sub_buffer : sub_buffers) {
-            clReleaseMemObject(sub_buffer);
-        }
-    }
-
-    cl_mem buffer;
-    std::vector<cl_mem> sub_buffers;
-};
-
-static void * const cl_ptr_base = (void *)(uintptr_t) 0x1000;
-
-static const char * ggml_backend_opencl_buffer_get_name(ggml_backend_buffer_t buffer) {
-    return "OpenCL";
-
-    GGML_UNUSED(buffer);
-}
-
-static void ggml_backend_opencl_buffer_free_buffer(ggml_backend_buffer_t buffer) {
-    ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
-    delete ctx;
-}
-
-static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) {
-    return cl_ptr_base;
-
-    GGML_UNUSED(buffer);
-}
-
-static void ggml_backend_opencl_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
-    if (tensor->view_src != NULL && tensor->view_offs == 0) {
-        tensor->extra = tensor->view_src->extra;
-    } else {
-        ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
-        cl_buffer_region region = {(size_t)((char *)tensor->data - (char *)cl_ptr_base), ggml_nbytes(tensor)};
-        cl_int err;
-        cl_mem sub_buffer = clCreateSubBuffer(ctx->buffer, CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
-        CL_CHECK(err);
-        ctx->sub_buffers.push_back(sub_buffer);
-        tensor->extra = sub_buffer;
-    }
-    tensor->backend = GGML_BACKEND_GPU;
-}
-
-static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
-    cl_mem tensor_buffer = (cl_mem) tensor->extra;
-    CL_CHECK(clEnqueueWriteBuffer(queue, tensor_buffer, true, offset, size, data, 0, NULL, NULL));
-    CL_CHECK(clFinish(queue));
-
-    GGML_UNUSED(buffer);
-}
-
-static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
-    cl_mem tensor_buffer = (cl_mem) tensor->extra;
-    CL_CHECK(clEnqueueReadBuffer(queue, tensor_buffer, true, offset, size, data, 0, NULL, NULL));
-    CL_CHECK(clFinish(queue));
-
-    GGML_UNUSED(buffer);
-}
-
-static void ggml_backend_opencl_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
-    ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
-    CL_CHECK(clEnqueueFillBuffer(queue, ctx->buffer, &value, sizeof(value), 0, buffer->size, 0, NULL, NULL));
-    CL_CHECK(clFinish(queue));
-}
-
-static void ggml_backend_opencl_buffer_reset(ggml_backend_buffer_t buffer) {
-    ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
-    for (auto * sub_buffer : ctx->sub_buffers) {
-        clReleaseMemObject(sub_buffer);
-    }
-    ctx->sub_buffers.clear();
-}
-
-static ggml_backend_buffer_i ggml_backend_opencl_buffer_interface = {
-    /* .get_name        = */ ggml_backend_opencl_buffer_get_name,
-    /* .free_buffer     = */ ggml_backend_opencl_buffer_free_buffer,
-    /* .get_base        = */ ggml_backend_opencl_buffer_get_base,
-    /* .init_tensor     = */ ggml_backend_opencl_buffer_init_tensor,
-    /* .set_tensor      = */ ggml_backend_opencl_buffer_set_tensor,
-    /* .get_tensor      = */ ggml_backend_opencl_buffer_get_tensor,
-    /* .cpy_tensor      = */ NULL,
-    /* .clear           = */ ggml_backend_opencl_buffer_clear,
-    /* .reset           = */ ggml_backend_opencl_buffer_reset,
-};
-
-// buffer type
-
-static const char * ggml_backend_opencl_buffer_type_name(ggml_backend_buffer_type_t buffer_type) {
-    return "OpenCL";
-
-    GGML_UNUSED(buffer_type);
-}
-
-static ggml_backend_buffer_t ggml_backend_opencl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buffer_type, size_t size) {
-    ggml_cl_init();
-
-    cl_int err;
-    cl_mem mem = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &err);
-    if (err != CL_SUCCESS) {
-        fprintf(stderr, "%s: failed to allocate %.2f MiB\n", __func__, size / 1024.0 / 1024.0);
-        return nullptr;
-    }
-
-    ggml_backend_opencl_buffer_context * ctx = new ggml_backend_opencl_buffer_context{mem, {}};
-
-    return ggml_backend_buffer_init(buffer_type, ggml_backend_opencl_buffer_interface, ctx, size);
-}
-
-static size_t ggml_backend_opencl_buffer_type_get_alignment(ggml_backend_buffer_type_t buffer_type) {
-    // FIXME: not thread safe, device may not be initialized yet
-    static cl_uint alignment = -1;
-    if (alignment == (cl_uint)-1) {
-        ggml_cl_init();
-        clGetDeviceInfo(device, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(cl_uint), &alignment, NULL);
-    }
-    return alignment;
-
-    GGML_UNUSED(buffer_type);
-}
-
-static bool ggml_backend_opencl_buffer_type_supports_backend(ggml_backend_buffer_type_t buffer_type, ggml_backend_t backend) {
-    //return ggml_backend_is_opencl(backend); // opencl must be used through the cpu backend
-    return ggml_backend_is_cpu(backend);
-
-    GGML_UNUSED(buffer_type);
-}
-
-static ggml_backend_buffer_type_i ggml_backend_opencl_buffer_type_interface = {
-    /* .get_name         = */ ggml_backend_opencl_buffer_type_name,
-    /* .alloc_buffer     = */ ggml_backend_opencl_buffer_type_alloc_buffer,
-    /* .get_alignment    = */ ggml_backend_opencl_buffer_type_get_alignment,
-    /* .get_alloc_size   = */ NULL,
-    /* .supports_backend = */ ggml_backend_opencl_buffer_type_supports_backend,
-    /* .is_host          = */ NULL,
-};
-
-
-ggml_backend_buffer_type_t ggml_backend_opencl_buffer_type() {
-    static ggml_backend_buffer_type buffer_type = {
-        /* .iface   = */ ggml_backend_opencl_buffer_type_interface,
-        /* .context = */ nullptr,
-    };
-
-    return &buffer_type;
-}
-
-#if 0
-// host buffer type
-
-static const char * ggml_backend_opencl_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
-    return "CL_Host";
-
-    GGML_UNUSED(buft);
-}
-
-static const char * ggml_backend_opencl_host_buffer_name(ggml_backend_buffer_t buffer) {
-    return "CL_Host";
-
-    GGML_UNUSED(buffer);
-}
-
-static void ggml_backend_opencl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
-    ggml_cl_host_free(buffer->context);
-}
-
-static ggml_backend_buffer_t ggml_backend_opencl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
-    void * ptr = ggml_cl_host_malloc(size);
-
-    if (ptr == nullptr) {
-        // fallback to cpu buffer
-        return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
-    }
-
-    ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
-    buffer->buft = buft;
-    buffer->iface.get_name = ggml_backend_opencl_host_buffer_name;
-    buffer->iface.free_buffer = ggml_backend_opencl_host_buffer_free_buffer;
-
-    return buffer;
-}
-
-ggml_backend_buffer_type_t ggml_backend_opencl_host_buffer_type() {
-    static struct ggml_backend_buffer_type ggml_backend_opencl_buffer_type_host = {
-        /* .iface    = */ {
-            /* .get_name         = */ ggml_backend_opencl_host_buffer_type_name,
-            /* .alloc_buffer     = */ ggml_backend_opencl_host_buffer_type_alloc_buffer,
-            /* .get_alignment    = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
-            /* .get_alloc_size   = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
-            /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
-            /* .is_host          = */ ggml_backend_cpu_buffer_type()->iface.is_host,
-        },
-        /* .context  = */ nullptr,
-    };
-
-    return &ggml_backend_opencl_buffer_type_host;
-}
-
-// backend
-
-static const char * ggml_backend_opencl_name(ggml_backend_t backend) {
-    return "OpenCL";
-
-    GGML_UNUSED(backend);
-}
-
-static void ggml_backend_opencl_free(ggml_backend_t backend) {
-    GGML_UNUSED(backend);
-}
-
-static ggml_backend_buffer_type_t ggml_backend_opencl_get_default_buffer_type(ggml_backend_t backend) {
-    return ggml_backend_opencl_buffer_type();
-
-    GGML_UNUSED(backend);
-}
-
-static bool ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggml_cgraph * graph) {
-    for (int i = 0; i < graph->n_nodes; ++i) {
-        ggml_tensor * node = graph->nodes[i];
-        switch (node->op) {
-            case GGML_OP_MUL_MAT:
-                ggml_cl_mul_mat(node->src[0], node->src[1], node, nullptr, 0);
-                break;
-            case GGML_OP_MUL:
-                ggml_cl_mul(node->src[0], node->src[1], node);
-                break;
-            default:
-                GGML_ASSERT(false);
-        }
-    }
-
-    return true;
-
-    GGML_UNUSED(backend);
-}
-
-static bool ggml_backend_opencl_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
-    switch (op->op) {
-        case GGML_OP_MUL_MAT:
-            return ggml_cl_can_mul_mat(op->src[0], op->src[1], op);
-        case GGML_OP_MUL:
-            // return ggml_can_repeat_rows(op->src[1], op->src[0]);
-            return true;
-        default:
-            return false;
-    }
-
-    GGML_UNUSED(backend);
-}
-
-static ggml_backend_i opencl_backend_i = {
-    /* .get_name                = */ ggml_backend_opencl_name,
-    /* .free                    = */ ggml_backend_opencl_free,
-    /* .get_default_buffer_type = */ ggml_backend_opencl_get_default_buffer_type,
-    /* .set_tensor_async        = */ NULL,
-    /* .get_tensor_async        = */ NULL,
-    /* .cpy_tensor_from_async   = */ NULL,
-    /* .cpy_tensor_to_async     = */ NULL,
-    /* .synchronize             = */ NULL,
-    /* .graph_plan_create       = */ NULL,
-    /* .graph_plan_free         = */ NULL,
-    /* .graph_plan_compute      = */ NULL,
-    /* .graph_compute           = */ ggml_backend_opencl_graph_compute,
-    /* .supports_op             = */ ggml_backend_opencl_supports_op,
-};
-
-ggml_backend_t ggml_backend_opencl_init() {
-    ggml_backend_t backend = new ggml_backend {
-        /* .interface = */ opencl_backend_i,
-        /* .context   = */ nullptr
-    };
-
-    return backend;
-}
-
-bool ggml_backend_is_opencl(ggml_backend_t backend) {
-    return backend && backend->iface.get_name == ggml_backend_opencl_name;
-}
-#endif
diff --git a/ggml/src/ggml-opencl.h b/ggml/src/ggml-opencl.h
deleted file mode 100644
index 919b00d..0000000
--- a/ggml/src/ggml-opencl.h
+++ /dev/null
@@ -1,35 +0,0 @@
-#pragma once
-
-#include "ggml.h"
-#include "ggml-backend.h"
-
-#ifdef  __cplusplus
-extern "C" {
-#endif
-
-GGML_API void ggml_cl_init(void);
-
-GGML_API void   ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
-GGML_API bool   ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, const struct ggml_tensor * dst);
-GGML_API size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
-GGML_API void   ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
-
-// GGML_API void * ggml_cl_host_malloc(size_t size);
-// GGML_API void   ggml_cl_host_free(void * ptr);
-
-GGML_API void ggml_cl_free_data(const struct ggml_tensor* tensor);
-
-GGML_API void ggml_cl_transform_tensor(void * data, struct ggml_tensor * tensor);
-
-// backend API
-
-// GGML_API ggml_backend_t ggml_backend_opencl_init(void);
-
-// GGML_API bool ggml_backend_is_opencl(ggml_backend_t backend);
-
-GGML_API ggml_backend_buffer_type_t ggml_backend_opencl_buffer_type(void);
-// GGML_API ggml_backend_buffer_type_t ggml_backend_opencl_host_buffer_type(void);
-
-#ifdef  __cplusplus
-}
-#endif
diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
deleted file mode 100644
index 601d155..0000000
--- a/ggml/src/ggml-quants.c
+++ /dev/null
@@ -1,7732 +0,0 @@
-#include "ggml-quants.h"
-#include "ggml-impl.h"
-
-#include <math.h>
-#include <string.h>
-#include <assert.h>
-#include <float.h>
-
-#ifdef __ARM_NEON
-
-// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
-//
-//   $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
-//
-#include <arm_neon.h>
-
-#else
-
-#ifdef __wasm_simd128__
-#include <wasm_simd128.h>
-#else
-#if defined(__POWER9_VECTOR__) || defined(__powerpc64__)
-#include <altivec.h>
-#undef bool
-#define bool _Bool
-#else
-#if defined(_MSC_VER) || defined(__MINGW32__)
-#include <intrin.h>
-#else
-#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__)
-#if !defined(__riscv)
-#include <immintrin.h>
-#endif
-#endif
-#endif
-#endif
-#endif
-#endif
-
-#ifdef __riscv_v_intrinsic
-#include <riscv_vector.h>
-#endif
-
-#undef MIN
-#undef MAX
-
-#define MIN(a, b) ((a) < (b) ? (a) : (b))
-#define MAX(a, b) ((a) > (b) ? (a) : (b))
-
-#define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
-
-#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
-// multiply int8_t, add results pairwise twice
-static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) {
-    // Get absolute values of x vectors
-    const __m128i ax = _mm_sign_epi8(x, x);
-    // Sign the values of the y vectors
-    const __m128i sy = _mm_sign_epi8(y, x);
-    // Perform multiplication and create 16-bit values
-    const __m128i dot = _mm_maddubs_epi16(ax, sy);
-    const __m128i ones = _mm_set1_epi16(1);
-    return _mm_madd_epi16(ones, dot);
-}
-
-#if __AVX__ || __AVX2__ || __AVX512F__
-// horizontally add 8 floats
-static inline float hsum_float_8(const __m256 x) {
-    __m128 res = _mm256_extractf128_ps(x, 1);
-    res = _mm_add_ps(res, _mm256_castps256_ps128(x));
-    res = _mm_add_ps(res, _mm_movehl_ps(res, res));
-    res = _mm_add_ss(res, _mm_movehdup_ps(res));
-    return _mm_cvtss_f32(res);
-}
-
-// horizontally add 8 int32_t
-static inline int hsum_i32_8(const __m256i a) {
-    const __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1));
-    const __m128i hi64 = _mm_unpackhi_epi64(sum128, sum128);
-    const __m128i sum64 = _mm_add_epi32(hi64, sum128);
-    const __m128i hi32  = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1));
-    return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32));
-}
-
-// horizontally add 4 int32_t
-static inline int hsum_i32_4(const __m128i a) {
-    const __m128i hi64 = _mm_unpackhi_epi64(a, a);
-    const __m128i sum64 = _mm_add_epi32(hi64, a);
-    const __m128i hi32  = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1));
-    return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32));
-}
-
-#if defined(__AVX2__) || defined(__AVX512F__)
-// spread 32 bits to 32 bytes { 0x00, 0xFF }
-static inline __m256i bytes_from_bits_32(const uint8_t * x) {
-    uint32_t x32;
-    memcpy(&x32, x, sizeof(uint32_t));
-    const __m256i shuf_mask = _mm256_set_epi64x(
-            0x0303030303030303, 0x0202020202020202,
-            0x0101010101010101, 0x0000000000000000);
-    __m256i bytes = _mm256_shuffle_epi8(_mm256_set1_epi32(x32), shuf_mask);
-    const __m256i bit_mask = _mm256_set1_epi64x(0x7fbfdfeff7fbfdfe);
-    bytes = _mm256_or_si256(bytes, bit_mask);
-    return _mm256_cmpeq_epi8(bytes, _mm256_set1_epi64x(-1));
-}
-
-// Unpack 32 4-bit fields into 32 bytes
-// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
-static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi)
-{
-    const __m128i tmp = _mm_loadu_si128((const __m128i *)rsi);
-    const __m256i bytes = MM256_SET_M128I(_mm_srli_epi16(tmp, 4), tmp);
-    const __m256i lowMask = _mm256_set1_epi8( 0xF );
-    return _mm256_and_si256(lowMask, bytes);
-}
-
-// add int16_t pairwise and return as float vector
-static inline __m256 sum_i16_pairs_float(const __m256i x) {
-    const __m256i ones = _mm256_set1_epi16(1);
-    const __m256i summed_pairs = _mm256_madd_epi16(ones, x);
-    return _mm256_cvtepi32_ps(summed_pairs);
-}
-
-static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
-#if __AVXVNNI__
-    const __m256i zero = _mm256_setzero_si256();
-    const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy);
-    return _mm256_cvtepi32_ps(summed_pairs);
-#else
-    // Perform multiplication and create 16-bit values
-    const __m256i dot = _mm256_maddubs_epi16(ax, sy);
-    return sum_i16_pairs_float(dot);
-#endif
-}
-
-// multiply int8_t, add results pairwise twice and return as float vector
-static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) {
-#if __AVXVNNIINT8__
-    const __m256i zero = _mm256_setzero_si256();
-    const __m256i summed_pairs = _mm256_dpbssd_epi32(zero, x, y);
-    return _mm256_cvtepi32_ps(summed_pairs);
-#else
-    // Get absolute values of x vectors
-    const __m256i ax = _mm256_sign_epi8(x, x);
-    // Sign the values of the y vectors
-    const __m256i sy = _mm256_sign_epi8(y, x);
-    return mul_sum_us8_pairs_float(ax, sy);
-#endif
-}
-
-static inline __m128i packNibbles( __m256i bytes )
-{
-    // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
-#if __AVX512F__
-    const __m256i bytes_srli_4 = _mm256_srli_epi16(bytes, 4);   // 0000_0000_abcd_0000
-    bytes = _mm256_or_si256(bytes, bytes_srli_4);               // 0000_abcd_abcd_efgh
-    return _mm256_cvtepi16_epi8(bytes);                         // abcd_efgh
-#else
-    const __m256i lowByte = _mm256_set1_epi16( 0xFF );
-    __m256i high = _mm256_andnot_si256( lowByte, bytes );
-    __m256i low = _mm256_and_si256( lowByte, bytes );
-    high = _mm256_srli_epi16( high, 4 );
-    bytes = _mm256_or_si256( low, high );
-
-    // Compress uint16_t lanes into bytes
-    __m128i r0 = _mm256_castsi256_si128( bytes );
-    __m128i r1 = _mm256_extracti128_si256( bytes, 1 );
-    return _mm_packus_epi16( r0, r1 );
-#endif
-}
-#elif defined(__AVX__)
-// spread 32 bits to 32 bytes { 0x00, 0xFF }
-static inline __m256i bytes_from_bits_32(const uint8_t * x) {
-    uint32_t x32;
-    memcpy(&x32, x, sizeof(uint32_t));
-    const __m128i shuf_maskl = _mm_set_epi64x(0x0101010101010101, 0x0000000000000000);
-    const __m128i shuf_maskh = _mm_set_epi64x(0x0303030303030303, 0x0202020202020202);
-    __m128i bytesl = _mm_shuffle_epi8(_mm_set1_epi32(x32), shuf_maskl);
-    __m128i bytesh = _mm_shuffle_epi8(_mm_set1_epi32(x32), shuf_maskh);
-    const __m128i bit_mask = _mm_set1_epi64x(0x7fbfdfeff7fbfdfe);
-    bytesl = _mm_or_si128(bytesl, bit_mask);
-    bytesh = _mm_or_si128(bytesh, bit_mask);
-    bytesl = _mm_cmpeq_epi8(bytesl, _mm_set1_epi64x(-1));
-    bytesh = _mm_cmpeq_epi8(bytesh, _mm_set1_epi64x(-1));
-    return MM256_SET_M128I(bytesh, bytesl);
-}
-
-// Unpack 32 4-bit fields into 32 bytes
-// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
-static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi)
-{
-    // Load 16 bytes from memory
-    __m128i tmpl = _mm_loadu_si128((const __m128i *)rsi);
-    __m128i tmph = _mm_srli_epi16(tmpl, 4);
-    const __m128i lowMask = _mm_set1_epi8(0xF);
-    tmpl = _mm_and_si128(lowMask, tmpl);
-    tmph = _mm_and_si128(lowMask, tmph);
-    return MM256_SET_M128I(tmph, tmpl);
-}
-
-// add int16_t pairwise and return as float vector
-static inline __m256 sum_i16_pairs_float(const __m128i xh, const __m128i xl) {
-    const __m128i ones = _mm_set1_epi16(1);
-    const __m128i summed_pairsl = _mm_madd_epi16(ones, xl);
-    const __m128i summed_pairsh = _mm_madd_epi16(ones, xh);
-    const __m256i summed_pairs = MM256_SET_M128I(summed_pairsh, summed_pairsl);
-    return _mm256_cvtepi32_ps(summed_pairs);
-}
-
-static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
-    const __m128i axl = _mm256_castsi256_si128(ax);
-    const __m128i axh = _mm256_extractf128_si256(ax, 1);
-    const __m128i syl = _mm256_castsi256_si128(sy);
-    const __m128i syh = _mm256_extractf128_si256(sy, 1);
-    // Perform multiplication and create 16-bit values
-    const __m128i dotl = _mm_maddubs_epi16(axl, syl);
-    const __m128i doth = _mm_maddubs_epi16(axh, syh);
-    return sum_i16_pairs_float(doth, dotl);
-}
-
-// multiply int8_t, add results pairwise twice and return as float vector
-static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) {
-    const __m128i xl = _mm256_castsi256_si128(x);
-    const __m128i xh = _mm256_extractf128_si256(x, 1);
-    const __m128i yl = _mm256_castsi256_si128(y);
-    const __m128i yh = _mm256_extractf128_si256(y, 1);
-    // Get absolute values of x vectors
-    const __m128i axl = _mm_sign_epi8(xl, xl);
-    const __m128i axh = _mm_sign_epi8(xh, xh);
-    // Sign the values of the y vectors
-    const __m128i syl = _mm_sign_epi8(yl, xl);
-    const __m128i syh = _mm_sign_epi8(yh, xh);
-    // Perform multiplication and create 16-bit values
-    const __m128i dotl = _mm_maddubs_epi16(axl, syl);
-    const __m128i doth = _mm_maddubs_epi16(axh, syh);
-    return sum_i16_pairs_float(doth, dotl);
-}
-
-static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 )
-{
-    // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
-    const __m128i lowByte = _mm_set1_epi16( 0xFF );
-    __m128i high = _mm_andnot_si128( lowByte, bytes1 );
-    __m128i low = _mm_and_si128( lowByte, bytes1 );
-    high = _mm_srli_epi16( high, 4 );
-    bytes1 = _mm_or_si128( low, high );
-    high = _mm_andnot_si128( lowByte, bytes2 );
-    low = _mm_and_si128( lowByte, bytes2 );
-    high = _mm_srli_epi16( high, 4 );
-    bytes2 = _mm_or_si128( low, high );
-
-    return _mm_packus_epi16( bytes1, bytes2);
-}
-#endif
-#elif defined(__SSSE3__)
-// horizontally add 4x4 floats
-static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128 c, const __m128 d) {
-    __m128 res_0 =_mm_hadd_ps(a, b);
-    __m128 res_1 =_mm_hadd_ps(c, d);
-    __m128 res =_mm_hadd_ps(res_0, res_1);
-    res =_mm_hadd_ps(res, res);
-    res =_mm_hadd_ps(res, res);
-
-    return _mm_cvtss_f32(res);
-}
-#endif // __AVX__ || __AVX2__ || __AVX512F__
-#endif // defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
-
-#if defined(__ARM_NEON)
-#if !defined(__aarch64__)
-
-// 64-bit compatibility
-
-// vaddvq_s16
-// vpaddq_s16
-// vpaddq_s32
-// vaddvq_s32
-// vaddvq_f32
-// vmaxvq_f32
-// vcvtnq_s32_f32
-// vzip1_u8
-// vzip2_u8
-
-inline static int32_t vaddvq_s16(int16x8_t v) {
-    return
-        (int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
-        (int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
-        (int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
-        (int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
-}
-
-inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
-    int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a));
-    int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b));
-    return vcombine_s16(a0, b0);
-}
-
-inline static int32x4_t vpaddq_s32(int32x4_t a, int32x4_t b) {
-    int32x2_t a0 = vpadd_s32(vget_low_s32(a), vget_high_s32(a));
-    int32x2_t b0 = vpadd_s32(vget_low_s32(b), vget_high_s32(b));
-    return vcombine_s32(a0, b0);
-}
-
-inline static int32_t vaddvq_s32(int32x4_t v) {
-    return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
-}
-
-inline static float vaddvq_f32(float32x4_t v) {
-    return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
-}
-
-inline static float vmaxvq_f32(float32x4_t v) {
-    return
-        MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
-            MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
-}
-
-inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) {
-    int32x4_t res;
-
-    res[0] = roundf(vgetq_lane_f32(v, 0));
-    res[1] = roundf(vgetq_lane_f32(v, 1));
-    res[2] = roundf(vgetq_lane_f32(v, 2));
-    res[3] = roundf(vgetq_lane_f32(v, 3));
-
-    return res;
-}
-
-inline static uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) {
-    uint8x8_t res;
-
-    res[0] = a[0]; res[1] = b[0];
-    res[2] = a[1]; res[3] = b[1];
-    res[4] = a[2]; res[5] = b[2];
-    res[6] = a[3]; res[7] = b[3];
-
-    return res;
-}
-
-inline static uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) {
-    uint8x8_t res;
-
-    res[0] = a[4]; res[1] = b[4];
-    res[2] = a[5]; res[3] = b[5];
-    res[4] = a[6]; res[5] = b[6];
-    res[6] = a[7]; res[7] = b[7];
-
-    return res;
-}
-
-// vld1q_s16_x2
-// vld1q_u8_x2
-// vld1q_u8_x4
-// vld1q_s8_x2
-// vld1q_s8_x4
-// TODO: double-check these work correctly
-
-typedef struct ggml_int16x8x2_t {
-    int16x8_t val[2];
-} ggml_int16x8x2_t;
-
-inline static ggml_int16x8x2_t ggml_vld1q_s16_x2(const int16_t * ptr) {
-    ggml_int16x8x2_t res;
-
-    res.val[0] = vld1q_s16(ptr + 0);
-    res.val[1] = vld1q_s16(ptr + 8);
-
-    return res;
-}
-
-typedef struct ggml_uint8x16x2_t {
-    uint8x16_t val[2];
-} ggml_uint8x16x2_t;
-
-inline static ggml_uint8x16x2_t ggml_vld1q_u8_x2(const uint8_t * ptr) {
-    ggml_uint8x16x2_t res;
-
-    res.val[0] = vld1q_u8(ptr + 0);
-    res.val[1] = vld1q_u8(ptr + 16);
-
-    return res;
-}
-
-typedef struct ggml_uint8x16x4_t {
-    uint8x16_t val[4];
-} ggml_uint8x16x4_t;
-
-inline static ggml_uint8x16x4_t ggml_vld1q_u8_x4(const uint8_t * ptr) {
-    ggml_uint8x16x4_t res;
-
-    res.val[0] = vld1q_u8(ptr + 0);
-    res.val[1] = vld1q_u8(ptr + 16);
-    res.val[2] = vld1q_u8(ptr + 32);
-    res.val[3] = vld1q_u8(ptr + 48);
-
-    return res;
-}
-
-typedef struct ggml_int8x16x2_t {
-    int8x16_t val[2];
-} ggml_int8x16x2_t;
-
-inline static ggml_int8x16x2_t ggml_vld1q_s8_x2(const int8_t * ptr) {
-    ggml_int8x16x2_t res;
-
-    res.val[0] = vld1q_s8(ptr + 0);
-    res.val[1] = vld1q_s8(ptr + 16);
-
-    return res;
-}
-
-typedef struct ggml_int8x16x4_t {
-    int8x16_t val[4];
-} ggml_int8x16x4_t;
-
-inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) {
-    ggml_int8x16x4_t res;
-
-    res.val[0] = vld1q_s8(ptr + 0);
-    res.val[1] = vld1q_s8(ptr + 16);
-    res.val[2] = vld1q_s8(ptr + 32);
-    res.val[3] = vld1q_s8(ptr + 48);
-
-    return res;
-}
-
-#else
-
-#define ggml_int16x8x2_t  int16x8x2_t
-#define ggml_uint8x16x2_t uint8x16x2_t
-#define ggml_uint8x16x4_t uint8x16x4_t
-#define ggml_int8x16x2_t  int8x16x2_t
-#define ggml_int8x16x4_t  int8x16x4_t
-
-#define ggml_vld1q_s16_x2 vld1q_s16_x2
-#define ggml_vld1q_u8_x2  vld1q_u8_x2
-#define ggml_vld1q_u8_x4  vld1q_u8_x4
-#define ggml_vld1q_s8_x2  vld1q_s8_x2
-#define ggml_vld1q_s8_x4  vld1q_s8_x4
-
-#endif
-
-#if !defined(__ARM_FEATURE_DOTPROD)
-
-inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) {
-    const int16x8_t p0 = vmull_s8(vget_low_s8 (a), vget_low_s8 (b));
-    const int16x8_t p1 = vmull_s8(vget_high_s8(a), vget_high_s8(b));
-
-    return vaddq_s32(acc, vaddq_s32(vpaddlq_s16(p0), vpaddlq_s16(p1)));
-}
-
-#else
-
-#define ggml_vdotq_s32(a, b, c) vdotq_s32(a, b, c)
-
-#endif
-
-#endif
-
-#if defined(__ARM_NEON) || defined(__wasm_simd128__)
-#define B1(c,s,n)  0x ## n ## c ,  0x ## n ## s
-#define B2(c,s,n) B1(c,s,n ## c), B1(c,s,n ## s)
-#define B3(c,s,n) B2(c,s,n ## c), B2(c,s,n ## s)
-#define B4(c,s,n) B3(c,s,n ## c), B3(c,s,n ## s)
-#define B5(c,s,n) B4(c,s,n ## c), B4(c,s,n ## s)
-#define B6(c,s,n) B5(c,s,n ## c), B5(c,s,n ## s)
-#define B7(c,s,n) B6(c,s,n ## c), B6(c,s,n ## s)
-#define B8(c,s  ) B7(c,s,     c), B7(c,s,     s)
-
-// precomputed tables for expanding 8bits to 8 bytes:
-static const uint64_t table_b2b_0[1 << 8] = { B8(00, 10) }; // ( b) << 4
-static const uint64_t table_b2b_1[1 << 8] = { B8(10, 00) }; // (!b) << 4
-#endif
-
-// reference implementation for deterministic creation of model files
-void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k) {
-    static const int qk = QK4_0;
-
-    assert(k % qk == 0);
-
-    const int nb = k / qk;
-
-    for (int i = 0; i < nb; i++) {
-        float amax = 0.0f; // absolute max
-        float max  = 0.0f;
-
-        for (int j = 0; j < qk; j++) {
-            const float v = x[i*qk + j];
-            if (amax < fabsf(v)) {
-                amax = fabsf(v);
-                max  = v;
-            }
-        }
-
-        const float d  = max / -8;
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = GGML_FP32_TO_FP16(d);
-
-        for (int j = 0; j < qk/2; ++j) {
-            const float x0 = x[i*qk + 0    + j]*id;
-            const float x1 = x[i*qk + qk/2 + j]*id;
-
-            const uint8_t xi0 = MIN(15, (int8_t)(x0 + 8.5f));
-            const uint8_t xi1 = MIN(15, (int8_t)(x1 + 8.5f));
-
-            y[i].qs[j]  = xi0;
-            y[i].qs[j] |= xi1 << 4;
-        }
-    }
-}
-
-void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
-    quantize_row_q4_0_reference(x, y, k);
-}
-
-void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k) {
-    const int qk = QK4_1;
-
-    assert(k % qk == 0);
-
-    const int nb = k / qk;
-
-    for (int i = 0; i < nb; i++) {
-        float min = FLT_MAX;
-        float max = -FLT_MAX;
-
-        for (int j = 0; j < qk; j++) {
-            const float v = x[i*qk + j];
-
-            if (v < min) min = v;
-            if (v > max) max = v;
-        }
-
-        const float d  = (max - min) / ((1 << 4) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = GGML_FP32_TO_FP16(d);
-        y[i].m = GGML_FP32_TO_FP16(min);
-
-        for (int j = 0; j < qk/2; ++j) {
-            const float x0 = (x[i*qk + 0    + j] - min)*id;
-            const float x1 = (x[i*qk + qk/2 + j] - min)*id;
-
-            const uint8_t xi0 = MIN(15, (int8_t)(x0 + 0.5f));
-            const uint8_t xi1 = MIN(15, (int8_t)(x1 + 0.5f));
-
-            y[i].qs[j]  = xi0;
-            y[i].qs[j] |= xi1 << 4;
-        }
-    }
-}
-
-void quantize_row_q4_1(const float * restrict x, void * restrict y, int k) {
-    quantize_row_q4_1_reference(x, y, k);
-}
-
-void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int k) {
-    static const int qk = QK5_0;
-
-    assert(k % qk == 0);
-
-    const int nb = k / qk;
-
-    for (int i = 0; i < nb; i++) {
-        float amax = 0.0f; // absolute max
-        float max  = 0.0f;
-
-        for (int j = 0; j < qk; j++) {
-            const float v = x[i*qk + j];
-            if (amax < fabsf(v)) {
-                amax = fabsf(v);
-                max  = v;
-            }
-        }
-
-        const float d  = max / -16;
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = GGML_FP32_TO_FP16(d);
-
-        uint32_t qh = 0;
-
-        for (int j = 0; j < qk/2; ++j) {
-            const float x0 = x[i*qk + 0    + j]*id;
-            const float x1 = x[i*qk + qk/2 + j]*id;
-
-            const uint8_t xi0 = MIN(31, (int8_t)(x0 + 16.5f));
-            const uint8_t xi1 = MIN(31, (int8_t)(x1 + 16.5f));
-
-            y[i].qs[j] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4);
-
-            // get the 5-th bit and store it in qh at the right position
-            qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
-            qh |= ((xi1 & 0x10u) >> 4) << (j + qk/2);
-        }
-
-        memcpy(&y[i].qh, &qh, sizeof(qh));
-    }
-}
-
-void quantize_row_q5_0(const float * restrict x, void * restrict y, int k) {
-    quantize_row_q5_0_reference(x, y, k);
-}
-
-void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y, int k) {
-    const int qk = QK5_1;
-
-    assert(k % qk == 0);
-
-    const int nb = k / qk;
-
-    for (int i = 0; i < nb; i++) {
-        float min = FLT_MAX;
-        float max = -FLT_MAX;
-
-        for (int j = 0; j < qk; j++) {
-            const float v = x[i*qk + j];
-
-            if (v < min) min = v;
-            if (v > max) max = v;
-        }
-
-        const float d  = (max - min) / ((1 << 5) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = GGML_FP32_TO_FP16(d);
-        y[i].m = GGML_FP32_TO_FP16(min);
-
-        uint32_t qh = 0;
-
-        for (int j = 0; j < qk/2; ++j) {
-            const float x0 = (x[i*qk + 0    + j] - min)*id;
-            const float x1 = (x[i*qk + qk/2 + j] - min)*id;
-
-            const uint8_t xi0 = (uint8_t)(x0 + 0.5f);
-            const uint8_t xi1 = (uint8_t)(x1 + 0.5f);
-
-            y[i].qs[j] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4);
-
-            // get the 5-th bit and store it in qh at the right position
-            qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
-            qh |= ((xi1 & 0x10u) >> 4) << (j + qk/2);
-        }
-
-        memcpy(&y[i].qh, &qh, sizeof(y[i].qh));
-    }
-}
-
-void quantize_row_q5_1(const float * restrict x, void * restrict y, int k) {
-    quantize_row_q5_1_reference(x, y, k);
-}
-
-// reference implementation for deterministic creation of model files
-void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int k) {
-    assert(k % QK8_0 == 0);
-    const int nb = k / QK8_0;
-
-    for (int i = 0; i < nb; i++) {
-        float amax = 0.0f; // absolute max
-
-        for (int j = 0; j < QK8_0; j++) {
-            const float v = x[i*QK8_0 + j];
-            amax = MAX(amax, fabsf(v));
-        }
-
-        const float d = amax / ((1 << 7) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = GGML_FP32_TO_FP16(d);
-
-        for (int j = 0; j < QK8_0; ++j) {
-            const float x0 = x[i*QK8_0 + j]*id;
-
-            y[i].qs[j] = roundf(x0);
-        }
-    }
-}
-
-void quantize_row_q8_0(const float * restrict x, void * restrict vy, int k) {
-    assert(QK8_0 == 32);
-    assert(k % QK8_0 == 0);
-    const int nb = k / QK8_0;
-
-    block_q8_0 * restrict y = vy;
-
-#if defined(__ARM_NEON)
-    for (int i = 0; i < nb; i++) {
-        float32x4_t srcv [8];
-        float32x4_t asrcv[8];
-        float32x4_t amaxv[8];
-
-        for (int j = 0; j < 8; j++) srcv[j]  = vld1q_f32(x + i*32 + 4*j);
-        for (int j = 0; j < 8; j++) asrcv[j] = vabsq_f32(srcv[j]);
-
-        for (int j = 0; j < 4; j++) amaxv[2*j] = vmaxq_f32(asrcv[2*j], asrcv[2*j+1]);
-        for (int j = 0; j < 2; j++) amaxv[4*j] = vmaxq_f32(amaxv[4*j], amaxv[4*j+2]);
-        for (int j = 0; j < 1; j++) amaxv[8*j] = vmaxq_f32(amaxv[8*j], amaxv[8*j+4]);
-
-        const float amax = vmaxvq_f32(amaxv[0]);
-
-        const float d = amax / ((1 << 7) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = GGML_FP32_TO_FP16(d);
-
-        for (int j = 0; j < 8; j++) {
-            const float32x4_t v  = vmulq_n_f32(srcv[j], id);
-            const int32x4_t   vi = vcvtnq_s32_f32(v);
-
-            y[i].qs[4*j + 0] = vgetq_lane_s32(vi, 0);
-            y[i].qs[4*j + 1] = vgetq_lane_s32(vi, 1);
-            y[i].qs[4*j + 2] = vgetq_lane_s32(vi, 2);
-            y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3);
-        }
-    }
-#elif defined(__wasm_simd128__)
-    for (int i = 0; i < nb; i++) {
-        v128_t srcv [8];
-        v128_t asrcv[8];
-        v128_t amaxv[8];
-
-        for (int j = 0; j < 8; j++) srcv[j]  = wasm_v128_load(x + i*32 + 4*j);
-        for (int j = 0; j < 8; j++) asrcv[j] = wasm_f32x4_abs(srcv[j]);
-
-        for (int j = 0; j < 4; j++) amaxv[2*j] = wasm_f32x4_max(asrcv[2*j], asrcv[2*j+1]);
-        for (int j = 0; j < 2; j++) amaxv[4*j] = wasm_f32x4_max(amaxv[4*j], amaxv[4*j+2]);
-        for (int j = 0; j < 1; j++) amaxv[8*j] = wasm_f32x4_max(amaxv[8*j], amaxv[8*j+4]);
-
-        const float amax = MAX(MAX(wasm_f32x4_extract_lane(amaxv[0], 0),
-                                   wasm_f32x4_extract_lane(amaxv[0], 1)),
-                               MAX(wasm_f32x4_extract_lane(amaxv[0], 2),
-                                   wasm_f32x4_extract_lane(amaxv[0], 3)));
-
-        const float d = amax / ((1 << 7) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = GGML_FP32_TO_FP16(d);
-
-        for (int j = 0; j < 8; j++) {
-            const v128_t v  = wasm_f32x4_mul(srcv[j], wasm_f32x4_splat(id));
-            const v128_t vi = wasm_i32x4_trunc_sat_f32x4(v);
-
-            y[i].qs[4*j + 0] = wasm_i32x4_extract_lane(vi, 0);
-            y[i].qs[4*j + 1] = wasm_i32x4_extract_lane(vi, 1);
-            y[i].qs[4*j + 2] = wasm_i32x4_extract_lane(vi, 2);
-            y[i].qs[4*j + 3] = wasm_i32x4_extract_lane(vi, 3);
-        }
-    }
-#elif defined(__AVX2__) || defined(__AVX__)
-    for (int i = 0; i < nb; i++) {
-        // Load elements into 4 AVX vectors
-        __m256 v0 = _mm256_loadu_ps( x );
-        __m256 v1 = _mm256_loadu_ps( x + 8 );
-        __m256 v2 = _mm256_loadu_ps( x + 16 );
-        __m256 v3 = _mm256_loadu_ps( x + 24 );
-        x += 32;
-
-        // Compute max(abs(e)) for the block
-        const __m256 signBit = _mm256_set1_ps( -0.0f );
-        __m256 maxAbs = _mm256_andnot_ps( signBit, v0 );
-        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) );
-        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) );
-        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) );
-
-        __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) );
-        max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) );
-        max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) );
-        const float maxScalar = _mm_cvtss_f32( max4 );
-
-        // Quantize these floats
-        const float d = maxScalar / 127.f;
-        y[i].d = GGML_FP32_TO_FP16(d);
-        const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f;
-        const __m256 mul = _mm256_set1_ps( id );
-
-        // Apply the multiplier
-        v0 = _mm256_mul_ps( v0, mul );
-        v1 = _mm256_mul_ps( v1, mul );
-        v2 = _mm256_mul_ps( v2, mul );
-        v3 = _mm256_mul_ps( v3, mul );
-
-        // Round to nearest integer
-        v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST );
-        v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST );
-        v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST );
-        v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST );
-
-        // Convert floats to integers
-        __m256i i0 = _mm256_cvtps_epi32( v0 );
-        __m256i i1 = _mm256_cvtps_epi32( v1 );
-        __m256i i2 = _mm256_cvtps_epi32( v2 );
-        __m256i i3 = _mm256_cvtps_epi32( v3 );
-
-#if defined(__AVX2__)
-        // Convert int32 to int16
-        i0 = _mm256_packs_epi32( i0, i1 );	// 0, 1, 2, 3,  8, 9, 10, 11,  4, 5, 6, 7, 12, 13, 14, 15
-        i2 = _mm256_packs_epi32( i2, i3 );	// 16, 17, 18, 19,  24, 25, 26, 27,  20, 21, 22, 23, 28, 29, 30, 31
-                                            // Convert int16 to int8
-        i0 = _mm256_packs_epi16( i0, i2 );	// 0, 1, 2, 3,  8, 9, 10, 11,  16, 17, 18, 19,  24, 25, 26, 27,  4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
-
-        // We got our precious signed bytes, but the order is now wrong
-        // These AVX2 pack instructions process 16-byte pieces independently
-        // The following instruction is fixing the order
-        const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 );
-        i0 = _mm256_permutevar8x32_epi32( i0, perm );
-
-        _mm256_storeu_si256((__m256i *)y[i].qs, i0);
-#else
-        // Since we don't have in AVX some necessary functions,
-        // we split the registers in half and call AVX2 analogs from SSE
-        __m128i ni0 = _mm256_castsi256_si128( i0 );
-        __m128i ni1 = _mm256_extractf128_si256( i0, 1);
-        __m128i ni2 = _mm256_castsi256_si128( i1 );
-        __m128i ni3 = _mm256_extractf128_si256( i1, 1);
-        __m128i ni4 = _mm256_castsi256_si128( i2 );
-        __m128i ni5 = _mm256_extractf128_si256( i2, 1);
-        __m128i ni6 = _mm256_castsi256_si128( i3 );
-        __m128i ni7 = _mm256_extractf128_si256( i3, 1);
-
-        // Convert int32 to int16
-        ni0 = _mm_packs_epi32( ni0, ni1 );
-        ni2 = _mm_packs_epi32( ni2, ni3 );
-        ni4 = _mm_packs_epi32( ni4, ni5 );
-        ni6 = _mm_packs_epi32( ni6, ni7 );
-        // Convert int16 to int8
-        ni0 = _mm_packs_epi16( ni0, ni2 );
-        ni4 = _mm_packs_epi16( ni4, ni6 );
-
-        _mm_storeu_si128((__m128i *)(y[i].qs +  0), ni0);
-        _mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4);
-#endif
-    }
-#elif defined(__riscv_v_intrinsic)
-
-    size_t vl = __riscv_vsetvl_e32m4(QK8_0);
-
-    for (int i = 0; i < nb; i++) {
-        // load elements
-        vfloat32m4_t v_x   = __riscv_vle32_v_f32m4(x+i*QK8_0, vl);
-
-        vfloat32m4_t vfabs = __riscv_vfabs_v_f32m4(v_x, vl);
-        vfloat32m1_t tmp   = __riscv_vfmv_v_f_f32m1(0.0f, vl);
-        vfloat32m1_t vmax  = __riscv_vfredmax_vs_f32m4_f32m1(vfabs, tmp, vl);
-        float amax = __riscv_vfmv_f_s_f32m1_f32(vmax);
-
-        const float d = amax / ((1 << 7) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = GGML_FP32_TO_FP16(d);
-
-        vfloat32m4_t x0 = __riscv_vfmul_vf_f32m4(v_x, id, vl);
-
-        // convert to integer
-        vint16m2_t   vi = __riscv_vfncvt_x_f_w_i16m2(x0, vl);
-        vint8m1_t    vs = __riscv_vncvt_x_x_w_i8m1(vi, vl);
-
-        // store result
-        __riscv_vse8_v_i8m1(y[i].qs , vs, vl);
-    }
-#else
-    GGML_UNUSED(nb);
-    // scalar
-    quantize_row_q8_0_reference(x, y, k);
-#endif
-}
-
-// reference implementation for deterministic creation of model files
-void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int k) {
-    assert(QK8_1 == 32);
-    assert(k % QK8_1 == 0);
-    const int nb = k / QK8_1;
-
-    for (int i = 0; i < nb; i++) {
-        float amax = 0.0f; // absolute max
-
-        for (int j = 0; j < QK8_1; j++) {
-            const float v = x[i*QK8_1 + j];
-            amax = MAX(amax, fabsf(v));
-        }
-
-        const float d = amax / ((1 << 7) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = d;
-
-        int sum = 0;
-
-        for (int j = 0; j < QK8_1/2; ++j) {
-            const float v0 = x[i*QK8_1           + j]*id;
-            const float v1 = x[i*QK8_1 + QK8_1/2 + j]*id;
-
-            y[i].qs[          j] = roundf(v0);
-            y[i].qs[QK8_1/2 + j] = roundf(v1);
-
-            sum += y[i].qs[          j];
-            sum += y[i].qs[QK8_1/2 + j];
-        }
-
-        y[i].s = sum*d;
-    }
-}
-
-void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) {
-    assert(k % QK8_1 == 0);
-    const int nb = k / QK8_1;
-
-    block_q8_1 * restrict y = vy;
-
-#if defined(__ARM_NEON)
-    for (int i = 0; i < nb; i++) {
-        float32x4_t srcv [8];
-        float32x4_t asrcv[8];
-        float32x4_t amaxv[8];
-
-        for (int j = 0; j < 8; j++) srcv[j]  = vld1q_f32(x + i*32 + 4*j);
-        for (int j = 0; j < 8; j++) asrcv[j] = vabsq_f32(srcv[j]);
-
-        for (int j = 0; j < 4; j++) amaxv[2*j] = vmaxq_f32(asrcv[2*j], asrcv[2*j+1]);
-        for (int j = 0; j < 2; j++) amaxv[4*j] = vmaxq_f32(amaxv[4*j], amaxv[4*j+2]);
-        for (int j = 0; j < 1; j++) amaxv[8*j] = vmaxq_f32(amaxv[8*j], amaxv[8*j+4]);
-
-        const float amax = vmaxvq_f32(amaxv[0]);
-
-        const float d = amax / ((1 << 7) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = d;
-
-        int32x4_t accv = vdupq_n_s32(0);
-
-        for (int j = 0; j < 8; j++) {
-            const float32x4_t v  = vmulq_n_f32(srcv[j], id);
-            const int32x4_t   vi = vcvtnq_s32_f32(v);
-
-            y[i].qs[4*j + 0] = vgetq_lane_s32(vi, 0);
-            y[i].qs[4*j + 1] = vgetq_lane_s32(vi, 1);
-            y[i].qs[4*j + 2] = vgetq_lane_s32(vi, 2);
-            y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3);
-
-            accv = vaddq_s32(accv, vi);
-        }
-
-        y[i].s = d * vaddvq_s32(accv);
-    }
-#elif defined(__wasm_simd128__)
-    for (int i = 0; i < nb; i++) {
-        v128_t srcv [8];
-        v128_t asrcv[8];
-        v128_t amaxv[8];
-
-        for (int j = 0; j < 8; j++) srcv[j]  = wasm_v128_load(x + i*32 + 4*j);
-        for (int j = 0; j < 8; j++) asrcv[j] = wasm_f32x4_abs(srcv[j]);
-
-        for (int j = 0; j < 4; j++) amaxv[2*j] = wasm_f32x4_max(asrcv[2*j], asrcv[2*j+1]);
-        for (int j = 0; j < 2; j++) amaxv[4*j] = wasm_f32x4_max(amaxv[4*j], amaxv[4*j+2]);
-        for (int j = 0; j < 1; j++) amaxv[8*j] = wasm_f32x4_max(amaxv[8*j], amaxv[8*j+4]);
-
-        const float amax = MAX(MAX(wasm_f32x4_extract_lane(amaxv[0], 0),
-                                   wasm_f32x4_extract_lane(amaxv[0], 1)),
-                               MAX(wasm_f32x4_extract_lane(amaxv[0], 2),
-                                   wasm_f32x4_extract_lane(amaxv[0], 3)));
-
-        const float d = amax / ((1 << 7) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = d;
-
-        v128_t accv = wasm_i32x4_splat(0);
-
-        for (int j = 0; j < 8; j++) {
-            const v128_t v  = wasm_f32x4_mul(srcv[j], wasm_f32x4_splat(id));
-            const v128_t vi = wasm_i32x4_trunc_sat_f32x4(v);
-
-            y[i].qs[4*j + 0] = wasm_i32x4_extract_lane(vi, 0);
-            y[i].qs[4*j + 1] = wasm_i32x4_extract_lane(vi, 1);
-            y[i].qs[4*j + 2] = wasm_i32x4_extract_lane(vi, 2);
-            y[i].qs[4*j + 3] = wasm_i32x4_extract_lane(vi, 3);
-
-            accv = wasm_i32x4_add(accv, vi);
-        }
-
-        y[i].s = d * (wasm_i32x4_extract_lane(accv, 0) +
-                      wasm_i32x4_extract_lane(accv, 1) +
-                      wasm_i32x4_extract_lane(accv, 2) +
-                      wasm_i32x4_extract_lane(accv, 3));
-    }
-#elif defined(__AVX2__) || defined(__AVX__)
-    for (int i = 0; i < nb; i++) {
-        // Load elements into 4 AVX vectors
-        __m256 v0 = _mm256_loadu_ps( x );
-        __m256 v1 = _mm256_loadu_ps( x + 8 );
-        __m256 v2 = _mm256_loadu_ps( x + 16 );
-        __m256 v3 = _mm256_loadu_ps( x + 24 );
-        x += 32;
-
-        // Compute max(abs(e)) for the block
-        const __m256 signBit = _mm256_set1_ps( -0.0f );
-        __m256 maxAbs = _mm256_andnot_ps( signBit, v0 );
-        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) );
-        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) );
-        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) );
-
-        __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) );
-        max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) );
-        max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) );
-        const float maxScalar = _mm_cvtss_f32( max4 );
-
-        // Quantize these floats
-        const float d = maxScalar / 127.f;
-        y[i].d = d;
-        const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f;
-        const __m256 mul = _mm256_set1_ps( id );
-
-        // Apply the multiplier
-        v0 = _mm256_mul_ps( v0, mul );
-        v1 = _mm256_mul_ps( v1, mul );
-        v2 = _mm256_mul_ps( v2, mul );
-        v3 = _mm256_mul_ps( v3, mul );
-
-        // Round to nearest integer
-        v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST );
-        v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST );
-        v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST );
-        v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST );
-
-        // Convert floats to integers
-        __m256i i0 = _mm256_cvtps_epi32( v0 );
-        __m256i i1 = _mm256_cvtps_epi32( v1 );
-        __m256i i2 = _mm256_cvtps_epi32( v2 );
-        __m256i i3 = _mm256_cvtps_epi32( v3 );
-
-#if defined(__AVX2__)
-        // Compute the sum of the quants and set y[i].s
-        y[i].s = d * hsum_i32_8(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3)));
-
-        // Convert int32 to int16
-        i0 = _mm256_packs_epi32( i0, i1 );	// 0, 1, 2, 3,  8, 9, 10, 11,  4, 5, 6, 7, 12, 13, 14, 15
-        i2 = _mm256_packs_epi32( i2, i3 );	// 16, 17, 18, 19,  24, 25, 26, 27,  20, 21, 22, 23, 28, 29, 30, 31
-                                            // Convert int16 to int8
-        i0 = _mm256_packs_epi16( i0, i2 );	// 0, 1, 2, 3,  8, 9, 10, 11,  16, 17, 18, 19,  24, 25, 26, 27,  4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
-
-        // We got our precious signed bytes, but the order is now wrong
-        // These AVX2 pack instructions process 16-byte pieces independently
-        // The following instruction is fixing the order
-        const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 );
-        i0 = _mm256_permutevar8x32_epi32( i0, perm );
-
-        _mm256_storeu_si256((__m256i *)y[i].qs, i0);
-#else
-        // Since we don't have in AVX some necessary functions,
-        // we split the registers in half and call AVX2 analogs from SSE
-        __m128i ni0 = _mm256_castsi256_si128( i0 );
-        __m128i ni1 = _mm256_extractf128_si256( i0, 1);
-        __m128i ni2 = _mm256_castsi256_si128( i1 );
-        __m128i ni3 = _mm256_extractf128_si256( i1, 1);
-        __m128i ni4 = _mm256_castsi256_si128( i2 );
-        __m128i ni5 = _mm256_extractf128_si256( i2, 1);
-        __m128i ni6 = _mm256_castsi256_si128( i3 );
-        __m128i ni7 = _mm256_extractf128_si256( i3, 1);
-
-        // Compute the sum of the quants and set y[i].s
-        const __m128i s0 = _mm_add_epi32(_mm_add_epi32(ni0, ni1), _mm_add_epi32(ni2, ni3));
-        const __m128i s1 = _mm_add_epi32(_mm_add_epi32(ni4, ni5), _mm_add_epi32(ni6, ni7));
-        y[i].s = d * hsum_i32_4(_mm_add_epi32(s0, s1));
-
-        // Convert int32 to int16
-        ni0 = _mm_packs_epi32( ni0, ni1 );
-        ni2 = _mm_packs_epi32( ni2, ni3 );
-        ni4 = _mm_packs_epi32( ni4, ni5 );
-        ni6 = _mm_packs_epi32( ni6, ni7 );
-        // Convert int16 to int8
-        ni0 = _mm_packs_epi16( ni0, ni2 );
-        ni4 = _mm_packs_epi16( ni4, ni6 );
-
-        _mm_storeu_si128((__m128i *)(y[i].qs +  0), ni0);
-        _mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4);
-#endif
-    }
-#elif defined(__riscv_v_intrinsic)
-
-    size_t vl = __riscv_vsetvl_e32m4(QK8_1);
-
-    for (int i = 0; i < nb; i++) {
-        // load elements
-        vfloat32m4_t v_x   = __riscv_vle32_v_f32m4(x+i*QK8_1, vl);
-
-        vfloat32m4_t vfabs = __riscv_vfabs_v_f32m4(v_x, vl);
-        vfloat32m1_t tmp   = __riscv_vfmv_v_f_f32m1(0.0, vl);
-        vfloat32m1_t vmax  = __riscv_vfredmax_vs_f32m4_f32m1(vfabs, tmp, vl);
-        float amax = __riscv_vfmv_f_s_f32m1_f32(vmax);
-
-        const float d  = amax / ((1 << 7) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = d;
-
-        vfloat32m4_t x0 = __riscv_vfmul_vf_f32m4(v_x, id, vl);
-
-        // convert to integer
-        vint16m2_t   vi = __riscv_vfncvt_x_f_w_i16m2(x0, vl);
-        vint8m1_t    vs = __riscv_vncvt_x_x_w_i8m1(vi, vl);
-
-        // store result
-        __riscv_vse8_v_i8m1(y[i].qs , vs, vl);
-
-        // compute sum for y[i].s
-        vint16m1_t tmp2 = __riscv_vmv_v_x_i16m1(0, vl);
-        vint16m1_t vwrs = __riscv_vwredsum_vs_i8m1_i16m1(vs, tmp2, vl);
-
-        // set y[i].s
-        int sum = __riscv_vmv_x_s_i16m1_i16(vwrs);
-        y[i].s = sum*d;
-    }
-#else
-    GGML_UNUSED(nb);
-    // scalar
-    quantize_row_q8_1_reference(x, y, k);
-#endif
-}
-
-void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k) {
-    static const int qk = QK4_0;
-
-    assert(k % qk == 0);
-
-    const int nb = k / qk;
-
-    for (int i = 0; i < nb; i++) {
-        const float d = GGML_FP16_TO_FP32(x[i].d);
-
-        for (int j = 0; j < qk/2; ++j) {
-            const int x0 = (x[i].qs[j] & 0x0F) - 8;
-            const int x1 = (x[i].qs[j] >>   4) - 8;
-
-            y[i*qk + j + 0   ] = x0*d;
-            y[i*qk + j + qk/2] = x1*d;
-        }
-    }
-}
-
-void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int k) {
-    static const int qk = QK4_1;
-
-    assert(k % qk == 0);
-
-    const int nb = k / qk;
-
-    for (int i = 0; i < nb; i++) {
-        const float d = GGML_FP16_TO_FP32(x[i].d);
-        const float m = GGML_FP16_TO_FP32(x[i].m);
-
-        for (int j = 0; j < qk/2; ++j) {
-            const int x0 = (x[i].qs[j] & 0x0F);
-            const int x1 = (x[i].qs[j] >>   4);
-
-            y[i*qk + j + 0   ] = x0*d + m;
-            y[i*qk + j + qk/2] = x1*d + m;
-        }
-    }
-}
-
-void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int k) {
-    static const int qk = QK5_0;
-
-    assert(k % qk == 0);
-
-    const int nb = k / qk;
-
-    for (int i = 0; i < nb; i++) {
-        const float d = GGML_FP16_TO_FP32(x[i].d);
-
-        uint32_t qh;
-        memcpy(&qh, x[i].qh, sizeof(qh));
-
-        for (int j = 0; j < qk/2; ++j) {
-            const uint8_t xh_0 = ((qh >> (j +  0)) << 4) & 0x10;
-            const uint8_t xh_1 = ((qh >> (j + 12))     ) & 0x10;
-
-            const int32_t x0 = ((x[i].qs[j] & 0x0F) | xh_0) - 16;
-            const int32_t x1 = ((x[i].qs[j] >>   4) | xh_1) - 16;
-
-            y[i*qk + j + 0   ] = x0*d;
-            y[i*qk + j + qk/2] = x1*d;
-        }
-    }
-}
-
-void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int k) {
-    static const int qk = QK5_1;
-
-    assert(k % qk == 0);
-
-    const int nb = k / qk;
-
-    for (int i = 0; i < nb; i++) {
-        const float d = GGML_FP16_TO_FP32(x[i].d);
-        const float m = GGML_FP16_TO_FP32(x[i].m);
-
-        uint32_t qh;
-        memcpy(&qh, x[i].qh, sizeof(qh));
-
-        for (int j = 0; j < qk/2; ++j) {
-            const uint8_t xh_0 = ((qh >> (j +  0)) << 4) & 0x10;
-            const uint8_t xh_1 = ((qh >> (j + 12))     ) & 0x10;
-
-            const int x0 = (x[i].qs[j] & 0x0F) | xh_0;
-            const int x1 = (x[i].qs[j] >>   4) | xh_1;
-
-            y[i*qk + j + 0   ] = x0*d + m;
-            y[i*qk + j + qk/2] = x1*d + m;
-        }
-    }
-}
-
-void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int k) {
-    static const int qk = QK8_0;
-
-    assert(k % qk == 0);
-
-    const int nb = k / qk;
-
-    for (int i = 0; i < nb; i++) {
-        const float d = GGML_FP16_TO_FP32(x[i].d);
-
-        for (int j = 0; j < qk; ++j) {
-            y[i*qk + j] = x[i].qs[j]*d;
-        }
-    }
-}
-
-//
-// 2-6 bit quantization in super-blocks
-//
-
-//
-// ===================== Helper functions
-//
-static inline int nearest_int(float fval) {
-    assert(fval <= 4194303.f);
-    float val = fval + 12582912.f;
-    int i; memcpy(&i, &val, sizeof(int));
-    return (i & 0x007fffff) - 0x00400000;
-}
-
-static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t * restrict L, int rmse_type) {
-    float max = 0;
-    float amax = 0;
-    for (int i = 0; i < n; ++i) {
-        float ax = fabsf(x[i]);
-        if (ax > amax) { amax = ax; max = x[i]; }
-    }
-    if (amax < 1e-30f) { // all zero
-        for (int i = 0; i < n; ++i) {
-            L[i] = 0;
-        }
-        return 0.f;
-    }
-    float iscale = -nmax / max;
-    if (rmse_type == 0) {
-        for (int i = 0; i < n; ++i) {
-            int l = nearest_int(iscale * x[i]);
-            L[i] = nmax + MAX(-nmax, MIN(nmax-1, l));
-        }
-        return 1/iscale;
-    }
-    bool return_early = false;
-    if (rmse_type < 0) {
-        rmse_type = -rmse_type;
-        return_early = true;
-    }
-    int weight_type = rmse_type%2;
-    float sumlx = 0;
-    float suml2 = 0;
-    for (int i = 0; i < n; ++i) {
-        int l = nearest_int(iscale * x[i]);
-        l = MAX(-nmax, MIN(nmax-1, l));
-        L[i] = l + nmax;
-        float w = weight_type == 1 ? x[i] * x[i] : 1;
-        sumlx += w*x[i]*l;
-        suml2 += w*l*l;
-    }
-    float scale = sumlx/suml2;
-    if (return_early) return suml2 > 0 ? 0.5f*(scale + 1/iscale) : 1/iscale;
-    float best = scale * sumlx;
-    for (int is = -9; is <= 9; ++is) {
-        if (is == 0) {
-            continue;
-        }
-        iscale = -(nmax + 0.1f*is) / max;
-        sumlx = suml2 = 0;
-        for (int i = 0; i < n; ++i) {
-            int l = nearest_int(iscale * x[i]);
-            l = MAX(-nmax, MIN(nmax-1, l));
-            float w = weight_type == 1 ? x[i] * x[i] : 1;
-            sumlx += w*x[i]*l;
-            suml2 += w*l*l;
-        }
-        if (suml2 > 0 && sumlx*sumlx > best*suml2) {
-            for (int i = 0; i < n; ++i) {
-                int l = nearest_int(iscale * x[i]);
-                L[i] = nmax + MAX(-nmax, MIN(nmax-1, l));
-            }
-            scale = sumlx/suml2; best = scale*sumlx;
-        }
-    }
-    return scale;
-}
-
-static float make_q3_quants(int n, int nmax, const float * restrict x, int8_t * restrict L, bool do_rmse) {
-    float max = 0;
-    float amax = 0;
-    for (int i = 0; i < n; ++i) {
-        float ax = fabsf(x[i]);
-        if (ax > amax) { amax = ax; max = x[i]; }
-    }
-    if (!amax) { // all zero
-        for (int i = 0; i < n; ++i) { L[i] = 0; }
-        return 0.f;
-    }
-    float iscale = -nmax / max;
-    if (do_rmse) {
-        float sumlx = 0;
-        float suml2 = 0;
-        for (int i = 0; i < n; ++i) {
-            int l = nearest_int(iscale * x[i]);
-            l = MAX(-nmax, MIN(nmax-1, l));
-            L[i] = l;
-            float w = x[i]*x[i];
-            sumlx += w*x[i]*l;
-            suml2 += w*l*l;
-        }
-        for (int itry = 0; itry < 5; ++itry) {
-            int n_changed = 0;
-            for (int i = 0; i < n; ++i) {
-                float w = x[i]*x[i];
-                float slx = sumlx - w*x[i]*L[i];
-                if (slx > 0) {
-                    float sl2 = suml2 - w*L[i]*L[i];
-                    int new_l = nearest_int(x[i] * sl2 / slx);
-                    new_l = MAX(-nmax, MIN(nmax-1, new_l));
-                    if (new_l != L[i]) {
-                        slx += w*x[i]*new_l;
-                        sl2 += w*new_l*new_l;
-                        if (sl2 > 0 && slx*slx*suml2 > sumlx*sumlx*sl2) {
-                            L[i] = new_l; sumlx = slx; suml2 = sl2;
-                            ++n_changed;
-                        }
-                    }
-                }
-            }
-            if (!n_changed) {
-                break;
-            }
-        }
-        for (int i = 0; i < n; ++i) {
-            L[i] += nmax;
-        }
-        return sumlx / suml2;
-    }
-    for (int i = 0; i < n; ++i) {
-        int l = nearest_int(iscale * x[i]);
-        l = MAX(-nmax, MIN(nmax-1, l));
-        L[i] = l + nmax;
-    }
-    return 1/iscale;
-}
-
-static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t * restrict L, float * restrict the_min,
-        int ntry, float alpha) {
-    float min = x[0];
-    float max = x[0];
-    for (int i = 1; i < n; ++i) {
-        if (x[i] < min) min = x[i];
-        if (x[i] > max) max = x[i];
-    }
-    if (max == min) {
-        for (int i = 0; i < n; ++i) L[i] = 0;
-        *the_min = 0;
-        return 0.f;
-    }
-    if (min > 0) min = 0;
-    float iscale = nmax/(max - min);
-    float scale = 1/iscale;
-    for (int itry = 0; itry < ntry; ++itry) {
-        float sumlx = 0; int suml2 = 0;
-        bool did_change = false;
-        for (int i = 0; i < n; ++i) {
-            int l = nearest_int(iscale*(x[i] - min));
-            l = MAX(0, MIN(nmax, l));
-            if (l != L[i]) {
-                L[i] = l;
-                did_change = true;
-            }
-            sumlx += (x[i] - min)*l;
-            suml2 += l*l;
-        }
-        scale = sumlx/suml2;
-        float sum = 0;
-        for (int i = 0; i < n; ++i) {
-            sum += x[i] - scale*L[i];
-        }
-        min = alpha*min + (1 - alpha)*sum/n;
-        if (min > 0) min = 0;
-        iscale = 1/scale;
-        if (!did_change) break;
-    }
-    *the_min = -min;
-    return scale;
-}
-
-static float make_qkx2_quants(int n, int nmax, const float * restrict x, const float * restrict weights,
-        uint8_t * restrict L, float * restrict the_min, uint8_t * restrict Laux,
-        float rmin, float rdelta, int nstep, bool use_mad) {
-    float min = x[0];
-    float max = x[0];
-    float sum_w = weights[0];
-    float sum_x = sum_w * x[0];
-#ifdef HAVE_BUGGY_APPLE_LINKER
-    // use 'volatile' to prevent unroll and work around a bug in Apple ld64 1015.7
-    for (volatile int i = 1; i < n; ++i) {
-#else
-    for (int i = 1; i < n; ++i) {
-#endif
-        if (x[i] < min) min = x[i];
-        if (x[i] > max) max = x[i];
-        float w = weights[i];
-        sum_w += w;
-        sum_x += w * x[i];
-    }
-    if (min > 0) min = 0;
-    if (max == min) {
-        for (int i = 0; i < n; ++i) L[i] = 0;
-        *the_min = -min;
-        return 0.f;
-    }
-    float iscale = nmax/(max - min);
-    float scale = 1/iscale;
-    float best_mad = 0;
-    for (int i = 0; i < n; ++i) {
-        int l = nearest_int(iscale*(x[i] - min));
-        L[i] = MAX(0, MIN(nmax, l));
-        float diff = scale * L[i] + min - x[i];
-        diff = use_mad ? fabsf(diff) : diff * diff;
-        float w = weights[i];
-        best_mad += w * diff;
-    }
-    if (nstep < 1) {
-        *the_min = -min;
-        return scale;
-    }
-    for (int is = 0; is <= nstep; ++is) {
-        iscale = (rmin + rdelta*is + nmax)/(max - min);
-        float sum_l = 0, sum_l2 = 0, sum_xl = 0;
-        for (int i = 0; i < n; ++i) {
-            int l = nearest_int(iscale*(x[i] - min));
-            l = MAX(0, MIN(nmax, l));
-            Laux[i] = l;
-            float w = weights[i];
-            sum_l += w*l;
-            sum_l2 += w*l*l;
-            sum_xl += w*l*x[i];
-        }
-        float D = sum_w * sum_l2 - sum_l * sum_l;
-        if (D > 0) {
-            float this_scale = (sum_w * sum_xl - sum_x * sum_l)/D;
-            float this_min   = (sum_l2 * sum_x - sum_l * sum_xl)/D;
-            if (this_min > 0) {
-                this_min = 0;
-                this_scale = sum_xl / sum_l2;
-            }
-            float mad = 0;
-            for (int i = 0; i < n; ++i) {
-                float diff = this_scale * Laux[i] + this_min - x[i];
-                diff = use_mad ? fabsf(diff) : diff * diff;
-                float w = weights[i];
-                mad += w * diff;
-            }
-            if (mad < best_mad) {
-                for (int i = 0; i < n; ++i) {
-                    L[i] = Laux[i];
-                }
-                best_mad = mad;
-                scale = this_scale;
-                min = this_min;
-            }
-        }
-    }
-    *the_min = -min;
-    return scale;
-}
-
-#if QK_K == 256
-static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t * restrict d, uint8_t * restrict m) {
-    if (j < 4) {
-        *d = q[j] & 63; *m = q[j + 4] & 63;
-    } else {
-        *d = (q[j+4] & 0xF) | ((q[j-4] >> 6) << 4);
-        *m = (q[j+4] >>  4) | ((q[j-0] >> 6) << 4);
-    }
-}
-#endif
-
-//========================- 2-bit (de)-quantization
-
-void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int k) {
-    assert(k % QK_K == 0);
-    const int nb = k / QK_K;
-
-    uint8_t L[QK_K];
-    uint8_t Laux[16];
-    float   weights[16];
-    float mins[QK_K/16];
-    float scales[QK_K/16];
-
-    const float q4scale = 15.f;
-
-    for (int i = 0; i < nb; i++) {
-        float max_scale = 0; // as we are deducting the min, scales are always positive
-        float max_min = 0;
-        for (int j = 0; j < QK_K/16; ++j) {
-            for (int l = 0; l < 16; ++l) weights[l] = fabsf(x[16*j + l]);
-            scales[j] = make_qkx2_quants(16, 3, x + 16*j, weights, L + 16*j, &mins[j], Laux, -0.5f, 0.1f, 15, true);
-            float scale = scales[j];
-            if (scale > max_scale) {
-                max_scale = scale;
-            }
-            float min = mins[j];
-            if (min > max_min) {
-                max_min = min;
-            }
-        }
-
-        if (max_scale > 0) {
-            float iscale = q4scale/max_scale;
-            for (int j = 0; j < QK_K/16; ++j) {
-                int l = nearest_int(iscale*scales[j]);
-                y[i].scales[j] = l;
-            }
-            y[i].d = GGML_FP32_TO_FP16(max_scale/q4scale);
-        } else {
-            for (int j = 0; j < QK_K/16; ++j) y[i].scales[j] = 0;
-            y[i].d = GGML_FP32_TO_FP16(0.f);
-        }
-        if (max_min > 0) {
-            float iscale = q4scale/max_min;
-            for (int j = 0; j < QK_K/16; ++j) {
-                int l = nearest_int(iscale*mins[j]);
-                y[i].scales[j] |= (l << 4);
-            }
-            y[i].dmin = GGML_FP32_TO_FP16(max_min/q4scale);
-        } else {
-            y[i].dmin = GGML_FP32_TO_FP16(0.f);
-        }
-        for (int j = 0; j < QK_K/16; ++j) {
-            const float d = GGML_FP16_TO_FP32(y[i].d) * (y[i].scales[j] & 0xF);
-            if (!d) continue;
-            const float dm = GGML_FP16_TO_FP32(y[i].dmin) * (y[i].scales[j] >> 4);
-            for (int ii = 0; ii < 16; ++ii) {
-                int l = nearest_int((x[16*j + ii] + dm)/d);
-                l = MAX(0, MIN(3, l));
-                L[16*j + ii] = l;
-            }
-        }
-
-#if QK_K == 256
-        for (int j = 0; j < QK_K; j += 128) {
-            for (int l = 0; l < 32; ++l) {
-                y[i].qs[j/4 + l] = L[j + l] | (L[j + l + 32] << 2) | (L[j + l + 64] << 4) | (L[j + l + 96] << 6);
-            }
-        }
-#else
-        for (int l = 0; l < 16; ++l) {
-            y[i].qs[l] = L[l] | (L[l + 16] << 2) | (L[l + 32] << 4) | (L[l + 48] << 6);
-        }
-#endif
-
-        x += QK_K;
-
-    }
-}
-
-void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int k) {
-    assert(k % QK_K == 0);
-    const int nb = k / QK_K;
-
-    for (int i = 0; i < nb; i++) {
-
-        const float d = GGML_FP16_TO_FP32(x[i].d);
-        const float min = GGML_FP16_TO_FP32(x[i].dmin);
-
-        const uint8_t * q = x[i].qs;
-
-#if QK_K == 256
-        int is = 0;
-        float dl, ml;
-        for (int n = 0; n < QK_K; n += 128) {
-            int shift = 0;
-            for (int j = 0; j < 4; ++j) {
-
-                uint8_t sc = x[i].scales[is++];
-                dl = d * (sc & 0xF); ml = min * (sc >> 4);
-                for (int l = 0; l < 16; ++l) *y++ = dl * ((int8_t)((q[l] >> shift) & 3)) - ml;
-
-                sc = x[i].scales[is++];
-                dl = d * (sc & 0xF); ml = min * (sc >> 4);
-                for (int l = 0; l < 16; ++l) *y++ = dl * ((int8_t)((q[l+16] >> shift) & 3)) - ml;
-
-                shift += 2;
-            }
-            q += 32;
-        }
-#else
-        float dl1 = d * (x[i].scales[0] & 0xF), ml1 = min * (x[i].scales[0] >> 4);
-        float dl2 = d * (x[i].scales[1] & 0xF), ml2 = min * (x[i].scales[1] >> 4);
-        float dl3 = d * (x[i].scales[2] & 0xF), ml3 = min * (x[i].scales[2] >> 4);
-        float dl4 = d * (x[i].scales[3] & 0xF), ml4 = min * (x[i].scales[3] >> 4);
-        for (int l = 0; l < 16; ++l) {
-            y[l+ 0] = dl1 * ((int8_t)((q[l] >> 0) & 3)) - ml1;
-            y[l+16] = dl2 * ((int8_t)((q[l] >> 2) & 3)) - ml2;
-            y[l+32] = dl3 * ((int8_t)((q[l] >> 4) & 3)) - ml3;
-            y[l+48] = dl4 * ((int8_t)((q[l] >> 6) & 3)) - ml4;
-        }
-        y += QK_K;
-#endif
-    }
-}
-
-void quantize_row_q2_K(const float * restrict x, void * restrict vy, int k) {
-    quantize_row_q2_K_reference(x, vy, k);
-}
-
-size_t ggml_quantize_q2_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
-    (void)hist; // TODO: collect histograms
-
-    for (int j = 0; j < n; j += k) {
-        block_q2_K * restrict y = (block_q2_K *)dst + j/QK_K;
-        quantize_row_q2_K_reference(src + j, y, k);
-    }
-    return (n/QK_K*sizeof(block_q2_K));
-}
-
-//========================= 3-bit (de)-quantization
-
-void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k) {
-    assert(k % QK_K == 0);
-    const int nb = k / QK_K;
-
-    int8_t L[QK_K];
-    float scales[QK_K / 16];
-
-    for (int i = 0; i < nb; i++) {
-
-        float max_scale = 0;
-        float amax = 0;
-        for (int j = 0; j < QK_K/16; ++j) {
-            scales[j] = make_q3_quants(16, 4, x + 16*j, L + 16*j, true);
-            float scale = fabsf(scales[j]);
-            if (scale > amax) {
-                amax = scale; max_scale = scales[j];
-            }
-        }
-
-#if QK_K == 256
-        memset(y[i].scales, 0, 12);
-        if (max_scale) {
-            float iscale = -32.f/max_scale;
-            for (int j = 0; j < QK_K/16; ++j) {
-                int8_t l = nearest_int(iscale*scales[j]);
-                l = MAX(-32, MIN(31, l)) + 32;
-                if (j < 8) {
-                    y[i].scales[j] = l & 0xF;
-                } else {
-                    y[i].scales[j-8] |= ((l & 0xF) << 4);
-                }
-                l >>= 4;
-                y[i].scales[j%4 + 8] |= (l << (2*(j/4)));
-            }
-            y[i].d = GGML_FP32_TO_FP16(1/iscale);
-        } else {
-            y[i].d = GGML_FP32_TO_FP16(0.f);
-        }
-
-        int8_t sc;
-        for (int j = 0; j < QK_K/16; ++j) {
-            sc = j < 8 ? y[i].scales[j] & 0xF : y[i].scales[j-8] >> 4;
-            sc = (sc | (((y[i].scales[8 + j%4] >> (2*(j/4))) & 3) << 4)) - 32;
-            float d = GGML_FP16_TO_FP32(y[i].d) * sc;
-            if (!d) {
-                continue;
-            }
-            for (int ii = 0; ii < 16; ++ii) {
-                int l = nearest_int(x[16*j + ii]/d);
-                l = MAX(-4, MIN(3, l));
-                L[16*j + ii] = l + 4;
-            }
-        }
-#else
-        if (max_scale) {
-            float iscale = -8.f/max_scale;
-            for (int j = 0; j < QK_K/16; j+=2) {
-                int l1 = nearest_int(iscale*scales[j]);
-                l1 = 8 + MAX(-8, MIN(7, l1));
-                int l2 = nearest_int(iscale*scales[j+1]);
-                l2 = 8 + MAX(-8, MIN(7, l2));
-                y[i].scales[j/2] = l1 | (l2 << 4);
-            }
-            y[i].d = GGML_FP32_TO_FP16(1/iscale);
-        } else {
-            for (int j = 0; j < QK_K/16; j+=2) {
-                y[i].scales[j/2] = 0;
-            }
-            y[i].d = GGML_FP32_TO_FP16(0.f);
-        }
-        for (int j = 0; j < QK_K/16; ++j) {
-            int s = j%2 == 0 ? y[i].scales[j/2] & 0xF : y[i].scales[j/2] >> 4;
-            float d = GGML_FP16_TO_FP32(y[i].d) * (s - 8);
-            if (!d) {
-                continue;
-            }
-            for (int ii = 0; ii < 16; ++ii) {
-                int l = nearest_int(x[16*j + ii]/d);
-                l = MAX(-4, MIN(3, l));
-                L[16*j + ii] = l + 4;
-            }
-        }
-#endif
-
-        memset(y[i].hmask, 0, QK_K/8);
-        // We put the high-bit for the 1st 8 quants into bit 0, the next 8 into bit 1, etc.
-        int m = 0;
-        uint8_t hm = 1;
-        for (int j = 0; j < QK_K; ++j) {
-            if (L[j] > 3) {
-                y[i].hmask[m] |= hm;
-                L[j] -= 4;
-            }
-            if (++m == QK_K/8) {
-                m = 0; hm <<= 1;
-            }
-        }
-#if QK_K == 256
-        for (int j = 0; j < QK_K; j += 128) {
-            for (int l = 0; l < 32; ++l) {
-                y[i].qs[j/4 + l] = L[j + l] | (L[j + l + 32] << 2) | (L[j + l + 64] << 4) | (L[j + l + 96] << 6);
-            }
-        }
-#else
-        for (int l = 0; l < 16; ++l) {
-            y[i].qs[l] = L[l] | (L[l + 16] << 2) | (L[l + 32] << 4) | (L[l + 48] << 6);
-        }
-#endif
-
-        x += QK_K;
-    }
-}
-
-#if QK_K == 256
-void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k) {
-    assert(k % QK_K == 0);
-    const int nb = k / QK_K;
-
-    const uint32_t kmask1 = 0x03030303;
-    const uint32_t kmask2 = 0x0f0f0f0f;
-
-    uint32_t aux[4];
-    const int8_t * scales = (const int8_t*)aux;
-
-    for (int i = 0; i < nb; i++) {
-
-        const float d_all = GGML_FP16_TO_FP32(x[i].d);
-
-        const uint8_t * restrict q = x[i].qs;
-        const uint8_t * restrict hm = x[i].hmask;
-        uint8_t m = 1;
-
-        memcpy(aux, x[i].scales, 12);
-        uint32_t tmp = aux[2];
-        aux[2] = ((aux[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
-        aux[3] = ((aux[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
-        aux[0] = (aux[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
-        aux[1] = (aux[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);
-
-        int is = 0;
-        float dl;
-        for (int n = 0; n < QK_K; n += 128) {
-            int shift = 0;
-            for (int j = 0; j < 4; ++j) {
-
-                dl = d_all * (scales[is++] - 32);
-                for (int l = 0; l < 16; ++l) {
-                    *y++ = dl * ((int8_t)((q[l+ 0] >> shift) & 3) - ((hm[l+ 0] & m) ? 0 : 4));
-                }
-
-                dl = d_all * (scales[is++] - 32);
-                for (int l = 0; l < 16; ++l) {
-                    *y++ = dl * ((int8_t)((q[l+16] >> shift) & 3) - ((hm[l+16] & m) ? 0 : 4));
-                }
-
-                shift += 2;
-                m <<= 1;
-            }
-            q += 32;
-        }
-
-    }
-}
-#else
-void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k) {
-    assert(k % QK_K == 0);
-    assert(QK_K == 64);
-    const int nb = k / QK_K;
-
-    for (int i = 0; i < nb; i++) {
-
-        const float d_all = GGML_FP16_TO_FP32(x[i].d);
-
-        const uint8_t * restrict q = x[i].qs;
-        const uint8_t * restrict hm = x[i].hmask;
-
-        const float d1 = d_all * ((x[i].scales[0] & 0xF) - 8);
-        const float d2 = d_all * ((x[i].scales[0] >>  4) - 8);
-        const float d3 = d_all * ((x[i].scales[1] & 0xF) - 8);
-        const float d4 = d_all * ((x[i].scales[1] >>  4) - 8);
-
-        for (int l=0; l<8; ++l) {
-            uint8_t h = hm[l];
-            y[l+ 0] = d1 * ((int8_t)((q[l+0] >> 0) & 3) - ((h & 0x01) ? 0 : 4));
-            y[l+ 8] = d1 * ((int8_t)((q[l+8] >> 0) & 3) - ((h & 0x02) ? 0 : 4));
-            y[l+16] = d2 * ((int8_t)((q[l+0] >> 2) & 3) - ((h & 0x04) ? 0 : 4));
-            y[l+24] = d2 * ((int8_t)((q[l+8] >> 2) & 3) - ((h & 0x08) ? 0 : 4));
-            y[l+32] = d3 * ((int8_t)((q[l+0] >> 4) & 3) - ((h & 0x10) ? 0 : 4));
-            y[l+40] = d3 * ((int8_t)((q[l+8] >> 4) & 3) - ((h & 0x20) ? 0 : 4));
-            y[l+48] = d4 * ((int8_t)((q[l+0] >> 6) & 3) - ((h & 0x40) ? 0 : 4));
-            y[l+56] = d4 * ((int8_t)((q[l+8] >> 6) & 3) - ((h & 0x80) ? 0 : 4));
-        }
-        y += QK_K;
-    }
-}
-#endif
-
-void quantize_row_q3_K(const float * restrict x, void * restrict vy, int k) {
-    quantize_row_q3_K_reference(x, vy, k);
-}
-
-size_t ggml_quantize_q3_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
-    (void)hist; // TODO: collect histograms
-
-    for (int j = 0; j < n; j += k) {
-        block_q3_K * restrict y = (block_q3_K *)dst + j/QK_K;
-        quantize_row_q3_K_reference(src + j, y, k);
-    }
-    return (n/QK_K*sizeof(block_q3_K));
-}
-
-// ====================== 4-bit (de)-quantization
-
-void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k) {
-    assert(k % QK_K == 0);
-    const int nb = k / QK_K;
-
-    uint8_t L[QK_K];
-    uint8_t Laux[32];
-    float   weights[32];
-    float mins[QK_K/32];
-    float scales[QK_K/32];
-
-    for (int i = 0; i < nb; i++) {
-
-        float max_scale = 0; // as we are deducting the min, scales are always positive
-        float max_min = 0;
-        for (int j = 0; j < QK_K/32; ++j) {
-            //scales[j] = make_qkx1_quants(32, 15, x + 32*j, L + 32*j, &mins[j], 9, 0.5f);
-            float sum_x2 = 0;
-            for (int l = 0; l < 32; ++l) sum_x2 += x[32*j + l] * x[32*j + l];
-            float av_x = sqrtf(sum_x2/32);
-            for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]);
-            scales[j] = make_qkx2_quants(32, 15, x + 32*j, weights, L + 32*j, &mins[j], Laux, -1.f, 0.1f, 20, false);
-            float scale = scales[j];
-            if (scale > max_scale) {
-                max_scale = scale;
-            }
-            float min = mins[j];
-            if (min > max_min) {
-                max_min = min;
-            }
-        }
-
-#if QK_K == 256
-        float inv_scale = max_scale > 0 ? 63.f/max_scale : 0.f;
-        float inv_min   = max_min   > 0 ? 63.f/max_min   : 0.f;
-        for (int j = 0; j < QK_K/32; ++j) {
-            uint8_t ls = nearest_int(inv_scale*scales[j]);
-            uint8_t lm = nearest_int(inv_min*mins[j]);
-            ls = MIN(63, ls);
-            lm = MIN(63, lm);
-            if (j < 4) {
-                y[i].scales[j] = ls;
-                y[i].scales[j+4] = lm;
-            } else {
-                y[i].scales[j+4] = (ls & 0xF) | ((lm & 0xF) << 4);
-                y[i].scales[j-4] |= ((ls >> 4) << 6);
-                y[i].scales[j-0] |= ((lm >> 4) << 6);
-            }
-        }
-        y[i].d = GGML_FP32_TO_FP16(max_scale/63.f);
-        y[i].dmin = GGML_FP32_TO_FP16(max_min/63.f);
-
-        uint8_t sc, m;
-        for (int j = 0; j < QK_K/32; ++j) {
-            get_scale_min_k4(j, y[i].scales, &sc, &m);
-            const float d = GGML_FP16_TO_FP32(y[i].d) * sc;
-            if (!d) continue;
-            const float dm = GGML_FP16_TO_FP32(y[i].dmin) * m;
-            for (int ii = 0; ii < 32; ++ii) {
-                int l = nearest_int((x[32*j + ii] + dm)/d);
-                l = MAX(0, MIN(15, l));
-                L[32*j + ii] = l;
-            }
-        }
-#else
-        const float s_factor = 15.f;
-        float inv_scale = max_scale > 0 ? s_factor/max_scale : 0.f;
-        float inv_min   = max_min   > 0 ? s_factor/max_min   : 0.f;
-        int d1 = nearest_int(inv_scale*scales[0]);
-        int m1 = nearest_int(inv_min*mins[0]);
-        int d2 = nearest_int(inv_scale*scales[1]);
-        int m2 = nearest_int(inv_min*mins[1]);
-        y[i].scales[0] = d1 | (m1 << 4);
-        y[i].scales[1] = d2 | (m2 << 4);
-        y[i].d[0] = GGML_FP32_TO_FP16(max_scale/s_factor);
-        y[i].d[1] = GGML_FP32_TO_FP16(max_min/s_factor);
-
-        float sumlx = 0;
-        int   suml2 = 0;
-        for (int j = 0; j < QK_K/32; ++j) {
-            const uint8_t sd = y[i].scales[j] & 0xF;
-            const uint8_t sm = y[i].scales[j] >>  4;
-            const float d = GGML_FP16_TO_FP32(y[i].d[0]) * sd;
-            if (!d) continue;
-            const float m = GGML_FP16_TO_FP32(y[i].d[1]) * sm;
-            for (int ii = 0; ii < 32; ++ii) {
-                int l = nearest_int((x[32*j + ii] + m)/d);
-                l = MAX(0, MIN(15, l));
-                L[32*j + ii] = l;
-                sumlx += (x[32*j + ii] + m)*l*sd;
-                suml2 += l*l*sd*sd;
-            }
-        }
-        if (suml2) {
-            y[i].d[0] = GGML_FP32_TO_FP16(sumlx/suml2);
-        }
-#endif
-        uint8_t * q = y[i].qs;
-        for (int j = 0; j < QK_K; j += 64) {
-            for (int l = 0; l < 32; ++l) q[l] = L[j + l] | (L[j + l + 32] << 4);
-            q += 32;
-        }
-
-        x += QK_K;
-
-    }
-}
-
-void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k) {
-    assert(k % QK_K == 0);
-    const int nb = k / QK_K;
-
-    for (int i = 0; i < nb; i++) {
-
-        const uint8_t * q = x[i].qs;
-
-#if QK_K == 256
-
-        const float d   = GGML_FP16_TO_FP32(x[i].d);
-        const float min = GGML_FP16_TO_FP32(x[i].dmin);
-
-        int is = 0;
-        uint8_t sc, m;
-        for (int j = 0; j < QK_K; j += 64) {
-            get_scale_min_k4(is + 0, x[i].scales, &sc, &m);
-            const float d1 = d * sc; const float m1 = min * m;
-            get_scale_min_k4(is + 1, x[i].scales, &sc, &m);
-            const float d2 = d * sc; const float m2 = min * m;
-            for (int l = 0; l < 32; ++l) *y++ = d1 * (q[l] & 0xF) - m1;
-            for (int l = 0; l < 32; ++l) *y++ = d2 * (q[l]  >> 4) - m2;
-            q += 32; is += 2;
-        }
-#else
-        const float dall = GGML_FP16_TO_FP32(x[i].d[0]);
-        const float mall = GGML_FP16_TO_FP32(x[i].d[1]);
-        const float d1 = dall * (x[i].scales[0] & 0xF), m1 = mall * (x[i].scales[0] >> 4);
-        const float d2 = dall * (x[i].scales[1] & 0xF), m2 = mall * (x[i].scales[1] >> 4);
-        for (int l = 0; l < 32; ++l) {
-            y[l+ 0] = d1 * (q[l] & 0xF) - m1;
-            y[l+32] = d2 * (q[l] >>  4) - m2;
-        }
-        y += QK_K;
-#endif
-
-    }
-}
-
-void quantize_row_q4_K(const float * restrict x, void * restrict vy, int k) {
-    assert(k % QK_K == 0);
-    block_q4_K * restrict y = vy;
-    quantize_row_q4_K_reference(x, y, k);
-}
-
-size_t ggml_quantize_q4_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
-    assert(k % QK_K == 0);
-    (void)hist; // TODO: collect histograms
-
-    for (int j = 0; j < n; j += k) {
-        block_q4_K * restrict y = (block_q4_K *)dst + j/QK_K;
-        quantize_row_q4_K_reference(src + j, y, k);
-    }
-    return (n/QK_K*sizeof(block_q4_K));
-}
-
-// ====================== 5-bit (de)-quantization
-
-void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k) {
-    assert(k % QK_K == 0);
-    const int nb = k / QK_K;
-
-#if QK_K == 256
-    uint8_t L[QK_K];
-    float mins[QK_K/32];
-    float scales[QK_K/32];
-    float weights[32];
-    uint8_t Laux[32];
-#else
-    int8_t L[QK_K];
-    float scales[QK_K/16];
-#endif
-
-    for (int i = 0; i < nb; i++) {
-
-#if QK_K == 256
-
-        float max_scale = 0; // as we are deducting the min, scales are always positive
-        float max_min = 0;
-        for (int j = 0; j < QK_K/32; ++j) {
-            //scales[j] = make_qkx1_quants(32, 31, x + 32*j, L + 32*j, &mins[j], 9, 0.5f);
-            float sum_x2 = 0;
-            for (int l = 0; l < 32; ++l) sum_x2 += x[32*j + l] * x[32*j + l];
-            float av_x = sqrtf(sum_x2/32);
-            for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]);
-            scales[j] = make_qkx2_quants(32, 31, x + 32*j, weights, L + 32*j, &mins[j], Laux, -0.5f, 0.1f, 15, false);
-            float scale = scales[j];
-            if (scale > max_scale) {
-                max_scale = scale;
-            }
-            float min = mins[j];
-            if (min > max_min) {
-                max_min = min;
-            }
-        }
-
-        float inv_scale = max_scale > 0 ? 63.f/max_scale : 0.f;
-        float inv_min   = max_min   > 0 ? 63.f/max_min   : 0.f;
-        for (int j = 0; j < QK_K/32; ++j) {
-            uint8_t ls = nearest_int(inv_scale*scales[j]);
-            uint8_t lm = nearest_int(inv_min*mins[j]);
-            ls = MIN(63, ls);
-            lm = MIN(63, lm);
-            if (j < 4) {
-                y[i].scales[j] = ls;
-                y[i].scales[j+4] = lm;
-            } else {
-                y[i].scales[j+4] = (ls & 0xF) | ((lm & 0xF) << 4);
-                y[i].scales[j-4] |= ((ls >> 4) << 6);
-                y[i].scales[j-0] |= ((lm >> 4) << 6);
-            }
-        }
-        y[i].d = GGML_FP32_TO_FP16(max_scale/63.f);
-        y[i].dmin = GGML_FP32_TO_FP16(max_min/63.f);
-
-        uint8_t sc, m;
-        for (int j = 0; j < QK_K/32; ++j) {
-            get_scale_min_k4(j, y[i].scales, &sc, &m);
-            const float d = GGML_FP16_TO_FP32(y[i].d) * sc;
-            if (!d) continue;
-            const float dm = GGML_FP16_TO_FP32(y[i].dmin) * m;
-            for (int ii = 0; ii < 32; ++ii) {
-                int l = nearest_int((x[32*j + ii] + dm)/d);
-                l = MAX(0, MIN(31, l));
-                L[32*j + ii] = l;
-            }
-        }
-
-        uint8_t * restrict qh = y[i].qh;
-        uint8_t * restrict ql = y[i].qs;
-        memset(qh, 0, QK_K/8);
-
-        uint8_t m1 = 1, m2 = 2;
-        for (int n = 0; n < QK_K; n += 64) {
-            for (int j = 0; j < 32; ++j) {
-                int l1 = L[n + j];
-                if (l1 > 15) {
-                    l1 -= 16; qh[j] |= m1;
-                }
-                int l2 = L[n + j + 32];
-                if (l2 > 15) {
-                    l2 -= 16; qh[j] |= m2;
-                }
-                ql[j] = l1 | (l2 << 4);
-            }
-            m1 <<= 2; m2 <<= 2;
-            ql += 32;
-        }
-#else
-        float max_scale = 0, amax = 0;
-        for (int j = 0; j < QK_K/16; ++j) {
-            scales[j] = make_qx_quants(16, 16, x + 16*j, L + 16*j, 1);
-            float abs_scale = fabsf(scales[j]);
-            if (abs_scale > amax) {
-                amax = abs_scale;
-                max_scale = scales[j];
-            }
-        }
-
-        float iscale = -128.f/max_scale;
-        for (int j = 0; j < QK_K/16; ++j) {
-            int l = nearest_int(iscale*scales[j]);
-            y[i].scales[j] = MAX(-128, MIN(127, l));
-        }
-        y[i].d = GGML_FP32_TO_FP16(1/iscale);
-
-        for (int j = 0; j < QK_K/16; ++j) {
-            const float d = GGML_FP16_TO_FP32(y[i].d) * y[i].scales[j];
-            if (!d) continue;
-            for (int ii = 0; ii < 16; ++ii) {
-                int l = nearest_int(x[16*j + ii]/d);
-                l = MAX(-16, MIN(15, l));
-                L[16*j + ii] = l + 16;
-            }
-        }
-
-        uint8_t * restrict qh = y[i].qh;
-        uint8_t * restrict ql = y[i].qs;
-        memset(qh, 0, QK_K/8);
-
-        for (int j = 0; j < 32; ++j) {
-            int jm = j%8;
-            int is = j/8;
-            int l1 = L[j];
-            if (l1 > 15) {
-                l1 -= 16; qh[jm] |= (1 << is);
-            }
-            int l2 = L[j + 32];
-            if (l2 > 15) {
-                l2 -= 16; qh[jm] |= (1 << (4 + is));
-            }
-            ql[j] = l1 | (l2 << 4);
-        }
-#endif
-
-        x += QK_K;
-
-    }
-}
-
-void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int k) {
-    assert(k % QK_K == 0);
-    const int nb = k / QK_K;
-
-    for (int i = 0; i < nb; i++) {
-
-        const uint8_t * ql = x[i].qs;
-        const uint8_t * qh = x[i].qh;
-
-#if QK_K == 256
-
-        const float d = GGML_FP16_TO_FP32(x[i].d);
-        const float min = GGML_FP16_TO_FP32(x[i].dmin);
-
-        int is = 0;
-        uint8_t sc, m;
-        uint8_t u1 = 1, u2 = 2;
-        for (int j = 0; j < QK_K; j += 64) {
-            get_scale_min_k4(is + 0, x[i].scales, &sc, &m);
-            const float d1 = d * sc; const float m1 = min * m;
-            get_scale_min_k4(is + 1, x[i].scales, &sc, &m);
-            const float d2 = d * sc; const float m2 = min * m;
-            for (int l = 0; l < 32; ++l) *y++ = d1 * ((ql[l] & 0xF) + (qh[l] & u1 ? 16 : 0)) - m1;
-            for (int l = 0; l < 32; ++l) *y++ = d2 * ((ql[l]  >> 4) + (qh[l] & u2 ? 16 : 0)) - m2;
-            ql += 32; is += 2;
-            u1 <<= 2; u2 <<= 2;
-        }
-#else
-        float d = GGML_FP16_TO_FP32(x[i].d);
-        const int8_t * restrict s = x[i].scales;
-        for (int l = 0; l < 8; ++l) {
-            y[l+ 0] = d * s[0] * ((ql[l+ 0] & 0xF) - (qh[l] & 0x01 ? 0 : 16));
-            y[l+ 8] = d * s[0] * ((ql[l+ 8] & 0xF) - (qh[l] & 0x02 ? 0 : 16));
-            y[l+16] = d * s[1] * ((ql[l+16] & 0xF) - (qh[l] & 0x04 ? 0 : 16));
-            y[l+24] = d * s[1] * ((ql[l+24] & 0xF) - (qh[l] & 0x08 ? 0 : 16));
-            y[l+32] = d * s[2] * ((ql[l+ 0] >>  4) - (qh[l] & 0x10 ? 0 : 16));
-            y[l+40] = d * s[2] * ((ql[l+ 8] >>  4) - (qh[l] & 0x20 ? 0 : 16));
-            y[l+48] = d * s[3] * ((ql[l+16] >>  4) - (qh[l] & 0x40 ? 0 : 16));
-            y[l+56] = d * s[3] * ((ql[l+24] >>  4) - (qh[l] & 0x80 ? 0 : 16));
-        }
-        y += QK_K;
-#endif
-    }
-}
-
-void quantize_row_q5_K(const float * restrict x, void * restrict vy, int k) {
-    assert(k % QK_K == 0);
-    block_q5_K * restrict y = vy;
-    quantize_row_q5_K_reference(x, y, k);
-}
-
-size_t ggml_quantize_q5_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
-    assert(k % QK_K == 0);
-    (void)hist; // TODO: collect histograms
-
-    for (int j = 0; j < n; j += k) {
-        block_q5_K * restrict y = (block_q5_K *)dst + j/QK_K;
-        quantize_row_q5_K_reference(src + j, y, k);
-    }
-    return (n/QK_K*sizeof(block_q5_K));
-}
-
-// ====================== 6-bit (de)-quantization
-
-void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k) {
-    assert(k % QK_K == 0);
-    const int nb = k / QK_K;
-
-    int8_t L[QK_K];
-    float   scales[QK_K/16];
-
-    for (int i = 0; i < nb; i++) {
-
-        float max_scale = 0;
-        float max_abs_scale = 0;
-
-        for (int ib = 0; ib < QK_K/16; ++ib) {
-
-            const float scale = make_qx_quants(16, 32, x + 16*ib, L + 16*ib, 1);
-            scales[ib] = scale;
-
-            const float abs_scale = fabsf(scale);
-            if (abs_scale > max_abs_scale) {
-                max_abs_scale = abs_scale;
-                max_scale = scale;
-            }
-
-        }
-
-        if (!max_abs_scale) {
-            memset(&y[i], 0, sizeof(block_q6_K));
-            y[i].d = GGML_FP32_TO_FP16(0.f);
-            x += QK_K;
-            continue;
-        }
-
-        float iscale = -128.f/max_scale;
-        y[i].d = GGML_FP32_TO_FP16(1/iscale);
-        for (int ib = 0; ib < QK_K/16; ++ib) {
-            y[i].scales[ib] = MIN(127, nearest_int(iscale*scales[ib]));
-        }
-
-        for (int j = 0; j < QK_K/16; ++j) {
-            float d = GGML_FP16_TO_FP32(y[i].d) * y[i].scales[j];
-            if (!d) {
-                continue;
-            }
-            for (int ii = 0; ii < 16; ++ii) {
-                int l = nearest_int(x[16*j + ii]/d);
-                l = MAX(-32, MIN(31, l));
-                L[16*j + ii] = l + 32;
-            }
-        }
-
-        uint8_t * restrict ql = y[i].ql;
-        uint8_t * restrict qh = y[i].qh;
-#if QK_K == 256
-        for (int j = 0; j < QK_K; j += 128) {
-            for (int l = 0; l < 32; ++l) {
-                const uint8_t q1 = L[j + l +  0] & 0xF;
-                const uint8_t q2 = L[j + l + 32] & 0xF;
-                const uint8_t q3 = L[j + l + 64] & 0xF;
-                const uint8_t q4 = L[j + l + 96] & 0xF;
-                ql[l+ 0] = q1 | (q3 << 4);
-                ql[l+32] = q2 | (q4 << 4);
-                qh[l] = (L[j + l] >> 4) | ((L[j + l + 32] >> 4) << 2) | ((L[j + l + 64] >> 4) << 4) | ((L[j + l + 96] >> 4) << 6);
-            }
-            ql += 64;
-            qh += 32;
-        }
-#else
-        for (int l = 0; l < 32; ++l) {
-            const uint8_t q1 = L[l +  0] & 0xF;
-            const uint8_t q2 = L[l + 32] & 0xF;
-            ql[l] = q1 | (q2 << 4);
-        }
-        for (int l = 0; l < 16; ++l) {
-            qh[l] = (L[l] >> 4) | ((L[l + 16] >> 4) << 2) | ((L[l + 32] >> 4) << 4) | ((L[l + 48] >> 4) << 6);
-        }
-#endif
-
-        x += QK_K;
-
-    }
-}
-
-void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k) {
-    assert(k % QK_K == 0);
-    const int nb = k / QK_K;
-
-    for (int i = 0; i < nb; i++) {
-
-        const float d = GGML_FP16_TO_FP32(x[i].d);
-
-        const uint8_t * restrict ql = x[i].ql;
-        const uint8_t * restrict qh = x[i].qh;
-        const int8_t  * restrict sc = x[i].scales;
-
-#if QK_K == 256
-        for (int n = 0; n < QK_K; n += 128) {
-            for (int l = 0; l < 32; ++l) {
-                int is = l/16;
-                const int8_t q1 = (int8_t)((ql[l +  0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
-                const int8_t q2 = (int8_t)((ql[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
-                const int8_t q3 = (int8_t)((ql[l +  0]  >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
-                const int8_t q4 = (int8_t)((ql[l + 32]  >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
-                y[l +  0] = d * sc[is + 0] * q1;
-                y[l + 32] = d * sc[is + 2] * q2;
-                y[l + 64] = d * sc[is + 4] * q3;
-                y[l + 96] = d * sc[is + 6] * q4;
-            }
-            y  += 128;
-            ql += 64;
-            qh += 32;
-            sc += 8;
-        }
-#else
-        for (int l = 0; l < 16; ++l) {
-            const int8_t q1 = (int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
-            const int8_t q2 = (int8_t)((ql[l+16] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
-            const int8_t q3 = (int8_t)((ql[l+ 0]  >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
-            const int8_t q4 = (int8_t)((ql[l+16]  >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
-            y[l+ 0] = d * sc[0] * q1;
-            y[l+16] = d * sc[1] * q2;
-            y[l+32] = d * sc[2] * q3;
-            y[l+48] = d * sc[3] * q4;
-        }
-        y  += 64;
-#endif
-
-    }
-}
-
-void quantize_row_q6_K(const float * restrict x, void * restrict vy, int k) {
-    assert(k % QK_K == 0);
-    block_q6_K * restrict y = vy;
-    quantize_row_q6_K_reference(x, y, k);
-}
-
-size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist) {
-    assert(k % QK_K == 0);
-    (void)hist; // TODO: collect histograms
-
-    for (int j = 0; j < n; j += k) {
-        block_q6_K * restrict y = (block_q6_K *)dst + j/QK_K;
-        quantize_row_q6_K_reference(src + j, y, k);
-    }
-    return (n/QK_K*sizeof(block_q6_K));
-}
-
-// ====================== "True" 2-bit (de)-quantization
-
-static const  uint64_t iq2xxs_grid[256] = {
-    0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
-    0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x08080808082b0808,
-    0x08080808082b082b, 0x08080808082b2b08, 0x08080808082b2b2b, 0x0808080819080819,
-    0x0808080819081908, 0x0808080819190808, 0x0808080819192b08, 0x08080808192b0819,
-    0x08080808192b1908, 0x080808082b080808, 0x080808082b08082b, 0x080808082b082b2b,
-    0x080808082b2b082b, 0x0808081908080819, 0x0808081908081908, 0x0808081908190808,
-    0x0808081908191919, 0x0808081919080808, 0x080808192b081908, 0x080808192b192b08,
-    0x0808082b08080808, 0x0808082b0808082b, 0x0808082b082b082b, 0x0808082b2b08082b,
-    0x0808190808080819, 0x0808190808081908, 0x0808190808190808, 0x08081908082b0819,
-    0x08081908082b1908, 0x0808190819080808, 0x080819081908082b, 0x0808190819082b08,
-    0x08081908192b0808, 0x080819082b080819, 0x080819082b081908, 0x080819082b190808,
-    0x080819082b2b1908, 0x0808191908080808, 0x080819190808082b, 0x0808191908082b08,
-    0x08081919082b0808, 0x080819191908192b, 0x08081919192b2b19, 0x080819192b080808,
-    0x080819192b190819, 0x0808192b08082b19, 0x0808192b08190808, 0x0808192b19080808,
-    0x0808192b2b081908, 0x0808192b2b2b1908, 0x08082b0808080808, 0x08082b0808081919,
-    0x08082b0808082b08, 0x08082b0808191908, 0x08082b08082b2b08, 0x08082b0819080819,
-    0x08082b0819081908, 0x08082b0819190808, 0x08082b081919082b, 0x08082b082b082b08,
-    0x08082b1908081908, 0x08082b1919080808, 0x08082b2b0808082b, 0x08082b2b08191908,
-    0x0819080808080819, 0x0819080808081908, 0x0819080808190808, 0x08190808082b0819,
-    0x0819080819080808, 0x08190808192b0808, 0x081908082b081908, 0x081908082b190808,
-    0x081908082b191919, 0x0819081908080808, 0x0819081908082b08, 0x08190819082b0808,
-    0x0819081919190808, 0x0819081919192b2b, 0x081908192b080808, 0x0819082b082b1908,
-    0x0819082b19081919, 0x0819190808080808, 0x0819190808082b08, 0x08191908082b0808,
-    0x08191908082b1919, 0x0819190819082b19, 0x081919082b080808, 0x0819191908192b08,
-    0x08191919192b082b, 0x0819192b08080808, 0x0819192b0819192b, 0x08192b0808080819,
-    0x08192b0808081908, 0x08192b0808190808, 0x08192b0819080808, 0x08192b082b080819,
-    0x08192b1908080808, 0x08192b1908081919, 0x08192b192b2b0808, 0x08192b2b19190819,
-    0x082b080808080808, 0x082b08080808082b, 0x082b080808082b2b, 0x082b080819081908,
-    0x082b0808192b0819, 0x082b08082b080808, 0x082b08082b08082b, 0x082b0819082b2b19,
-    0x082b081919082b08, 0x082b082b08080808, 0x082b082b0808082b, 0x082b190808080819,
-    0x082b190808081908, 0x082b190808190808, 0x082b190819080808, 0x082b19081919192b,
-    0x082b191908080808, 0x082b191919080819, 0x082b1919192b1908, 0x082b192b2b190808,
-    0x082b2b0808082b08, 0x082b2b08082b0808, 0x082b2b082b191908, 0x082b2b2b19081908,
-    0x1908080808080819, 0x1908080808081908, 0x1908080808190808, 0x1908080808192b08,
-    0x19080808082b0819, 0x19080808082b1908, 0x1908080819080808, 0x1908080819082b08,
-    0x190808081919192b, 0x19080808192b0808, 0x190808082b080819, 0x190808082b081908,
-    0x190808082b190808, 0x1908081908080808, 0x19080819082b0808, 0x19080819192b0819,
-    0x190808192b080808, 0x190808192b081919, 0x1908082b08080819, 0x1908082b08190808,
-    0x1908082b19082b08, 0x1908082b1919192b, 0x1908082b192b2b08, 0x1908190808080808,
-    0x1908190808082b08, 0x19081908082b0808, 0x190819082b080808, 0x190819082b192b19,
-    0x190819190819082b, 0x19081919082b1908, 0x1908192b08080808, 0x19082b0808080819,
-    0x19082b0808081908, 0x19082b0808190808, 0x19082b0819080808, 0x19082b0819081919,
-    0x19082b1908080808, 0x19082b1919192b08, 0x19082b19192b0819, 0x19082b192b08082b,
-    0x19082b2b19081919, 0x19082b2b2b190808, 0x1919080808080808, 0x1919080808082b08,
-    0x1919080808190819, 0x1919080808192b19, 0x19190808082b0808, 0x191908082b080808,
-    0x191908082b082b08, 0x1919081908081908, 0x191908191908082b, 0x191908192b2b1908,
-    0x1919082b2b190819, 0x191919082b190808, 0x191919082b19082b, 0x1919191908082b2b,
-    0x1919192b08080819, 0x1919192b19191908, 0x19192b0808080808, 0x19192b0808190819,
-    0x19192b0808192b19, 0x19192b08192b1908, 0x19192b1919080808, 0x19192b2b08082b08,
-    0x192b080808081908, 0x192b080808190808, 0x192b080819080808, 0x192b0808192b2b08,
-    0x192b081908080808, 0x192b081919191919, 0x192b082b08192b08, 0x192b082b192b0808,
-    0x192b190808080808, 0x192b190808081919, 0x192b191908190808, 0x192b19190819082b,
-    0x192b19192b081908, 0x192b2b081908082b, 0x2b08080808080808, 0x2b0808080808082b,
-    0x2b08080808082b2b, 0x2b08080819080819, 0x2b0808082b08082b, 0x2b08081908081908,
-    0x2b08081908192b08, 0x2b08081919080808, 0x2b08082b08190819, 0x2b08190808080819,
-    0x2b08190808081908, 0x2b08190808190808, 0x2b08190808191919, 0x2b08190819080808,
-    0x2b081908192b0808, 0x2b08191908080808, 0x2b0819191908192b, 0x2b0819192b191908,
-    0x2b08192b08082b19, 0x2b08192b19080808, 0x2b08192b192b0808, 0x2b082b080808082b,
-    0x2b082b1908081908, 0x2b082b2b08190819, 0x2b19080808081908, 0x2b19080808190808,
-    0x2b190808082b1908, 0x2b19080819080808, 0x2b1908082b2b0819, 0x2b1908190819192b,
-    0x2b1908192b080808, 0x2b19082b19081919, 0x2b19190808080808, 0x2b191908082b082b,
-    0x2b19190819081908, 0x2b19191919190819, 0x2b192b082b080819, 0x2b192b19082b0808,
-    0x2b2b08080808082b, 0x2b2b080819190808, 0x2b2b08082b081919, 0x2b2b081908082b19,
-    0x2b2b082b08080808, 0x2b2b190808192b08, 0x2b2b2b0819190808, 0x2b2b2b1908081908,
-};
-
-static const uint64_t iq2xs_grid[512] = {
-    0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
-    0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x080808080819192b,
-    0x0808080808192b19, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919,
-    0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908, 0x080808081908192b,
-    0x0808080819082b19, 0x0808080819190808, 0x080808081919082b, 0x0808080819191919,
-    0x0808080819192b08, 0x08080808192b0819, 0x08080808192b1908, 0x080808082b080808,
-    0x080808082b08082b, 0x080808082b081919, 0x080808082b082b08, 0x080808082b190819,
-    0x080808082b191908, 0x080808082b192b19, 0x080808082b2b0808, 0x0808081908080819,
-    0x0808081908081908, 0x080808190808192b, 0x0808081908082b19, 0x0808081908190808,
-    0x080808190819082b, 0x0808081908191919, 0x0808081908192b08, 0x0808081908192b2b,
-    0x08080819082b0819, 0x08080819082b1908, 0x0808081919080808, 0x080808191908082b,
-    0x0808081919081919, 0x0808081919082b08, 0x0808081919190819, 0x0808081919191908,
-    0x08080819192b0808, 0x08080819192b2b08, 0x080808192b080819, 0x080808192b081908,
-    0x080808192b190808, 0x0808082b08080808, 0x0808082b0808082b, 0x0808082b08081919,
-    0x0808082b08082b08, 0x0808082b08190819, 0x0808082b08191908, 0x0808082b082b0808,
-    0x0808082b19080819, 0x0808082b19081908, 0x0808082b19190808, 0x0808082b19191919,
-    0x0808082b2b080808, 0x0808082b2b082b2b, 0x0808190808080819, 0x0808190808081908,
-    0x080819080808192b, 0x0808190808082b19, 0x0808190808190808, 0x080819080819082b,
-    0x0808190808191919, 0x0808190808192b08, 0x08081908082b0819, 0x08081908082b1908,
-    0x0808190819080808, 0x080819081908082b, 0x0808190819081919, 0x0808190819082b08,
-    0x0808190819190819, 0x0808190819191908, 0x080819081919192b, 0x08081908192b0808,
-    0x080819082b080819, 0x080819082b081908, 0x080819082b190808, 0x0808191908080808,
-    0x080819190808082b, 0x0808191908081919, 0x0808191908082b08, 0x0808191908190819,
-    0x0808191908191908, 0x08081919082b0808, 0x0808191919080819, 0x0808191919081908,
-    0x0808191919190808, 0x08081919192b0819, 0x080819192b080808, 0x0808192b08080819,
-    0x0808192b08081908, 0x0808192b08190808, 0x0808192b082b192b, 0x0808192b19080808,
-    0x0808192b1908082b, 0x0808192b2b081908, 0x08082b0808080808, 0x08082b080808082b,
-    0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808082b2b, 0x08082b0808190819,
-    0x08082b0808191908, 0x08082b08082b0808, 0x08082b08082b1919, 0x08082b0819080819,
-    0x08082b0819081908, 0x08082b0819190808, 0x08082b0819192b08, 0x08082b082b080808,
-    0x08082b082b2b0808, 0x08082b082b2b2b2b, 0x08082b1908080819, 0x08082b1908081908,
-    0x08082b1908190808, 0x08082b1919080808, 0x08082b192b080819, 0x08082b192b082b19,
-    0x08082b2b08080808, 0x08082b2b082b0808, 0x08082b2b082b2b08, 0x08082b2b2b19192b,
-    0x08082b2b2b2b0808, 0x0819080808080819, 0x0819080808081908, 0x081908080808192b,
-    0x0819080808082b19, 0x0819080808190808, 0x081908080819082b, 0x0819080808191919,
-    0x0819080808192b08, 0x08190808082b0819, 0x08190808082b1908, 0x0819080819080808,
-    0x081908081908082b, 0x0819080819081919, 0x0819080819082b08, 0x0819080819190819,
-    0x0819080819191908, 0x08190808192b0808, 0x08190808192b2b2b, 0x081908082b080819,
-    0x081908082b081908, 0x081908082b190808, 0x0819081908080808, 0x081908190808082b,
-    0x0819081908081919, 0x0819081908082b08, 0x0819081908190819, 0x0819081908191908,
-    0x08190819082b0808, 0x0819081919080819, 0x0819081919081908, 0x0819081919190808,
-    0x081908192b080808, 0x081908192b191908, 0x081908192b19192b, 0x0819082b08080819,
-    0x0819082b08081908, 0x0819082b0808192b, 0x0819082b08190808, 0x0819082b19080808,
-    0x0819082b192b0808, 0x0819190808080808, 0x081919080808082b, 0x0819190808081919,
-    0x0819190808082b08, 0x0819190808190819, 0x0819190808191908, 0x08191908082b0808,
-    0x0819190819080819, 0x0819190819081908, 0x0819190819082b19, 0x0819190819190808,
-    0x08191908192b1908, 0x081919082b080808, 0x0819191908080819, 0x0819191908081908,
-    0x0819191908190808, 0x0819191919080808, 0x0819192b08080808, 0x0819192b08191908,
-    0x0819192b19082b19, 0x08192b0808080819, 0x08192b0808081908, 0x08192b0808190808,
-    0x08192b080819082b, 0x08192b0819080808, 0x08192b0819191908, 0x08192b082b08192b,
-    0x08192b1908080808, 0x08192b1908081919, 0x08192b19192b192b, 0x08192b2b19190819,
-    0x08192b2b2b2b2b19, 0x082b080808080808, 0x082b08080808082b, 0x082b080808081919,
-    0x082b080808082b08, 0x082b080808082b2b, 0x082b080808190819, 0x082b080808191908,
-    0x082b0808082b0808, 0x082b080819080819, 0x082b080819081908, 0x082b080819190808,
-    0x082b08082b080808, 0x082b08082b2b0808, 0x082b081908080819, 0x082b081908081908,
-    0x082b081908190808, 0x082b081919080808, 0x082b081919082b08, 0x082b0819192b1919,
-    0x082b082b08080808, 0x082b082b082b082b, 0x082b082b2b080808, 0x082b082b2b2b2b08,
-    0x082b190808080819, 0x082b190808081908, 0x082b190808190808, 0x082b1908082b2b19,
-    0x082b190819080808, 0x082b191908080808, 0x082b191919080819, 0x082b19191919082b,
-    0x082b19192b192b19, 0x082b192b08080819, 0x082b192b08192b2b, 0x082b192b2b2b192b,
-    0x082b2b0808080808, 0x082b2b0808082b08, 0x082b2b0808082b2b, 0x082b2b08082b0808,
-    0x082b2b0819191919, 0x082b2b082b082b08, 0x082b2b082b2b082b, 0x082b2b19192b2b08,
-    0x082b2b192b190808, 0x082b2b2b08082b08, 0x082b2b2b082b0808, 0x082b2b2b2b08082b,
-    0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819, 0x1908080808081908,
-    0x190808080808192b, 0x1908080808082b19, 0x1908080808190808, 0x190808080819082b,
-    0x1908080808191919, 0x1908080808192b08, 0x19080808082b0819, 0x19080808082b1908,
-    0x1908080819080808, 0x190808081908082b, 0x1908080819081919, 0x1908080819082b08,
-    0x1908080819082b2b, 0x1908080819190819, 0x1908080819191908, 0x19080808192b0808,
-    0x19080808192b1919, 0x190808082b080819, 0x190808082b081908, 0x190808082b190808,
-    0x1908081908080808, 0x190808190808082b, 0x1908081908081919, 0x1908081908082b08,
-    0x1908081908190819, 0x1908081908191908, 0x19080819082b0808, 0x1908081919080819,
-    0x1908081919081908, 0x1908081919190808, 0x190808192b080808, 0x190808192b081919,
-    0x190808192b2b082b, 0x1908082b08080819, 0x1908082b08081908, 0x1908082b08190808,
-    0x1908082b0819082b, 0x1908082b082b2b19, 0x1908082b19080808, 0x1908190808080808,
-    0x190819080808082b, 0x1908190808081919, 0x1908190808082b08, 0x1908190808190819,
-    0x1908190808191908, 0x1908190808192b19, 0x19081908082b0808, 0x1908190819080819,
-    0x1908190819081908, 0x1908190819190808, 0x190819082b080808, 0x190819082b191908,
-    0x1908191908080819, 0x1908191908081908, 0x1908191908190808, 0x19081919082b1908,
-    0x1908191919080808, 0x190819192b192b2b, 0x1908192b08080808, 0x1908192b08082b2b,
-    0x1908192b19081908, 0x1908192b19190808, 0x19082b0808080819, 0x19082b0808081908,
-    0x19082b0808190808, 0x19082b0819080808, 0x19082b0819081919, 0x19082b0819191908,
-    0x19082b08192b082b, 0x19082b1908080808, 0x19082b1908190819, 0x19082b1919081908,
-    0x19082b1919190808, 0x19082b19192b2b19, 0x19082b2b08081908, 0x1919080808080808,
-    0x191908080808082b, 0x1919080808081919, 0x1919080808082b08, 0x1919080808190819,
-    0x1919080808191908, 0x19190808082b0808, 0x19190808082b2b08, 0x1919080819080819,
-    0x1919080819081908, 0x1919080819190808, 0x191908082b080808, 0x1919081908080819,
-    0x1919081908081908, 0x1919081908190808, 0x1919081908191919, 0x1919081919080808,
-    0x191908191908082b, 0x1919082b08080808, 0x1919082b19081908, 0x1919082b2b2b2b2b,
-    0x1919190808080819, 0x1919190808081908, 0x1919190808190808, 0x19191908082b0819,
-    0x1919190819080808, 0x19191908192b0808, 0x191919082b080819, 0x191919082b2b0819,
-    0x1919191908080808, 0x1919191908082b08, 0x191919192b080808, 0x191919192b082b08,
-    0x1919192b082b0819, 0x1919192b192b2b08, 0x1919192b2b2b0819, 0x19192b0808080808,
-    0x19192b0808191908, 0x19192b0819080819, 0x19192b0819190808, 0x19192b082b192b19,
-    0x19192b1908192b2b, 0x19192b1919080808, 0x19192b191908082b, 0x19192b2b2b081919,
-    0x192b080808080819, 0x192b080808081908, 0x192b080808190808, 0x192b080819080808,
-    0x192b080819191908, 0x192b0808192b082b, 0x192b08082b08192b, 0x192b08082b2b2b19,
-    0x192b081908080808, 0x192b082b082b1908, 0x192b082b19082b2b, 0x192b082b2b19082b,
-    0x192b190808080808, 0x192b19080819192b, 0x192b191908190808, 0x192b191919080808,
-    0x192b191919081919, 0x192b19192b2b1908, 0x192b2b0808080819, 0x192b2b08192b2b2b,
-    0x192b2b19082b1919, 0x192b2b2b0808192b, 0x192b2b2b19191908, 0x192b2b2b192b082b,
-    0x2b08080808080808, 0x2b0808080808082b, 0x2b08080808081919, 0x2b08080808082b08,
-    0x2b08080808190819, 0x2b08080808191908, 0x2b080808082b0808, 0x2b080808082b2b2b,
-    0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808, 0x2b0808082b080808,
-    0x2b0808082b08082b, 0x2b0808082b2b2b08, 0x2b0808082b2b2b2b, 0x2b08081908080819,
-    0x2b08081908081908, 0x2b0808190808192b, 0x2b08081908190808, 0x2b08081919080808,
-    0x2b08081919190819, 0x2b08081919192b19, 0x2b08082b08080808, 0x2b08082b082b0808,
-    0x2b08082b2b080808, 0x2b08082b2b08082b, 0x2b08082b2b2b0808, 0x2b08082b2b2b2b08,
-    0x2b08190808080819, 0x2b08190808081908, 0x2b08190808190808, 0x2b0819080819082b,
-    0x2b08190808191919, 0x2b08190819080808, 0x2b081908192b0808, 0x2b0819082b082b19,
-    0x2b08191908080808, 0x2b08191919081908, 0x2b0819192b2b1919, 0x2b08192b08192b08,
-    0x2b08192b192b2b2b, 0x2b082b0808080808, 0x2b082b0808082b08, 0x2b082b08082b1919,
-    0x2b082b0819192b2b, 0x2b082b082b080808, 0x2b082b082b08082b, 0x2b082b082b2b2b08,
-    0x2b082b190808192b, 0x2b082b2b082b082b, 0x2b082b2b2b080808, 0x2b082b2b2b082b08,
-    0x2b082b2b2b19192b, 0x2b082b2b2b2b2b08, 0x2b19080808080819, 0x2b19080808081908,
-    0x2b19080808190808, 0x2b19080819080808, 0x2b1908081919192b, 0x2b1908082b081908,
-    0x2b19081908080808, 0x2b190819082b082b, 0x2b190819192b1908, 0x2b19082b1919192b,
-    0x2b19082b2b082b19, 0x2b19190808080808, 0x2b19190808081919, 0x2b19190819081908,
-    0x2b19190819190808, 0x2b19190819192b08, 0x2b191919082b2b19, 0x2b1919192b190808,
-    0x2b1919192b19082b, 0x2b19192b19080819, 0x2b192b0819190819, 0x2b192b082b2b192b,
-    0x2b192b1919082b19, 0x2b192b2b08191919, 0x2b192b2b192b0808, 0x2b2b080808080808,
-    0x2b2b08080808082b, 0x2b2b080808082b08, 0x2b2b080808082b2b, 0x2b2b0808082b0808,
-    0x2b2b0808082b2b2b, 0x2b2b08082b2b0808, 0x2b2b081919190819, 0x2b2b081919192b19,
-    0x2b2b08192b2b192b, 0x2b2b082b08080808, 0x2b2b082b0808082b, 0x2b2b082b08082b08,
-    0x2b2b082b082b2b2b, 0x2b2b082b2b080808, 0x2b2b082b2b2b0808, 0x2b2b190819080808,
-    0x2b2b19082b191919, 0x2b2b192b192b1919, 0x2b2b192b2b192b08, 0x2b2b2b0808082b2b,
-    0x2b2b2b08082b0808, 0x2b2b2b08082b082b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b0808,
-    0x2b2b2b082b2b2b08, 0x2b2b2b1908081908, 0x2b2b2b192b081908, 0x2b2b2b192b08192b,
-    0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b, 0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b,
-};
-
-static const uint8_t ksigns_iq2xs[128] = {
-      0, 129, 130,   3, 132,   5,   6, 135, 136,   9,  10, 139,  12, 141, 142,  15,
-    144,  17,  18, 147,  20, 149, 150,  23,  24, 153, 154,  27, 156,  29,  30, 159,
-    160,  33,  34, 163,  36, 165, 166,  39,  40, 169, 170,  43, 172,  45,  46, 175,
-     48, 177, 178,  51, 180,  53,  54, 183, 184,  57,  58, 187,  60, 189, 190,  63,
-    192,  65,  66, 195,  68, 197, 198,  71,  72, 201, 202,  75, 204,  77,  78, 207,
-     80, 209, 210,  83, 212,  85,  86, 215, 216,  89,  90, 219,  92, 221, 222,  95,
-     96, 225, 226,  99, 228, 101, 102, 231, 232, 105, 106, 235, 108, 237, 238, 111,
-    240, 113, 114, 243, 116, 245, 246, 119, 120, 249, 250, 123, 252, 125, 126, 255,
-};
-
-static const uint8_t kmask_iq2xs[8] = {1, 2, 4, 8, 16, 32, 64, 128};
-
-void quantize_row_iq2_xxs_reference(const float * restrict x, block_iq2_xxs * restrict y, int k) {
-    (void)x;
-    (void)y;
-    (void)k;
-    assert(k % QK_K == 0);
-    //fprintf(stderr, "=========================== %s: not implemented\n", __func__);
-}
-
-void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int k) {
-    assert(k % QK_K == 0);
-    const int nb = k / QK_K;
-
-    uint32_t aux32[2];
-    const uint8_t * aux8 = (const uint8_t *)aux32;
-
-    for (int i = 0; i < nb; i++) {
-
-        const float d = GGML_FP16_TO_FP32(x[i].d);
-
-        for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
-            memcpy(aux32, x[i].qs + 4*ib32, 2*sizeof(uint32_t));
-            const float db = d * (0.5f + (aux32[1] >> 28)) * 0.25f;
-            for (int l = 0; l < 4; ++l) {
-                const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]);
-                const uint8_t  signs = ksigns_iq2xs[(aux32[1] >> 7*l) & 127];
-                for (int j = 0; j < 8; ++j) {
-                    y[j] = db * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
-                }
-                y += 8;
-            }
-        }
-    }
-}
-
-void quantize_row_iq2_xxs(const float * restrict x, void * restrict vy, int k) {
-    assert(k % QK_K == 0);
-    block_iq2_xxs * restrict y = vy;
-    quantize_row_iq2_xxs_reference(x, y, k);
-}
-
-size_t ggml_quantize_iq2_xxs(const float * src, void * dst, int n, int k, int64_t * hist) {
-    assert(k % QK_K == 0);
-    (void)hist; // TODO: collect histograms
-
-    for (int j = 0; j < n; j += k) {
-        block_iq2_xxs * restrict y = (block_iq2_xxs *)dst + j/QK_K;
-        quantize_row_iq2_xxs_reference(src + j, y, k);
-    }
-    return (n/QK_K*sizeof(block_iq2_xxs));
-}
-
-// ====================== 2.3125 bpw (de)-quantization
-
-void quantize_row_iq2_xs_reference(const float * restrict x, block_iq2_xs * restrict y, int k) {
-    (void)x;
-    (void)y;
-    (void)k;
-    assert(k % QK_K == 0);
-    //fprintf(stderr, "=========================== %s: not implemented\n", __func__);
-}
-
-void dequantize_row_iq2_xs(const block_iq2_xs * restrict x, float * restrict y, int k) {
-    assert(k % QK_K == 0);
-    const int nb = k / QK_K;
-
-    float db[2];
-
-    for (int i = 0; i < nb; i++) {
-
-        const float d = GGML_FP16_TO_FP32(x[i].d);
-
-        for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
-            db[0] = d * (0.5f + (x[i].scales[ib32] & 0xf)) * 0.25f;
-            db[1] = d * (0.5f + (x[i].scales[ib32] >>  4)) * 0.25f;
-            for (int l = 0; l < 4; ++l) {
-                const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (x[i].qs[4*ib32 + l] & 511));
-                const uint8_t  signs = ksigns_iq2xs[x[i].qs[4*ib32 + l] >> 9];
-                for (int j = 0; j < 8; ++j) {
-                    y[j] = db[l/2] * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
-                }
-                y += 8;
-            }
-        }
-    }
-}
-
-void quantize_row_iq2_xs(const float * restrict x, void * restrict vy, int k) {
-    assert(k % QK_K == 0);
-    block_iq2_xs * restrict y = vy;
-    quantize_row_iq2_xs_reference(x, y, k);
-}
-
-size_t ggml_quantize_iq2_xs(const float * src, void * dst, int n, int k, int64_t * hist) {
-    assert(k % QK_K == 0);
-    (void)hist; // TODO: collect histograms
-
-    for (int j = 0; j < n; j += k) {
-        block_iq2_xs * restrict y = (block_iq2_xs *)dst + j/QK_K;
-        quantize_row_iq2_xs_reference(src + j, y, k);
-    }
-    return (n/QK_K*sizeof(block_iq2_xs));
-}
-
-//===================================== Q8_K ==============================================
-
-void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k) {
-    assert(k % QK_K == 0);
-    const int nb = k / QK_K;
-
-    for (int i = 0; i < nb; i++) {
-
-        float max = 0;
-        float amax = 0;
-        for (int j = 0; j < QK_K; ++j) {
-            float ax = fabsf(x[j]);
-            if (ax > amax) {
-                amax = ax; max = x[j];
-            }
-        }
-        if (!amax) {
-            y[i].d = 0;
-            memset(y[i].qs, 0, QK_K);
-            x += QK_K;
-            continue;
-        }
-        //const float iscale = -128.f/max;
-        // We need this change for IQ2_XXS, else the AVX implementation becomes very awkward
-        const float iscale = -127.f/max;
-        for (int j = 0; j < QK_K; ++j) {
-            int v = nearest_int(iscale*x[j]);
-            y[i].qs[j] = MIN(127, v);
-        }
-        for (int j = 0; j < QK_K/16; ++j) {
-            int sum = 0;
-            for (int ii = 0; ii < 16; ++ii) {
-                sum += y[i].qs[j*16 + ii];
-            }
-            y[i].bsums[j] = sum;
-        }
-        y[i].d = 1/iscale;
-        x += QK_K;
-    }
-}
-
-void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k) {
-    assert(k % QK_K == 0);
-    const int nb = k / QK_K;
-
-    for (int i = 0; i < nb; i++) {
-        for (int j = 0; j < QK_K; ++j) {
-            *y++ = x[i].d * x[i].qs[j];
-        }
-    }
-}
-
-void quantize_row_q8_K(const float * restrict x, void * restrict y, int k) {
-    quantize_row_q8_K_reference(x, y, k);
-}
-
-//===================================== Dot ptoducts =================================
-
-//
-// Helper functions
-//
-#if __AVX__ || __AVX2__ || __AVX512F__
-
-// shuffles to pick the required scales in dot products
-static inline __m256i get_scale_shuffle_q3k(int i) {
-    static const uint8_t k_shuffle[128] = {
-         0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,     2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
-         4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5,     6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7,
-         8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9,    10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,
-        12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,    14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,
-    };
-    return _mm256_loadu_si256((const __m256i*)k_shuffle + i);
-}
-static inline __m256i get_scale_shuffle_k4(int i) {
-    static const uint8_t k_shuffle[256] = {
-         0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
-         2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
-         4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5,
-         6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7,
-         8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9,
-        10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,
-        12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,
-        14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15
-    };
-    return _mm256_loadu_si256((const __m256i*)k_shuffle + i);
-}
-static inline __m128i get_scale_shuffle(int i) {
-    static const uint8_t k_shuffle[128] = {
-         0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
-         2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
-         4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5,
-         6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7,
-         8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9,
-        10,10,10,10,10,10,10,10, 11,11,11,11,11,11,11,11,
-        12,12,12,12,12,12,12,12, 13,13,13,13,13,13,13,13,
-        14,14,14,14,14,14,14,14, 15,15,15,15,15,15,15,15
-    };
-    return _mm_loadu_si128((const __m128i*)k_shuffle + i);
-}
-#endif
-
-void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
-    const int qk = QK8_0;
-    const int nb = n / qk;
-
-    assert(n % qk == 0);
-
-    const block_q4_0 * restrict x = vx;
-    const block_q8_0 * restrict y = vy;
-
-#if defined(__ARM_NEON)
-    float32x4_t sumv0 = vdupq_n_f32(0.0f);
-    float32x4_t sumv1 = vdupq_n_f32(0.0f);
-
-    assert(nb % 2 == 0); // TODO: handle odd nb
-
-    for (int i = 0; i < nb; i += 2) {
-        const block_q4_0 * restrict x0 = &x[i + 0];
-        const block_q4_0 * restrict x1 = &x[i + 1];
-        const block_q8_0 * restrict y0 = &y[i + 0];
-        const block_q8_0 * restrict y1 = &y[i + 1];
-
-        const uint8x16_t m4b = vdupq_n_u8(0x0F);
-        const int8x16_t  s8b = vdupq_n_s8(0x8);
-
-        const uint8x16_t v0_0 = vld1q_u8(x0->qs);
-        const uint8x16_t v0_1 = vld1q_u8(x1->qs);
-
-        // 4-bit -> 8-bit
-        const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8  (v0_0, m4b));
-        const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
-        const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8  (v0_1, m4b));
-        const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
-
-        // sub 8
-        const int8x16_t v0_0ls = vsubq_s8(v0_0l, s8b);
-        const int8x16_t v0_0hs = vsubq_s8(v0_0h, s8b);
-        const int8x16_t v0_1ls = vsubq_s8(v0_1l, s8b);
-        const int8x16_t v0_1hs = vsubq_s8(v0_1h, s8b);
-
-        // load y
-        const int8x16_t v1_0l = vld1q_s8(y0->qs);
-        const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
-        const int8x16_t v1_1l = vld1q_s8(y1->qs);
-        const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
-
-        // dot product into int32x4_t
-        const int32x4_t p_0 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0l), v0_0hs, v1_0h);
-        const int32x4_t p_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1l), v0_1hs, v1_1h);
-
-        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
-        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
-    }
-
-    *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
-#elif defined(__AVX2__)
-    // Initialize accumulator with zeros
-    __m256 acc = _mm256_setzero_ps();
-
-    // Main loop
-    for (int i = 0; i < nb; ++i) {
-        /* Compute combined scale for the block */
-        const __m256 d = _mm256_set1_ps( GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d) );
-
-        __m256i bx = bytes_from_nibbles_32(x[i].qs);
-
-        // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
-        const __m256i off = _mm256_set1_epi8( 8 );
-        bx = _mm256_sub_epi8( bx, off );
-
-        __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
-
-        const __m256 q = mul_sum_i8_pairs_float(bx, by);
-
-        /* Multiply q with scale and accumulate */
-        acc = _mm256_fmadd_ps( d, q, acc );
-    }
-
-    *s = hsum_float_8(acc);
-#elif defined(__AVX__)
-    // Initialize accumulator with zeros
-    __m256 acc = _mm256_setzero_ps();
-
-    // Main loop
-    for (int i = 0; i < nb; ++i) {
-        // Compute combined scale for the block
-        const __m256 d = _mm256_set1_ps( GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d) );
-
-        const __m128i lowMask = _mm_set1_epi8(0xF);
-        const __m128i off = _mm_set1_epi8(8);
-
-        const __m128i tmp = _mm_loadu_si128((const __m128i *)x[i].qs);
-
-        __m128i bx = _mm_and_si128(lowMask, tmp);
-        __m128i by = _mm_loadu_si128((const __m128i *)y[i].qs);
-        bx = _mm_sub_epi8(bx, off);
-        const __m128i i32_0 = mul_sum_i8_pairs(bx, by);
-
-        bx = _mm_and_si128(lowMask, _mm_srli_epi64(tmp, 4));
-        by = _mm_loadu_si128((const __m128i *)(y[i].qs + 16));
-        bx = _mm_sub_epi8(bx, off);
-        const __m128i i32_1 = mul_sum_i8_pairs(bx, by);
-
-        // Convert int32_t to float
-        __m256 p = _mm256_cvtepi32_ps(MM256_SET_M128I(i32_0, i32_1));
-
-        // Apply the scale, and accumulate
-        acc = _mm256_add_ps(_mm256_mul_ps( d, p ), acc);
-    }
-
-    *s = hsum_float_8(acc);
-#elif defined(__SSSE3__)
-    // set constants
-    const __m128i lowMask = _mm_set1_epi8(0xF);
-    const __m128i off = _mm_set1_epi8(8);
-
-    // Initialize accumulator with zeros
-    __m128 acc_0 = _mm_setzero_ps();
-    __m128 acc_1 = _mm_setzero_ps();
-    __m128 acc_2 = _mm_setzero_ps();
-    __m128 acc_3 = _mm_setzero_ps();
-
-    // First round without accumulation
-    {
-        _mm_prefetch(&x[0] + sizeof(block_q4_0), _MM_HINT_T0);
-        _mm_prefetch(&y[0] + sizeof(block_q8_0), _MM_HINT_T0);
-
-        // Compute combined scale for the block 0 and 1
-        const __m128 d_0_1 = _mm_set1_ps( GGML_FP16_TO_FP32(x[0].d) * GGML_FP16_TO_FP32(y[0].d) );
-
-        const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i *)x[0].qs);
-
-        __m128i bx_0 = _mm_and_si128(lowMask, tmp_0_1);
-        __m128i by_0 = _mm_loadu_si128((const __m128i *)y[0].qs);
-        bx_0 = _mm_sub_epi8(bx_0, off);
-        const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0);
-
-        __m128i bx_1 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_0_1, 4));
-        __m128i by_1 = _mm_loadu_si128((const __m128i *)(y[0].qs + 16));
-        bx_1 = _mm_sub_epi8(bx_1, off);
-        const __m128i i32_1 = mul_sum_i8_pairs(bx_1, by_1);
-
-        _mm_prefetch(&x[1] + sizeof(block_q4_0), _MM_HINT_T0);
-        _mm_prefetch(&y[1] + sizeof(block_q8_0), _MM_HINT_T0);
-
-        // Compute combined scale for the block 2 and 3
-        const __m128 d_2_3 = _mm_set1_ps( GGML_FP16_TO_FP32(x[1].d) * GGML_FP16_TO_FP32(y[1].d) );
-
-        const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i *)x[1].qs);
-
-        __m128i bx_2 = _mm_and_si128(lowMask, tmp_2_3);
-        __m128i by_2 = _mm_loadu_si128((const __m128i *)y[1].qs);
-        bx_2 = _mm_sub_epi8(bx_2, off);
-        const __m128i i32_2 = mul_sum_i8_pairs(bx_2, by_2);
-
-        __m128i bx_3 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_2_3, 4));
-        __m128i by_3 = _mm_loadu_si128((const __m128i *)(y[1].qs + 16));
-        bx_3 = _mm_sub_epi8(bx_3, off);
-        const __m128i i32_3 = mul_sum_i8_pairs(bx_3, by_3);
-
-        // Convert int32_t to float
-        __m128 p0 = _mm_cvtepi32_ps(i32_0);
-        __m128 p1 = _mm_cvtepi32_ps(i32_1);
-        __m128 p2 = _mm_cvtepi32_ps(i32_2);
-        __m128 p3 = _mm_cvtepi32_ps(i32_3);
-
-        // Apply the scale
-        acc_0 = _mm_mul_ps( d_0_1, p0 );
-        acc_1 = _mm_mul_ps( d_0_1, p1 );
-        acc_2 = _mm_mul_ps( d_2_3, p2 );
-        acc_3 = _mm_mul_ps( d_2_3, p3 );
-    }
-
-    assert(nb % 2 == 0); // TODO: handle odd nb
-
-    // Main loop
-    for (int i = 2; i < nb; i+=2) {
-        _mm_prefetch(&x[i] + sizeof(block_q4_0), _MM_HINT_T0);
-        _mm_prefetch(&y[i] + sizeof(block_q8_0), _MM_HINT_T0);
-
-        // Compute combined scale for the block 0 and 1
-        const __m128 d_0_1 = _mm_set1_ps( GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d) );
-
-        const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i *)x[i].qs);
-
-        __m128i bx_0 = _mm_and_si128(lowMask, tmp_0_1);
-        __m128i by_0 = _mm_loadu_si128((const __m128i *)y[i].qs);
-        bx_0 = _mm_sub_epi8(bx_0, off);
-        const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0);
-
-        __m128i bx_1 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_0_1, 4));
-        __m128i by_1 = _mm_loadu_si128((const __m128i *)(y[i].qs + 16));
-        bx_1 = _mm_sub_epi8(bx_1, off);
-        const __m128i i32_1 = mul_sum_i8_pairs(bx_1, by_1);
-
-        _mm_prefetch(&x[i] + 2 * sizeof(block_q4_0), _MM_HINT_T0);
-        _mm_prefetch(&y[i] + 2 * sizeof(block_q8_0), _MM_HINT_T0);
-
-        // Compute combined scale for the block 2 and 3
-        const __m128 d_2_3 = _mm_set1_ps( GGML_FP16_TO_FP32(x[i + 1].d) * GGML_FP16_TO_FP32(y[i + 1].d) );
-
-        const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i *)x[i + 1].qs);
-
-        __m128i bx_2 = _mm_and_si128(lowMask, tmp_2_3);
-        __m128i by_2 = _mm_loadu_si128((const __m128i *)y[i + 1].qs);
-        bx_2 = _mm_sub_epi8(bx_2, off);
-        const __m128i i32_2 = mul_sum_i8_pairs(bx_2, by_2);
-
-        __m128i bx_3 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_2_3, 4));
-        __m128i by_3 = _mm_loadu_si128((const __m128i *)(y[i + 1].qs + 16));
-        bx_3 = _mm_sub_epi8(bx_3, off);
-        const __m128i i32_3 = mul_sum_i8_pairs(bx_3, by_3);
-
-        // Convert int32_t to float
-        __m128 p0 = _mm_cvtepi32_ps(i32_0);
-        __m128 p1 = _mm_cvtepi32_ps(i32_1);
-        __m128 p2 = _mm_cvtepi32_ps(i32_2);
-        __m128 p3 = _mm_cvtepi32_ps(i32_3);
-
-        // Apply the scale
-        __m128 p0_d = _mm_mul_ps( d_0_1, p0 );
-        __m128 p1_d = _mm_mul_ps( d_0_1, p1 );
-        __m128 p2_d = _mm_mul_ps( d_2_3, p2 );
-        __m128 p3_d = _mm_mul_ps( d_2_3, p3 );
-
-        // Acummulate
-        acc_0 = _mm_add_ps(p0_d, acc_0);
-        acc_1 = _mm_add_ps(p1_d, acc_1);
-        acc_2 = _mm_add_ps(p2_d, acc_2);
-        acc_3 = _mm_add_ps(p3_d, acc_3);
-    }
-
-    *s = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3);
-#elif defined(__riscv_v_intrinsic)
-    float sumf = 0.0;
-
-    size_t vl = __riscv_vsetvl_e8m1(qk/2);
-
-    for (int i = 0; i < nb; i++) {
-        // load elements
-        vuint8mf2_t tx = __riscv_vle8_v_u8mf2(x[i].qs, vl);
-
-        vint8mf2_t y0 = __riscv_vle8_v_i8mf2(y[i].qs, vl);
-        vint8mf2_t y1 = __riscv_vle8_v_i8mf2(y[i].qs+16, vl);
-
-        // mask and store lower part of x, and then upper part
-        vuint8mf2_t x_a = __riscv_vand_vx_u8mf2(tx, 0x0F, vl);
-        vuint8mf2_t x_l = __riscv_vsrl_vx_u8mf2(tx, 0x04, vl);
-
-        vint8mf2_t x_ai = __riscv_vreinterpret_v_u8mf2_i8mf2(x_a);
-        vint8mf2_t x_li = __riscv_vreinterpret_v_u8mf2_i8mf2(x_l);
-
-        // subtract offset
-        vint8mf2_t v0 = __riscv_vsub_vx_i8mf2(x_ai, 8, vl);
-        vint8mf2_t v1 = __riscv_vsub_vx_i8mf2(x_li, 8, vl);
-
-        vint16m1_t vec_mul1 = __riscv_vwmul_vv_i16m1(v0, y0, vl);
-        vint16m1_t vec_mul2 = __riscv_vwmul_vv_i16m1(v1, y1, vl);
-
-        vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
-
-        vint32m1_t vs1 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul1, vec_zero, vl);
-        vint32m1_t vs2 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul2, vs1, vl);
-
-        int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
-
-        sumf += sumi*GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d);
-    }
-
-    *s = sumf;
-#else
-    // scalar
-    float sumf = 0.0;
-
-    for (int i = 0; i < nb; i++) {
-        int sumi = 0;
-
-        for (int j = 0; j < qk/2; ++j) {
-            const int v0 = (x[i].qs[j] & 0x0F) - 8;
-            const int v1 = (x[i].qs[j] >>   4) - 8;
-
-            sumi += (v0 * y[i].qs[j]) + (v1 * y[i].qs[j + qk/2]);
-        }
-
-        sumf += sumi*GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d);
-    }
-
-    *s = sumf;
-#endif
-}
-
-void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
-    const int qk = QK8_1;
-    const int nb = n / qk;
-
-    assert(n % qk == 0);
-
-    const block_q4_1 * restrict x = vx;
-    const block_q8_1 * restrict y = vy;
-
-    // TODO: add WASM SIMD
-#if defined(__ARM_NEON)
-    float32x4_t sumv0 = vdupq_n_f32(0.0f);
-    float32x4_t sumv1 = vdupq_n_f32(0.0f);
-
-    float summs = 0;
-
-    assert(nb % 2 == 0); // TODO: handle odd nb
-
-    for (int i = 0; i < nb; i += 2) {
-        const block_q4_1 * restrict x0 = &x[i + 0];
-        const block_q4_1 * restrict x1 = &x[i + 1];
-        const block_q8_1 * restrict y0 = &y[i + 0];
-        const block_q8_1 * restrict y1 = &y[i + 1];
-
-        summs += GGML_FP16_TO_FP32(x0->m) * y0->s + GGML_FP16_TO_FP32(x1->m) * y1->s;
-
-        const uint8x16_t m4b = vdupq_n_u8(0x0F);
-
-        const uint8x16_t v0_0 = vld1q_u8(x0->qs);
-        const uint8x16_t v0_1 = vld1q_u8(x1->qs);
-
-        // 4-bit -> 8-bit
-        const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8  (v0_0, m4b));
-        const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
-        const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8  (v0_1, m4b));
-        const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
-
-        // load y
-        const int8x16_t v1_0l = vld1q_s8(y0->qs);
-        const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
-        const int8x16_t v1_1l = vld1q_s8(y1->qs);
-        const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
-
-        // dot product into int32x4_t
-        const int32x4_t p_0 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0l), v0_0h, v1_0h);
-        const int32x4_t p_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1l), v0_1h, v1_1h);
-
-        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*y0->d);
-        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*y1->d);
-    }
-
-    *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs;
-#elif defined(__AVX2__) || defined(__AVX__)
-    // Initialize accumulator with zeros
-    __m256 acc = _mm256_setzero_ps();
-
-    float summs = 0;
-
-    // Main loop
-    for (int i = 0; i < nb; ++i) {
-        const float d0 = GGML_FP16_TO_FP32(x[i].d);
-        const float d1 = y[i].d;
-
-        summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s;
-
-        const __m256 d0v = _mm256_set1_ps( d0 );
-        const __m256 d1v = _mm256_set1_ps( d1 );
-
-        // Compute combined scales
-        const __m256 d0d1 = _mm256_mul_ps( d0v, d1v );
-
-        // Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes
-        const __m256i bx = bytes_from_nibbles_32(x[i].qs);
-        const __m256i by = _mm256_loadu_si256( (const __m256i *)y[i].qs );
-
-        const __m256 xy = mul_sum_us8_pairs_float(bx, by);
-
-        // Accumulate d0*d1*x*y
-#if defined(__AVX2__)
-        acc = _mm256_fmadd_ps( d0d1, xy, acc );
-#else
-        acc = _mm256_add_ps( _mm256_mul_ps( d0d1, xy ), acc );
-#endif
-    }
-
-    *s = hsum_float_8(acc) + summs;
-#elif defined(__riscv_v_intrinsic)
-    float sumf = 0.0;
-
-    size_t vl = __riscv_vsetvl_e8m1(qk/2);
-
-    for (int i = 0; i < nb; i++) {
-        // load elements
-        vuint8mf2_t tx = __riscv_vle8_v_u8mf2(x[i].qs, vl);
-
-        vint8mf2_t y0 = __riscv_vle8_v_i8mf2(y[i].qs, vl);
-        vint8mf2_t y1 = __riscv_vle8_v_i8mf2(y[i].qs+16, vl);
-
-        // mask and store lower part of x, and then upper part
-        vuint8mf2_t x_a = __riscv_vand_vx_u8mf2(tx, 0x0F, vl);
-        vuint8mf2_t x_l = __riscv_vsrl_vx_u8mf2(tx, 0x04, vl);
-
-        vint8mf2_t v0 = __riscv_vreinterpret_v_u8mf2_i8mf2(x_a);
-        vint8mf2_t v1 = __riscv_vreinterpret_v_u8mf2_i8mf2(x_l);
-
-        vint16m1_t vec_mul1 = __riscv_vwmul_vv_i16m1(v0, y0, vl);
-        vint16m1_t vec_mul2 = __riscv_vwmul_vv_i16m1(v1, y1, vl);
-
-        vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
-
-        vint32m1_t vs1 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul1, vec_zero, vl);
-        vint32m1_t vs2 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul2, vs1, vl);
-
-        int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
-
-        sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
-    }
-
-    *s = sumf;
-#else
-    // scalar
-    float sumf = 0.0;
-
-    for (int i = 0; i < nb; i++) {
-        int sumi = 0;
-
-        for (int j = 0; j < qk/2; ++j) {
-            const int v0 = (x[i].qs[j] & 0x0F);
-            const int v1 = (x[i].qs[j] >>   4);
-
-            sumi += (v0 * y[i].qs[j]) + (v1 * y[i].qs[j + qk/2]);
-        }
-
-        sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
-    }
-
-    *s = sumf;
-#endif
-}
-
-void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
-    const int qk = QK8_0;
-    const int nb = n / qk;
-
-    assert(n % qk == 0);
-    assert(qk == QK5_0);
-
-    const block_q5_0 * restrict x = vx;
-    const block_q8_0 * restrict y = vy;
-
-#if defined(__ARM_NEON)
-    float32x4_t sumv0 = vdupq_n_f32(0.0f);
-    float32x4_t sumv1 = vdupq_n_f32(0.0f);
-
-    uint32_t qh0;
-    uint32_t qh1;
-
-    uint64_t tmp0[4];
-    uint64_t tmp1[4];
-
-    assert(nb % 2 == 0); // TODO: handle odd nb
-
-    for (int i = 0; i < nb; i += 2) {
-        const block_q5_0 * restrict x0 = &x[i];
-        const block_q5_0 * restrict x1 = &x[i + 1];
-        const block_q8_0 * restrict y0 = &y[i];
-        const block_q8_0 * restrict y1 = &y[i + 1];
-
-        const uint8x16_t m4b = vdupq_n_u8(0x0F);
-
-        // extract the 5th bit via lookup table ((!b) << 4)
-        memcpy(&qh0, x0->qh, sizeof(qh0));
-        memcpy(&qh1, x1->qh, sizeof(qh1));
-
-        tmp0[0] = table_b2b_1[(qh0 >>  0) & 0xFF];
-        tmp0[1] = table_b2b_1[(qh0 >>  8) & 0xFF];
-        tmp0[2] = table_b2b_1[(qh0 >> 16) & 0xFF];
-        tmp0[3] = table_b2b_1[(qh0 >> 24)       ];
-
-        tmp1[0] = table_b2b_1[(qh1 >>  0) & 0xFF];
-        tmp1[1] = table_b2b_1[(qh1 >>  8) & 0xFF];
-        tmp1[2] = table_b2b_1[(qh1 >> 16) & 0xFF];
-        tmp1[3] = table_b2b_1[(qh1 >> 24)       ];
-
-        const int8x16_t qhl0 = vld1q_s8((const int8_t *)(tmp0 + 0));
-        const int8x16_t qhh0 = vld1q_s8((const int8_t *)(tmp0 + 2));
-        const int8x16_t qhl1 = vld1q_s8((const int8_t *)(tmp1 + 0));
-        const int8x16_t qhh1 = vld1q_s8((const int8_t *)(tmp1 + 2));
-
-        const uint8x16_t v0_0 = vld1q_u8(x0->qs);
-        const uint8x16_t v0_1 = vld1q_u8(x1->qs);
-
-        // 4-bit -> 8-bit
-        int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8  (v0_0, m4b));
-        int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
-        int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8  (v0_1, m4b));
-        int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
-
-        // add high bit and sub 16 (equivalent to sub 0x10 when bit is zero)
-        const int8x16_t v0_0lf = vsubq_s8(v0_0l, qhl0);
-        const int8x16_t v0_0hf = vsubq_s8(v0_0h, qhh0);
-        const int8x16_t v0_1lf = vsubq_s8(v0_1l, qhl1);
-        const int8x16_t v0_1hf = vsubq_s8(v0_1h, qhh1);
-
-        // load y
-        const int8x16_t v1_0l = vld1q_s8(y0->qs);
-        const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
-        const int8x16_t v1_1l = vld1q_s8(y1->qs);
-        const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
-
-        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
-                        ggml_vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
-                        ggml_vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
-        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
-                        ggml_vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
-                        ggml_vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
-    }
-
-    *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
-#elif defined(__wasm_simd128__)
-    v128_t sumv = wasm_f32x4_splat(0.0f);
-
-    uint32_t qh;
-    uint64_t tmp[4];
-
-    // TODO: check if unrolling this is better
-    for (int i = 0; i < nb; ++i) {
-        const block_q5_0 * restrict x0 = &x[i];
-        const block_q8_0 * restrict y0 = &y[i];
-
-        const v128_t m4b  = wasm_i8x16_splat(0x0F);
-
-        // extract the 5th bit
-        memcpy(&qh, x0->qh, sizeof(qh));
-
-        tmp[0] = table_b2b_1[(qh >>  0) & 0xFF];
-        tmp[1] = table_b2b_1[(qh >>  8) & 0xFF];
-        tmp[2] = table_b2b_1[(qh >> 16) & 0xFF];
-        tmp[3] = table_b2b_1[(qh >> 24)       ];
-
-        const v128_t qhl = wasm_v128_load(tmp + 0);
-        const v128_t qhh = wasm_v128_load(tmp + 2);
-
-        const v128_t v0 = wasm_v128_load(x0->qs);
-
-        // 4-bit -> 8-bit
-        const v128_t v0l = wasm_v128_and (v0, m4b);
-        const v128_t v0h = wasm_u8x16_shr(v0, 4);
-
-        // add high bit and sub 16 (equivalent to sub 0x10 when bit is zero)
-        const v128_t v0lf = wasm_i8x16_sub(v0l, qhl);
-        const v128_t v0hf = wasm_i8x16_sub(v0h, qhh);
-
-        // load y
-        const v128_t v1l = wasm_v128_load(y0->qs);
-        const v128_t v1h = wasm_v128_load(y0->qs + 16);
-
-        // int8x16 -> int16x8
-        const v128_t v0lfl = wasm_i16x8_extend_low_i8x16 (v0lf);
-        const v128_t v0lfh = wasm_i16x8_extend_high_i8x16(v0lf);
-        const v128_t v0hfl = wasm_i16x8_extend_low_i8x16 (v0hf);
-        const v128_t v0hfh = wasm_i16x8_extend_high_i8x16(v0hf);
-
-        const v128_t v1ll = wasm_i16x8_extend_low_i8x16 (v1l);
-        const v128_t v1lh = wasm_i16x8_extend_high_i8x16(v1l);
-        const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h);
-        const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h);
-
-        // dot product
-        sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(
-                        wasm_i32x4_add(
-                            wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
-                                           wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
-                            wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
-                                           wasm_i32x4_dot_i16x8(v0hfh, v1hh)))),
-                    wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * GGML_FP16_TO_FP32(y0->d))));
-    }
-
-    *s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
-         wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3);
-#elif defined(__AVX2__)
-    // Initialize accumulator with zeros
-    __m256 acc = _mm256_setzero_ps();
-
-    // Main loop
-    for (int i = 0; i < nb; i++) {
-        /* Compute combined scale for the block */
-        const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d));
-
-        __m256i bx = bytes_from_nibbles_32(x[i].qs);
-        __m256i bxhi = bytes_from_bits_32(x[i].qh);
-        bxhi = _mm256_andnot_si256(bxhi, _mm256_set1_epi8((char)0xF0));
-        bx = _mm256_or_si256(bx, bxhi);
-
-        __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
-
-        const __m256 q = mul_sum_i8_pairs_float(bx, by);
-
-        /* Multiply q with scale and accumulate */
-        acc = _mm256_fmadd_ps(d, q, acc);
-    }
-
-    *s = hsum_float_8(acc);
-#elif defined(__AVX__)
-    // Initialize accumulator with zeros
-    __m256 acc = _mm256_setzero_ps();
-    __m128i mask = _mm_set1_epi8((char)0xF0);
-
-    // Main loop
-    for (int i = 0; i < nb; i++) {
-        /* Compute combined scale for the block */
-        const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d));
-
-        __m256i bx = bytes_from_nibbles_32(x[i].qs);
-        const __m256i bxhi = bytes_from_bits_32(x[i].qh);
-        __m128i bxhil = _mm256_castsi256_si128(bxhi);
-        __m128i bxhih = _mm256_extractf128_si256(bxhi, 1);
-        bxhil = _mm_andnot_si128(bxhil, mask);
-        bxhih = _mm_andnot_si128(bxhih, mask);
-        __m128i bxl = _mm256_castsi256_si128(bx);
-        __m128i bxh = _mm256_extractf128_si256(bx, 1);
-        bxl = _mm_or_si128(bxl, bxhil);
-        bxh = _mm_or_si128(bxh, bxhih);
-        bx = MM256_SET_M128I(bxh, bxl);
-
-        const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
-
-        const __m256 q = mul_sum_i8_pairs_float(bx, by);
-
-        /* Multiply q with scale and accumulate */
-        acc = _mm256_add_ps(_mm256_mul_ps(d, q), acc);
-    }
-
-    *s = hsum_float_8(acc);
-#elif defined(__riscv_v_intrinsic)
-    float sumf = 0.0;
-
-    uint32_t qh;
-
-    size_t vl = __riscv_vsetvl_e8m1(qk/2);
-
-    // These temporary registers are for masking and shift operations
-    vuint32m2_t vt_1 = __riscv_vid_v_u32m2(vl);
-    vuint32m2_t vt_2 = __riscv_vsll_vv_u32m2(__riscv_vmv_v_x_u32m2(1, vl), vt_1, vl);
-
-    vuint32m2_t vt_3 = __riscv_vsll_vx_u32m2(vt_2, 16, vl);
-    vuint32m2_t vt_4 = __riscv_vadd_vx_u32m2(vt_1, 12, vl);
-
-    for (int i = 0; i < nb; i++) {
-        memcpy(&qh, x[i].qh, sizeof(uint32_t));
-
-        // ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
-        vuint32m2_t xha_0 = __riscv_vand_vx_u32m2(vt_2, qh, vl);
-        vuint32m2_t xhr_0 = __riscv_vsrl_vv_u32m2(xha_0, vt_1, vl);
-        vuint32m2_t xhl_0 = __riscv_vsll_vx_u32m2(xhr_0, 4, vl);
-
-        // ((qh & (1u << (j + 16))) >> (j + 12));
-        vuint32m2_t xha_1 = __riscv_vand_vx_u32m2(vt_3, qh, vl);
-        vuint32m2_t xhl_1 = __riscv_vsrl_vv_u32m2(xha_1, vt_4, vl);
-
-        // narrowing
-        vuint16m1_t xhc_0 = __riscv_vncvt_x_x_w_u16m1(xhl_0, vl);
-        vuint8mf2_t xh_0 = __riscv_vncvt_x_x_w_u8mf2(xhc_0, vl);
-
-        vuint16m1_t xhc_1 = __riscv_vncvt_x_x_w_u16m1(xhl_1, vl);
-        vuint8mf2_t xh_1 = __riscv_vncvt_x_x_w_u8mf2(xhc_1, vl);
-
-        // load
-        vuint8mf2_t tx = __riscv_vle8_v_u8mf2(x[i].qs, vl);
-
-        vint8mf2_t y0 = __riscv_vle8_v_i8mf2(y[i].qs, vl);
-        vint8mf2_t y1 = __riscv_vle8_v_i8mf2(y[i].qs+16, vl);
-
-        vuint8mf2_t x_at = __riscv_vand_vx_u8mf2(tx, 0x0F, vl);
-        vuint8mf2_t x_lt = __riscv_vsrl_vx_u8mf2(tx, 0x04, vl);
-
-        vuint8mf2_t x_a = __riscv_vor_vv_u8mf2(x_at, xh_0, vl);
-        vuint8mf2_t x_l = __riscv_vor_vv_u8mf2(x_lt, xh_1, vl);
-
-        vint8mf2_t x_ai = __riscv_vreinterpret_v_u8mf2_i8mf2(x_a);
-        vint8mf2_t x_li = __riscv_vreinterpret_v_u8mf2_i8mf2(x_l);
-
-        vint8mf2_t v0 = __riscv_vsub_vx_i8mf2(x_ai, 16, vl);
-        vint8mf2_t v1 = __riscv_vsub_vx_i8mf2(x_li, 16, vl);
-
-        vint16m1_t vec_mul1 = __riscv_vwmul_vv_i16m1(v0, y0, vl);
-        vint16m1_t vec_mul2 = __riscv_vwmul_vv_i16m1(v1, y1, vl);
-
-        vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
-
-        vint32m1_t vs1 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul1, vec_zero, vl);
-        vint32m1_t vs2 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul2, vs1, vl);
-
-        int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
-
-        sumf += (GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d)) * sumi;
-    }
-
-    *s = sumf;
-#else
-    // scalar
-    float sumf = 0.0;
-
-    for (int i = 0; i < nb; i++) {
-        uint32_t qh;
-        memcpy(&qh, x[i].qh, sizeof(qh));
-
-        int sumi = 0;
-
-        for (int j = 0; j < qk/2; ++j) {
-            const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
-            const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12));
-
-            const int32_t x0 = ((x[i].qs[j] & 0x0F) | xh_0) - 16;
-            const int32_t x1 = ((x[i].qs[j] >>   4) | xh_1) - 16;
-
-            sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]);
-        }
-
-        sumf += (GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d)) * sumi;
-    }
-
-    *s = sumf;
-#endif
-}
-
-void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
-    const int qk = QK8_1;
-    const int nb = n / qk;
-
-    assert(n % qk == 0);
-    assert(qk == QK5_1);
-
-    const block_q5_1 * restrict x = vx;
-    const block_q8_1 * restrict y = vy;
-
-#if defined(__ARM_NEON)
-    float32x4_t sumv0 = vdupq_n_f32(0.0f);
-    float32x4_t sumv1 = vdupq_n_f32(0.0f);
-
-    float summs0 = 0.0f;
-    float summs1 = 0.0f;
-
-    uint32_t qh0;
-    uint32_t qh1;
-
-    uint64_t tmp0[4];
-    uint64_t tmp1[4];
-
-    assert(nb % 2 == 0); // TODO: handle odd nb
-
-    for (int i = 0; i < nb; i += 2) {
-        const block_q5_1 * restrict x0 = &x[i];
-        const block_q5_1 * restrict x1 = &x[i + 1];
-        const block_q8_1 * restrict y0 = &y[i];
-        const block_q8_1 * restrict y1 = &y[i + 1];
-
-        const uint8x16_t m4b = vdupq_n_u8(0x0F);
-
-        summs0 += GGML_FP16_TO_FP32(x0->m) * y0->s;
-        summs1 += GGML_FP16_TO_FP32(x1->m) * y1->s;
-
-        // extract the 5th bit via lookup table ((b) << 4)
-        memcpy(&qh0, x0->qh, sizeof(qh0));
-        memcpy(&qh1, x1->qh, sizeof(qh1));
-
-        tmp0[0] = table_b2b_0[(qh0 >>  0) & 0xFF];
-        tmp0[1] = table_b2b_0[(qh0 >>  8) & 0xFF];
-        tmp0[2] = table_b2b_0[(qh0 >> 16) & 0xFF];
-        tmp0[3] = table_b2b_0[(qh0 >> 24)       ];
-
-        tmp1[0] = table_b2b_0[(qh1 >>  0) & 0xFF];
-        tmp1[1] = table_b2b_0[(qh1 >>  8) & 0xFF];
-        tmp1[2] = table_b2b_0[(qh1 >> 16) & 0xFF];
-        tmp1[3] = table_b2b_0[(qh1 >> 24)       ];
-
-        const int8x16_t qhl0 = vld1q_s8((const int8_t *)(tmp0 + 0));
-        const int8x16_t qhh0 = vld1q_s8((const int8_t *)(tmp0 + 2));
-        const int8x16_t qhl1 = vld1q_s8((const int8_t *)(tmp1 + 0));
-        const int8x16_t qhh1 = vld1q_s8((const int8_t *)(tmp1 + 2));
-
-        const uint8x16_t v0_0 = vld1q_u8(x0->qs);
-        const uint8x16_t v0_1 = vld1q_u8(x1->qs);
-
-        // 4-bit -> 8-bit
-        const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8  (v0_0, m4b));
-        const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
-        const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8  (v0_1, m4b));
-        const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
-
-        // add high bit
-        const int8x16_t v0_0lf = vorrq_s8(v0_0l, qhl0);
-        const int8x16_t v0_0hf = vorrq_s8(v0_0h, qhh0);
-        const int8x16_t v0_1lf = vorrq_s8(v0_1l, qhl1);
-        const int8x16_t v0_1hf = vorrq_s8(v0_1h, qhh1);
-
-        // load y
-        const int8x16_t v1_0l = vld1q_s8(y0->qs);
-        const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
-        const int8x16_t v1_1l = vld1q_s8(y1->qs);
-        const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
-
-        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
-                        ggml_vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
-                        ggml_vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*y0->d);
-        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
-                        ggml_vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
-                        ggml_vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*y1->d);
-    }
-
-    *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs0 + summs1;
-#elif defined(__wasm_simd128__)
-    v128_t sumv = wasm_f32x4_splat(0.0f);
-
-    float summs = 0.0f;
-
-    uint32_t qh;
-    uint64_t tmp[4];
-
-    // TODO: check if unrolling this is better
-    for (int i = 0; i < nb; ++i) {
-        const block_q5_1 * restrict x0 = &x[i];
-        const block_q8_1 * restrict y0 = &y[i];
-
-        summs += GGML_FP16_TO_FP32(x0->m) * y0->s;
-
-        const v128_t m4b = wasm_i8x16_splat(0x0F);
-
-        // extract the 5th bit
-        memcpy(&qh, x0->qh, sizeof(qh));
-
-        tmp[0] = table_b2b_0[(qh >>  0) & 0xFF];
-        tmp[1] = table_b2b_0[(qh >>  8) & 0xFF];
-        tmp[2] = table_b2b_0[(qh >> 16) & 0xFF];
-        tmp[3] = table_b2b_0[(qh >> 24)       ];
-
-        const v128_t qhl = wasm_v128_load(tmp + 0);
-        const v128_t qhh = wasm_v128_load(tmp + 2);
-
-        const v128_t v0 = wasm_v128_load(x0->qs);
-
-        // 4-bit -> 8-bit
-        const v128_t v0l = wasm_v128_and (v0, m4b);
-        const v128_t v0h = wasm_u8x16_shr(v0, 4);
-
-        // add high bit
-        const v128_t v0lf = wasm_v128_or(v0l, qhl);
-        const v128_t v0hf = wasm_v128_or(v0h, qhh);
-
-        // load y
-        const v128_t v1l = wasm_v128_load(y0->qs);
-        const v128_t v1h = wasm_v128_load(y0->qs + 16);
-
-        // int8x16 -> int16x8
-        const v128_t v0lfl = wasm_i16x8_extend_low_i8x16 (v0lf);
-        const v128_t v0lfh = wasm_i16x8_extend_high_i8x16(v0lf);
-        const v128_t v0hfl = wasm_i16x8_extend_low_i8x16 (v0hf);
-        const v128_t v0hfh = wasm_i16x8_extend_high_i8x16(v0hf);
-
-        const v128_t v1ll = wasm_i16x8_extend_low_i8x16 (v1l);
-        const v128_t v1lh = wasm_i16x8_extend_high_i8x16(v1l);
-        const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h);
-        const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h);
-
-        // dot product
-        sumv = wasm_f32x4_add(sumv,
-                wasm_f32x4_mul(wasm_f32x4_convert_i32x4(wasm_i32x4_add(
-                            wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
-                                           wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
-                            wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
-                                           wasm_i32x4_dot_i16x8(v0hfh, v1hh)))),
-                    wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * y0->d)));
-    }
-
-    *s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
-         wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3) + summs;
-#elif defined(__AVX2__)
-    // Initialize accumulator with zeros
-    __m256 acc = _mm256_setzero_ps();
-
-    float summs = 0.0f;
-
-    // Main loop
-    for (int i = 0; i < nb; i++) {
-        const __m256 dx = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d));
-
-        summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s;
-
-        __m256i bx = bytes_from_nibbles_32(x[i].qs);
-        __m256i bxhi = bytes_from_bits_32(x[i].qh);
-        bxhi = _mm256_and_si256(bxhi, _mm256_set1_epi8(0x10));
-        bx = _mm256_or_si256(bx, bxhi);
-
-        const __m256 dy = _mm256_set1_ps(y[i].d);
-        const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
-
-        const __m256 q = mul_sum_us8_pairs_float(bx, by);
-
-        acc = _mm256_fmadd_ps(q, _mm256_mul_ps(dx, dy), acc);
-    }
-
-    *s = hsum_float_8(acc) + summs;
-#elif defined(__AVX__)
-    // Initialize accumulator with zeros
-    __m256 acc = _mm256_setzero_ps();
-    __m128i mask = _mm_set1_epi8(0x10);
-
-    float summs = 0.0f;
-
-    // Main loop
-    for (int i = 0; i < nb; i++) {
-        const __m256 dx = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d));
-
-        summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s;
-
-        __m256i bx = bytes_from_nibbles_32(x[i].qs);
-        const __m256i bxhi = bytes_from_bits_32(x[i].qh);
-        __m128i bxhil = _mm256_castsi256_si128(bxhi);
-        __m128i bxhih = _mm256_extractf128_si256(bxhi, 1);
-        bxhil = _mm_and_si128(bxhil, mask);
-        bxhih = _mm_and_si128(bxhih, mask);
-        __m128i bxl = _mm256_castsi256_si128(bx);
-        __m128i bxh = _mm256_extractf128_si256(bx, 1);
-        bxl = _mm_or_si128(bxl, bxhil);
-        bxh = _mm_or_si128(bxh, bxhih);
-        bx = MM256_SET_M128I(bxh, bxl);
-
-        const __m256 dy = _mm256_set1_ps(y[i].d);
-        const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
-
-        const __m256 q = mul_sum_us8_pairs_float(bx, by);
-
-        acc = _mm256_add_ps(_mm256_mul_ps(q, _mm256_mul_ps(dx, dy)), acc);
-    }
-
-    *s = hsum_float_8(acc) + summs;
-#elif defined(__riscv_v_intrinsic)
-    float sumf = 0.0;
-
-    uint32_t qh;
-
-    size_t vl = __riscv_vsetvl_e8m1(qk/2);
-
-    // temporary registers for shift operations
-    vuint32m2_t vt_1 = __riscv_vid_v_u32m2(vl);
-    vuint32m2_t vt_2 = __riscv_vadd_vx_u32m2(vt_1, 12, vl);
-
-    for (int i = 0; i < nb; i++) {
-        memcpy(&qh, x[i].qh, sizeof(uint32_t));
-
-        // load qh
-        vuint32m2_t vqh = __riscv_vmv_v_x_u32m2(qh, vl);
-
-        // ((qh >> (j +  0)) << 4) & 0x10;
-        vuint32m2_t xhr_0 = __riscv_vsrl_vv_u32m2(vqh, vt_1, vl);
-        vuint32m2_t xhl_0 = __riscv_vsll_vx_u32m2(xhr_0, 4, vl);
-        vuint32m2_t xha_0 = __riscv_vand_vx_u32m2(xhl_0, 0x10, vl);
-
-        // ((qh >> (j + 12))     ) & 0x10;
-        vuint32m2_t xhr_1 = __riscv_vsrl_vv_u32m2(vqh, vt_2, vl);
-        vuint32m2_t xha_1 = __riscv_vand_vx_u32m2(xhr_1, 0x10, vl);
-
-        // narrowing
-        vuint16m1_t xhc_0 = __riscv_vncvt_x_x_w_u16m1(xha_0, vl);
-        vuint8mf2_t xh_0 = __riscv_vncvt_x_x_w_u8mf2(xhc_0, vl);
-
-        vuint16m1_t xhc_1 = __riscv_vncvt_x_x_w_u16m1(xha_1, vl);
-        vuint8mf2_t xh_1 = __riscv_vncvt_x_x_w_u8mf2(xhc_1, vl);
-
-        // load
-        vuint8mf2_t tx = __riscv_vle8_v_u8mf2(x[i].qs, vl);
-
-        vint8mf2_t y0 = __riscv_vle8_v_i8mf2(y[i].qs, vl);
-        vint8mf2_t y1 = __riscv_vle8_v_i8mf2(y[i].qs+16, vl);
-
-        vuint8mf2_t x_at = __riscv_vand_vx_u8mf2(tx, 0x0F, vl);
-        vuint8mf2_t x_lt = __riscv_vsrl_vx_u8mf2(tx, 0x04, vl);
-
-        vuint8mf2_t x_a = __riscv_vor_vv_u8mf2(x_at, xh_0, vl);
-        vuint8mf2_t x_l = __riscv_vor_vv_u8mf2(x_lt, xh_1, vl);
-
-        vint8mf2_t v0 = __riscv_vreinterpret_v_u8mf2_i8mf2(x_a);
-        vint8mf2_t v1 = __riscv_vreinterpret_v_u8mf2_i8mf2(x_l);
-
-        vint16m1_t vec_mul1 = __riscv_vwmul_vv_i16m1(v0, y0, vl);
-        vint16m1_t vec_mul2 = __riscv_vwmul_vv_i16m1(v1, y1, vl);
-
-        vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
-
-        vint32m1_t vs1 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul1, vec_zero, vl);
-        vint32m1_t vs2 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul2, vs1, vl);
-
-        int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
-
-        sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
-    }
-
-    *s = sumf;
-#else
-    // scalar
-    float sumf = 0.0;
-
-    for (int i = 0; i < nb; i++) {
-        uint32_t qh;
-        memcpy(&qh, x[i].qh, sizeof(qh));
-
-        int sumi = 0;
-
-        for (int j = 0; j < qk/2; ++j) {
-            const uint8_t xh_0 = ((qh >> (j +  0)) << 4) & 0x10;
-            const uint8_t xh_1 = ((qh >> (j + 12))     ) & 0x10;
-
-            const int32_t x0 = (x[i].qs[j] & 0xF) | xh_0;
-            const int32_t x1 = (x[i].qs[j] >>  4) | xh_1;
-
-            sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]);
-        }
-
-        sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
-    }
-
-    *s = sumf;
-#endif
-}
-
-void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
-    const int qk = QK8_0;
-    const int nb = n / qk;
-
-    assert(n % qk == 0);
-
-    const block_q8_0 * restrict x = vx;
-    const block_q8_0 * restrict y = vy;
-
-#if defined(__ARM_NEON)
-    float32x4_t sumv0 = vdupq_n_f32(0.0f);
-    float32x4_t sumv1 = vdupq_n_f32(0.0f);
-
-    assert(nb % 2 == 0); // TODO: handle odd nb
-
-    for (int i = 0; i < nb; i += 2) {
-        const block_q8_0 * restrict x0 = &x[i + 0];
-        const block_q8_0 * restrict x1 = &x[i + 1];
-        const block_q8_0 * restrict y0 = &y[i + 0];
-        const block_q8_0 * restrict y1 = &y[i + 1];
-
-        const int8x16_t x0_0 = vld1q_s8(x0->qs);
-        const int8x16_t x0_1 = vld1q_s8(x0->qs + 16);
-        const int8x16_t x1_0 = vld1q_s8(x1->qs);
-        const int8x16_t x1_1 = vld1q_s8(x1->qs + 16);
-
-        // load y
-        const int8x16_t y0_0 = vld1q_s8(y0->qs);
-        const int8x16_t y0_1 = vld1q_s8(y0->qs + 16);
-        const int8x16_t y1_0 = vld1q_s8(y1->qs);
-        const int8x16_t y1_1 = vld1q_s8(y1->qs + 16);
-
-        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
-                        ggml_vdotq_s32(vdupq_n_s32(0), x0_0, y0_0),
-                        ggml_vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
-
-        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
-                        ggml_vdotq_s32(vdupq_n_s32(0), x1_0, y1_0),
-                        ggml_vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
-    }
-
-    *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
-#elif defined(__AVX2__) || defined(__AVX__)
-    // Initialize accumulator with zeros
-    __m256 acc = _mm256_setzero_ps();
-
-    // Main loop
-    for (int i = 0; i < nb; ++i) {
-        // Compute combined scale for the block
-        const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d));
-        __m256i bx = _mm256_loadu_si256((const __m256i *)x[i].qs);
-        __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
-
-        const __m256 q = mul_sum_i8_pairs_float(bx, by);
-
-        // Multiply q with scale and accumulate
-#if defined(__AVX2__)
-        acc = _mm256_fmadd_ps( d, q, acc );
-#else
-        acc = _mm256_add_ps( _mm256_mul_ps( d, q ), acc );
-#endif
-    }
-
-    *s = hsum_float_8(acc);
-#elif defined(__riscv_v_intrinsic)
-    float sumf = 0.0;
-    size_t vl = __riscv_vsetvl_e8m1(qk);
-
-    for (int i = 0; i < nb; i++) {
-        // load elements
-        vint8m1_t bx = __riscv_vle8_v_i8m1(x[i].qs, vl);
-        vint8m1_t by = __riscv_vle8_v_i8m1(y[i].qs, vl);
-
-        vint16m2_t vw_mul = __riscv_vwmul_vv_i16m2(bx, by, vl);
-
-        vint32m1_t v_zero = __riscv_vmv_v_x_i32m1(0, vl);
-        vint32m1_t v_sum = __riscv_vwredsum_vs_i16m2_i32m1(vw_mul, v_zero, vl);
-
-        int sumi = __riscv_vmv_x_s_i32m1_i32(v_sum);
-
-        sumf += sumi*(GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d));
-    }
-
-    *s = sumf;
-#else
-    // scalar
-    float sumf = 0.0;
-
-    for (int i = 0; i < nb; i++) {
-        int sumi = 0;
-
-        for (int j = 0; j < qk; j++) {
-            sumi += x[i].qs[j]*y[i].qs[j];
-        }
-
-        sumf += sumi*(GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d));
-    }
-
-    *s = sumf;
-#endif
-}
-
-#if QK_K == 256
-void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
-
-    const block_q2_K * restrict x = vx;
-    const block_q8_K * restrict y = vy;
-
-    const int nb = n / QK_K;
-
-#ifdef __ARM_NEON
-    const uint8x16_t m3 = vdupq_n_u8(0x3);
-    const uint8x16_t m4 = vdupq_n_u8(0xF);
-
-    const int32x4_t vzero = vdupq_n_s32(0);
-
-    ggml_int8x16x2_t q2bytes;
-    uint8_t aux[16];
-
-    float sum = 0;
-
-    for (int i = 0; i < nb; ++i) {
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
-
-        const uint8_t * restrict q2 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
-        const uint8_t * restrict sc = x[i].scales;
-
-        const uint8x16_t mins_and_scales = vld1q_u8(sc);
-        const uint8x16_t scales = vandq_u8(mins_and_scales, m4);
-        vst1q_u8(aux, scales);
-
-        const uint8x16_t mins = vshrq_n_u8(mins_and_scales, 4);
-        const ggml_int16x8x2_t q8sums = ggml_vld1q_s16_x2(y[i].bsums);
-        const ggml_int16x8x2_t mins16 = {{vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(mins))), vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(mins)))}};
-        const int32x4_t s0 = vaddq_s32(vmull_s16(vget_low_s16 (mins16.val[0]), vget_low_s16 (q8sums.val[0])),
-                                       vmull_s16(vget_high_s16(mins16.val[0]), vget_high_s16(q8sums.val[0])));
-        const int32x4_t s1 = vaddq_s32(vmull_s16(vget_low_s16 (mins16.val[1]), vget_low_s16 (q8sums.val[1])),
-                                       vmull_s16(vget_high_s16(mins16.val[1]), vget_high_s16(q8sums.val[1])));
-        sum += dmin * vaddvq_s32(vaddq_s32(s0, s1));
-
-        int isum = 0;
-        int is = 0;
-
-// We use this macro instead of a function call because for some reason
-// the code runs 2-3% slower, even if the function is declared inline
-#define MULTIPLY_ACCUM_WITH_SCALE(index)\
-        isum += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[0], q8bytes.val[0])) * aux[is+(index)];\
-        isum += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[1], q8bytes.val[1])) * aux[is+1+(index)];
-
-#define SHIFT_MULTIPLY_ACCUM_WITH_SCALE(shift, index)\
-        q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;\
-        q2bytes.val[0] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits.val[0], (shift)), m3));\
-        q2bytes.val[1] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits.val[1], (shift)), m3));\
-        MULTIPLY_ACCUM_WITH_SCALE((index));
-
-        for (int j = 0; j < QK_K/128; ++j) {
-            const ggml_uint8x16x2_t q2bits = ggml_vld1q_u8_x2(q2); q2 += 32;
-
-            ggml_int8x16x2_t q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
-            q2bytes.val[0] = vreinterpretq_s8_u8(vandq_u8(q2bits.val[0], m3));
-            q2bytes.val[1] = vreinterpretq_s8_u8(vandq_u8(q2bits.val[1], m3));
-
-            MULTIPLY_ACCUM_WITH_SCALE(0);
-
-            SHIFT_MULTIPLY_ACCUM_WITH_SCALE(2, 2);
-            SHIFT_MULTIPLY_ACCUM_WITH_SCALE(4, 4);
-            SHIFT_MULTIPLY_ACCUM_WITH_SCALE(6, 6);
-
-            is += 8;
-        }
-
-        sum += d * isum;
-    }
-
-    *s = sum;
-
-#elif defined __AVX2__
-
-    const __m256i m3 = _mm256_set1_epi8(3);
-    const __m128i m4 = _mm_set1_epi8(0xF);
-
-    __m256 acc = _mm256_setzero_ps();
-
-    for (int i = 0; i < nb; ++i) {
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
-
-        const uint8_t * restrict q2 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
-
-        const __m128i mins_and_scales = _mm_loadu_si128((const __m128i*)x[i].scales);
-        const __m128i scales8 = _mm_and_si128(mins_and_scales, m4);
-        const __m128i mins8 = _mm_and_si128(_mm_srli_epi16(mins_and_scales, 4), m4);
-        const __m256i mins = _mm256_cvtepi8_epi16(mins8);
-        const __m256i prod = _mm256_madd_epi16(mins, _mm256_loadu_si256((const __m256i*)y[i].bsums));
-
-        acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&dmin), _mm256_cvtepi32_ps(prod), acc);
-
-        const __m256i all_scales = _mm256_cvtepi8_epi16(scales8);
-        const __m128i l_scales = _mm256_extracti128_si256(all_scales, 0);
-        const __m128i h_scales = _mm256_extracti128_si256(all_scales, 1);
-        const __m256i scales[2] = {MM256_SET_M128I(l_scales, l_scales), MM256_SET_M128I(h_scales, h_scales)};
-
-        __m256i sumi = _mm256_setzero_si256();
-
-        for (int j = 0; j < QK_K/128; ++j) {
-
-            const __m256i q2bits = _mm256_loadu_si256((const __m256i*)q2); q2 += 32;
-
-            const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
-            const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
-            const __m256i q8_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
-            const __m256i q8_3 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
-
-            const __m256i q2_0 = _mm256_and_si256(q2bits, m3);
-            const __m256i q2_1 = _mm256_and_si256(_mm256_srli_epi16(q2bits, 2), m3);
-            const __m256i q2_2 = _mm256_and_si256(_mm256_srli_epi16(q2bits, 4), m3);
-            const __m256i q2_3 = _mm256_and_si256(_mm256_srli_epi16(q2bits, 6), m3);
-
-            __m256i p0 = _mm256_maddubs_epi16(q2_0, q8_0);
-            __m256i p1 = _mm256_maddubs_epi16(q2_1, q8_1);
-            __m256i p2 = _mm256_maddubs_epi16(q2_2, q8_2);
-            __m256i p3 = _mm256_maddubs_epi16(q2_3, q8_3);
-
-            p0 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(0)), p0);
-            p1 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(1)), p1);
-            p2 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(2)), p2);
-            p3 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(3)), p3);
-
-            p0 = _mm256_add_epi32(p0, p1);
-            p2 = _mm256_add_epi32(p2, p3);
-
-            sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p0, p2));
-        }
-
-        acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi), acc);
-
-    }
-
-    *s = hsum_float_8(acc);
-
-#elif defined __AVX__
-
-    const __m128i m3 = _mm_set1_epi8(0x3);
-    const __m128i m4 = _mm_set1_epi8(0xF);
-    const __m128i m2 = _mm_set1_epi8(0x2);
-
-    __m256 acc = _mm256_setzero_ps();
-
-    for (int i = 0; i < nb; ++i) {
-
-        const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
-
-        const uint8_t * restrict q2 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
-
-        // load mins and scales from block_q2_K.scales[QK_K/16]
-        const __m128i mins_and_scales = _mm_loadu_si128((const __m128i*)x[i].scales);
-        const __m128i scales16 = _mm_and_si128(mins_and_scales, m4);
-        const __m128i mins16 = _mm_and_si128(_mm_srli_epi16(mins_and_scales, 4), m4);
-        const __m128i mins_0 = _mm_cvtepi8_epi16(mins16);
-        const __m128i mins_1 = _mm_cvtepi8_epi16(_mm_unpackhi_epi64(mins16, mins16));
-
-        // summs = y[i].bsums * (x[i].scales >> 4) in 16bits*8*2 to 32bits*4*2
-        const __m128i summs_0 = _mm_madd_epi16(mins_0, _mm_loadu_si128((const __m128i*)&y[i].bsums[0]));
-        const __m128i summs_1 = _mm_madd_epi16(mins_1, _mm_loadu_si128((const __m128i*)&y[i].bsums[8]));
-
-        // sumf += -dmin * summs in 32bits*8
-        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&dmin), _mm256_cvtepi32_ps(MM256_SET_M128I(summs_1, summs_0))), acc);
-
-        const __m128i scales_0 = _mm_cvtepi8_epi16(scales16);
-        const __m128i scales_1 = _mm_cvtepi8_epi16(_mm_unpackhi_epi64(scales16, scales16));
-        const __m128i scales[2] = { scales_0, scales_1 };
-
-        __m128i sumi_0 = _mm_setzero_si128();
-        __m128i sumi_1 = _mm_setzero_si128();
-
-        for (int j = 0; j < QK_K/128; ++j) {
-
-            // load Q8 quants int8*16*8 from block_q8_K.qs[QK_K]
-            const __m128i q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_2 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_3 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_4 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_5 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_6 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_7 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-
-            // load 2bits*16*8 from block_q2_K.qs[QK_K/4]
-            __m128i q2bits = _mm_loadu_si128((const __m128i*)q2); q2 += 16;
-            const __m128i q2_0 = _mm_and_si128(q2bits, m3);
-            const __m128i q2_2 = _mm_and_si128(_mm_srli_epi16(q2bits, 2), m3);
-            const __m128i q2_4 = _mm_and_si128(_mm_srli_epi16(q2bits, 4), m3);
-            const __m128i q2_6 = _mm_and_si128(_mm_srli_epi16(q2bits, 6), m3);
-            q2bits = _mm_loadu_si128((const __m128i*)q2); q2 += 16;
-            const __m128i q2_1 = _mm_and_si128(q2bits, m3);
-            const __m128i q2_3 = _mm_and_si128(_mm_srli_epi16(q2bits, 2), m3);
-            const __m128i q2_5 = _mm_and_si128(_mm_srli_epi16(q2bits, 4), m3);
-            const __m128i q2_7 = _mm_and_si128(_mm_srli_epi16(q2bits, 6), m3);
-
-            // isuml = q8[l] * ((q2[l] >> shift) & 3) in 8bits*16*8 to 16bits*8*8
-            __m128i p0 = _mm_maddubs_epi16(q2_0, q8_0);
-            __m128i p1 = _mm_maddubs_epi16(q2_1, q8_1);
-            __m128i p2 = _mm_maddubs_epi16(q2_2, q8_2);
-            __m128i p3 = _mm_maddubs_epi16(q2_3, q8_3);
-            __m128i p4 = _mm_maddubs_epi16(q2_4, q8_4);
-            __m128i p5 = _mm_maddubs_epi16(q2_5, q8_5);
-            __m128i p6 = _mm_maddubs_epi16(q2_6, q8_6);
-            __m128i p7 = _mm_maddubs_epi16(q2_7, q8_7);
-
-            // isum += (x[i].scales[is++] & 0xF) * isuml in 16bits*8*8 to 32bits*4*8
-            __m128i shuffle = _mm_set1_epi16(0x0100);
-            p0 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p0);
-            shuffle = _mm_add_epi16(shuffle, m2);
-            p1 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p1);
-            shuffle = _mm_add_epi16(shuffle, m2);
-            p2 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p2);
-            shuffle = _mm_add_epi16(shuffle, m2);
-            p3 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p3);
-            shuffle = _mm_add_epi16(shuffle, m2);
-            p4 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p4);
-            shuffle = _mm_add_epi16(shuffle, m2);
-            p5 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p5);
-            shuffle = _mm_add_epi16(shuffle, m2);
-            p6 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p6);
-            shuffle = _mm_add_epi16(shuffle, m2);
-            p7 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p7);
-
-            p0 = _mm_add_epi32(p0, p1);
-            p2 = _mm_add_epi32(p2, p3);
-            p4 = _mm_add_epi32(p4, p5);
-            p6 = _mm_add_epi32(p6, p7);
-
-            // isum in 32bits*4*2
-            sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p0, p2));
-            sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p4, p6));
-        }
-
-        // sumf += dall * isum - dmin * summs in 32bits
-        __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
-        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&dall), _mm256_cvtepi32_ps(sumi)), acc);
-    }
-
-    *s = hsum_float_8(acc);
-
-#elif defined __riscv_v_intrinsic
-
-    float sumf = 0;
-    uint8_t temp_01[32] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
-
-    for (int i = 0; i < nb; ++i) {
-
-        const uint8_t * q2 = x[i].qs;
-        const  int8_t * q8 = y[i].qs;
-        const uint8_t * sc = x[i].scales;
-
-        const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
-
-        size_t vl = 16;
-
-        vuint8m1_t scales = __riscv_vle8_v_u8m1(sc, vl);
-        vuint8m1_t aux = __riscv_vand_vx_u8m1(scales, 0x0F, vl);
-
-        vint16m1_t q8sums = __riscv_vle16_v_i16m1(y[i].bsums, vl);
-
-        vuint8mf2_t scales_2 = __riscv_vle8_v_u8mf2(sc, vl);
-        vuint8mf2_t mins8 = __riscv_vsrl_vx_u8mf2(scales_2, 0x4, vl);
-        vint16m1_t mins = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(mins8, vl));
-        vint32m2_t prod = __riscv_vwmul_vv_i32m2(q8sums, mins, vl);
-        vint32m1_t vsums = __riscv_vredsum_vs_i32m2_i32m1(prod, __riscv_vmv_v_x_i32m1(0, 1), vl);
-
-        sumf  += dmin * __riscv_vmv_x_s_i32m1_i32(vsums);
-
-        vl = 32;
-
-        vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
-        vuint8m1_t v_b = __riscv_vle8_v_u8m1(temp_01, vl);
-
-        uint8_t is=0;
-        int isum=0;
-
-        for (int j = 0; j < QK_K/128; ++j) {
-            // load Q2
-            vuint8m1_t q2_x = __riscv_vle8_v_u8m1(q2, vl);
-
-            vuint8m1_t q2_0 = __riscv_vand_vx_u8m1(q2_x, 0x03, vl);
-            vuint8m1_t q2_1 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q2_x, 0x2, vl), 0x03 , vl);
-            vuint8m1_t q2_2 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q2_x, 0x4, vl), 0x03 , vl);
-            vuint8m1_t q2_3 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q2_x, 0x6, vl), 0x03 , vl);
-
-            // duplicate scale elements for product
-            vuint8m1_t sc0 = __riscv_vrgather_vv_u8m1(aux, __riscv_vadd_vx_u8m1(v_b, 0+is, vl), vl);
-            vuint8m1_t sc1 = __riscv_vrgather_vv_u8m1(aux, __riscv_vadd_vx_u8m1(v_b, 2+is, vl), vl);
-            vuint8m1_t sc2 = __riscv_vrgather_vv_u8m1(aux, __riscv_vadd_vx_u8m1(v_b, 4+is, vl), vl);
-            vuint8m1_t sc3 = __riscv_vrgather_vv_u8m1(aux, __riscv_vadd_vx_u8m1(v_b, 6+is, vl), vl);
-
-            vint16m2_t p0 = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vwmulu_vv_u16m2(q2_0, sc0, vl));
-            vint16m2_t p1 = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vwmulu_vv_u16m2(q2_1, sc1, vl));
-            vint16m2_t p2 = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vwmulu_vv_u16m2(q2_2, sc2, vl));
-            vint16m2_t p3 = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vwmulu_vv_u16m2(q2_3, sc3, vl));
-
-            // load Q8
-            vint8m1_t q8_0 = __riscv_vle8_v_i8m1(q8, vl);
-            vint8m1_t q8_1 = __riscv_vle8_v_i8m1(q8+32, vl);
-            vint8m1_t q8_2 = __riscv_vle8_v_i8m1(q8+64, vl);
-            vint8m1_t q8_3 = __riscv_vle8_v_i8m1(q8+96, vl);
-
-            vint32m4_t s0 = __riscv_vwmul_vv_i32m4(p0, __riscv_vwcvt_x_x_v_i16m2(q8_0, vl), vl);
-            vint32m4_t s1 = __riscv_vwmul_vv_i32m4(p1, __riscv_vwcvt_x_x_v_i16m2(q8_1, vl), vl);
-            vint32m4_t s2 = __riscv_vwmul_vv_i32m4(p2, __riscv_vwcvt_x_x_v_i16m2(q8_2, vl), vl);
-            vint32m4_t s3 = __riscv_vwmul_vv_i32m4(p3, __riscv_vwcvt_x_x_v_i16m2(q8_3, vl), vl);
-
-            vint32m1_t isum0 = __riscv_vredsum_vs_i32m4_i32m1(__riscv_vadd_vv_i32m4(s0, s1, vl), vzero, vl);
-            vint32m1_t isum1 = __riscv_vredsum_vs_i32m4_i32m1(__riscv_vadd_vv_i32m4(s2, s3, vl), isum0, vl);
-
-            isum += __riscv_vmv_x_s_i32m1_i32(isum1);
-
-            q2+=32;  q8+=128;  is=8;
-
-        }
-
-        sumf += dall * isum;
-
-    }
-
-    *s = sumf;
-
-#else
-
-    float sumf = 0;
-
-    for (int i = 0; i < nb; ++i) {
-
-        const uint8_t * q2 = x[i].qs;
-        const  int8_t * q8 = y[i].qs;
-        const uint8_t * sc = x[i].scales;
-
-        int summs = 0;
-        for (int j = 0; j < 16; ++j) {
-            summs += y[i].bsums[j] * (sc[j] >> 4);
-        }
-
-        const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-        const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
-
-        int isum = 0;
-        int is = 0;
-        int d;
-        for (int k = 0; k < QK_K/128; ++k) {
-            int shift = 0;
-            for (int j = 0; j < 4; ++j) {
-                d = sc[is++] & 0xF;
-                int isuml = 0;
-                for (int l =  0; l < 16; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
-                isum += d * isuml;
-                d = sc[is++] & 0xF;
-                isuml = 0;
-                for (int l = 16; l < 32; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
-                isum += d * isuml;
-                shift += 2;
-                q8 += 32;
-            }
-            q2 += 32;
-        }
-        sumf += dall * isum - dmin * summs;
-    }
-    *s = sumf;
-#endif
-}
-
-#else
-
-void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
-
-    const block_q2_K * restrict x = vx;
-    const block_q8_K * restrict y = vy;
-
-    const int nb = n / QK_K;
-
-#ifdef __ARM_NEON
-    const uint8x16_t m3 = vdupq_n_u8(0x3);
-
-    const int32x4_t vzero = vdupq_n_s32(0);
-
-    ggml_int8x16x4_t q2bytes;
-
-    uint32_t aux32[2];
-    const uint8_t * scales = (const uint8_t *)aux32;
-
-    float sum = 0;
-
-    for (int i = 0; i < nb; ++i) {
-
-        const float d = y[i].d * (float)x[i].d;
-        const float dmin = -y[i].d * (float)x[i].dmin;
-
-        const uint8_t * restrict q2 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
-        const uint32_t * restrict sc = (const uint32_t *)x[i].scales;
-
-        aux32[0] = sc[0] & 0x0f0f0f0f;
-        aux32[1] = (sc[0] >> 4) & 0x0f0f0f0f;
-
-        sum += dmin * (scales[4] * y[i].bsums[0] + scales[5] * y[i].bsums[1] + scales[6] * y[i].bsums[2] + scales[7] * y[i].bsums[3]);
-
-        int isum1 = 0, isum2 = 0;
-
-        const uint8x16_t q2bits = vld1q_u8(q2);
-
-        const ggml_int8x16x4_t q8bytes = ggml_vld1q_s8_x4(q8);
-
-        q2bytes.val[0] = vreinterpretq_s8_u8(vandq_u8(q2bits, m3));
-        q2bytes.val[1] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits, 2), m3));
-        q2bytes.val[2] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits, 4), m3));
-        q2bytes.val[3] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits, 6), m3));
-
-        isum1 += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[0], q8bytes.val[0])) * scales[0];
-        isum2 += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[1], q8bytes.val[1])) * scales[1];
-        isum1 += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[2], q8bytes.val[2])) * scales[2];
-        isum2 += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[3], q8bytes.val[3])) * scales[3];
-
-        sum += d * (isum1 + isum2);
-    }
-
-    *s = sum;
-
-#elif defined __AVX2__
-
-    const __m256i m3 = _mm256_set1_epi8(3);
-
-    __m256 acc = _mm256_setzero_ps();
-
-    uint32_t ud, um;
-    const uint8_t * restrict db = (const uint8_t *)&ud;
-    const uint8_t * restrict mb = (const uint8_t *)&um;
-
-    float summs = 0;
-
-    // TODO: optimize this
-
-    for (int i = 0; i < nb; ++i) {
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
-
-        const uint8_t * restrict q2 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
-
-        const uint32_t * restrict sc = (const uint32_t *)x[i].scales;
-        ud = (sc[0] >> 0) & 0x0f0f0f0f;
-        um = (sc[0] >> 4) & 0x0f0f0f0f;
-
-        int32_t smin = mb[0] * y[i].bsums[0] + mb[1] * y[i].bsums[1] + mb[2] * y[i].bsums[2] + mb[3] * y[i].bsums[3];
-        summs += dmin * smin;
-
-        const __m128i q2bits = _mm_loadu_si128((const __m128i*)q2);
-        const __m256i q2_0 = _mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(q2bits, 2), q2bits), m3);
-        const __m256i q2_1 = _mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(q2bits, 6), _mm_srli_epi16(q2bits, 4)), m3);
-
-        const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0));
-        const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32));
-
-        const __m256i p0 = _mm256_maddubs_epi16(q2_0, q8_0);
-        const __m256i p1 = _mm256_maddubs_epi16(q2_1, q8_1);
-
-        const __m256i p_0 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(p0, 0));
-        const __m256i p_1 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(p0, 1));
-        const __m256i p_2 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(p1, 0));
-        const __m256i p_3 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(p1, 1));
-
-        acc = _mm256_fmadd_ps(_mm256_set1_ps(d * db[0]), _mm256_cvtepi32_ps(p_0), acc);
-        acc = _mm256_fmadd_ps(_mm256_set1_ps(d * db[1]), _mm256_cvtepi32_ps(p_1), acc);
-        acc = _mm256_fmadd_ps(_mm256_set1_ps(d * db[2]), _mm256_cvtepi32_ps(p_2), acc);
-        acc = _mm256_fmadd_ps(_mm256_set1_ps(d * db[3]), _mm256_cvtepi32_ps(p_3), acc);
-    }
-
-    *s = hsum_float_8(acc) + summs;
-
-#elif defined __AVX__
-
-    const __m128i m3 = _mm_set1_epi8(3);
-
-    __m256 acc = _mm256_setzero_ps();
-
-    uint32_t ud, um;
-    const uint8_t * restrict db = (const uint8_t *)&ud;
-    const uint8_t * restrict mb = (const uint8_t *)&um;
-
-    float summs = 0;
-
-    // TODO: optimize this
-
-    for (int i = 0; i < nb; ++i) {
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
-
-        const uint8_t * restrict q2 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
-
-        const uint32_t * restrict sc = (const uint32_t *)x[i].scales;
-        ud = (sc[0] >> 0) & 0x0f0f0f0f;
-        um = (sc[0] >> 4) & 0x0f0f0f0f;
-
-        int32_t smin = mb[0] * y[i].bsums[0] + mb[1] * y[i].bsums[1] + mb[2] * y[i].bsums[2] + mb[3] * y[i].bsums[3];
-        summs += dmin * smin;
-
-        const __m128i q2bits = _mm_loadu_si128((const __m128i*)q2);
-        const __m128i q2_0 = _mm_and_si128(q2bits, m3);
-        const __m128i q2_1 = _mm_and_si128(_mm_srli_epi16(q2bits, 2), m3);
-        const __m128i q2_2 = _mm_and_si128(_mm_srli_epi16(q2bits, 4), m3);
-        const __m128i q2_3 = _mm_and_si128(_mm_srli_epi16(q2bits, 6), m3);
-
-        const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0));
-        const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32));
-
-        const __m128i p0 = _mm_maddubs_epi16(q2_0, _mm256_extractf128_si256(q8_0, 0));
-        const __m128i p1 = _mm_maddubs_epi16(q2_1, _mm256_extractf128_si256(q8_0, 1));
-        const __m128i p2 = _mm_maddubs_epi16(q2_2, _mm256_extractf128_si256(q8_1, 0));
-        const __m128i p3 = _mm_maddubs_epi16(q2_3, _mm256_extractf128_si256(q8_1, 1));
-
-        const __m256i p_0 = MM256_SET_M128I(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p0, p0)), _mm_cvtepi16_epi32(p0));
-        const __m256i p_1 = MM256_SET_M128I(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p1, p1)), _mm_cvtepi16_epi32(p1));
-        const __m256i p_2 = MM256_SET_M128I(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p2, p2)), _mm_cvtepi16_epi32(p2));
-        const __m256i p_3 = MM256_SET_M128I(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p3, p3)), _mm_cvtepi16_epi32(p3));
-
-        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d * db[0]), _mm256_cvtepi32_ps(p_0)), acc);
-        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d * db[1]), _mm256_cvtepi32_ps(p_1)), acc);
-        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d * db[2]), _mm256_cvtepi32_ps(p_2)), acc);
-        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d * db[3]), _mm256_cvtepi32_ps(p_3)), acc);
-    }
-
-    *s = hsum_float_8(acc) + summs;
-
-#elif defined __riscv_v_intrinsic
-
-    uint32_t aux32[2];
-    const uint8_t * scales = (const uint8_t *)aux32;
-
-    float sumf = 0;
-
-    for (int i = 0; i < nb; ++i) {
-
-        const float d = y[i].d * (float)x[i].d;
-        const float dmin = -y[i].d * (float)x[i].dmin;
-
-        const uint8_t * restrict q2 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
-        const uint32_t * restrict sc = (const uint32_t *)x[i].scales;
-
-        aux32[0] = sc[0] & 0x0f0f0f0f;
-        aux32[1] = (sc[0] >> 4) & 0x0f0f0f0f;
-
-        sumf += dmin * (scales[4] * y[i].bsums[0] + scales[5] * y[i].bsums[1] + scales[6] * y[i].bsums[2] + scales[7] * y[i].bsums[3]);
-
-        int isum1 = 0;
-        int isum2 = 0;
-
-        size_t vl = 16;
-
-        vint16m1_t vzero = __riscv_vmv_v_x_i16m1(0, 1);
-
-        // load Q2
-        vuint8mf2_t q2_x = __riscv_vle8_v_u8mf2(q2, vl);
-
-        vint8mf2_t q2_0 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vand_vx_u8mf2(q2_x, 0x03, vl));
-        vint8mf2_t q2_1 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vand_vx_u8mf2(__riscv_vsrl_vx_u8mf2(q2_x, 0x2, vl), 0x03 , vl));
-        vint8mf2_t q2_2 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vand_vx_u8mf2(__riscv_vsrl_vx_u8mf2(q2_x, 0x4, vl), 0x03 , vl));
-        vint8mf2_t q2_3 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vand_vx_u8mf2(__riscv_vsrl_vx_u8mf2(q2_x, 0x6, vl), 0x03 , vl));
-
-        // load Q8, and take product with Q2
-        vint16m1_t p0 = __riscv_vwmul_vv_i16m1(q2_0, __riscv_vle8_v_i8mf2(q8, vl), vl);
-        vint16m1_t p1 = __riscv_vwmul_vv_i16m1(q2_1, __riscv_vle8_v_i8mf2(q8+16, vl), vl);
-        vint16m1_t p2 = __riscv_vwmul_vv_i16m1(q2_2, __riscv_vle8_v_i8mf2(q8+32, vl), vl);
-        vint16m1_t p3 = __riscv_vwmul_vv_i16m1(q2_3, __riscv_vle8_v_i8mf2(q8+48, vl), vl);
-
-        vint16m1_t vs_0 = __riscv_vredsum_vs_i16m1_i16m1(p0, vzero, vl);
-        vint16m1_t vs_1 = __riscv_vredsum_vs_i16m1_i16m1(p1, vzero, vl);
-        vint16m1_t vs_2 = __riscv_vredsum_vs_i16m1_i16m1(p2, vzero, vl);
-        vint16m1_t vs_3 = __riscv_vredsum_vs_i16m1_i16m1(p3, vzero, vl);
-
-        isum1 += __riscv_vmv_x_s_i16m1_i16(vs_0) * scales[0];
-        isum2 += __riscv_vmv_x_s_i16m1_i16(vs_1) * scales[1];
-        isum1 += __riscv_vmv_x_s_i16m1_i16(vs_2) * scales[2];
-        isum2 += __riscv_vmv_x_s_i16m1_i16(vs_3) * scales[3];
-
-        sumf += d * (isum1 + isum2);
-
-    }
-
-    *s = sumf;
-
-#else
-
-    float sumf = 0;
-
-    int isum[4];
-
-    for (int i = 0; i < nb; ++i) {
-
-        const uint8_t * q2 = x[i].qs;
-        const  int8_t * q8 = y[i].qs;
-        const uint8_t * sc = x[i].scales;
-
-        int summs = 0;
-        for (int j = 0; j < QK_K/16; ++j) {
-            summs += y[i].bsums[j] * (sc[j] >> 4);
-        }
-
-        const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-        const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
-
-        isum[0] = isum[1] = isum[2] = isum[3] = 0;
-        for (int l =  0; l < 16; ++l) {
-            isum[0] += q8[l+ 0] * ((q2[l] >> 0) & 3);
-            isum[1] += q8[l+16] * ((q2[l] >> 2) & 3);
-            isum[2] += q8[l+32] * ((q2[l] >> 4) & 3);
-            isum[3] += q8[l+48] * ((q2[l] >> 6) & 3);
-        }
-        for (int l = 0; l < 4; ++l) {
-            isum[l] *= (sc[l] & 0xF);
-        }
-        sumf += dall * (isum[0] + isum[1] + isum[2] + isum[3]) - dmin * summs;
-    }
-    *s = sumf;
-#endif
-}
-#endif
-
-#if QK_K == 256
-void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
-    assert(n % QK_K == 0);
-
-    const uint32_t kmask1 = 0x03030303;
-    const uint32_t kmask2 = 0x0f0f0f0f;
-
-    const block_q3_K * restrict x = vx;
-    const block_q8_K * restrict y = vy;
-
-    const int nb = n / QK_K;
-
-#ifdef __ARM_NEON
-
-    uint32_t aux[3];
-    uint32_t utmp[4];
-
-    const uint8x16_t m3b = vdupq_n_u8(0x3);
-    const int32x4_t  vzero = vdupq_n_s32(0);
-
-    const uint8x16_t m0 = vdupq_n_u8(1);
-    const uint8x16_t m1 = vshlq_n_u8(m0, 1);
-    const uint8x16_t m2 = vshlq_n_u8(m0, 2);
-    const uint8x16_t m3 = vshlq_n_u8(m0, 3);
-    const int8_t m32 = 32;
-
-    ggml_int8x16x4_t q3bytes;
-
-    float sum = 0;
-
-    for (int i = 0; i < nb; ++i) {
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-
-        const uint8_t * restrict q3 = x[i].qs;
-        const uint8_t * restrict qh = x[i].hmask;
-        const int8_t  * restrict q8 = y[i].qs;
-
-        ggml_uint8x16x2_t qhbits = ggml_vld1q_u8_x2(qh);
-
-        ggml_uint8x16x4_t q3h;
-
-        int32_t isum = 0;
-
-        // Set up scales
-        memcpy(aux, x[i].scales, 12);
-        utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4);
-        utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4);
-        utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4);
-        utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4);
-
-        int8_t * scale = (int8_t *)utmp;
-        for (int j = 0; j < 16; ++j) scale[j] -= m32;
-
-        for (int j = 0; j < QK_K/128; ++j) {
-
-            const ggml_uint8x16x2_t q3bits = ggml_vld1q_u8_x2(q3); q3 += 32;
-            const ggml_int8x16x4_t q8bytes_1 = ggml_vld1q_s8_x4(q8); q8 += 64;
-            const ggml_int8x16x4_t q8bytes_2 = ggml_vld1q_s8_x4(q8); q8 += 64;
-
-            q3h.val[0] = vshlq_n_u8(vbicq_u8(m0, qhbits.val[0]), 2);
-            q3h.val[1] = vshlq_n_u8(vbicq_u8(m0, qhbits.val[1]), 2);
-            q3h.val[2] = vshlq_n_u8(vbicq_u8(m1, qhbits.val[0]), 1);
-            q3h.val[3] = vshlq_n_u8(vbicq_u8(m1, qhbits.val[1]), 1);
-
-            q3bytes.val[0] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(q3bits.val[0], m3b)), vreinterpretq_s8_u8(q3h.val[0]));
-            q3bytes.val[1] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(q3bits.val[1], m3b)), vreinterpretq_s8_u8(q3h.val[1]));
-            q3bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[0], 2), m3b)), vreinterpretq_s8_u8(q3h.val[2]));
-            q3bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[1], 2), m3b)), vreinterpretq_s8_u8(q3h.val[3]));
-
-            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[0], q8bytes_1.val[0])) * scale[0];
-            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[1], q8bytes_1.val[1])) * scale[1];
-            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[2], q8bytes_1.val[2])) * scale[2];
-            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[3], q8bytes_1.val[3])) * scale[3];
-
-            scale += 4;
-
-            q3h.val[0] = vbicq_u8(m2, qhbits.val[0]);
-            q3h.val[1] = vbicq_u8(m2, qhbits.val[1]);
-            q3h.val[2] = vshrq_n_u8(vbicq_u8(m3, qhbits.val[0]), 1);
-            q3h.val[3] = vshrq_n_u8(vbicq_u8(m3, qhbits.val[1]), 1);
-
-            q3bytes.val[0] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[0], 4), m3b)), vreinterpretq_s8_u8(q3h.val[0]));
-            q3bytes.val[1] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[1], 4), m3b)), vreinterpretq_s8_u8(q3h.val[1]));
-            q3bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[0], 6), m3b)), vreinterpretq_s8_u8(q3h.val[2]));
-            q3bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[1], 6), m3b)), vreinterpretq_s8_u8(q3h.val[3]));
-
-            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[0], q8bytes_2.val[0])) * scale[0];
-            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[1], q8bytes_2.val[1])) * scale[1];
-            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[2], q8bytes_2.val[2])) * scale[2];
-            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[3], q8bytes_2.val[3])) * scale[3];
-
-            scale += 4;
-
-            if (j == 0) {
-                qhbits.val[0] = vshrq_n_u8(qhbits.val[0], 4);
-                qhbits.val[1] = vshrq_n_u8(qhbits.val[1], 4);
-            }
-
-        }
-        sum += d * isum;
-
-    }
-
-    *s = sum;
-
-#elif defined __AVX2__
-
-    const __m256i m3 = _mm256_set1_epi8(3);
-    const __m256i mone = _mm256_set1_epi8(1);
-    const __m128i m32 = _mm_set1_epi8(32);
-
-    __m256 acc = _mm256_setzero_ps();
-
-    uint32_t aux[3];
-
-    for (int i = 0; i < nb; ++i) {
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-
-        const uint8_t * restrict q3 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
-
-        // Set up scales
-        memcpy(aux, x[i].scales, 12);
-        __m128i scales128 = _mm_set_epi32(
-                ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4),
-                ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4),
-                (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4),
-                (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4));
-        scales128 = _mm_sub_epi8(scales128, m32);
-        const __m256i all_scales = _mm256_cvtepi8_epi16(scales128);
-        const __m128i l_scales = _mm256_extracti128_si256(all_scales, 0);
-        const __m128i h_scales = _mm256_extracti128_si256(all_scales, 1);
-        const __m256i scales[2] = {MM256_SET_M128I(l_scales, l_scales), MM256_SET_M128I(h_scales, h_scales)};
-
-        // high bit
-        const __m256i hbits = _mm256_loadu_si256((const __m256i*)x[i].hmask);
-
-        // integer accumulator
-        __m256i sumi = _mm256_setzero_si256();
-
-        int bit = 0;
-        int is  = 0;
-
-        for (int j = 0; j < QK_K/128; ++j) {
-            // load low 2 bits
-            const __m256i q3bits = _mm256_loadu_si256((const __m256i*)q3); q3 += 32;
-
-            // prepare low and high bits
-            const __m256i q3l_0 = _mm256_and_si256(q3bits, m3);
-            const __m256i q3h_0 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_andnot_si256(hbits, _mm256_slli_epi16(mone, bit)), bit), 2);
-            ++bit;
-
-            const __m256i q3l_1 = _mm256_and_si256(_mm256_srli_epi16(q3bits, 2), m3);
-            const __m256i q3h_1 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_andnot_si256(hbits, _mm256_slli_epi16(mone, bit)), bit), 2);
-            ++bit;
-
-            const __m256i q3l_2 = _mm256_and_si256(_mm256_srli_epi16(q3bits, 4), m3);
-            const __m256i q3h_2 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_andnot_si256(hbits, _mm256_slli_epi16(mone, bit)), bit), 2);
-            ++bit;
-
-            const __m256i q3l_3 = _mm256_and_si256(_mm256_srli_epi16(q3bits, 6), m3);
-            const __m256i q3h_3 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_andnot_si256(hbits, _mm256_slli_epi16(mone, bit)), bit), 2);
-            ++bit;
-
-            // load Q8 quants
-            const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
-            const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
-            const __m256i q8_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
-            const __m256i q8_3 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
-
-            // Dot product: we multiply the 2 low bits and 1 high bit part separately, so we can use _mm256_maddubs_epi16,
-            // and then subtract. The high bit part has the 2 already subtracted (and so, it is zero if the high bit was not set,
-            // and 2 if the high bit was set)
-            __m256i q8s_0 = _mm256_maddubs_epi16(q3h_0, q8_0);
-            __m256i q8s_1 = _mm256_maddubs_epi16(q3h_1, q8_1);
-            __m256i q8s_2 = _mm256_maddubs_epi16(q3h_2, q8_2);
-            __m256i q8s_3 = _mm256_maddubs_epi16(q3h_3, q8_3);
-
-            __m256i p16_0 = _mm256_maddubs_epi16(q3l_0, q8_0);
-            __m256i p16_1 = _mm256_maddubs_epi16(q3l_1, q8_1);
-            __m256i p16_2 = _mm256_maddubs_epi16(q3l_2, q8_2);
-            __m256i p16_3 = _mm256_maddubs_epi16(q3l_3, q8_3);
-
-            p16_0 = _mm256_sub_epi16(p16_0, q8s_0);
-            p16_1 = _mm256_sub_epi16(p16_1, q8s_1);
-            p16_2 = _mm256_sub_epi16(p16_2, q8s_2);
-            p16_3 = _mm256_sub_epi16(p16_3, q8s_3);
-
-            // multiply with scales
-            p16_0 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(is + 0)), p16_0);
-            p16_1 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(is + 1)), p16_1);
-            p16_2 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(is + 2)), p16_2);
-            p16_3 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(is + 3)), p16_3);
-
-            // accumulate
-            p16_0 = _mm256_add_epi32(p16_0, p16_1);
-            p16_2 = _mm256_add_epi32(p16_2, p16_3);
-            sumi  = _mm256_add_epi32(sumi, _mm256_add_epi32(p16_0, p16_2));
-
-        }
-
-        // multiply with block scale and accumulate
-        acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi), acc);
-
-    }
-
-    *s = hsum_float_8(acc);
-
-#elif defined __AVX__
-
-    const __m128i m3 = _mm_set1_epi8(3);
-    const __m128i mone = _mm_set1_epi8(1);
-    const __m128i m32 = _mm_set1_epi8(32);
-    const __m128i m2 = _mm_set1_epi8(2);
-
-    __m256 acc = _mm256_setzero_ps();
-
-    const uint32_t *aux;
-
-    for (int i = 0; i < nb; ++i) {
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-
-        const uint8_t * restrict q3 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
-
-        // Set up scales
-        aux = (const uint32_t *)x[i].scales;
-        __m128i scales128 = _mm_set_epi32(
-                ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4),
-                ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4),
-                (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4),
-                (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4));
-        scales128 = _mm_sub_epi8(scales128, m32);
-        const __m128i scales_0 = _mm_cvtepi8_epi16(scales128);
-        const __m128i scales_1 = _mm_cvtepi8_epi16(_mm_unpackhi_epi64(scales128, scales128));
-        const __m128i scales[2] = { scales_0, scales_1 };
-
-        // high bit *128*2 from block_q3_K.hmask[QK_K/8]
-        const __m128i hbits_0 = _mm_loadu_si128((const __m128i*)&x[i].hmask[0]);
-        const __m128i hbits_1 = _mm_loadu_si128((const __m128i*)&x[i].hmask[16]);
-
-        // integer accumulator
-        __m128i sumi_0 = _mm_setzero_si128();
-        __m128i sumi_1 = _mm_setzero_si128();
-
-        for (int j = 0; j < QK_K/128; ++j) {
-            // load low 2 bits *64*2 from block_q3_K.qs[QK_K/4]
-            const __m128i q3bits_0 = _mm_loadu_si128((const __m128i*)q3); q3 += 16;
-            const __m128i q3bits_1 = _mm_loadu_si128((const __m128i*)q3); q3 += 16;
-
-            // prepare low and high bits
-            const int bit = j << 2;
-
-            const __m128i q3l_0 = _mm_and_si128(q3bits_0, m3);
-            const __m128i q3l_1 = _mm_and_si128(q3bits_1, m3);
-            const __m128i q3h_0 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_0, _mm_slli_epi16(mone, bit)), bit), 2);
-            const __m128i q3h_1 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_1, _mm_slli_epi16(mone, bit)), bit), 2);
-
-            const __m128i q3l_2 = _mm_and_si128(_mm_srli_epi16(q3bits_0, 2), m3);
-            const __m128i q3l_3 = _mm_and_si128(_mm_srli_epi16(q3bits_1, 2), m3);
-            const __m128i q3h_2 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_0, _mm_slli_epi16(mone, bit+1)), bit+1), 2);
-            const __m128i q3h_3 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_1, _mm_slli_epi16(mone, bit+1)), bit+1), 2);
-
-            const __m128i q3l_4 = _mm_and_si128(_mm_srli_epi16(q3bits_0, 4), m3);
-            const __m128i q3l_5 = _mm_and_si128(_mm_srli_epi16(q3bits_1, 4), m3);
-            const __m128i q3h_4 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_0, _mm_slli_epi16(mone, bit+2)), bit+2), 2);
-            const __m128i q3h_5 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_1, _mm_slli_epi16(mone, bit+2)), bit+2), 2);
-
-            const __m128i q3l_6 = _mm_and_si128(_mm_srli_epi16(q3bits_0, 6), m3);
-            const __m128i q3l_7 = _mm_and_si128(_mm_srli_epi16(q3bits_1, 6), m3);
-            const __m128i q3h_6 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_0, _mm_slli_epi16(mone, bit+3)), bit+3), 2);
-            const __m128i q3h_7 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_1, _mm_slli_epi16(mone, bit+3)), bit+3), 2);
-
-            // load Q8 quants from block_q8_K.qs[QK_K]
-            const __m128i q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_2 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_3 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_4 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_5 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_6 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_7 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-
-            // Dot product: we multiply the 2 low bits and 1 high bit part separately, so we can use _mm256_maddubs_epi16,
-            // and then subtract. The high bit part has the 2 already subtracted (and so, it is zero if the high bit was not set,
-            // and 2 if the high bit was set)
-            __m128i q8s_0 = _mm_maddubs_epi16(q3h_0, q8_0);
-            __m128i q8s_1 = _mm_maddubs_epi16(q3h_1, q8_1);
-            __m128i q8s_2 = _mm_maddubs_epi16(q3h_2, q8_2);
-            __m128i q8s_3 = _mm_maddubs_epi16(q3h_3, q8_3);
-            __m128i q8s_4 = _mm_maddubs_epi16(q3h_4, q8_4);
-            __m128i q8s_5 = _mm_maddubs_epi16(q3h_5, q8_5);
-            __m128i q8s_6 = _mm_maddubs_epi16(q3h_6, q8_6);
-            __m128i q8s_7 = _mm_maddubs_epi16(q3h_7, q8_7);
-
-            __m128i p16_0 = _mm_maddubs_epi16(q3l_0, q8_0);
-            __m128i p16_1 = _mm_maddubs_epi16(q3l_1, q8_1);
-            __m128i p16_2 = _mm_maddubs_epi16(q3l_2, q8_2);
-            __m128i p16_3 = _mm_maddubs_epi16(q3l_3, q8_3);
-            __m128i p16_4 = _mm_maddubs_epi16(q3l_4, q8_4);
-            __m128i p16_5 = _mm_maddubs_epi16(q3l_5, q8_5);
-            __m128i p16_6 = _mm_maddubs_epi16(q3l_6, q8_6);
-            __m128i p16_7 = _mm_maddubs_epi16(q3l_7, q8_7);
-
-            p16_0 = _mm_sub_epi16(p16_0, q8s_0);
-            p16_1 = _mm_sub_epi16(p16_1, q8s_1);
-            p16_2 = _mm_sub_epi16(p16_2, q8s_2);
-            p16_3 = _mm_sub_epi16(p16_3, q8s_3);
-            p16_4 = _mm_sub_epi16(p16_4, q8s_4);
-            p16_5 = _mm_sub_epi16(p16_5, q8s_5);
-            p16_6 = _mm_sub_epi16(p16_6, q8s_6);
-            p16_7 = _mm_sub_epi16(p16_7, q8s_7);
-
-            // multiply with scales
-            __m128i shuffle = _mm_set1_epi16(0x0100);
-            p16_0 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_0);
-            shuffle = _mm_add_epi16(shuffle, m2);
-            p16_1 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_1);
-            shuffle = _mm_add_epi16(shuffle, m2);
-            p16_2 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_2);
-            shuffle = _mm_add_epi16(shuffle, m2);
-            p16_3 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_3);
-            shuffle = _mm_add_epi16(shuffle, m2);
-            p16_4 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_4);
-            shuffle = _mm_add_epi16(shuffle, m2);
-            p16_5 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_5);
-            shuffle = _mm_add_epi16(shuffle, m2);
-            p16_6 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_6);
-            shuffle = _mm_add_epi16(shuffle, m2);
-            p16_7 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_7);
-
-            // accumulate
-            p16_0 = _mm_add_epi32(p16_0, p16_1);
-            p16_2 = _mm_add_epi32(p16_2, p16_3);
-            p16_4 = _mm_add_epi32(p16_4, p16_5);
-            p16_6 = _mm_add_epi32(p16_6, p16_7);
-            sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2));
-            sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_4, p16_6));
-
-        }
-
-        // multiply with block scale and accumulate
-        __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
-        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi)), acc);
-
-    }
-
-    *s = hsum_float_8(acc);
-
-#elif defined __riscv_v_intrinsic
-
-    uint32_t aux[3];
-    uint32_t utmp[4];
-
-    float sumf = 0;
-    for (int i = 0; i < nb; ++i) {
-
-        const uint8_t * restrict q3 = x[i].qs;
-        const uint8_t * restrict qh = x[i].hmask;
-        const  int8_t * restrict q8 = y[i].qs;
-
-        memcpy(aux, x[i].scales, 12);
-        utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4);
-        utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4);
-        utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4);
-        utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4);
-
-        int8_t * scale = (int8_t *)utmp;
-        for (int j = 0; j < 16; ++j) scale[j] -= 32;
-
-
-        size_t vl = 32;
-        uint8_t m =  1;
-
-        vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
-        vuint8m1_t vqh = __riscv_vle8_v_u8m1(qh, vl);
-
-        int sum_t = 0;
-
-        for (int j = 0; j < QK_K; j += 128) {
-
-            vl = 32;
-
-            // load Q3
-            vuint8m1_t q3_x = __riscv_vle8_v_u8m1(q3, vl);
-
-            vint8m1_t q3_0 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q3_x, 0x03, vl));
-            vint8m1_t q3_1 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q3_x, 0x2, vl), 0x03 , vl));
-            vint8m1_t q3_2 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q3_x, 0x4, vl), 0x03 , vl));
-            vint8m1_t q3_3 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q3_x, 0x6, vl), 0x03 , vl));
-
-            // compute mask for subtraction
-            vuint8m1_t qh_m0 = __riscv_vand_vx_u8m1(vqh, m, vl);
-            vbool8_t vmask_0 = __riscv_vmseq_vx_u8m1_b8(qh_m0, 0, vl);
-            vint8m1_t q3_m0 = __riscv_vsub_vx_i8m1_m(vmask_0, q3_0, 0x4, vl);
-            m <<= 1;
-
-            vuint8m1_t qh_m1 = __riscv_vand_vx_u8m1(vqh, m, vl);
-            vbool8_t vmask_1 = __riscv_vmseq_vx_u8m1_b8(qh_m1, 0, vl);
-            vint8m1_t q3_m1 = __riscv_vsub_vx_i8m1_m(vmask_1, q3_1, 0x4, vl);
-            m <<= 1;
-
-            vuint8m1_t qh_m2 = __riscv_vand_vx_u8m1(vqh, m, vl);
-            vbool8_t vmask_2 = __riscv_vmseq_vx_u8m1_b8(qh_m2, 0, vl);
-            vint8m1_t q3_m2 = __riscv_vsub_vx_i8m1_m(vmask_2, q3_2, 0x4, vl);
-            m <<= 1;
-
-            vuint8m1_t qh_m3 = __riscv_vand_vx_u8m1(vqh, m, vl);
-            vbool8_t vmask_3 = __riscv_vmseq_vx_u8m1_b8(qh_m3, 0, vl);
-            vint8m1_t q3_m3 = __riscv_vsub_vx_i8m1_m(vmask_3, q3_3, 0x4, vl);
-            m <<= 1;
-
-            // load Q8 and take product with Q3
-            vint16m2_t a0 = __riscv_vwmul_vv_i16m2(q3_m0, __riscv_vle8_v_i8m1(q8, vl), vl);
-            vint16m2_t a1 = __riscv_vwmul_vv_i16m2(q3_m1, __riscv_vle8_v_i8m1(q8+32, vl), vl);
-            vint16m2_t a2 = __riscv_vwmul_vv_i16m2(q3_m2, __riscv_vle8_v_i8m1(q8+64, vl), vl);
-            vint16m2_t a3 = __riscv_vwmul_vv_i16m2(q3_m3, __riscv_vle8_v_i8m1(q8+96, vl), vl);
-
-            vl = 16;
-
-            // retrieve lane to multiply with scale
-            vint32m2_t aux0_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a0, 0), (scale[0]), vl);
-            vint32m2_t aux0_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a0, 1), (scale[1]), vl);
-            vint32m2_t aux1_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a1, 0), (scale[2]), vl);
-            vint32m2_t aux1_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a1, 1), (scale[3]), vl);
-            vint32m2_t aux2_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a2, 0), (scale[4]), vl);
-            vint32m2_t aux2_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a2, 1), (scale[5]), vl);
-            vint32m2_t aux3_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a3, 0), (scale[6]), vl);
-            vint32m2_t aux3_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a3, 1), (scale[7]), vl);
-
-            vint32m1_t isum0 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(aux0_0, aux0_1, vl), vzero, vl);
-            vint32m1_t isum1 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(aux1_0, aux1_1, vl), isum0, vl);
-            vint32m1_t isum2 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(aux2_0, aux2_1, vl), isum1, vl);
-            vint32m1_t isum3 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(aux3_0, aux3_1, vl), isum2, vl);
-
-            sum_t +=  __riscv_vmv_x_s_i32m1_i32(isum3);
-
-            q3 += 32;    q8 += 128;   scale += 8;
-
-        }
-
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-
-        sumf += d*sum_t;
-
-    }
-
-    *s = sumf;
-
-#else
-    // scalar version
-    // This function is written like this so the compiler can manage to vectorize most of it
-    // Using -Ofast, GCC and clang manage to produce code that is within a factor of 2 or so from the
-    // manually vectorized version above. Every other version I tried would run at least 4 times slower.
-    // The ideal situation would be if we could just write the code once, and the compiler would
-    // automatically produce the best possible set of machine instructions, instead of us having to manually
-    // write vectorized versions for AVX, ARM_NEON, etc.
-
-    int8_t  aux8[QK_K];
-    int16_t aux16[8];
-    float   sums [8];
-    int32_t aux32[8];
-    memset(sums, 0, 8*sizeof(float));
-
-    uint32_t auxs[4];
-    const int8_t * scales = (const int8_t*)auxs;
-
-    float sumf = 0;
-    for (int i = 0; i < nb; ++i) {
-        const uint8_t * restrict q3 = x[i].qs;
-        const uint8_t * restrict hm = x[i].hmask;
-        const  int8_t * restrict q8 = y[i].qs;
-        memset(aux32, 0, 8*sizeof(int32_t));
-        int8_t * restrict a = aux8;
-        uint8_t m = 1;
-        for (int j = 0; j < QK_K; j += 128) {
-            for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3;
-            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
-            a += 32; m <<= 1;
-            for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 2) & 3;
-            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
-            a += 32; m <<= 1;
-            for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 4) & 3;
-            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
-            a += 32; m <<= 1;
-            for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 6) & 3;
-            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
-            a += 32; m <<= 1;
-            q3 += 32;
-        }
-        a = aux8;
-
-        memcpy(auxs, x[i].scales, 12);
-        uint32_t tmp = auxs[2];
-        auxs[2] = ((auxs[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
-        auxs[3] = ((auxs[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
-        auxs[0] = (auxs[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
-        auxs[1] = (auxs[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);
-        for (int j = 0; j < QK_K/16; ++j) {
-            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
-            for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
-            q8 += 8; a += 8;
-            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
-            for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
-            q8 += 8; a += 8;
-        }
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
-    }
-    for (int l = 0; l < 8; ++l) sumf += sums[l];
-    *s = sumf;
-
-#endif
-
-}
-
-#else
-
-void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
-    assert(n % QK_K == 0);
-
-    const block_q3_K * restrict x = vx;
-    const block_q8_K * restrict y = vy;
-
-    const int nb = n / QK_K;
-
-#ifdef __ARM_NEON
-    const int32x4_t vzero = vdupq_n_s32(0);
-
-    const uint8x16_t m3b = vdupq_n_u8(0x3);
-    const uint8x16_t mh  = vdupq_n_u8(4);
-
-    ggml_int8x16x4_t q3bytes;
-
-    uint16_t aux16[2];
-    int8_t * scales = (int8_t *)aux16;
-
-    float sum = 0;
-
-    for (int i = 0; i < nb; ++i) {
-
-        ggml_uint8x16x4_t q3h;
-
-        const uint8x8_t  hbits    = vld1_u8(x[i].hmask);
-        const uint8x16_t q3bits   = vld1q_u8(x[i].qs);
-        const ggml_int8x16x4_t q8bytes = ggml_vld1q_s8_x4(y[i].qs);
-
-        const uint16_t a = *(const uint16_t *)x[i].scales;
-        aux16[0] = a & 0x0f0f;
-        aux16[1] = (a >> 4) & 0x0f0f;
-
-        for (int j = 0; j < 4; ++j) scales[j] -= 8;
-
-        int32_t isum = -4*(scales[0] * y[i].bsums[0] + scales[2] * y[i].bsums[1] + scales[1] * y[i].bsums[2] + scales[3] * y[i].bsums[3]);
-
-        const float d = y[i].d * (float)x[i].d;
-
-        const uint8x16_t htmp = vcombine_u8(hbits, vshr_n_u8(hbits, 1));
-        q3h.val[0] = vandq_u8(mh, vshlq_n_u8(htmp, 2));
-        q3h.val[1] = vandq_u8(mh, htmp);
-        q3h.val[2] = vandq_u8(mh, vshrq_n_u8(htmp, 2));
-        q3h.val[3] = vandq_u8(mh, vshrq_n_u8(htmp, 4));
-
-        q3bytes.val[0] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q3bits, m3b),                q3h.val[0]));
-        q3bytes.val[1] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(vshrq_n_u8(q3bits, 2), m3b), q3h.val[1]));
-        q3bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(vshrq_n_u8(q3bits, 4), m3b), q3h.val[2]));
-        q3bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q3bits, 6),                q3h.val[3]));
-
-        isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[0], q8bytes.val[0])) * scales[0];
-        isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[1], q8bytes.val[1])) * scales[2];
-        isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[2], q8bytes.val[2])) * scales[1];
-        isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[3], q8bytes.val[3])) * scales[3];
-
-        sum += d * isum;
-
-    }
-
-    *s = sum;
-
-#elif defined __AVX2__
-
-    const __m256i m3 = _mm256_set1_epi8(3);
-    const __m256i m1 = _mm256_set1_epi8(1);
-
-    __m256 acc = _mm256_setzero_ps();
-
-    uint64_t aux64;
-
-    uint16_t aux16[2];
-    const int8_t * aux8 = (const int8_t *)aux16;
-
-    for (int i = 0; i < nb; ++i) {
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-
-        const uint8_t * restrict q3 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
-
-        const uint16_t a = *(const uint16_t *)x[i].scales;
-        aux16[0] = a & 0x0f0f;
-        aux16[1] = (a >> 4) & 0x0f0f;
-
-        const __m256i scale_0 = MM256_SET_M128I(_mm_set1_epi16(aux8[2] - 8), _mm_set1_epi16(aux8[0] - 8));
-        const __m256i scale_1 = MM256_SET_M128I(_mm_set1_epi16(aux8[3] - 8), _mm_set1_epi16(aux8[1] - 8));
-
-        memcpy(&aux64, x[i].hmask, 8);
-
-        const __m128i haux = _mm_set_epi64x(aux64 >> 1, aux64 >> 0);
-        __m256i q3h_0 = MM256_SET_M128I(_mm_srli_epi16(haux, 2), haux);
-        __m256i q3h_1 = _mm256_srli_epi16(q3h_0, 4);
-        q3h_0 = _mm256_slli_epi16(_mm256_andnot_si256(q3h_0, m1), 2);
-        q3h_1 = _mm256_slli_epi16(_mm256_andnot_si256(q3h_1, m1), 2);
-
-        // load low 2 bits
-        const __m128i q3bits = _mm_loadu_si128((const __m128i*)q3);
-
-        // prepare low and high bits
-        const __m256i q3aux  = MM256_SET_M128I(_mm_srli_epi16(q3bits, 2), q3bits);
-        const __m256i q3l_0 = _mm256_and_si256(q3aux, m3);
-        const __m256i q3l_1 = _mm256_and_si256(_mm256_srli_epi16(q3aux, 4), m3);
-
-        // load Q8 quants
-        const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0));
-        const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32));
-
-        // Dot product: we multiply the 2 low bits and 1 high bit part separately, so we can use _mm256_maddubs_epi16,
-        // and then subtract. The high bit part has the 2 already subtracted (and so, it is zero if the high bit was not set,
-        // and 2 if the high bit was set)
-        const __m256i q8s_0 = _mm256_maddubs_epi16(q3h_0, q8_0);
-        const __m256i q8s_1 = _mm256_maddubs_epi16(q3h_1, q8_1);
-
-        __m256i p16_0 = _mm256_maddubs_epi16(q3l_0, q8_0);
-        __m256i p16_1 = _mm256_maddubs_epi16(q3l_1, q8_1);
-
-        p16_0 = _mm256_sub_epi16(p16_0, q8s_0);
-        p16_1 = _mm256_sub_epi16(p16_1, q8s_1);
-
-        // multiply with scales
-        p16_0 = _mm256_madd_epi16(scale_0, p16_0);
-        p16_1 = _mm256_madd_epi16(scale_1, p16_1);
-
-        p16_0 = _mm256_add_epi32(p16_0, p16_1);
-
-        // multiply with block scale and accumulate
-        acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(p16_0), acc);
-
-    }
-
-    *s = hsum_float_8(acc);
-
-#elif defined __AVX__
-
-    const __m128i m3 = _mm_set1_epi8(3);
-    const __m128i m1 = _mm_set1_epi8(1);
-
-    __m256 acc = _mm256_setzero_ps();
-
-    uint64_t aux64;
-
-    uint16_t aux16[2];
-    const int8_t * aux8 = (const int8_t *)aux16;
-
-    for (int i = 0; i < nb; ++i) {
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-
-        const uint8_t * restrict q3 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
-
-        const uint16_t a = *(const uint16_t *)x[i].scales;
-        aux16[0] = a & 0x0f0f;
-        aux16[1] = (a >> 4) & 0x0f0f;
-
-        const __m128i scale_0 = _mm_set1_epi16(aux8[0] - 8);
-        const __m128i scale_1 = _mm_set1_epi16(aux8[2] - 8);
-        const __m128i scale_2 = _mm_set1_epi16(aux8[1] - 8);
-        const __m128i scale_3 = _mm_set1_epi16(aux8[3] - 8);
-
-        memcpy(&aux64, x[i].hmask, 8);
-
-        __m128i q3h_0 = _mm_set_epi64x(aux64 >> 1, aux64 >> 0);
-        __m128i q3h_1 = _mm_srli_epi16(q3h_0, 2);
-        __m128i q3h_2 = _mm_srli_epi16(q3h_0, 4);
-        __m128i q3h_3 = _mm_srli_epi16(q3h_0, 6);
-        q3h_0 = _mm_slli_epi16(_mm_andnot_si128(q3h_0, m1), 2);
-        q3h_1 = _mm_slli_epi16(_mm_andnot_si128(q3h_1, m1), 2);
-        q3h_2 = _mm_slli_epi16(_mm_andnot_si128(q3h_2, m1), 2);
-        q3h_3 = _mm_slli_epi16(_mm_andnot_si128(q3h_3, m1), 2);
-
-        // load low 2 bits
-        const __m128i q3bits = _mm_loadu_si128((const __m128i*)q3);
-
-        // prepare low and high bits
-        const __m128i q3l_0 = _mm_and_si128(q3bits, m3);
-        const __m128i q3l_1 = _mm_and_si128(_mm_srli_epi16(q3bits, 2), m3);
-        const __m128i q3l_2 = _mm_and_si128(_mm_srli_epi16(q3bits, 4), m3);
-        const __m128i q3l_3 = _mm_and_si128(_mm_srli_epi16(q3bits, 6), m3);
-
-        // load Q8 quants
-        const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0));
-        const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32));
-
-        // Dot product: we multiply the 2 low bits and 1 high bit part separately, so we can use _mm_maddubs_epi16,
-        // and then subtract. The high bit part has the 2 already subtracted (and so, it is zero if the high bit was not set,
-        // and 2 if the high bit was set)
-        const __m128i q8s_0 = _mm_maddubs_epi16(q3h_0, _mm256_extractf128_si256(q8_0, 0));
-        const __m128i q8s_1 = _mm_maddubs_epi16(q3h_1, _mm256_extractf128_si256(q8_0, 1));
-        const __m128i q8s_2 = _mm_maddubs_epi16(q3h_2, _mm256_extractf128_si256(q8_1, 0));
-        const __m128i q8s_3 = _mm_maddubs_epi16(q3h_3, _mm256_extractf128_si256(q8_1, 1));
-
-        __m128i p16_0 = _mm_maddubs_epi16(q3l_0, _mm256_extractf128_si256(q8_0, 0));
-        __m128i p16_1 = _mm_maddubs_epi16(q3l_1, _mm256_extractf128_si256(q8_0, 1));
-        __m128i p16_2 = _mm_maddubs_epi16(q3l_2, _mm256_extractf128_si256(q8_1, 0));
-        __m128i p16_3 = _mm_maddubs_epi16(q3l_3, _mm256_extractf128_si256(q8_1, 1));
-
-        p16_0 = _mm_sub_epi16(p16_0, q8s_0);
-        p16_1 = _mm_sub_epi16(p16_1, q8s_1);
-        p16_2 = _mm_sub_epi16(p16_2, q8s_2);
-        p16_3 = _mm_sub_epi16(p16_3, q8s_3);
-
-        // multiply with scales
-        p16_0 = _mm_madd_epi16(scale_0, p16_0);
-        p16_1 = _mm_madd_epi16(scale_1, p16_1);
-        p16_2 = _mm_madd_epi16(scale_2, p16_2);
-        p16_3 = _mm_madd_epi16(scale_3, p16_3);
-
-        p16_0 = _mm_add_epi32(p16_0, p16_2);
-        p16_1 = _mm_add_epi32(p16_1, p16_3);
-        __m256i p16 = MM256_SET_M128I(p16_1, p16_0);
-
-        // multiply with block scale and accumulate
-        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(p16)), acc);
-
-    }
-
-    *s = hsum_float_8(acc);
-
-#elif defined __riscv_v_intrinsic
-
-    uint16_t aux16[2];
-    int8_t * scales = (int8_t *)aux16;
-
-    float sumf = 0;
-
-    for (int i = 0; i < nb; ++i) {
-
-        const uint8_t * restrict q3 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
-
-        const uint16_t a = *(const uint16_t *)x[i].scales;
-        aux16[0] = a & 0x0f0f;
-        aux16[1] = (a >> 4) & 0x0f0f;
-
-        for (int j = 0; j < 4; ++j) scales[j] -= 8;
-
-        int32_t isum = -4*(scales[0] * y[i].bsums[0] + scales[2] * y[i].bsums[1] + scales[1] * y[i].bsums[2] + scales[3] * y[i].bsums[3]);
-
-        const float d = y[i].d * (float)x[i].d;
-
-        vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
-
-        // load qh
-        vuint8mf4_t qh_x1   = __riscv_vle8_v_u8mf4(x[i].hmask, 8);
-        vuint8mf2_t qh_x2   = __riscv_vlmul_ext_v_u8mf4_u8mf2(__riscv_vsrl_vx_u8mf4(qh_x1, 1, 8));
-
-        size_t vl = 16;
-
-        // extend and combine both qh_x1 and qh_x2
-        vuint8mf2_t qh_x = __riscv_vslideup_vx_u8mf2(__riscv_vlmul_ext_v_u8mf4_u8mf2(qh_x1), qh_x2, vl/2, vl);
-
-        vuint8mf2_t qh_0 = __riscv_vand_vx_u8mf2(__riscv_vsll_vx_u8mf2(qh_x, 0x2, vl), 0x4, vl);
-        vuint8mf2_t qh_1 = __riscv_vand_vx_u8mf2(qh_x, 0x4, vl);
-        vuint8mf2_t qh_2 = __riscv_vand_vx_u8mf2(__riscv_vsrl_vx_u8mf2(qh_x, 0x2, vl), 0x4, vl);
-        vuint8mf2_t qh_3 = __riscv_vand_vx_u8mf2(__riscv_vsrl_vx_u8mf2(qh_x, 0x4, vl), 0x4, vl);
-
-        // load Q3
-        vuint8mf2_t q3_x  = __riscv_vle8_v_u8mf2(q3, vl);
-
-        vuint8mf2_t q3h_0 = __riscv_vor_vv_u8mf2(__riscv_vand_vx_u8mf2(q3_x, 0x3, vl), qh_0, vl);
-        vuint8mf2_t q3h_1 = __riscv_vor_vv_u8mf2(__riscv_vand_vx_u8mf2(__riscv_vsrl_vx_u8mf2(q3_x, 2, vl), 0x3, vl), qh_1, vl);
-        vuint8mf2_t q3h_2 = __riscv_vor_vv_u8mf2(__riscv_vand_vx_u8mf2(__riscv_vsrl_vx_u8mf2(q3_x, 4, vl), 0x3, vl), qh_2, vl);
-        vuint8mf2_t q3h_3 = __riscv_vor_vv_u8mf2(__riscv_vsrl_vx_u8mf2(q3_x, 0x6, vl), qh_3, vl);
-
-        vint8mf2_t q3_0 = __riscv_vreinterpret_v_u8mf2_i8mf2(q3h_0);
-        vint8mf2_t q3_1 = __riscv_vreinterpret_v_u8mf2_i8mf2(q3h_1);
-        vint8mf2_t q3_2 = __riscv_vreinterpret_v_u8mf2_i8mf2(q3h_2);
-        vint8mf2_t q3_3 = __riscv_vreinterpret_v_u8mf2_i8mf2(q3h_3);
-
-        // load Q8 and take product with Q3
-        vint16m1_t p0 = __riscv_vwmul_vv_i16m1(q3_0, __riscv_vle8_v_i8mf2(q8, vl), vl);
-        vint16m1_t p1 = __riscv_vwmul_vv_i16m1(q3_1, __riscv_vle8_v_i8mf2(q8+16, vl), vl);
-        vint16m1_t p2 = __riscv_vwmul_vv_i16m1(q3_2, __riscv_vle8_v_i8mf2(q8+32, vl), vl);
-        vint16m1_t p3 = __riscv_vwmul_vv_i16m1(q3_3, __riscv_vle8_v_i8mf2(q8+48, vl), vl);
-
-        vint32m1_t vs_0 = __riscv_vwredsum_vs_i16m1_i32m1(p0, vzero, vl);
-        vint32m1_t vs_1 = __riscv_vwredsum_vs_i16m1_i32m1(p1, vzero, vl);
-        vint32m1_t vs_2 = __riscv_vwredsum_vs_i16m1_i32m1(p2, vzero, vl);
-        vint32m1_t vs_3 = __riscv_vwredsum_vs_i16m1_i32m1(p3, vzero, vl);
-
-        isum += __riscv_vmv_x_s_i32m1_i32(vs_0) * scales[0];
-        isum += __riscv_vmv_x_s_i32m1_i32(vs_1) * scales[2];
-        isum += __riscv_vmv_x_s_i32m1_i32(vs_2) * scales[1];
-        isum += __riscv_vmv_x_s_i32m1_i32(vs_3) * scales[3];
-
-        sumf += d * isum;
-
-    }
-
-    *s = sumf;
-
-#else
-
-    int8_t  aux8[QK_K];
-    int16_t aux16[8];
-    float   sums [8];
-    int32_t aux32[8];
-    int32_t scales[4];
-    memset(sums, 0, 8*sizeof(float));
-
-    float sumf = 0;
-    for (int i = 0; i < nb; ++i) {
-        const uint8_t * restrict q3 = x[i].qs;
-        const uint8_t * restrict hm = x[i].hmask;
-        const  int8_t * restrict q8 = y[i].qs;
-        int8_t * restrict a = aux8;
-        for (int l = 0; l < 8; ++l) {
-            a[l+ 0] = (int8_t)((q3[l+0] >> 0) & 3) - (hm[l] & 0x01 ? 0 : 4);
-            a[l+ 8] = (int8_t)((q3[l+8] >> 0) & 3) - (hm[l] & 0x02 ? 0 : 4);
-            a[l+16] = (int8_t)((q3[l+0] >> 2) & 3) - (hm[l] & 0x04 ? 0 : 4);
-            a[l+24] = (int8_t)((q3[l+8] >> 2) & 3) - (hm[l] & 0x08 ? 0 : 4);
-            a[l+32] = (int8_t)((q3[l+0] >> 4) & 3) - (hm[l] & 0x10 ? 0 : 4);
-            a[l+40] = (int8_t)((q3[l+8] >> 4) & 3) - (hm[l] & 0x20 ? 0 : 4);
-            a[l+48] = (int8_t)((q3[l+0] >> 6) & 3) - (hm[l] & 0x40 ? 0 : 4);
-            a[l+56] = (int8_t)((q3[l+8] >> 6) & 3) - (hm[l] & 0x80 ? 0 : 4);
-        }
-
-        scales[0] = (x[i].scales[0] & 0xF) - 8;
-        scales[1] = (x[i].scales[0] >>  4) - 8;
-        scales[2] = (x[i].scales[1] & 0xF) - 8;
-        scales[3] = (x[i].scales[1] >>  4) - 8;
-
-        memset(aux32, 0, 8*sizeof(int32_t));
-        for (int j = 0; j < QK_K/16; ++j) {
-            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
-            q8 += 8; a += 8;
-            for (int l = 0; l < 8; ++l) aux16[l] += q8[l] * a[l];
-            q8 += 8; a += 8;
-            for (int l = 0; l < 8; ++l) aux32[l] += scales[j] * aux16[l];
-        }
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
-    }
-    for (int l = 0; l < 8; ++l) sumf += sums[l];
-    *s = sumf;
-
-#endif
-
-}
-#endif
-
-#if QK_K == 256
-void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
-    assert(n % QK_K == 0);
-
-    const block_q4_K * restrict x = vx;
-    const block_q8_K * restrict y = vy;
-
-    const int nb = n / QK_K;
-
-    static const uint32_t kmask1 = 0x3f3f3f3f;
-    static const uint32_t kmask2 = 0x0f0f0f0f;
-    static const uint32_t kmask3 = 0x03030303;
-
-    uint32_t utmp[4];
-
-#ifdef __ARM_NEON
-    const uint8x16_t m4b = vdupq_n_u8(0xf);
-    const int32x4_t mzero = vdupq_n_s32(0);
-
-    ggml_int8x16x2_t q4bytes;
-    ggml_int8x16x2_t q8bytes;
-
-    float sumf = 0;
-
-    for (int i = 0; i < nb; ++i) {
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-        const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
-
-        const int16x8_t q8sums = vpaddq_s16(vld1q_s16(y[i].bsums), vld1q_s16(y[i].bsums + 8));
-
-        memcpy(utmp, x[i].scales, 12);
-
-        uint32x2_t mins8 = { 0 };
-        mins8 = vset_lane_u32(utmp[1] & kmask1, mins8, 0);
-        mins8 = vset_lane_u32(((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4), mins8, 1);
-
-        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
-        utmp[0] &= kmask1;
-
-        const int16x8_t mins = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(mins8)));
-        const int32x4_t prod = vaddq_s32(vmull_s16(vget_low_s16 (q8sums), vget_low_s16 (mins)),
-                                         vmull_s16(vget_high_s16(q8sums), vget_high_s16(mins)));
-        sumf -= dmin * vaddvq_s32(prod);
-
-        const uint8_t * scales = (const uint8_t *)utmp;
-
-        const uint8_t * restrict q4 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
-
-        int32_t sumi1 = 0;
-        int32_t sumi2 = 0;
-
-        for (int j = 0; j < QK_K/64; ++j) {
-            const ggml_uint8x16x2_t q4bits = ggml_vld1q_u8_x2(q4); q4 += 32;
-
-            q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
-            q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8  (q4bits.val[0], m4b));
-            q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8  (q4bits.val[1], m4b));
-
-            const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);
-            sumi1 += vaddvq_s32(p1) * scales[2*j+0];
-
-            q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
-            q4bytes.val[0] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[0], 4));
-            q4bytes.val[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[1], 4));
-
-            const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);
-
-            sumi2 += vaddvq_s32(p2) * scales[2*j+1];
-        }
-
-        sumf += d * (sumi1 + sumi2);
-
-    }
-
-    *s = sumf;
-
-#elif defined __AVX2__
-
-    const __m256i m4 = _mm256_set1_epi8(0xF);
-
-    __m256 acc = _mm256_setzero_ps();
-    __m128 acc_m = _mm_setzero_ps();
-
-   for (int i = 0; i < nb; ++i) {
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
-
-        memcpy(utmp, x[i].scales, 12);
-        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
-        const uint32_t uaux = utmp[1] & kmask1;
-        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
-        utmp[2] = uaux;
-        utmp[0] &= kmask1;
-
-        const uint8_t * restrict q4 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
-
-        const __m256i mins_and_scales = _mm256_cvtepu8_epi16(_mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0]));
-
-        const __m256i q8sums = _mm256_loadu_si256((const __m256i*)y[i].bsums);
-        const __m128i q8s = _mm_hadd_epi16(_mm256_extracti128_si256(q8sums, 0), _mm256_extracti128_si256(q8sums, 1));
-        const __m128i prod = _mm_madd_epi16(_mm256_extracti128_si256(mins_and_scales, 1), q8s);
-        acc_m = _mm_fmadd_ps(_mm_set1_ps(dmin), _mm_cvtepi32_ps(prod), acc_m);
-
-        const __m128i sc128  = _mm256_extracti128_si256(mins_and_scales, 0);
-        const __m256i scales = MM256_SET_M128I(sc128, sc128);
-
-        __m256i sumi = _mm256_setzero_si256();
-
-        for (int j = 0; j < QK_K/64; ++j) {
-
-            const __m256i scale_l = _mm256_shuffle_epi8(scales, get_scale_shuffle_k4(2*j+0));
-            const __m256i scale_h = _mm256_shuffle_epi8(scales, get_scale_shuffle_k4(2*j+1));
-
-            const __m256i q4bits = _mm256_loadu_si256((const __m256i*)q4); q4 += 32;
-            const __m256i q4l = _mm256_and_si256(q4bits, m4);
-            const __m256i q4h = _mm256_and_si256(_mm256_srli_epi16(q4bits, 4), m4);
-
-            const __m256i q8l = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
-            __m256i p16l = _mm256_maddubs_epi16(q4l, q8l);
-            p16l = _mm256_madd_epi16(scale_l, p16l);
-
-            const __m256i q8h = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
-            __m256i p16h = _mm256_maddubs_epi16(q4h, q8h);
-            p16h = _mm256_madd_epi16(scale_h, p16h);
-            const __m256i sumj = _mm256_add_epi32(p16l, p16h);
-
-            sumi = _mm256_add_epi32(sumi, sumj);
-        }
-
-        __m256 vd = _mm256_set1_ps(d);
-        acc = _mm256_fmadd_ps(vd, _mm256_cvtepi32_ps(sumi), acc);
-
-    }
-
-    acc_m = _mm_add_ps(acc_m, _mm_movehl_ps(acc_m, acc_m));
-    acc_m = _mm_add_ss(acc_m, _mm_movehdup_ps(acc_m));
-
-    *s = hsum_float_8(acc) + _mm_cvtss_f32(acc_m);
-
-#elif defined __AVX__
-
-    const __m128i m4 = _mm_set1_epi8(0xF);
-    const __m128i m2 = _mm_set1_epi8(0x2);
-
-    __m256 acc = _mm256_setzero_ps();
-    __m128 acc_m = _mm_setzero_ps();
-
-   for (int i = 0; i < nb; ++i) {
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
-
-        const uint8_t * restrict q4 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
-
-        memcpy(utmp, x[i].scales, 12);
-        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
-        const uint32_t uaux = utmp[1] & kmask1;
-        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
-        utmp[2] = uaux;
-        utmp[0] &= kmask1;
-
-        const __m128i utmps = _mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0]);
-        const __m128i scales = _mm_cvtepu8_epi16(utmps);
-        const __m128i mins = _mm_cvtepu8_epi16(_mm_unpackhi_epi64(utmps, utmps));
-
-        const __m128i q8sums_0 = _mm_loadu_si128((const __m128i*)&y[i].bsums[0]);
-        const __m128i q8sums_1 = _mm_loadu_si128((const __m128i*)&y[i].bsums[8]);
-        const __m128i q8s = _mm_hadd_epi16(q8sums_0, q8sums_1);
-        const __m128i prod = _mm_madd_epi16(mins, q8s);
-        acc_m = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(dmin), _mm_cvtepi32_ps(prod)), acc_m);
-
-        __m128i sumi_0 = _mm_setzero_si128();
-        __m128i sumi_1 = _mm_setzero_si128();
-
-        __m128i shuffle = _mm_set1_epi16(0x0100);
-        for (int j = 0; j < QK_K/64; ++j) {
-
-            const __m128i scale_l = _mm_shuffle_epi8(scales, shuffle);
-            shuffle = _mm_add_epi16(shuffle, m2);
-            const __m128i scale_h = _mm_shuffle_epi8(scales, shuffle);
-            shuffle = _mm_add_epi16(shuffle, m2);
-
-            __m128i q4bits = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
-            const __m128i q4l_0 = _mm_and_si128(q4bits, m4);
-            const __m128i q4h_0 = _mm_and_si128(_mm_srli_epi16(q4bits, 4), m4);
-            q4bits = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
-            const __m128i q4l_1 = _mm_and_si128(q4bits, m4);
-            const __m128i q4h_1 = _mm_and_si128(_mm_srli_epi16(q4bits, 4), m4);
-
-            const __m128i q8l_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            __m128i p16l = _mm_maddubs_epi16(q4l_0, q8l_0);
-            p16l = _mm_madd_epi16(scale_l, p16l);
-            sumi_0 = _mm_add_epi32(sumi_0, p16l);
-            const __m128i q8l_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            p16l = _mm_maddubs_epi16(q4l_1, q8l_1);
-            p16l = _mm_madd_epi16(scale_l, p16l);
-            sumi_1 = _mm_add_epi32(sumi_1, p16l);
-
-            const __m128i q8h_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            __m128i p16h = _mm_maddubs_epi16(q4h_0, q8h_0);
-            p16h = _mm_madd_epi16(scale_h, p16h);
-            sumi_0 = _mm_add_epi32(sumi_0, p16h);
-            const __m128i q8h_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            p16h = _mm_maddubs_epi16(q4h_1, q8h_1);
-            p16h = _mm_madd_epi16(scale_h, p16h);
-            sumi_1 = _mm_add_epi32(sumi_1, p16h);
-
-        }
-
-        __m256 vd = _mm256_set1_ps(d);
-        __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
-        acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(sumi)), acc);
-
-    }
-
-    acc_m = _mm_add_ps(acc_m, _mm_movehl_ps(acc_m, acc_m));
-    acc_m = _mm_add_ss(acc_m, _mm_movehdup_ps(acc_m));
-
-    *s = hsum_float_8(acc) + _mm_cvtss_f32(acc_m);
-
-#elif defined __riscv_v_intrinsic
-
-    const uint8_t * scales = (const uint8_t*)&utmp[0];
-    const uint8_t * mins   = (const uint8_t*)&utmp[2];
-
-    float sumf = 0;
-
-    for (int i = 0; i < nb; ++i) {
-
-        size_t vl = 8;
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-        const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
-
-        vint16mf2_t q8sums_0 = __riscv_vlse16_v_i16mf2(y[i].bsums, 4, vl);
-        vint16mf2_t q8sums_1 = __riscv_vlse16_v_i16mf2(y[i].bsums+1, 4, vl);
-        vint16mf2_t q8sums   = __riscv_vadd_vv_i16mf2(q8sums_0, q8sums_1, vl);
-
-        memcpy(utmp, x[i].scales, 12);
-        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
-        const uint32_t uaux = utmp[1] & kmask1;
-        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
-        utmp[2] = uaux;
-        utmp[0] &= kmask1;
-
-        vuint8mf4_t mins8  = __riscv_vle8_v_u8mf4(mins, vl);
-        vint16mf2_t v_mins = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vzext_vf2_u16mf2(mins8, vl));
-        vint32m1_t  prod   = __riscv_vwmul_vv_i32m1(q8sums, v_mins, vl);
-
-        vint32m1_t sumi = __riscv_vredsum_vs_i32m1_i32m1(prod, __riscv_vmv_v_x_i32m1(0, 1), vl);
-        sumf -= dmin * __riscv_vmv_x_s_i32m1_i32(sumi);
-
-        const uint8_t * restrict q4 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
-
-        vl = 32;
-
-        int32_t sum_1 = 0;
-        int32_t sum_2 = 0;
-
-        vint16m1_t vzero = __riscv_vmv_v_x_i16m1(0, 1);
-
-        for (int j = 0; j < QK_K/64; ++j) {
-            // load Q4
-            vuint8m1_t q4_x = __riscv_vle8_v_u8m1(q4, vl);
-
-            // load Q8 and multiply it with lower Q4 nibble
-            vint8m1_t  q8_0 = __riscv_vle8_v_i8m1(q8, vl);
-            vint8m1_t  q4_0 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q4_x, 0x0F, vl));
-            vint16m2_t qv_0 = __riscv_vwmul_vv_i16m2(q4_0, q8_0, vl);
-            vint16m1_t vs_0 = __riscv_vredsum_vs_i16m2_i16m1(qv_0, vzero, vl);
-
-            sum_1 += __riscv_vmv_x_s_i16m1_i16(vs_0) * scales[2*j+0];
-
-            // load Q8 and multiply it with upper Q4 nibble
-            vint8m1_t  q8_1 = __riscv_vle8_v_i8m1(q8+32, vl);
-            vint8m1_t  q4_1 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vsrl_vx_u8m1(q4_x, 0x04, vl));
-            vint16m2_t qv_1 = __riscv_vwmul_vv_i16m2(q4_1, q8_1, vl);
-            vint16m1_t vs_1 = __riscv_vredsum_vs_i16m2_i16m1(qv_1, vzero, vl);
-
-            sum_2 += __riscv_vmv_x_s_i16m1_i16(vs_1) * scales[2*j+1];
-
-            q4 += 32;    q8 += 64;
-
-        }
-
-        sumf += d*(sum_1 + sum_2);
-
-    }
-
-    *s = sumf;
-
-#else
-
-
-    const uint8_t * scales = (const uint8_t*)&utmp[0];
-    const uint8_t * mins   = (const uint8_t*)&utmp[2];
-
-    int8_t  aux8[QK_K];
-    int16_t aux16[8];
-    float   sums [8];
-    int32_t aux32[8];
-    memset(sums, 0, 8*sizeof(float));
-
-    float sumf = 0;
-    for (int i = 0; i < nb; ++i) {
-        const uint8_t * restrict q4 = x[i].qs;
-        const  int8_t * restrict q8 = y[i].qs;
-        memset(aux32, 0, 8*sizeof(int32_t));
-        int8_t * restrict a = aux8;
-        for (int j = 0; j < QK_K/64; ++j) {
-            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
-            a += 32;
-            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l]  >> 4);
-            a += 32; q4 += 32;
-        }
-        memcpy(utmp, x[i].scales, 12);
-        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
-        const uint32_t uaux = utmp[1] & kmask1;
-        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
-        utmp[2] = uaux;
-        utmp[0] &= kmask1;
-
-        int sumi = 0;
-        for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
-        a = aux8;
-        int is = 0;
-        for (int j = 0; j < QK_K/32; ++j) {
-            int32_t scale = scales[is++];
-            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
-            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
-            q8 += 8; a += 8;
-            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
-            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
-            q8 += 8; a += 8;
-            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
-            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
-            q8 += 8; a += 8;
-            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
-            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
-            q8 += 8; a += 8;
-        }
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
-        const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
-        sumf -= dmin * sumi;
-    }
-    for (int l = 0; l < 8; ++l) sumf += sums[l];
-    *s = sumf;
-#endif
-}
-#else
-void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
-    assert(n % QK_K == 0);
-
-    const block_q4_K * restrict x = vx;
-    const block_q8_K * restrict y = vy;
-
-    const int nb = n / QK_K;
-
-#ifdef __ARM_NEON
-    const uint8x16_t m4b = vdupq_n_u8(0xf);
-
-    const int32x4_t mzero = vdupq_n_s32(0);
-
-    float sumf = 0;
-
-    ggml_int8x16x2_t q4bytes;
-    ggml_int8x16x4_t q8bytes;
-
-    float sum_mins = 0.f;
-
-    uint16_t aux16[2];
-    const uint8_t * restrict scales = (const uint8_t *)aux16;
-
-    for (int i = 0; i < nb; ++i) {
-
-        const uint8_t * restrict q4 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
-
-        const uint16_t * restrict a = (const uint16_t *)x[i].scales;
-        aux16[0] = a[0] & 0x0f0f;
-        aux16[1] = (a[0] >> 4) & 0x0f0f;
-
-        const int32_t summi = scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3]);
-        sum_mins += y[i].d * (float)x[i].d[1] * summi;
-
-        const float d = y[i].d * (float)x[i].d[0];
-
-        const ggml_uint8x16x2_t q4bits = ggml_vld1q_u8_x2(q4);
-
-        q8bytes = ggml_vld1q_s8_x4(q8);
-        q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8  (q4bits.val[0], m4b));
-        q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8  (q4bits.val[1], m4b));
-
-        const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);
-        const int32_t sumi1 = vaddvq_s32(p1) * scales[0];
-
-        q4bytes.val[0] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[0], 4));
-        q4bytes.val[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[1], 4));
-
-        const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[2]), q4bytes.val[1], q8bytes.val[3]);
-        const int32_t sumi2 = vaddvq_s32(p2) * scales[1];
-
-        sumf += d * (sumi1 + sumi2);
-    }
-
-    *s = sumf - sum_mins;
-
-#elif defined __AVX2__
-
-    const __m256i m4 = _mm256_set1_epi8(0xF);
-
-    __m256 acc = _mm256_setzero_ps();
-
-    float summs = 0;
-
-    uint16_t aux16[2];
-    const uint8_t * scales = (const uint8_t *)aux16;
-
-    for (int i = 0; i < nb; ++i) {
-
-        const float d = GGML_FP16_TO_FP32(x[i].d[0]) * y[i].d;
-        const float m = GGML_FP16_TO_FP32(x[i].d[1]) * y[i].d;
-        const __m256 vd = _mm256_set1_ps(d);
-
-        const uint16_t * a = (const uint16_t *)x[i].scales;
-        aux16[0] = a[0] & 0x0f0f;
-        aux16[1] = (a[0] >> 4) & 0x0f0f;
-
-        summs += m * (scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3]));
-
-        const uint8_t * restrict q4 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
-
-        const __m256i q4bits = _mm256_loadu_si256((const __m256i*)q4);
-        const __m256i q4l = _mm256_and_si256(q4bits, m4);
-        const __m256i q4h = _mm256_and_si256(_mm256_srli_epi16(q4bits, 4), m4);
-
-        const __m256i q8l = _mm256_loadu_si256((const __m256i*)(q8+ 0));
-        const __m256i q8h = _mm256_loadu_si256((const __m256i*)(q8+32));
-
-        const __m256i p16l = _mm256_maddubs_epi16(q4l, q8l);
-        const __m256i p16h = _mm256_maddubs_epi16(q4h, q8h);
-
-        const __m256i p32l = _mm256_madd_epi16(_mm256_set1_epi16(scales[0]), p16l);
-        acc = _mm256_fmadd_ps(vd, _mm256_cvtepi32_ps(p32l), acc);
-
-        const __m256i p32h = _mm256_madd_epi16(_mm256_set1_epi16(scales[1]), p16h);
-        acc = _mm256_fmadd_ps(vd, _mm256_cvtepi32_ps(p32h), acc);
-
-    }
-
-    *s = hsum_float_8(acc) - summs;
-
-#elif defined __AVX__
-
-    const __m128i m4 = _mm_set1_epi8(0xF);
-
-    __m256 acc = _mm256_setzero_ps();
-
-    float summs = 0;
-
-    uint16_t aux16[2];
-    const uint8_t * scales = (const uint8_t *)aux16;
-
-    for (int i = 0; i < nb; ++i) {
-
-        const float d = GGML_FP16_TO_FP32(x[i].d[0]) * y[i].d;
-        const float m = GGML_FP16_TO_FP32(x[i].d[1]) * y[i].d;
-        const __m256 vd = _mm256_set1_ps(d);
-
-        const uint16_t * a = (const uint16_t *)x[i].scales;
-        aux16[0] = a[0] & 0x0f0f;
-        aux16[1] = (a[0] >> 4) & 0x0f0f;
-
-        summs += m * (scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3]));
-
-        const uint8_t * restrict q4 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
-
-        const __m256i q4bits = _mm256_loadu_si256((const __m256i*)q4);
-        const __m128i q4bits_0 = _mm256_extractf128_si256(q4bits, 0);
-        const __m128i q4bits_1 = _mm256_extractf128_si256(q4bits, 1);
-        const __m128i q4_0 = _mm_and_si128(q4bits_0, m4);
-        const __m128i q4_1 = _mm_and_si128(q4bits_1, m4);
-        const __m128i q4_2 = _mm_and_si128(_mm_srli_epi16(q4bits_0, 4), m4);
-        const __m128i q4_3 = _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4);
-
-        const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0));
-        const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32));
-
-        const __m128i p16_0 = _mm_maddubs_epi16(q4_0, _mm256_extractf128_si256(q8_0, 0));
-        const __m128i p16_1 = _mm_maddubs_epi16(q4_1, _mm256_extractf128_si256(q8_0, 1));
-        const __m128i p16_2 = _mm_maddubs_epi16(q4_2, _mm256_extractf128_si256(q8_1, 0));
-        const __m128i p16_3 = _mm_maddubs_epi16(q4_3, _mm256_extractf128_si256(q8_1, 1));
-
-        const __m128i p32_0 = _mm_madd_epi16(_mm_set1_epi16(scales[0]), p16_0);
-        const __m128i p32_1 = _mm_madd_epi16(_mm_set1_epi16(scales[0]), p16_1);
-        acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(MM256_SET_M128I(p32_1, p32_0))), acc);
-
-        const __m128i p32_2 = _mm_madd_epi16(_mm_set1_epi16(scales[1]), p16_2);
-        const __m128i p32_3 = _mm_madd_epi16(_mm_set1_epi16(scales[1]), p16_3);
-        acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(MM256_SET_M128I(p32_3, p32_2))), acc);
-
-    }
-
-    *s = hsum_float_8(acc) - summs;
-
-#elif defined __riscv_v_intrinsic
-
-    uint16_t s16[2];
-    const uint8_t * restrict scales = (const uint8_t *)s16;
-
-    float sumf = 0;
-
-    for (int i = 0; i < nb; ++i) {
-
-        const uint8_t * restrict q4 = x[i].qs;
-        const  int8_t * restrict q8 = y[i].qs;
-
-        const uint16_t * restrict b = (const uint16_t *)x[i].scales;
-        s16[0] = b[0] & 0x0f0f;
-        s16[1] = (b[0] >> 4) & 0x0f0f;
-
-        sumf -= y[i].d * GGML_FP16_TO_FP32(x[i].d[1]) * (scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3]));
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d[0]);
-
-        size_t vl = 32;
-
-        vint16m1_t vzero = __riscv_vmv_v_x_i16m1(0, 1);
-
-        // load Q4
-        vuint8m1_t q4_x = __riscv_vle8_v_u8m1(q4, vl);
-
-        // load Q8 and multiply it with lower Q4 nibble
-        vint8m1_t  q4_a = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q4_x, 0x0F, vl));
-        vint16m2_t va_0 = __riscv_vwmul_vv_i16m2(q4_a, __riscv_vle8_v_i8m1(q8, vl), vl);
-        vint16m1_t aux1 = __riscv_vredsum_vs_i16m2_i16m1(va_0, vzero, vl);
-
-        sumf += d*scales[0]*__riscv_vmv_x_s_i16m1_i16(aux1);
-
-        // load Q8 and multiply it with upper Q4 nibble
-        vint8m1_t  q4_s = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vsrl_vx_u8m1(q4_x, 0x04, vl));
-        vint16m2_t va_1 = __riscv_vwmul_vv_i16m2(q4_s, __riscv_vle8_v_i8m1(q8+32, vl), vl);
-        vint16m1_t aux2 = __riscv_vredsum_vs_i16m2_i16m1(va_1, vzero, vl);
-
-        sumf += d*scales[1]*__riscv_vmv_x_s_i16m1_i16(aux2);
-
-    }
-
-    *s = sumf;
-
-#else
-
-    uint8_t aux8[QK_K];
-    int16_t aux16[16];
-    float   sums [8];
-    memset(sums, 0, 8*sizeof(float));
-
-    uint16_t s16[2];
-    const uint8_t * restrict scales = (const uint8_t *)s16;
-
-    float sumf = 0;
-    for (int i = 0; i < nb; ++i) {
-        const uint8_t * restrict q4 = x[i].qs;
-        const  int8_t * restrict q8 = y[i].qs;
-        uint8_t * restrict a = aux8;
-        for (int l = 0; l < 32; ++l) a[l+ 0] = q4[l] & 0xF;
-        for (int l = 0; l < 32; ++l) a[l+32] = q4[l]  >> 4;
-
-        const uint16_t * restrict b = (const uint16_t *)x[i].scales;
-        s16[0] = b[0] & 0x0f0f;
-        s16[1] = (b[0] >> 4) & 0x0f0f;
-
-        sumf -= y[i].d * GGML_FP16_TO_FP32(x[i].d[1]) * (scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3]));
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d[0]);
-
-        for (int j = 0; j < QK_K/32; ++j) {
-            for (int l = 0; l < 16; ++l) aux16[l] = q8[l] * a[l];
-            q8 += 16; a += 16;
-            for (int l = 0; l < 16; ++l) aux16[l] += q8[l] * a[l];
-            q8 += 16; a += 16;
-            const float dl = d * scales[j];
-            for (int l = 0; l < 8; ++l) sums[l] += dl * (aux16[l] + aux16[l+8]);
-        }
-    }
-    for (int l = 0; l < 8; ++l) sumf += sums[l];
-    *s = sumf;
-#endif
-}
-#endif
-
-#if QK_K == 256
-void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
-    assert(n % QK_K == 0);
-
-    const block_q5_K * restrict x = vx;
-    const block_q8_K * restrict y = vy;
-
-    const int nb = n / QK_K;
-
-    static const uint32_t kmask1 = 0x3f3f3f3f;
-    static const uint32_t kmask2 = 0x0f0f0f0f;
-    static const uint32_t kmask3 = 0x03030303;
-
-    uint32_t utmp[4];
-
-#ifdef __ARM_NEON
-    const uint8x16_t m4b = vdupq_n_u8(0xf);
-    const uint8x16_t mone = vdupq_n_u8(1);
-    const uint8x16_t mtwo = vdupq_n_u8(2);
-    const int32x4_t mzero = vdupq_n_s32(0);
-
-    ggml_int8x16x4_t q5bytes;
-
-    float sumf = 0;
-
-    for (int i = 0; i < nb; ++i) {
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-        const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
-
-        const int16x8_t q8sums = vpaddq_s16(vld1q_s16(y[i].bsums), vld1q_s16(y[i].bsums + 8));
-
-        memcpy(utmp, x[i].scales, 12);
-        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
-        const uint32_t uaux = utmp[1] & kmask1;
-        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
-        utmp[2] = uaux;
-        utmp[0] &= kmask1;
-
-        const uint8x8_t mins8 = vld1_u8((const uint8_t*)utmp + 8);
-        const int16x8_t mins = vreinterpretq_s16_u16(vmovl_u8(mins8));
-        const int32x4_t prod = vaddq_s32(vmull_s16(vget_low_s16 (q8sums), vget_low_s16 (mins)),
-                                         vmull_s16(vget_high_s16(q8sums), vget_high_s16(mins)));
-        int32_t sumi_mins = vaddvq_s32(prod);
-
-        const uint8_t * scales = (const uint8_t *)utmp;
-
-        const uint8_t * restrict q5 = x[i].qs;
-        const uint8_t * restrict qh = x[i].qh;
-        const int8_t  * restrict q8 = y[i].qs;
-
-        ggml_uint8x16x2_t qhbits = ggml_vld1q_u8_x2(qh);
-
-        ggml_uint8x16x4_t q5h;
-
-        int32_t sumi = 0;
-
-        for (int j = 0; j < QK_K/64; ++j) {
-
-            const ggml_uint8x16x2_t q5bits = ggml_vld1q_u8_x2(q5); q5 += 32;
-            const ggml_int8x16x4_t q8bytes = ggml_vld1q_s8_x4(q8); q8 += 64;
-
-            q5h.val[0] = vshlq_n_u8(vandq_u8(mone, qhbits.val[0]), 4);
-            q5h.val[1] = vshlq_n_u8(vandq_u8(mone, qhbits.val[1]), 4);
-            q5h.val[2] = vshlq_n_u8(vandq_u8(mtwo, qhbits.val[0]), 3);
-            q5h.val[3] = vshlq_n_u8(vandq_u8(mtwo, qhbits.val[1]), 3);
-            qhbits.val[0] = vshrq_n_u8(qhbits.val[0], 2);
-            qhbits.val[1] = vshrq_n_u8(qhbits.val[1], 2);
-
-            q5bytes.val[0] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q5bits.val[0], m4b), q5h.val[0]));
-            q5bytes.val[1] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q5bits.val[1], m4b), q5h.val[1]));
-            q5bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q5bits.val[0], 4), q5h.val[2]));
-            q5bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q5bits.val[1], 4), q5h.val[3]));
-
-            sumi += vaddvq_s32(ggml_vdotq_s32(ggml_vdotq_s32(mzero, q5bytes.val[0], q8bytes.val[0]), q5bytes.val[1], q8bytes.val[1])) * *scales++;
-            sumi += vaddvq_s32(ggml_vdotq_s32(ggml_vdotq_s32(mzero, q5bytes.val[2], q8bytes.val[2]), q5bytes.val[3], q8bytes.val[3])) * *scales++;
-        }
-
-        sumf += d * sumi - dmin * sumi_mins;
-    }
-
-    *s = sumf;
-
-#elif defined __AVX2__
-
-    const __m256i m4 = _mm256_set1_epi8(0xF);
-    const __m128i mzero = _mm_setzero_si128();
-    const __m256i mone  = _mm256_set1_epi8(1);
-
-    __m256 acc = _mm256_setzero_ps();
-
-    float summs = 0.f;
-
-   for (int i = 0; i < nb; ++i) {
-
-        const uint8_t * restrict q5 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
-
-#if QK_K == 256
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
-
-        memcpy(utmp, x[i].scales, 12);
-        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
-        const uint32_t uaux = utmp[1] & kmask1;
-        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
-        utmp[2] = uaux;
-        utmp[0] &= kmask1;
-#else
-        // TODO
-        const float d = 0, dmin = 0;
-#endif
-
-        const __m256i mins_and_scales = _mm256_cvtepu8_epi16(_mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0]));
-
-        const __m256i q8sums = _mm256_loadu_si256((const __m256i*)y[i].bsums);
-        const __m128i q8s = _mm_hadd_epi16(_mm256_extracti128_si256(q8sums, 0), _mm256_extracti128_si256(q8sums, 1));
-        const __m128i prod = _mm_madd_epi16(_mm256_extracti128_si256(mins_and_scales, 1), q8s);
-        const __m128i hsum = _mm_hadd_epi32(_mm_hadd_epi32(prod, mzero), mzero);
-        summs += dmin * _mm_extract_epi32(hsum, 0);
-
-        const __m128i sc128  = _mm256_extracti128_si256(mins_and_scales, 0);
-        const __m256i scales = MM256_SET_M128I(sc128, sc128);
-
-        const __m256i hbits = _mm256_loadu_si256((const __m256i*)x[i].qh);
-        __m256i hmask = mone;
-
-        __m256i sumi = _mm256_setzero_si256();
-
-        int bit = 0;
-
-        for (int j = 0; j < QK_K/64; ++j) {
-
-            const __m256i scale_0 = _mm256_shuffle_epi8(scales, get_scale_shuffle_k4(2*j+0));
-            const __m256i scale_1 = _mm256_shuffle_epi8(scales, get_scale_shuffle_k4(2*j+1));
-
-            const __m256i q5bits = _mm256_loadu_si256((const __m256i*)q5); q5 += 32;
-
-            const __m256i q5l_0 = _mm256_and_si256(q5bits, m4);
-            const __m256i q5h_0 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_and_si256(hbits, hmask), bit++), 4);
-            const __m256i q5_0  = _mm256_add_epi8(q5l_0, q5h_0);
-            hmask = _mm256_slli_epi16(hmask, 1);
-
-            const __m256i q5l_1 = _mm256_and_si256(_mm256_srli_epi16(q5bits, 4), m4);
-            const __m256i q5h_1 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_and_si256(hbits, hmask), bit++), 4);
-            const __m256i q5_1  = _mm256_add_epi8(q5l_1, q5h_1);
-            hmask = _mm256_slli_epi16(hmask, 1);
-
-            const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
-            const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
-
-            __m256i p16_0 = _mm256_maddubs_epi16(q5_0, q8_0);
-            __m256i p16_1 = _mm256_maddubs_epi16(q5_1, q8_1);
-
-            p16_0 = _mm256_madd_epi16(scale_0, p16_0);
-            p16_1 = _mm256_madd_epi16(scale_1, p16_1);
-
-            sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p16_0, p16_1));
-
-        }
-
-        __m256 vd = _mm256_set1_ps(d);
-        acc = _mm256_fmadd_ps(vd, _mm256_cvtepi32_ps(sumi), acc);
-
-    }
-
-    *s = hsum_float_8(acc) + summs;
-
-#elif defined __AVX__
-
-    const __m128i m4 = _mm_set1_epi8(0xF);
-    const __m128i mzero = _mm_setzero_si128();
-    const __m128i mone  = _mm_set1_epi8(1);
-    const __m128i m2 = _mm_set1_epi8(2);
-
-    __m256 acc = _mm256_setzero_ps();
-
-    float summs = 0.f;
-
-    for (int i = 0; i < nb; ++i) {
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
-
-        const uint8_t * restrict q5 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
-
-        memcpy(utmp, x[i].scales, 12);
-        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
-        const uint32_t uaux = utmp[1] & kmask1;
-        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
-        utmp[2] = uaux;
-        utmp[0] &= kmask1;
-
-        const __m128i utmps = _mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0]);
-        const __m128i scales = _mm_cvtepu8_epi16(utmps);
-        const __m128i mins = _mm_cvtepu8_epi16(_mm_unpackhi_epi64(utmps, utmps));
-
-        const __m128i q8sums_0 = _mm_loadu_si128((const __m128i*)&y[i].bsums[0]);
-        const __m128i q8sums_1 = _mm_loadu_si128((const __m128i*)&y[i].bsums[8]);
-        const __m128i q8s = _mm_hadd_epi16(q8sums_0, q8sums_1);
-        const __m128i prod = _mm_madd_epi16(mins, q8s);
-        const __m128i hsum = _mm_hadd_epi32(_mm_hadd_epi32(prod, mzero), mzero);
-        summs += dmin * _mm_extract_epi32(hsum, 0);
-
-        const __m128i hbits_0 = _mm_loadu_si128((const __m128i*)&x[i].qh[0]);
-        const __m128i hbits_1 = _mm_loadu_si128((const __m128i*)&x[i].qh[16]);
-        __m128i hmask = mone;
-
-        __m128i sumi_0 = _mm_setzero_si128();
-        __m128i sumi_1 = _mm_setzero_si128();
-
-        int bit = 0;
-
-        __m128i shuffle = _mm_set1_epi16(0x0100);
-        for (int j = 0; j < QK_K/64; ++j) {
-
-            const __m128i scale_0 = _mm_shuffle_epi8(scales, shuffle);
-            shuffle = _mm_add_epi16(shuffle, m2);
-            const __m128i scale_1 = _mm_shuffle_epi8(scales, shuffle);
-            shuffle = _mm_add_epi16(shuffle, m2);
-
-            const __m128i q5bits_0 = _mm_loadu_si128((const __m128i*)q5); q5 += 16;
-            const __m128i q5bits_1 = _mm_loadu_si128((const __m128i*)q5); q5 += 16;
-
-            __m128i q5l_0 = _mm_and_si128(q5bits_0, m4);
-            __m128i q5l_1 = _mm_and_si128(q5bits_1, m4);
-            __m128i q5h_0 = _mm_slli_epi16(_mm_srli_epi16(_mm_and_si128(hbits_0, hmask), bit), 4);
-            __m128i q5h_1 = _mm_slli_epi16(_mm_srli_epi16(_mm_and_si128(hbits_1, hmask), bit++), 4);
-            __m128i q5_0  = _mm_add_epi8(q5l_0, q5h_0);
-            __m128i q5_1  = _mm_add_epi8(q5l_1, q5h_1);
-            hmask = _mm_slli_epi16(hmask, 1);
-
-            __m128i q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            __m128i q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            __m128i p16_0 = _mm_maddubs_epi16(q5_0, q8_0);
-            __m128i p16_1 = _mm_maddubs_epi16(q5_1, q8_1);
-            p16_0 = _mm_madd_epi16(scale_0, p16_0);
-            p16_1 = _mm_madd_epi16(scale_0, p16_1);
-
-            q5l_0 = _mm_and_si128(_mm_srli_epi16(q5bits_0, 4), m4);
-            q5l_1 = _mm_and_si128(_mm_srli_epi16(q5bits_1, 4), m4);
-            q5h_0 = _mm_slli_epi16(_mm_srli_epi16(_mm_and_si128(hbits_0, hmask), bit), 4);
-            q5h_1 = _mm_slli_epi16(_mm_srli_epi16(_mm_and_si128(hbits_1, hmask), bit++), 4);
-            q5_0  = _mm_add_epi8(q5l_0, q5h_0);
-            q5_1  = _mm_add_epi8(q5l_1, q5h_1);
-            hmask = _mm_slli_epi16(hmask, 1);
-
-            q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            __m128i p16_2 = _mm_maddubs_epi16(q5_0, q8_0);
-            __m128i p16_3 = _mm_maddubs_epi16(q5_1, q8_1);
-            p16_2 = _mm_madd_epi16(scale_1, p16_2);
-            p16_3 = _mm_madd_epi16(scale_1, p16_3);
-
-            sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2));
-            sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_1, p16_3));
-
-        }
-
-        __m256 vd = _mm256_set1_ps(d);
-        __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
-        acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(sumi)), acc);
-
-    }
-
-    *s = hsum_float_8(acc) + summs;
-
-#elif defined __riscv_v_intrinsic
-
-    const uint8_t * scales = (const uint8_t*)&utmp[0];
-    const uint8_t * mins   = (const uint8_t*)&utmp[2];
-
-    float sumf = 0;
-    float sums = 0.0;
-
-    size_t vl;
-
-    for (int i = 0; i < nb; ++i) {
-
-        vl = 8;
-
-        const uint8_t * restrict q5 = x[i].qs;
-        const uint8_t * restrict hm = x[i].qh;
-        const  int8_t * restrict q8 = y[i].qs;
-
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
-
-        vint16mf2_t q8sums_0 = __riscv_vlse16_v_i16mf2(y[i].bsums, 4, vl);
-        vint16mf2_t q8sums_1 = __riscv_vlse16_v_i16mf2(y[i].bsums+1, 4, vl);
-        vint16mf2_t q8sums = __riscv_vadd_vv_i16mf2(q8sums_0, q8sums_1, vl);
-
-        memcpy(utmp, x[i].scales, 12);
-        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
-        const uint32_t uaux = utmp[1] & kmask1;
-        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
-        utmp[2] = uaux;
-        utmp[0] &= kmask1;
-
-        vuint8mf4_t mins8 = __riscv_vle8_v_u8mf4(mins, vl);
-        vint16mf2_t v_mins = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vzext_vf2_u16mf2(mins8, vl));
-        vint32m1_t prod = __riscv_vwmul_vv_i32m1(q8sums, v_mins, vl);
-
-        vint32m1_t sumi = __riscv_vredsum_vs_i32m1_i32m1(prod, __riscv_vmv_v_x_i32m1(0, 1), vl);
-        sumf -= dmin * __riscv_vmv_x_s_i32m1_i32(sumi);
-
-        vl = 32;
-        int32_t aux32 = 0;
-        int is = 0;
-
-        uint8_t m = 1;
-        vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
-        vuint8m1_t vqh = __riscv_vle8_v_u8m1(hm, vl);
-
-        for (int j = 0; j < QK_K/64; ++j) {
-            // load Q5 and Q8
-            vuint8m1_t q5_x = __riscv_vle8_v_u8m1(q5, vl);
-            vint8m1_t  q8_y1 = __riscv_vle8_v_i8m1(q8, vl);
-            vint8m1_t  q8_y2 = __riscv_vle8_v_i8m1(q8+32, vl);
-
-            // compute mask for addition
-            vint8m1_t q5_a = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q5_x, 0x0F, vl));
-            vuint8m1_t qh_m1 = __riscv_vand_vx_u8m1(vqh, m, vl);
-            vbool8_t vmask_1 = __riscv_vmsne_vx_u8m1_b8(qh_m1, 0, vl);
-            vint8m1_t q5_m1 = __riscv_vadd_vx_i8m1_m(vmask_1, q5_a, 16, vl);
-            m <<= 1;
-
-            vint8m1_t q5_l = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vsrl_vx_u8m1(q5_x, 0x04, vl));
-            vuint8m1_t qh_m2 = __riscv_vand_vx_u8m1(vqh, m, vl);
-            vbool8_t vmask_2 = __riscv_vmsne_vx_u8m1_b8(qh_m2, 0, vl);
-            vint8m1_t q5_m2 = __riscv_vadd_vx_i8m1_m(vmask_2, q5_l, 16, vl);
-            m <<= 1;
-
-            vint16m2_t v0 = __riscv_vwmul_vv_i16m2(q5_m1, q8_y1, vl);
-            vint16m2_t v1 = __riscv_vwmul_vv_i16m2(q5_m2, q8_y2, vl);
-
-            vint32m4_t vs1 = __riscv_vwmul_vx_i32m4(v0, scales[is++], vl);
-            vint32m4_t vs2 = __riscv_vwmul_vx_i32m4(v1, scales[is++], vl);
-
-            vint32m1_t vacc1 = __riscv_vredsum_vs_i32m4_i32m1(vs1, vzero, vl);
-            vint32m1_t vacc2 = __riscv_vredsum_vs_i32m4_i32m1(vs2, vzero, vl);
-
-            aux32 += __riscv_vmv_x_s_i32m1_i32(vacc1) + __riscv_vmv_x_s_i32m1_i32(vacc2);
-            q5 += 32;    q8 += 64;
-
-        }
-
-        vfloat32m1_t vaux = __riscv_vfmul_vf_f32m1(__riscv_vfmv_v_f_f32m1(aux32, 1), d, 1);
-        sums += __riscv_vfmv_f_s_f32m1_f32(vaux);
-
-    }
-
-    *s = sumf+sums;
-
-#else
-
-    const uint8_t * scales = (const uint8_t*)&utmp[0];
-    const uint8_t * mins   = (const uint8_t*)&utmp[2];
-
-    int8_t  aux8[QK_K];
-    int16_t aux16[8];
-    float   sums [8];
-    int32_t aux32[8];
-    memset(sums, 0, 8*sizeof(float));
-
-    float sumf = 0;
-    for (int i = 0; i < nb; ++i) {
-        const uint8_t * restrict q4 = x[i].qs;
-        const uint8_t * restrict hm = x[i].qh;
-        const  int8_t * restrict q8 = y[i].qs;
-        memset(aux32, 0, 8*sizeof(int32_t));
-        int8_t * restrict a = aux8;
-        uint8_t m = 1;
-        for (int j = 0; j < QK_K/64; ++j) {
-            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
-            for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
-            a += 32; m <<= 1;
-            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l]  >> 4);
-            for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
-            a += 32; m <<= 1;
-            q4 += 32;
-        }
-        memcpy(utmp, x[i].scales, 12);
-        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
-        const uint32_t uaux = utmp[1] & kmask1;
-        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
-        utmp[2] = uaux;
-        utmp[0] &= kmask1;
-
-        int sumi = 0;
-        for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
-        a = aux8;
-        int is = 0;
-        for (int j = 0; j < QK_K/32; ++j) {
-            int32_t scale = scales[is++];
-            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
-            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
-            q8 += 8; a += 8;
-            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
-            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
-            q8 += 8; a += 8;
-            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
-            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
-            q8 += 8; a += 8;
-            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
-            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
-            q8 += 8; a += 8;
-        }
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
-        const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
-        sumf -= dmin * sumi;
-    }
-    for (int l = 0; l < 8; ++l) sumf += sums[l];
-    *s = sumf;
-#endif
-}
-
-#else
-
-void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
-    assert(n % QK_K == 0);
-
-    const block_q5_K * restrict x = vx;
-    const block_q8_K * restrict y = vy;
-
-    const int nb = n / QK_K;
-
-#ifdef __ARM_NEON
-    const uint8x16_t m4b = vdupq_n_u8(0xf);
-    const uint8x16_t mh = vdupq_n_u8(16);
-    const int32x4_t mzero = vdupq_n_s32(0);
-
-    ggml_int8x16x4_t q5bytes;
-    ggml_uint8x16x4_t q5h;
-
-    float sumf = 0;
-
-    for (int i = 0; i < nb; ++i) {
-
-        const float d = y[i].d * (float)x[i].d;
-        const int8_t * sc = x[i].scales;
-
-        const uint8_t * restrict q5 = x[i].qs;
-        const uint8_t * restrict qh = x[i].qh;
-        const int8_t  * restrict q8 = y[i].qs;
-
-        const uint8x8_t qhbits = vld1_u8(qh);
-
-        const ggml_uint8x16x2_t q5bits = ggml_vld1q_u8_x2(q5);
-        const ggml_int8x16x4_t q8bytes = ggml_vld1q_s8_x4(q8);
-
-        const uint8x16_t htmp = vcombine_u8(qhbits, vshr_n_u8(qhbits, 1));
-        q5h.val[0] = vbicq_u8(mh, vshlq_n_u8(htmp, 4));
-        q5h.val[1] = vbicq_u8(mh, vshlq_n_u8(htmp, 2));
-        q5h.val[2] = vbicq_u8(mh, htmp);
-        q5h.val[3] = vbicq_u8(mh, vshrq_n_u8(htmp, 2));
-
-        q5bytes.val[0] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(q5bits.val[0], m4b)), vreinterpretq_s8_u8(q5h.val[0]));
-        q5bytes.val[1] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(q5bits.val[1], m4b)), vreinterpretq_s8_u8(q5h.val[1]));
-        q5bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vshrq_n_u8(q5bits.val[0], 4)), vreinterpretq_s8_u8(q5h.val[2]));
-        q5bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vshrq_n_u8(q5bits.val[1], 4)), vreinterpretq_s8_u8(q5h.val[3]));
-
-        int32_t sumi1 = sc[0] * vaddvq_s32(ggml_vdotq_s32(mzero, q5bytes.val[0], q8bytes.val[0]));
-        int32_t sumi2 = sc[1] * vaddvq_s32(ggml_vdotq_s32(mzero, q5bytes.val[1], q8bytes.val[1]));
-        int32_t sumi3 = sc[2] * vaddvq_s32(ggml_vdotq_s32(mzero, q5bytes.val[2], q8bytes.val[2]));
-        int32_t sumi4 = sc[3] * vaddvq_s32(ggml_vdotq_s32(mzero, q5bytes.val[3], q8bytes.val[3]));
-
-        sumf += d * (sumi1 + sumi2 + sumi3 + sumi4);
-    }
-
-    *s = sumf;
-
-#elif defined __AVX2__
-
-    const __m256i m4 = _mm256_set1_epi8(0xF);
-    const __m256i mone  = _mm256_set1_epi8(1);
-
-    __m256 acc = _mm256_setzero_ps();
-
-    for (int i = 0; i < nb; ++i) {
-
-        const uint8_t * restrict q5 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-
-        const __m256i q5bits = _mm256_loadu_si256((const __m256i*)q5);
-
-        const __m256i scale_l = MM256_SET_M128I(_mm_set1_epi16(x[i].scales[1]), _mm_set1_epi16(x[i].scales[0]));
-        const __m256i scale_h = MM256_SET_M128I(_mm_set1_epi16(x[i].scales[3]), _mm_set1_epi16(x[i].scales[2]));
-
-        int64_t aux64;
-        memcpy(&aux64, x[i].qh, 8);
-        const __m128i haux128 = _mm_set_epi64x(aux64 >> 1, aux64);
-        const __m256i haux256 = MM256_SET_M128I(_mm_srli_epi16(haux128, 2), haux128);
-
-        const __m256i q5h_0 = _mm256_slli_epi16(_mm256_andnot_si256(haux256, mone), 4);
-        const __m256i q5h_1 = _mm256_slli_epi16(_mm256_andnot_si256(_mm256_srli_epi16(haux256, 4), mone), 4);
-
-        const __m256i q5l_0 = _mm256_and_si256(q5bits, m4);
-        const __m256i q5l_1 = _mm256_and_si256(_mm256_srli_epi16(q5bits, 4), m4);
-
-        const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0));
-        const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32));
-
-        const __m256i p16_0 = _mm256_madd_epi16(scale_l, _mm256_maddubs_epi16(q5l_0, q8_0));
-        const __m256i p16_1 = _mm256_madd_epi16(scale_h, _mm256_maddubs_epi16(q5l_1, q8_1));
-        const __m256i s16_0 = _mm256_madd_epi16(scale_l, _mm256_maddubs_epi16(q5h_0, q8_0));
-        const __m256i s16_1 = _mm256_madd_epi16(scale_h, _mm256_maddubs_epi16(q5h_1, q8_1));
-
-        const __m256i dot = _mm256_sub_epi32(_mm256_add_epi32(p16_0, p16_1), _mm256_add_epi32(s16_0, s16_1));
-
-        acc = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(dot), acc);
-
-    }
-
-    *s = hsum_float_8(acc);
-
-#elif defined __AVX__
-
-    const __m128i m4 = _mm_set1_epi8(0xF);
-    const __m128i mone  = _mm_set1_epi8(1);
-
-    __m256 acc = _mm256_setzero_ps();
-
-    for (int i = 0; i < nb; ++i) {
-
-        const uint8_t * restrict q5 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-
-        const __m256i q5bits = _mm256_loadu_si256((const __m256i*)q5);
-
-        const __m128i scale_0 = _mm_set1_epi16(x[i].scales[0]);
-        const __m128i scale_1 = _mm_set1_epi16(x[i].scales[1]);
-        const __m128i scale_2 = _mm_set1_epi16(x[i].scales[2]);
-        const __m128i scale_3 = _mm_set1_epi16(x[i].scales[3]);
-
-        int64_t aux64;
-        memcpy(&aux64, x[i].qh, 8);
-        const __m128i haux128_0 = _mm_set_epi64x(aux64 >> 1, aux64);
-        const __m128i haux128_1 = _mm_srli_epi16(haux128_0, 2);
-
-        const __m128i q5h_0 = _mm_slli_epi16(_mm_andnot_si128(haux128_0, mone), 4);
-        const __m128i q5h_1 = _mm_slli_epi16(_mm_andnot_si128(haux128_1, mone), 4);
-        const __m128i q5h_2 = _mm_slli_epi16(_mm_andnot_si128(_mm_srli_epi16(haux128_0, 4), mone), 4);
-        const __m128i q5h_3 = _mm_slli_epi16(_mm_andnot_si128(_mm_srli_epi16(haux128_1, 4), mone), 4);
-
-        const __m128i q5l_0 = _mm_and_si128(_mm256_extractf128_si256(q5bits, 0), m4);
-        const __m128i q5l_1 = _mm_and_si128(_mm256_extractf128_si256(q5bits, 1), m4);
-        const __m128i q5l_2 = _mm_and_si128(_mm_srli_epi16(_mm256_extractf128_si256(q5bits, 0), 4), m4);
-        const __m128i q5l_3 = _mm_and_si128(_mm_srli_epi16(_mm256_extractf128_si256(q5bits, 1), 4), m4);
-
-        const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0));
-        const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32));
-
-        const __m128i p16_0 = _mm_madd_epi16(scale_0, _mm_maddubs_epi16(q5l_0, _mm256_extractf128_si256(q8_0, 0)));
-        const __m128i p16_1 = _mm_madd_epi16(scale_1, _mm_maddubs_epi16(q5l_1, _mm256_extractf128_si256(q8_0, 1)));
-        const __m128i p16_2 = _mm_madd_epi16(scale_2, _mm_maddubs_epi16(q5l_2, _mm256_extractf128_si256(q8_1, 0)));
-        const __m128i p16_3 = _mm_madd_epi16(scale_3, _mm_maddubs_epi16(q5l_3, _mm256_extractf128_si256(q8_1, 1)));
-        const __m128i s16_0 = _mm_madd_epi16(scale_0, _mm_maddubs_epi16(q5h_0, _mm256_extractf128_si256(q8_0, 0)));
-        const __m128i s16_1 = _mm_madd_epi16(scale_1, _mm_maddubs_epi16(q5h_1, _mm256_extractf128_si256(q8_0, 1)));
-        const __m128i s16_2 = _mm_madd_epi16(scale_2, _mm_maddubs_epi16(q5h_2, _mm256_extractf128_si256(q8_1, 0)));
-        const __m128i s16_3 = _mm_madd_epi16(scale_3, _mm_maddubs_epi16(q5h_3, _mm256_extractf128_si256(q8_1, 1)));
-
-        const __m128i dot_0 = _mm_sub_epi32(_mm_add_epi32(p16_0, p16_2), _mm_add_epi32(s16_0, s16_2));
-        const __m128i dot_1 = _mm_sub_epi32(_mm_add_epi32(p16_1, p16_3), _mm_add_epi32(s16_1, s16_3));
-
-        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(dot_1, dot_0))), acc);
-
-    }
-
-    *s = hsum_float_8(acc);
-
-#elif defined __riscv_v_intrinsic
-
-    float sumf = 0;
-
-    for (int i = 0; i < nb; ++i) {
-
-        const float d = y[i].d * (float)x[i].d;
-        const int8_t * sc = x[i].scales;
-
-        const uint8_t * restrict q5 = x[i].qs;
-        const uint8_t * restrict qh = x[i].qh;
-        const int8_t  * restrict q8 = y[i].qs;
-
-        vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
-
-        // load qh
-        vuint8mf4_t qh_x1   = __riscv_vle8_v_u8mf4(qh, 8);
-        vuint8mf2_t qh_x2   = __riscv_vlmul_ext_v_u8mf4_u8mf2(__riscv_vsrl_vx_u8mf4(qh_x1, 1, 8));
-
-        size_t vl = 16;
-
-        // combine both qh_1 and qh_2
-        vuint8mf2_t qh_x = __riscv_vslideup_vx_u8mf2(__riscv_vlmul_ext_v_u8mf4_u8mf2(qh_x1), qh_x2, vl/2, vl);
-
-        vuint8mf2_t qh_h0 = __riscv_vand_vx_u8mf2(__riscv_vnot_v_u8mf2(__riscv_vsll_vx_u8mf2(qh_x, 0x4, vl), vl), 16, vl);
-        vuint8mf2_t qh_h1 = __riscv_vand_vx_u8mf2(__riscv_vnot_v_u8mf2(__riscv_vsll_vx_u8mf2(qh_x, 0x2, vl), vl), 16, vl);
-        vuint8mf2_t qh_h2 = __riscv_vand_vx_u8mf2(__riscv_vnot_v_u8mf2(qh_x, vl), 16, vl);
-        vuint8mf2_t qh_h3 = __riscv_vand_vx_u8mf2(__riscv_vnot_v_u8mf2(__riscv_vsrl_vx_u8mf2(qh_x, 0x4, vl), vl), 16, vl);
-
-        vint8mf2_t qh_0 = __riscv_vreinterpret_v_u8mf2_i8mf2(qh_h0);
-        vint8mf2_t qh_1 = __riscv_vreinterpret_v_u8mf2_i8mf2(qh_h1);
-        vint8mf2_t qh_2 = __riscv_vreinterpret_v_u8mf2_i8mf2(qh_h2);
-        vint8mf2_t qh_3 = __riscv_vreinterpret_v_u8mf2_i8mf2(qh_h3);
-
-        // load q5
-        vuint8mf2_t q5_x1  = __riscv_vle8_v_u8mf2(q5, vl);
-        vuint8mf2_t q5_x2  = __riscv_vle8_v_u8mf2(q5+16, vl);
-
-        vint8mf2_t q5s_0 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vand_vx_u8mf2(q5_x1, 0xF, vl));
-        vint8mf2_t q5s_1 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vand_vx_u8mf2(q5_x2, 0xF, vl));
-        vint8mf2_t q5s_2 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vsrl_vx_u8mf2(q5_x1, 0x4, vl));
-        vint8mf2_t q5s_3 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vsrl_vx_u8mf2(q5_x2, 0x4, vl));
-
-        vint8mf2_t q5_0 = __riscv_vsub_vv_i8mf2(q5s_0, qh_0, vl);
-        vint8mf2_t q5_1 = __riscv_vsub_vv_i8mf2(q5s_1, qh_1, vl);
-        vint8mf2_t q5_2 = __riscv_vsub_vv_i8mf2(q5s_2, qh_2, vl);
-        vint8mf2_t q5_3 = __riscv_vsub_vv_i8mf2(q5s_3, qh_3, vl);
-
-        // load Q8 and multiply it with Q5
-        vint16m1_t p0 = __riscv_vwmul_vv_i16m1(q5_0, __riscv_vle8_v_i8mf2(q8, vl), vl);
-        vint16m1_t p1 = __riscv_vwmul_vv_i16m1(q5_1, __riscv_vle8_v_i8mf2(q8+16, vl), vl);
-        vint16m1_t p2 = __riscv_vwmul_vv_i16m1(q5_2, __riscv_vle8_v_i8mf2(q8+32, vl), vl);
-        vint16m1_t p3 = __riscv_vwmul_vv_i16m1(q5_3, __riscv_vle8_v_i8mf2(q8+48, vl), vl);
-
-        vint32m1_t vs_0 = __riscv_vwredsum_vs_i16m1_i32m1(p0, vzero, vl);
-        vint32m1_t vs_1 = __riscv_vwredsum_vs_i16m1_i32m1(p1, vzero, vl);
-        vint32m1_t vs_2 = __riscv_vwredsum_vs_i16m1_i32m1(p2, vzero, vl);
-        vint32m1_t vs_3 = __riscv_vwredsum_vs_i16m1_i32m1(p3, vzero, vl);
-
-        int32_t sumi1 = sc[0] * __riscv_vmv_x_s_i32m1_i32(vs_0);
-        int32_t sumi2 = sc[1] * __riscv_vmv_x_s_i32m1_i32(vs_1);
-        int32_t sumi3 = sc[2] * __riscv_vmv_x_s_i32m1_i32(vs_2);
-        int32_t sumi4 = sc[3] * __riscv_vmv_x_s_i32m1_i32(vs_3);
-
-        sumf += d * (sumi1 + sumi2 + sumi3 + sumi4);
-
-    }
-
-    *s = sumf;
-
-#else
-
-    int8_t aux8[QK_K];
-    int16_t aux16[16];
-    float   sums [8];
-    memset(sums, 0, 8*sizeof(float));
-
-    float sumf = 0;
-    for (int i = 0; i < nb; ++i) {
-        const uint8_t * restrict q4 = x[i].qs;
-        const uint8_t * restrict hm = x[i].qh;
-        const  int8_t * restrict q8 = y[i].qs;
-        int8_t * restrict a = aux8;
-        for (int l = 0; l < 32; ++l) {
-            a[l+ 0] = q4[l] & 0xF;
-            a[l+32] = q4[l]  >> 4;
-        }
-        for (int is = 0; is < 8; ++is) {
-            uint8_t m = 1 << is;
-            for (int l = 0; l < 8; ++l) a[8*is + l] -= (hm[l] & m ? 0 : 16);
-        }
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-        const int8_t * restrict sc = x[i].scales;
-
-        for (int j = 0; j < QK_K/16; ++j) {
-            const float dl = d * sc[j];
-            for (int l = 0; l < 16; ++l) aux16[l] = q8[l] * a[l];
-            for (int l = 0; l <  8; ++l) sums[l] += dl * (aux16[l] + aux16[8+l]);
-            q8 += 16; a += 16;
-        }
-    }
-    for (int l = 0; l < 8; ++l) sumf += sums[l];
-    *s = sumf;
-#endif
-}
-#endif
-
-
-#if QK_K == 256
-void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
-    assert(n % QK_K == 0);
-
-    const block_q6_K * restrict x = vx;
-    const block_q8_K * restrict y = vy;
-
-    const int nb = n / QK_K;
-
-#ifdef __ARM_NEON
-    float sum = 0;
-
-    const uint8x16_t m4b = vdupq_n_u8(0xF);
-    const int32x4_t  vzero = vdupq_n_s32(0);
-    //const int8x16_t  m32s = vdupq_n_s8(32);
-
-    const uint8x16_t mone = vdupq_n_u8(3);
-
-    ggml_int8x16x4_t q6bytes;
-    ggml_uint8x16x4_t q6h;
-
-    for (int i = 0; i < nb; ++i) {
-
-        const float d_all = GGML_FP16_TO_FP32(x[i].d);
-
-        const uint8_t * restrict q6 = x[i].ql;
-        const uint8_t * restrict qh = x[i].qh;
-        const int8_t  * restrict q8 = y[i].qs;
-
-        const int8_t * restrict scale = x[i].scales;
-
-        const ggml_int16x8x2_t q8sums = ggml_vld1q_s16_x2(y[i].bsums);
-        const int8x16_t scales = vld1q_s8(scale);
-        const ggml_int16x8x2_t q6scales = {{vmovl_s8(vget_low_s8(scales)), vmovl_s8(vget_high_s8(scales))}};
-
-        const int32x4_t prod = vaddq_s32(vaddq_s32(vmull_s16(vget_low_s16 (q8sums.val[0]), vget_low_s16 (q6scales.val[0])),
-                                                   vmull_s16(vget_high_s16(q8sums.val[0]), vget_high_s16(q6scales.val[0]))),
-                                         vaddq_s32(vmull_s16(vget_low_s16 (q8sums.val[1]), vget_low_s16 (q6scales.val[1])),
-                                                   vmull_s16(vget_high_s16(q8sums.val[1]), vget_high_s16(q6scales.val[1]))));
-        int32_t isum_mins = vaddvq_s32(prod);
-
-        int32_t isum = 0;
-
-        for (int j = 0; j < QK_K/128; ++j) {
-
-            ggml_uint8x16x2_t qhbits = ggml_vld1q_u8_x2(qh); qh += 32;
-            ggml_uint8x16x4_t q6bits = ggml_vld1q_u8_x4(q6); q6 += 64;
-            ggml_int8x16x4_t q8bytes = ggml_vld1q_s8_x4(q8); q8 += 64;
-
-            q6h.val[0] = vshlq_n_u8(vandq_u8(mone, qhbits.val[0]), 4);
-            q6h.val[1] = vshlq_n_u8(vandq_u8(mone, qhbits.val[1]), 4);
-            uint8x16_t shifted = vshrq_n_u8(qhbits.val[0], 2);
-            q6h.val[2] = vshlq_n_u8(vandq_u8(mone, shifted), 4);
-            shifted = vshrq_n_u8(qhbits.val[1], 2);
-            q6h.val[3] = vshlq_n_u8(vandq_u8(mone, shifted), 4);
-
-            //q6bytes.val[0] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[0], m4b), q6h.val[0])), m32s);
-            //q6bytes.val[1] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[1], m4b), q6h.val[1])), m32s);
-            //q6bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[2], m4b), q6h.val[2])), m32s);
-            //q6bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[3], m4b), q6h.val[3])), m32s);
-            q6bytes.val[0] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[0], m4b), q6h.val[0]));
-            q6bytes.val[1] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[1], m4b), q6h.val[1]));
-            q6bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[2], m4b), q6h.val[2]));
-            q6bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[3], m4b), q6h.val[3]));
-
-            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
-                    vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
-                    vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
-                    vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];
-
-            scale += 4;
-
-            q8bytes = ggml_vld1q_s8_x4(q8); q8 += 64;
-
-            shifted = vshrq_n_u8(qhbits.val[0], 4);
-            q6h.val[0] = vshlq_n_u8(vandq_u8(mone, shifted), 4);
-            shifted = vshrq_n_u8(qhbits.val[1], 4);
-            q6h.val[1] = vshlq_n_u8(vandq_u8(mone, shifted), 4);
-            shifted = vshrq_n_u8(qhbits.val[0], 6);
-            q6h.val[2] = vshlq_n_u8(vandq_u8(mone, shifted), 4);
-            shifted = vshrq_n_u8(qhbits.val[1], 6);
-            q6h.val[3] = vshlq_n_u8(vandq_u8(mone, shifted), 4);
-
-            //q6bytes.val[0] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[0], 4), q6h.val[0])), m32s);
-            //q6bytes.val[1] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[1], 4), q6h.val[1])), m32s);
-            //q6bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[2], 4), q6h.val[2])), m32s);
-            //q6bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[3], 4), q6h.val[3])), m32s);
-            q6bytes.val[0] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[0], 4), q6h.val[0]));
-            q6bytes.val[1] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[1], 4), q6h.val[1]));
-            q6bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[2], 4), q6h.val[2]));
-            q6bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[3], 4), q6h.val[3]));
-
-            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
-                    vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
-                    vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
-                    vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];
-            scale += 4;
-        }
-        //sum += isum * d_all * y[i].d;
-        sum += d_all * y[i].d * (isum - 32 * isum_mins);
-
-    }
-    *s = sum;
-
-#elif defined __AVX2__
-
-    const __m256i m4 = _mm256_set1_epi8(0xF);
-    const __m256i m2 = _mm256_set1_epi8(3);
-    const __m256i m32s = _mm256_set1_epi8(32);
-
-    __m256 acc = _mm256_setzero_ps();
-
-    for (int i = 0; i < nb; ++i) {
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-
-        const uint8_t * restrict q4 = x[i].ql;
-        const uint8_t * restrict qh = x[i].qh;
-        const int8_t  * restrict q8 = y[i].qs;
-
-        const __m128i scales = _mm_loadu_si128((const __m128i*)x[i].scales);
-
-        __m256i sumi = _mm256_setzero_si256();
-
-        int is = 0;
-
-        for (int j = 0; j < QK_K/128; ++j) {
-
-            const __m128i scale_0 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 0));
-            const __m128i scale_1 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 1));
-            const __m128i scale_2 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 2));
-            const __m128i scale_3 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 3));
-            is += 4;
-
-            const __m256i q4bits1 = _mm256_loadu_si256((const __m256i*)q4); q4 += 32;
-            const __m256i q4bits2 = _mm256_loadu_si256((const __m256i*)q4); q4 += 32;
-            const __m256i q4bitsH = _mm256_loadu_si256((const __m256i*)qh); qh += 32;
-
-            const __m256i q4h_0 = _mm256_slli_epi16(_mm256_and_si256(q4bitsH, m2), 4);
-            const __m256i q4h_1 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(q4bitsH, 2), m2), 4);
-            const __m256i q4h_2 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(q4bitsH, 4), m2), 4);
-            const __m256i q4h_3 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(q4bitsH, 6), m2), 4);
-
-            const __m256i q4_0 = _mm256_or_si256(_mm256_and_si256(q4bits1, m4), q4h_0);
-            const __m256i q4_1 = _mm256_or_si256(_mm256_and_si256(q4bits2, m4), q4h_1);
-            const __m256i q4_2 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q4bits1, 4), m4), q4h_2);
-            const __m256i q4_3 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q4bits2, 4), m4), q4h_3);
-
-            const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
-            const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
-            const __m256i q8_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
-            const __m256i q8_3 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
-
-            __m256i q8s_0 = _mm256_maddubs_epi16(m32s, q8_0);
-            __m256i q8s_1 = _mm256_maddubs_epi16(m32s, q8_1);
-            __m256i q8s_2 = _mm256_maddubs_epi16(m32s, q8_2);
-            __m256i q8s_3 = _mm256_maddubs_epi16(m32s, q8_3);
-
-            __m256i p16_0 = _mm256_maddubs_epi16(q4_0, q8_0);
-            __m256i p16_1 = _mm256_maddubs_epi16(q4_1, q8_1);
-            __m256i p16_2 = _mm256_maddubs_epi16(q4_2, q8_2);
-            __m256i p16_3 = _mm256_maddubs_epi16(q4_3, q8_3);
-
-            p16_0 = _mm256_sub_epi16(p16_0, q8s_0);
-            p16_1 = _mm256_sub_epi16(p16_1, q8s_1);
-            p16_2 = _mm256_sub_epi16(p16_2, q8s_2);
-            p16_3 = _mm256_sub_epi16(p16_3, q8s_3);
-
-            p16_0 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_0), p16_0);
-            p16_1 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_1), p16_1);
-            p16_2 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_2), p16_2);
-            p16_3 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_3), p16_3);
-
-            sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p16_0, p16_1));
-            sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p16_2, p16_3));
-
-        }
-
-        acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi), acc);
-    }
-
-    *s = hsum_float_8(acc);
-
-#elif defined __AVX__
-
-    const __m128i m4 = _mm_set1_epi8(0xF);
-    const __m128i m3 = _mm_set1_epi8(3);
-    const __m128i m32s = _mm_set1_epi8(32);
-    const __m128i m2 = _mm_set1_epi8(2);
-
-    __m256 acc = _mm256_setzero_ps();
-
-    for (int i = 0; i < nb; ++i) {
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-
-        const uint8_t * restrict q4 = x[i].ql;
-        const uint8_t * restrict qh = x[i].qh;
-        const int8_t  * restrict q8 = y[i].qs;
-
-        const __m128i scales = _mm_loadu_si128((const __m128i*)x[i].scales);
-
-        __m128i sumi_0 = _mm_setzero_si128();
-        __m128i sumi_1 = _mm_setzero_si128();
-
-        __m128i shuffle = _mm_set_epi64x(0x0101010101010101, 0x0000000000000000);
-        for (int j = 0; j < QK_K/128; ++j) {
-
-            const __m128i q4bitsH_0 = _mm_loadu_si128((const __m128i*)qh); qh += 16;
-            const __m128i q4bitsH_1 = _mm_loadu_si128((const __m128i*)qh); qh += 16;
-
-            const __m128i q4h_0 = _mm_slli_epi16(_mm_and_si128(q4bitsH_0, m3), 4);
-            const __m128i q4h_1 = _mm_slli_epi16(_mm_and_si128(q4bitsH_1, m3), 4);
-            const __m128i q4h_2 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_0, 2), m3), 4);
-            const __m128i q4h_3 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_1, 2), m3), 4);
-            const __m128i q4h_4 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_0, 4), m3), 4);
-            const __m128i q4h_5 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_1, 4), m3), 4);
-            const __m128i q4h_6 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_0, 6), m3), 4);
-            const __m128i q4h_7 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_1, 6), m3), 4);
-
-            const __m128i q4bits1_0 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
-            const __m128i q4bits1_1 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
-            const __m128i q4bits2_0 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
-            const __m128i q4bits2_1 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
-
-            const __m128i q4_0 = _mm_or_si128(_mm_and_si128(q4bits1_0, m4), q4h_0);
-            const __m128i q4_1 = _mm_or_si128(_mm_and_si128(q4bits1_1, m4), q4h_1);
-            const __m128i q4_2 = _mm_or_si128(_mm_and_si128(q4bits2_0, m4), q4h_2);
-            const __m128i q4_3 = _mm_or_si128(_mm_and_si128(q4bits2_1, m4), q4h_3);
-            const __m128i q4_4 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits1_0, 4), m4), q4h_4);
-            const __m128i q4_5 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits1_1, 4), m4), q4h_5);
-            const __m128i q4_6 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits2_0, 4), m4), q4h_6);
-            const __m128i q4_7 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits2_1, 4), m4), q4h_7);
-
-            const __m128i q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_2 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_3 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_4 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_5 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_6 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_7 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-
-            __m128i q8s_0 = _mm_maddubs_epi16(m32s, q8_0);
-            __m128i q8s_1 = _mm_maddubs_epi16(m32s, q8_1);
-            __m128i q8s_2 = _mm_maddubs_epi16(m32s, q8_2);
-            __m128i q8s_3 = _mm_maddubs_epi16(m32s, q8_3);
-            __m128i q8s_4 = _mm_maddubs_epi16(m32s, q8_4);
-            __m128i q8s_5 = _mm_maddubs_epi16(m32s, q8_5);
-            __m128i q8s_6 = _mm_maddubs_epi16(m32s, q8_6);
-            __m128i q8s_7 = _mm_maddubs_epi16(m32s, q8_7);
-
-            __m128i p16_0 = _mm_maddubs_epi16(q4_0, q8_0);
-            __m128i p16_1 = _mm_maddubs_epi16(q4_1, q8_1);
-            __m128i p16_2 = _mm_maddubs_epi16(q4_2, q8_2);
-            __m128i p16_3 = _mm_maddubs_epi16(q4_3, q8_3);
-            __m128i p16_4 = _mm_maddubs_epi16(q4_4, q8_4);
-            __m128i p16_5 = _mm_maddubs_epi16(q4_5, q8_5);
-            __m128i p16_6 = _mm_maddubs_epi16(q4_6, q8_6);
-            __m128i p16_7 = _mm_maddubs_epi16(q4_7, q8_7);
-
-            p16_0 = _mm_sub_epi16(p16_0, q8s_0);
-            p16_1 = _mm_sub_epi16(p16_1, q8s_1);
-            p16_2 = _mm_sub_epi16(p16_2, q8s_2);
-            p16_3 = _mm_sub_epi16(p16_3, q8s_3);
-            p16_4 = _mm_sub_epi16(p16_4, q8s_4);
-            p16_5 = _mm_sub_epi16(p16_5, q8s_5);
-            p16_6 = _mm_sub_epi16(p16_6, q8s_6);
-            p16_7 = _mm_sub_epi16(p16_7, q8s_7);
-
-            const __m128i scale_0 = _mm_shuffle_epi8(scales, shuffle);
-            shuffle = _mm_add_epi8(shuffle, m2);
-            const __m128i scale_1 = _mm_shuffle_epi8(scales, shuffle);
-            shuffle = _mm_add_epi8(shuffle, m2);
-            const __m128i scale_2 = _mm_shuffle_epi8(scales, shuffle);
-            shuffle = _mm_add_epi8(shuffle, m2);
-            const __m128i scale_3 = _mm_shuffle_epi8(scales, shuffle);
-            shuffle = _mm_add_epi8(shuffle, m2);
-
-            p16_0 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_0), p16_0);
-            p16_1 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_0, scale_0)), p16_1);
-            p16_2 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_1), p16_2);
-            p16_3 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_1, scale_1)), p16_3);
-            p16_4 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_2), p16_4);
-            p16_5 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_2, scale_2)), p16_5);
-            p16_6 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_3), p16_6);
-            p16_7 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_3, scale_3)), p16_7);
-
-            sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2));
-            sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_1, p16_3));
-            sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_4, p16_6));
-            sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_5, p16_7));
-
-        }
-
-        __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
-        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi)), acc);
-    }
-
-    *s = hsum_float_8(acc);
-
-#elif defined __riscv_v_intrinsic
-
-    float sumf = 0;
-    for (int i = 0; i < nb; ++i) {
-
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-
-        const uint8_t * restrict q6 = x[i].ql;
-        const uint8_t * restrict qh = x[i].qh;
-        const  int8_t * restrict q8 = y[i].qs;
-
-        const int8_t * restrict scale = x[i].scales;
-
-        size_t vl;
-
-        vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
-
-        int sum_t = 0;
-        int is = 0;
-
-        for (int j = 0; j < QK_K/128; ++j) {
-
-            vl = 32;
-
-            // load qh
-            vuint8m1_t qh_x = __riscv_vle8_v_u8m1(qh, vl);
-
-            // load Q6
-            vuint8m1_t q6_0 = __riscv_vle8_v_u8m1(q6, vl);
-            vuint8m1_t q6_1 = __riscv_vle8_v_u8m1(q6+32, vl);
-
-            vuint8m1_t q6a_0 = __riscv_vand_vx_u8m1(q6_0, 0x0F, vl);
-            vuint8m1_t q6a_1 = __riscv_vand_vx_u8m1(q6_1, 0x0F, vl);
-            vuint8m1_t q6s_0 = __riscv_vsrl_vx_u8m1(q6_0, 0x04, vl);
-            vuint8m1_t q6s_1 = __riscv_vsrl_vx_u8m1(q6_1, 0x04, vl);
-
-            vuint8m1_t qh_0 = __riscv_vand_vx_u8m1(qh_x, 0x03, vl);
-            vuint8m1_t qh_1 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(qh_x, 0x2, vl), 0x03 , vl);
-            vuint8m1_t qh_2 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(qh_x, 0x4, vl), 0x03 , vl);
-            vuint8m1_t qh_3 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(qh_x, 0x6, vl), 0x03 , vl);
-
-            vuint8m1_t qhi_0 = __riscv_vor_vv_u8m1(q6a_0, __riscv_vsll_vx_u8m1(qh_0, 0x04, vl), vl);
-            vuint8m1_t qhi_1 = __riscv_vor_vv_u8m1(q6a_1, __riscv_vsll_vx_u8m1(qh_1, 0x04, vl), vl);
-            vuint8m1_t qhi_2 = __riscv_vor_vv_u8m1(q6s_0, __riscv_vsll_vx_u8m1(qh_2, 0x04, vl), vl);
-            vuint8m1_t qhi_3 = __riscv_vor_vv_u8m1(q6s_1, __riscv_vsll_vx_u8m1(qh_3, 0x04, vl), vl);
-
-            vint8m1_t a_0 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_0), 32, vl);
-            vint8m1_t a_1 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_1), 32, vl);
-            vint8m1_t a_2 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_2), 32, vl);
-            vint8m1_t a_3 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_3), 32, vl);
-
-            // load Q8 and take product
-            vint16m2_t va_q_0 = __riscv_vwmul_vv_i16m2(a_0, __riscv_vle8_v_i8m1(q8, vl), vl);
-            vint16m2_t va_q_1 = __riscv_vwmul_vv_i16m2(a_1, __riscv_vle8_v_i8m1(q8+32, vl), vl);
-            vint16m2_t va_q_2 = __riscv_vwmul_vv_i16m2(a_2, __riscv_vle8_v_i8m1(q8+64, vl), vl);
-            vint16m2_t va_q_3 = __riscv_vwmul_vv_i16m2(a_3, __riscv_vle8_v_i8m1(q8+96, vl), vl);
-
-            vl = 16;
-
-            vint32m2_t vaux_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_0, 0), scale[is+0], vl);
-            vint32m2_t vaux_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_0, 1), scale[is+1], vl);
-            vint32m2_t vaux_2 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_1, 0), scale[is+2], vl);
-            vint32m2_t vaux_3 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_1, 1), scale[is+3], vl);
-            vint32m2_t vaux_4 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_2, 0), scale[is+4], vl);
-            vint32m2_t vaux_5 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_2, 1), scale[is+5], vl);
-            vint32m2_t vaux_6 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_3, 0), scale[is+6], vl);
-            vint32m2_t vaux_7 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_3, 1), scale[is+7], vl);
-
-            vint32m1_t isum0 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_0, vaux_1, vl), vzero, vl);
-            vint32m1_t isum1 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_2, vaux_3, vl), isum0, vl);
-            vint32m1_t isum2 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_4, vaux_5, vl), isum1, vl);
-            vint32m1_t isum3 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_6, vaux_7, vl), isum2, vl);
-
-            sum_t += __riscv_vmv_x_s_i32m1_i32(isum3);
-
-            q6 += 64;   qh += 32;   q8 += 128;   is=8;
-
-        }
-
-        sumf += d * sum_t;
-
-    }
-
-    *s = sumf;
-
-#else
-
-    int8_t  aux8[QK_K];
-    int16_t aux16[8];
-    float   sums [8];
-    int32_t aux32[8];
-    memset(sums, 0, 8*sizeof(float));
-
-    float sumf = 0;
-    for (int i = 0; i < nb; ++i) {
-        const uint8_t * restrict q4 = x[i].ql;
-        const uint8_t * restrict qh = x[i].qh;
-        const  int8_t * restrict q8 = y[i].qs;
-        memset(aux32, 0, 8*sizeof(int32_t));
-        int8_t * restrict a = aux8;
-        for (int j = 0; j < QK_K; j += 128) {
-            for (int l = 0; l < 32; ++l) {
-                a[l +  0] = (int8_t)((q4[l +  0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
-                a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
-                a[l + 64] = (int8_t)((q4[l +  0] >>  4) | (((qh[l] >> 4) & 3) << 4)) - 32;
-                a[l + 96] = (int8_t)((q4[l + 32] >>  4) | (((qh[l] >> 6) & 3) << 4)) - 32;
-            }
-            a  += 128;
-            q4 += 64;
-            qh += 32;
-        }
-        a = aux8;
-        int is = 0;
-        for (int j = 0; j < QK_K/16; ++j) {
-            int scale = x[i].scales[is++];
-            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
-            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
-            q8 += 8; a += 8;
-            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
-            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
-            q8 += 8; a += 8;
-        }
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
-    }
-    for (int l = 0; l < 8; ++l) sumf += sums[l];
-    *s = sumf;
-#endif
-}
-
-#else
-
-void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
-    assert(n % QK_K == 0);
-
-    const block_q6_K * restrict x = vx;
-    const block_q8_K * restrict y = vy;
-
-    const int nb = n / QK_K;
-
-#ifdef __ARM_NEON
-    float sum = 0;
-
-    const uint8x16_t m4b = vdupq_n_u8(0xF);
-    const int8x16_t  m32s = vdupq_n_s8(32);
-    const int32x4_t  vzero = vdupq_n_s32(0);
-
-    const uint8x16_t mone = vdupq_n_u8(3);
-
-    ggml_int8x16x4_t q6bytes;
-    ggml_uint8x16x4_t q6h;
-
-    for (int i = 0; i < nb; ++i) {
-
-        const float d_all = (float)x[i].d;
-
-        const uint8_t * restrict q6 = x[i].ql;
-        const uint8_t * restrict qh = x[i].qh;
-        const int8_t  * restrict q8 = y[i].qs;
-
-        const int8_t * restrict scale = x[i].scales;
-
-        int32_t isum = 0;
-
-        uint8x16_t qhbits = vld1q_u8(qh);
-        ggml_uint8x16x2_t q6bits = ggml_vld1q_u8_x2(q6);
-        ggml_int8x16x4_t q8bytes = ggml_vld1q_s8_x4(q8);
-
-        q6h.val[0] = vshlq_n_u8(vandq_u8(mone, qhbits), 4);
-        uint8x16_t shifted = vshrq_n_u8(qhbits, 2);
-        q6h.val[1] = vshlq_n_u8(vandq_u8(mone, shifted), 4);
-        shifted = vshrq_n_u8(qhbits, 4);
-        q6h.val[2] = vshlq_n_u8(vandq_u8(mone, shifted), 4);
-        shifted = vshrq_n_u8(qhbits, 6);
-        q6h.val[3] = vshlq_n_u8(vandq_u8(mone, shifted), 4);
-
-        q6bytes.val[0] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[0], m4b), q6h.val[0])), m32s);
-        q6bytes.val[1] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[1], m4b), q6h.val[1])), m32s);
-        q6bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[0], 4), q6h.val[2])), m32s);
-        q6bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[1], 4), q6h.val[3])), m32s);
-
-        isum += vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
-                vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
-                vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
-                vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];
-
-        sum += isum * d_all * y[i].d;
-
-    }
-    *s = sum;
-
-#elif defined __AVX2__
-
-    const __m256i m4 = _mm256_set1_epi8(0xF);
-    const __m256i m2 = _mm256_set1_epi8(3);
-    const __m256i m32s = _mm256_set1_epi8(32);
-
-    __m256 acc = _mm256_setzero_ps();
-
-    for (int i = 0; i < nb; ++i) {
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-
-        const uint8_t * restrict q4 = x[i].ql;
-        const uint8_t * restrict qh = x[i].qh;
-        const int8_t  * restrict q8 = y[i].qs;
-
-        const __m64 scales_1 = _mm_set1_pi8(x[i].scales[0]);
-        const __m64 scales_2 = _mm_set1_pi8(x[i].scales[1]);
-        const __m64 scales_3 = _mm_set1_pi8(x[i].scales[2]);
-        const __m64 scales_4 = _mm_set1_pi8(x[i].scales[3]);
-
-        __m256i sumi = _mm256_setzero_si256();
-
-        const __m128i scale_0 = _mm_set_epi64(scales_2, scales_1);
-        const __m128i scale_1 = _mm_set_epi64(scales_4, scales_3);
-
-        const __m256i q4bits1 = _mm256_loadu_si256((const __m256i*)q4);
-        const __m128i q4bitsH = _mm_loadu_si128((const __m128i*)qh);
-
-        const __m256i q4h_0 = _mm256_slli_epi16(_mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(q4bitsH, 2), q4bitsH), m2), 4);
-        const __m256i q4h_1 = _mm256_slli_epi16(_mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(q4bitsH, 6), _mm_srli_epi16(q4bitsH, 4)), m2), 4);
-
-        const __m256i q4_0 = _mm256_or_si256(_mm256_and_si256(q4bits1, m4), q4h_0);
-        const __m256i q4_1 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q4bits1, 4), m4), q4h_1);
-
-        const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0));
-        const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32));
-
-        __m256i q8s_0 = _mm256_maddubs_epi16(m32s, q8_0);
-        __m256i q8s_1 = _mm256_maddubs_epi16(m32s, q8_1);
-
-        __m256i p16_0 = _mm256_maddubs_epi16(q4_0, q8_0);
-        __m256i p16_1 = _mm256_maddubs_epi16(q4_1, q8_1);
-
-        p16_0 = _mm256_sub_epi16(p16_0, q8s_0);
-        p16_1 = _mm256_sub_epi16(p16_1, q8s_1);
-
-        p16_0 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_0), p16_0);
-        p16_1 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_1), p16_1);
-
-        sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p16_0, p16_1));
-
-        acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi), acc);
-    }
-
-    *s = hsum_float_8(acc);
-
-#elif defined __AVX__
-
-    const __m128i m4 = _mm_set1_epi8(0xF);
-    const __m128i m2 = _mm_set1_epi8(3);
-    const __m128i m32s = _mm_set1_epi8(32);
-
-    __m256 acc = _mm256_setzero_ps();
-
-    for (int i = 0; i < nb; ++i) {
-
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-
-        const uint8_t * restrict q4 = x[i].ql;
-        const uint8_t * restrict qh = x[i].qh;
-        const int8_t  * restrict q8 = y[i].qs;
-
-        const __m64 scales_1 = _mm_set1_pi8(x[i].scales[0]);
-        const __m64 scales_2 = _mm_set1_pi8(x[i].scales[1]);
-        const __m64 scales_3 = _mm_set1_pi8(x[i].scales[2]);
-        const __m64 scales_4 = _mm_set1_pi8(x[i].scales[3]);
-
-        __m128i sumi_0 = _mm_setzero_si128();
-        __m128i sumi_1 = _mm_setzero_si128();
-
-        const __m128i scale_0 = _mm_set_epi64(scales_2, scales_1);
-        const __m128i scale_1 = _mm_set_epi64(scales_4, scales_3);
-
-        const __m256i q4bits1 = _mm256_loadu_si256((const __m256i*)q4);
-        const __m128i q4bitsH = _mm_loadu_si128((const __m128i*)qh);
-
-        const __m128i q4h_0 = _mm_slli_epi16(_mm_and_si128(q4bitsH, m2), 4);
-        const __m128i q4h_1 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH, 2), m2), 4);
-        const __m128i q4h_2 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH, 4), m2), 4);
-        const __m128i q4h_3 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH, 6), m2), 4);
-
-        const __m128i q4_0 = _mm_or_si128(_mm_and_si128(_mm256_extractf128_si256(q4bits1, 0), m4), q4h_0);
-        const __m128i q4_1 = _mm_or_si128(_mm_and_si128(_mm256_extractf128_si256(q4bits1, 1), m4), q4h_1);
-        const __m128i q4_2 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(_mm256_extractf128_si256(q4bits1, 0), 4), m4), q4h_2);
-        const __m128i q4_3 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(_mm256_extractf128_si256(q4bits1, 1), 4), m4), q4h_3);
-
-        const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0));
-        const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32));
-
-        __m128i q8s_0 = _mm_maddubs_epi16(m32s, _mm256_extractf128_si256(q8_0, 0));
-        __m128i q8s_1 = _mm_maddubs_epi16(m32s, _mm256_extractf128_si256(q8_0, 1));
-        __m128i q8s_2 = _mm_maddubs_epi16(m32s, _mm256_extractf128_si256(q8_1, 0));
-        __m128i q8s_3 = _mm_maddubs_epi16(m32s, _mm256_extractf128_si256(q8_1, 1));
-
-        __m128i p16_0 = _mm_maddubs_epi16(q4_0, _mm256_extractf128_si256(q8_0, 0));
-        __m128i p16_1 = _mm_maddubs_epi16(q4_1, _mm256_extractf128_si256(q8_0, 1));
-        __m128i p16_2 = _mm_maddubs_epi16(q4_2, _mm256_extractf128_si256(q8_1, 0));
-        __m128i p16_3 = _mm_maddubs_epi16(q4_3, _mm256_extractf128_si256(q8_1, 1));
-
-        p16_0 = _mm_sub_epi16(p16_0, q8s_0);
-        p16_1 = _mm_sub_epi16(p16_1, q8s_1);
-        p16_2 = _mm_sub_epi16(p16_2, q8s_2);
-        p16_3 = _mm_sub_epi16(p16_3, q8s_3);
-
-        p16_0 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_0), p16_0);
-        p16_1 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_0, scale_0)), p16_1);
-        p16_2 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_1), p16_2);
-        p16_3 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_1, scale_1)), p16_3);
-
-        sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2));
-        sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_1, p16_3));
-
-        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(MM256_SET_M128I(sumi_1, sumi_0))), acc);
-    }
-
-    *s = hsum_float_8(acc);
-
-#elif defined __riscv_v_intrinsic
-
-    float sumf = 0;
-
-    for (int i = 0; i < nb; ++i) {
-
-        const float d_all = (float)x[i].d;
-
-        const uint8_t * restrict q6 = x[i].ql;
-        const uint8_t * restrict qh = x[i].qh;
-        const int8_t  * restrict q8 = y[i].qs;
-
-        const int8_t * restrict scale = x[i].scales;
-
-        int32_t isum = 0;
-
-        size_t vl = 16;
-
-        vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
-
-        // load Q6
-        vuint8mf2_t q6_0 = __riscv_vle8_v_u8mf2(q6, vl);
-        vuint8mf2_t q6_1 = __riscv_vle8_v_u8mf2(q6+16, vl);
-
-        // load qh
-        vuint8mf2_t qh_x = __riscv_vle8_v_u8mf2(qh, vl);
-
-        vuint8mf2_t qh0 = __riscv_vsll_vx_u8mf2(__riscv_vand_vx_u8mf2(qh_x, 0x3, vl), 0x4, vl);
-        qh_x = __riscv_vsrl_vx_u8mf2(qh_x, 0x2, vl);
-        vuint8mf2_t qh1 = __riscv_vsll_vx_u8mf2(__riscv_vand_vx_u8mf2(qh_x, 0x3, vl), 0x4, vl);
-        qh_x = __riscv_vsrl_vx_u8mf2(qh_x, 0x2, vl);
-        vuint8mf2_t qh2 = __riscv_vsll_vx_u8mf2(__riscv_vand_vx_u8mf2(qh_x, 0x3, vl), 0x4, vl);
-        qh_x = __riscv_vsrl_vx_u8mf2(qh_x, 0x2, vl);
-        vuint8mf2_t qh3 = __riscv_vsll_vx_u8mf2(__riscv_vand_vx_u8mf2(qh_x, 0x3, vl), 0x4, vl);
-
-        vuint8mf2_t q6h_0 = __riscv_vor_vv_u8mf2(__riscv_vand_vx_u8mf2(q6_0, 0xF, vl), qh0, vl);
-        vuint8mf2_t q6h_1 = __riscv_vor_vv_u8mf2(__riscv_vand_vx_u8mf2(q6_1, 0xF, vl), qh1, vl);
-        vuint8mf2_t q6h_2 = __riscv_vor_vv_u8mf2(__riscv_vsrl_vx_u8mf2(q6_0, 0x4, vl), qh2, vl);
-        vuint8mf2_t q6h_3 = __riscv_vor_vv_u8mf2(__riscv_vsrl_vx_u8mf2(q6_1, 0x4, vl), qh3, vl);
-
-        vint8mf2_t q6v_0 = __riscv_vsub_vx_i8mf2(__riscv_vreinterpret_v_u8mf2_i8mf2(q6h_0), 32, vl);
-        vint8mf2_t q6v_1 = __riscv_vsub_vx_i8mf2(__riscv_vreinterpret_v_u8mf2_i8mf2(q6h_1), 32, vl);
-        vint8mf2_t q6v_2 = __riscv_vsub_vx_i8mf2(__riscv_vreinterpret_v_u8mf2_i8mf2(q6h_2), 32, vl);
-        vint8mf2_t q6v_3 = __riscv_vsub_vx_i8mf2(__riscv_vreinterpret_v_u8mf2_i8mf2(q6h_3), 32, vl);
-
-        // load Q8 and take product
-        vint16m1_t p0 = __riscv_vwmul_vv_i16m1(q6v_0, __riscv_vle8_v_i8mf2(q8, vl), vl);
-        vint16m1_t p1 = __riscv_vwmul_vv_i16m1(q6v_1, __riscv_vle8_v_i8mf2(q8+16, vl), vl);
-        vint16m1_t p2 = __riscv_vwmul_vv_i16m1(q6v_2, __riscv_vle8_v_i8mf2(q8+32, vl), vl);
-        vint16m1_t p3 = __riscv_vwmul_vv_i16m1(q6v_3, __riscv_vle8_v_i8mf2(q8+48, vl), vl);
-
-        vint32m1_t vs_0 = __riscv_vwredsum_vs_i16m1_i32m1(p0, vzero, vl);
-        vint32m1_t vs_1 = __riscv_vwredsum_vs_i16m1_i32m1(p1, vzero, vl);
-        vint32m1_t vs_2 = __riscv_vwredsum_vs_i16m1_i32m1(p2, vzero, vl);
-        vint32m1_t vs_3 = __riscv_vwredsum_vs_i16m1_i32m1(p3, vzero, vl);
-
-        isum += __riscv_vmv_x_s_i32m1_i32(vs_0) * scale[0];
-        isum += __riscv_vmv_x_s_i32m1_i32(vs_1) * scale[1];
-        isum += __riscv_vmv_x_s_i32m1_i32(vs_2) * scale[2];
-        isum += __riscv_vmv_x_s_i32m1_i32(vs_3) * scale[3];
-
-        sumf += isum * d_all * y[i].d;
-
-    }
-
-    *s = sumf;
-
-#else
-
-    int8_t  aux8[QK_K];
-    int16_t aux16[8];
-    float   sums [8];
-    int32_t aux32[8];
-    memset(sums, 0, 8*sizeof(float));
-
-    float sumf = 0;
-    for (int i = 0; i < nb; ++i) {
-        const uint8_t * restrict q4 = x[i].ql;
-        const uint8_t * restrict qh = x[i].qh;
-        const  int8_t * restrict q8 = y[i].qs;
-        memset(aux32, 0, 8*sizeof(int32_t));
-        int8_t * restrict a = aux8;
-        for (int l = 0; l < 16; ++l) {
-            a[l+ 0] = (int8_t)((q4[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
-            a[l+16] = (int8_t)((q4[l+16] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
-            a[l+32] = (int8_t)((q4[l+ 0] >>  4) | (((qh[l] >> 4) & 3) << 4)) - 32;
-            a[l+48] = (int8_t)((q4[l+16] >>  4) | (((qh[l] >> 6) & 3) << 4)) - 32;
-        }
-        int is = 0;
-        for (int j = 0; j < QK_K/16; ++j) {
-            int scale = x[i].scales[is++];
-            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
-            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
-            q8 += 8; a += 8;
-            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
-            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
-            q8 += 8; a += 8;
-        }
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
-    }
-    for (int l = 0; l < 8; ++l) sumf += sums[l];
-    *s = sumf;
-#endif
-}
-
-#endif
-
-static const int8_t keven_signs_q2xs[1024] = {
-     1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1, -1,  1, -1,  1,  1,  1,  1,  1, -1, -1, -1,  1,  1,  1,  1,  1,  1,
-     1,  1, -1,  1,  1,  1,  1, -1, -1,  1, -1,  1,  1,  1,  1,  1,  1, -1, -1,  1,  1,  1,  1,  1, -1, -1, -1,  1,  1,  1,  1, -1,
-     1,  1,  1, -1,  1,  1,  1, -1, -1,  1,  1, -1,  1,  1,  1,  1,  1, -1,  1, -1,  1,  1,  1,  1, -1, -1,  1, -1,  1,  1,  1, -1,
-     1,  1, -1, -1,  1,  1,  1,  1, -1,  1, -1, -1,  1,  1,  1, -1,  1, -1, -1, -1,  1,  1,  1, -1, -1, -1, -1, -1,  1,  1,  1,  1,
-     1,  1,  1,  1, -1,  1,  1, -1, -1,  1,  1,  1, -1,  1,  1,  1,  1, -1,  1,  1, -1,  1,  1,  1, -1, -1,  1,  1, -1,  1,  1, -1,
-     1,  1, -1,  1, -1,  1,  1,  1, -1,  1, -1,  1, -1,  1,  1, -1,  1, -1, -1,  1, -1,  1,  1, -1, -1, -1, -1,  1, -1,  1,  1,  1,
-     1,  1,  1, -1, -1,  1,  1,  1, -1,  1,  1, -1, -1,  1,  1, -1,  1, -1,  1, -1, -1,  1,  1, -1, -1, -1,  1, -1, -1,  1,  1,  1,
-     1,  1, -1, -1, -1,  1,  1, -1, -1,  1, -1, -1, -1,  1,  1,  1,  1, -1, -1, -1, -1,  1,  1,  1, -1, -1, -1, -1, -1,  1,  1, -1,
-     1,  1,  1,  1,  1, -1,  1, -1, -1,  1,  1,  1,  1, -1,  1,  1,  1, -1,  1,  1,  1, -1,  1,  1, -1, -1,  1,  1,  1, -1,  1, -1,
-     1,  1, -1,  1,  1, -1,  1,  1, -1,  1, -1,  1,  1, -1,  1, -1,  1, -1, -1,  1,  1, -1,  1, -1, -1, -1, -1,  1,  1, -1,  1,  1,
-     1,  1,  1, -1,  1, -1,  1,  1, -1,  1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1, -1, -1,  1, -1,  1, -1,  1,  1,
-     1,  1, -1, -1,  1, -1,  1, -1, -1,  1, -1, -1,  1, -1,  1,  1,  1, -1, -1, -1,  1, -1,  1,  1, -1, -1, -1, -1,  1, -1,  1, -1,
-     1,  1,  1,  1, -1, -1,  1,  1, -1,  1,  1,  1, -1, -1,  1, -1,  1, -1,  1,  1, -1, -1,  1, -1, -1, -1,  1,  1, -1, -1,  1,  1,
-     1,  1, -1,  1, -1, -1,  1, -1, -1,  1, -1,  1, -1, -1,  1,  1,  1, -1, -1,  1, -1, -1,  1,  1, -1, -1, -1,  1, -1, -1,  1, -1,
-     1,  1,  1, -1, -1, -1,  1, -1, -1,  1,  1, -1, -1, -1,  1,  1,  1, -1,  1, -1, -1, -1,  1,  1, -1, -1,  1, -1, -1, -1,  1, -1,
-     1,  1, -1, -1, -1, -1,  1,  1, -1,  1, -1, -1, -1, -1,  1, -1,  1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1,  1,  1,
-     1,  1,  1,  1,  1,  1, -1, -1, -1,  1,  1,  1,  1,  1, -1,  1,  1, -1,  1,  1,  1,  1, -1,  1, -1, -1,  1,  1,  1,  1, -1, -1,
-     1,  1, -1,  1,  1,  1, -1,  1, -1,  1, -1,  1,  1,  1, -1, -1,  1, -1, -1,  1,  1,  1, -1, -1, -1, -1, -1,  1,  1,  1, -1,  1,
-     1,  1,  1, -1,  1,  1, -1,  1, -1,  1,  1, -1,  1,  1, -1, -1,  1, -1,  1, -1,  1,  1, -1, -1, -1, -1,  1, -1,  1,  1, -1,  1,
-     1,  1, -1, -1,  1,  1, -1, -1, -1,  1, -1, -1,  1,  1, -1,  1,  1, -1, -1, -1,  1,  1, -1,  1, -1, -1, -1, -1,  1,  1, -1, -1,
-     1,  1,  1,  1, -1,  1, -1,  1, -1,  1,  1,  1, -1,  1, -1, -1,  1, -1,  1,  1, -1,  1, -1, -1, -1, -1,  1,  1, -1,  1, -1,  1,
-     1,  1, -1,  1, -1,  1, -1, -1, -1,  1, -1,  1, -1,  1, -1,  1,  1, -1, -1,  1, -1,  1, -1,  1, -1, -1, -1,  1, -1,  1, -1, -1,
-     1,  1,  1, -1, -1,  1, -1, -1, -1,  1,  1, -1, -1,  1, -1,  1,  1, -1,  1, -1, -1,  1, -1,  1, -1, -1,  1, -1, -1,  1, -1, -1,
-     1,  1, -1, -1, -1,  1, -1,  1, -1,  1, -1, -1, -1,  1, -1, -1,  1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1,  1, -1,  1,
-     1,  1,  1,  1,  1, -1, -1,  1, -1,  1,  1,  1,  1, -1, -1, -1,  1, -1,  1,  1,  1, -1, -1, -1, -1, -1,  1,  1,  1, -1, -1,  1,
-     1,  1, -1,  1,  1, -1, -1, -1, -1,  1, -1,  1,  1, -1, -1,  1,  1, -1, -1,  1,  1, -1, -1,  1, -1, -1, -1,  1,  1, -1, -1, -1,
-     1,  1,  1, -1,  1, -1, -1, -1, -1,  1,  1, -1,  1, -1, -1,  1,  1, -1,  1, -1,  1, -1, -1,  1, -1, -1,  1, -1,  1, -1, -1, -1,
-     1,  1, -1, -1,  1, -1, -1,  1, -1,  1, -1, -1,  1, -1, -1, -1,  1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1,  1,
-     1,  1,  1,  1, -1, -1, -1, -1, -1,  1,  1,  1, -1, -1, -1,  1,  1, -1,  1,  1, -1, -1, -1,  1, -1, -1,  1,  1, -1, -1, -1, -1,
-     1,  1, -1,  1, -1, -1, -1,  1, -1,  1, -1,  1, -1, -1, -1, -1,  1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1,  1,
-     1,  1,  1, -1, -1, -1, -1,  1, -1,  1,  1, -1, -1, -1, -1, -1,  1, -1,  1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1,  1,
-     1,  1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1,  1,  1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1, -1,
-};
-
-void ggml_vec_dot_iq2_xxs_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
-    assert(n % QK_K == 0);
-
-    const block_iq2_xxs * restrict x = vx;
-    const block_q8_K    * restrict y = vy;
-
-    const int nb = n / QK_K;
-
-#if defined(__ARM_NEON)
-
-    const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
-
-    uint32_t aux32[4];
-    const uint8_t * aux8 = (const uint8_t *)aux32;
-
-    ggml_int8x16x4_t q2u;
-    ggml_int8x16x4_t q2s;
-    ggml_int8x16x4_t q8b;
-
-    float sumf = 0;
-    for (int i = 0; i < nb; ++i) {
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint16_t * restrict q2 = x[i].qs;
-        const int8_t   * restrict q8 = y[i].qs;
-        float sumf1 = 0, sumf2 = 0;
-        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
-            q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
-            memcpy(aux32, q2, 4*sizeof(uint32_t)); q2 += 8;
-            q2u.val[0] = vcombine_s8(vld1_s8((const void *)(iq2xxs_grid + aux8[ 0])), vld1_s8((const void *)(iq2xxs_grid + aux8[ 1])));
-            q2u.val[1] = vcombine_s8(vld1_s8((const void *)(iq2xxs_grid + aux8[ 2])), vld1_s8((const void *)(iq2xxs_grid + aux8[ 3])));
-            q2u.val[2] = vcombine_s8(vld1_s8((const void *)(iq2xxs_grid + aux8[ 8])), vld1_s8((const void *)(iq2xxs_grid + aux8[ 9])));
-            q2u.val[3] = vcombine_s8(vld1_s8((const void *)(iq2xxs_grid + aux8[10])), vld1_s8((const void *)(iq2xxs_grid + aux8[11])));
-            q2s.val[0] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[1] >>  0) & 127))), vld1_s8((const void *)(signs64 + ((aux32[1] >>  7) & 127))));
-            q2s.val[1] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[1] >> 14) & 127))), vld1_s8((const void *)(signs64 + ((aux32[1] >> 21) & 127))));
-            q2s.val[2] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[3] >>  0) & 127))), vld1_s8((const void *)(signs64 + ((aux32[3] >>  7) & 127))));
-            q2s.val[3] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[3] >> 14) & 127))), vld1_s8((const void *)(signs64 + ((aux32[3] >> 21) & 127))));
-            q2u.val[0] = vmulq_s8(q2u.val[0], q2s.val[0]);
-            q2u.val[1] = vmulq_s8(q2u.val[1], q2s.val[1]);
-            q2u.val[2] = vmulq_s8(q2u.val[2], q2s.val[2]);
-            q2u.val[3] = vmulq_s8(q2u.val[3], q2s.val[3]);
-            const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q2u.val[0], q8b.val[0]), q2u.val[1], q8b.val[1]);
-            const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q2u.val[2], q8b.val[2]), q2u.val[3], q8b.val[3]);
-            sumf1 += vaddvq_s32(p1) * (0.5f + (aux32[1] >> 28));
-            sumf2 += vaddvq_s32(p2) * (0.5f + (aux32[3] >> 28));
-        }
-        sumf += d*(sumf1 + sumf2);
-    }
-    *s = 0.25f * sumf;
-
-#elif defined(__AVX2__)
-
-    const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
-
-    uint32_t aux32[4];
-    const uint8_t * aux8 = (const uint8_t *)aux32;
-
-    __m256 accumf = _mm256_setzero_ps();
-    for (int i = 0; i < nb; ++i) {
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint16_t * restrict q2 = x[i].qs;
-        const int8_t   * restrict q8 = y[i].qs;
-        __m256i sumi1 = _mm256_setzero_si256();
-        __m256i sumi2 = _mm256_setzero_si256();
-        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
-            const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
-            const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
-            memcpy(aux32, q2, 4*sizeof(uint32_t)); q2 += 8;
-            const __m256i q2_1 = _mm256_set_epi64x(iq2xxs_grid[aux8[ 3]], iq2xxs_grid[aux8[ 2]], iq2xxs_grid[aux8[1]], iq2xxs_grid[aux8[0]]);
-            const __m256i q2_2 = _mm256_set_epi64x(iq2xxs_grid[aux8[11]], iq2xxs_grid[aux8[10]], iq2xxs_grid[aux8[9]], iq2xxs_grid[aux8[8]]);
-            const __m256i s2_1 = _mm256_set_epi64x(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127],
-                                                   signs64[(aux32[1] >>  7) & 127], signs64[(aux32[1] >>  0) & 127]);
-            const __m256i s2_2 = _mm256_set_epi64x(signs64[(aux32[3] >> 21) & 127], signs64[(aux32[3] >> 14) & 127],
-                                                   signs64[(aux32[3] >>  7) & 127], signs64[(aux32[3] >>  0) & 127]);
-            const __m256i q8s_1 = _mm256_sign_epi8(q8_1, s2_1);
-            const __m256i q8s_2 = _mm256_sign_epi8(q8_2, s2_2);
-            const __m256i dot1  = _mm256_maddubs_epi16(q2_1, q8s_1);
-            const __m256i dot2  = _mm256_maddubs_epi16(q2_2, q8s_2);
-            const uint16_t ls1 = aux32[1] >> 28;
-            const uint16_t ls2 = aux32[3] >> 28;
-            const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_set1_epi16(2*ls1+1));
-            const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_set1_epi16(2*ls2+1));
-            sumi1 = _mm256_add_epi32(sumi1, p1);
-            sumi2 = _mm256_add_epi32(sumi2, p2);
-        }
-
-        accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf);
-
-    }
-
-    *s = 0.125f * hsum_float_8(accumf);
-
-#else
-
-    uint32_t aux32[2];
-    const uint8_t * aux8 = (const uint8_t *)aux32;
-
-    float sumf = 0.f;
-    for (int i = 0; i < nb; ++i) {
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint16_t * restrict q2 = x[i].qs;
-        const int8_t   * restrict q8 = y[i].qs;
-        int32_t bsum = 0;
-        for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
-            memcpy(aux32, q2, 2*sizeof(uint32_t));
-            q2 += 4;
-            const uint32_t ls = 2*(aux32[1] >> 28) + 1;
-            int32_t sumi = 0;
-            for (int l = 0; l < 4; ++l) {
-                const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]);
-                const uint8_t  signs = ksigns_iq2xs[(aux32[1] >> 7*l) & 127];
-                for (int j = 0; j < 8; ++j) {
-                    sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
-                }
-                q8 += 8;
-            }
-            bsum += sumi * ls;
-        }
-        sumf += d * bsum;
-    }
-    *s = 0.125f * sumf;
-#endif
-}
-
-void ggml_vec_dot_iq2_xs_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
-    assert(n % QK_K == 0);
-
-    const block_iq2_xs * restrict x = vx;
-    const block_q8_K   * restrict y = vy;
-
-    const int nb = n / QK_K;
-
-#if defined(__ARM_NEON)
-
-    const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
-
-    ggml_int8x16x4_t q2u;
-    ggml_int8x16x4_t q2s;
-    ggml_int8x16x4_t q8b;
-
-    int32x4x4_t scales32;
-
-    float sumf = 0;
-    for (int i = 0; i < nb; ++i) {
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint16_t * restrict q2 = x[i].qs;
-        const int8_t   * restrict q8 = y[i].qs;
-        const uint8x8_t scales8 = vld1_u8(x[i].scales);
-        const uint8x8_t scales_l = vand_u8(scales8, vdup_n_u8(0xf));
-        const uint8x8_t scales_h = vshr_n_u8(scales8, 4);
-        uint8x16_t scales = vcombine_u8(vzip1_u8(scales_l, scales_h), vzip2_u8(scales_l, scales_h));
-        scales = vaddq_u8(vshlq_n_u8(scales, 1), vdupq_n_u8(1));
-        const uint16x8_t scales1 = vmovl_u8(vget_low_u8(scales));
-        const uint16x8_t scales2 = vmovl_u8(vget_high_u8(scales));
-        scales32.val[0] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(scales1)));
-        scales32.val[1] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(scales1)));
-        scales32.val[2] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(scales2)));
-        scales32.val[3] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(scales2)));
-        int32x4_t sumi = vdupq_n_s32(0);
-        for (int ib64 = 0; ib64 < QK_K/64; ++ib64) {
-            q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
-            q2u.val[0] = vcombine_s8(vld1_s8((const void *)(iq2xs_grid + (q2[0] & 511))), vld1_s8((const void *)(iq2xs_grid + (q2[1] & 511))));
-            q2u.val[1] = vcombine_s8(vld1_s8((const void *)(iq2xs_grid + (q2[2] & 511))), vld1_s8((const void *)(iq2xs_grid + (q2[3] & 511))));
-            q2u.val[2] = vcombine_s8(vld1_s8((const void *)(iq2xs_grid + (q2[4] & 511))), vld1_s8((const void *)(iq2xs_grid + (q2[5] & 511))));
-            q2u.val[3] = vcombine_s8(vld1_s8((const void *)(iq2xs_grid + (q2[6] & 511))), vld1_s8((const void *)(iq2xs_grid + (q2[7] & 511))));
-            q2s.val[0] = vcombine_s8(vld1_s8((const void *)(signs64 + (q2[0] >> 9))), vld1_s8((const void *)(signs64 + (q2[1] >> 9))));
-            q2s.val[1] = vcombine_s8(vld1_s8((const void *)(signs64 + (q2[2] >> 9))), vld1_s8((const void *)(signs64 + (q2[3] >> 9))));
-            q2s.val[2] = vcombine_s8(vld1_s8((const void *)(signs64 + (q2[4] >> 9))), vld1_s8((const void *)(signs64 + (q2[5] >> 9))));
-            q2s.val[3] = vcombine_s8(vld1_s8((const void *)(signs64 + (q2[6] >> 9))), vld1_s8((const void *)(signs64 + (q2[7] >> 9))));
-            q2u.val[0] = vmulq_s8(q2u.val[0], q2s.val[0]);
-            q2u.val[1] = vmulq_s8(q2u.val[1], q2s.val[1]);
-            q2u.val[2] = vmulq_s8(q2u.val[2], q2s.val[2]);
-            q2u.val[3] = vmulq_s8(q2u.val[3], q2s.val[3]);
-            const int32x4_t p1 = ggml_vdotq_s32(vdupq_n_s32(0), q2u.val[0], q8b.val[0]);
-            const int32x4_t p2 = ggml_vdotq_s32(vdupq_n_s32(0), q2u.val[1], q8b.val[1]);
-            const int32x4_t p3 = ggml_vdotq_s32(vdupq_n_s32(0), q2u.val[2], q8b.val[2]);
-            const int32x4_t p4 = ggml_vdotq_s32(vdupq_n_s32(0), q2u.val[3], q8b.val[3]);
-            const int32x4_t p = vpaddq_s32(vpaddq_s32(p1, p2), vpaddq_s32(p3, p4));
-            sumi = vmlaq_s32(sumi, p, scales32.val[ib64]);
-            q2 += 8;
-        }
-        sumf += d*vaddvq_s32(sumi);
-    }
-    *s = 0.125f * sumf;
-
-#elif defined(__AVX2__)
-
-    const __m128i m4 = _mm_set1_epi8(0xf);
-    const __m128i m1 = _mm_set1_epi8(1);
-    const __m128i m511 = _mm_set1_epi16(511);
-    const __m128i m127 = _mm_set1_epi16(127);
-
-    const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
-
-    uint64_t aux64;
-
-    // somewhat hacky, but gives a significant boost in performance
-    __m128i aux_gindex, aux_sindex;
-    const uint16_t * gindex = (const uint16_t *)&aux_gindex;
-    const uint16_t * sindex = (const uint16_t *)&aux_sindex;
-
-    __m256 accumf = _mm256_setzero_ps();
-    for (int i = 0; i < nb; ++i) {
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint16_t * restrict q2 = x[i].qs;
-        const int8_t   * restrict q8 = y[i].qs;
-
-        memcpy(&aux64, x[i].scales, 8);
-        __m128i stmp = _mm_set1_epi64x(aux64);
-        stmp = _mm_unpacklo_epi8(_mm_and_si128(stmp, m4), _mm_and_si128(_mm_srli_epi16(stmp, 4), m4));
-        const __m128i scales = _mm_add_epi8(_mm_slli_epi16(stmp, 1), m1);
-
-        __m256i sumi1 = _mm256_setzero_si256();
-        __m256i sumi2 = _mm256_setzero_si256();
-        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
-            const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
-            const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
-            const __m128i q2_data = _mm_loadu_si128((const __m128i*)q2);  q2 +=  8;
-            aux_gindex = _mm_and_si128(q2_data, m511);
-            aux_sindex = _mm_and_si128(_mm_srli_epi16(q2_data, 9), m127);
-            const __m256i q2_1 = _mm256_set_epi64x(iq2xs_grid[gindex[3]], iq2xs_grid[gindex[2]], iq2xs_grid[gindex[1]], iq2xs_grid[gindex[0]]);
-            const __m256i q2_2 = _mm256_set_epi64x(iq2xs_grid[gindex[7]], iq2xs_grid[gindex[6]], iq2xs_grid[gindex[5]], iq2xs_grid[gindex[4]]);
-            const __m256i s2_1 = _mm256_set_epi64x(signs64[sindex[3]], signs64[sindex[2]], signs64[sindex[1]], signs64[sindex[0]]);
-            const __m256i s2_2 = _mm256_set_epi64x(signs64[sindex[7]], signs64[sindex[6]], signs64[sindex[5]], signs64[sindex[4]]);
-            const __m256i q8s_1 = _mm256_sign_epi8(q8_1, s2_1);
-            const __m256i q8s_2 = _mm256_sign_epi8(q8_2, s2_2);
-            const __m256i dot1  = _mm256_maddubs_epi16(q2_1, q8s_1);
-            const __m256i dot2  = _mm256_maddubs_epi16(q2_2, q8s_2);
-
-            const __m256i sc1 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+0)));
-            const __m256i sc2 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+1)));
-
-            sumi1 = _mm256_add_epi32(sumi1, _mm256_madd_epi16(dot1, sc1));
-            sumi2 = _mm256_add_epi32(sumi2, _mm256_madd_epi16(dot2, sc2));
-        }
-
-        accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf);
-
-    }
-
-    *s = 0.125f * hsum_float_8(accumf);
-
-#else
-
-    float sumf = 0.f;
-    for (int i = 0; i < nb; ++i) {
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint16_t * restrict q2 = x[i].qs;
-        const uint8_t  * restrict sc = x[i].scales;
-        const int8_t   * restrict q8 = y[i].qs;
-        int32_t bsum = 0;
-        for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
-            const uint16_t ls1 = 2*(sc[ib32] & 0xf) + 1;
-            const uint16_t ls2 = 2*(sc[ib32] >>  4) + 1;
-            int32_t sumi = 0;
-            for (int l = 0; l < 2; ++l) {
-                const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
-                const uint8_t  signs = ksigns_iq2xs[q2[l] >> 9];
-                for (int j = 0; j < 8; ++j) {
-                    sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
-                }
-                q8 += 8;
-            }
-            bsum += sumi * ls1;
-            sumi = 0;
-            for (int l = 2; l < 4; ++l) {
-                const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
-                const uint8_t  signs = ksigns_iq2xs[q2[l] >> 9];
-                for (int j = 0; j < 8; ++j) {
-                    sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
-                }
-                q8 += 8;
-            }
-            bsum += sumi * ls2;
-            q2 += 4;
-        }
-        sumf += d * bsum;
-    }
-    *s = 0.125f * sumf;
-#endif
-}
diff --git a/ggml/src/ggml-quants.h b/ggml/src/ggml-quants.h
deleted file mode 100644
index df5e7ae..0000000
--- a/ggml/src/ggml-quants.h
+++ /dev/null
@@ -1,248 +0,0 @@
-#pragma once
-
-#include "ggml-impl.h"
-
-// GGML internal header
-
-#include <stdint.h>
-#include <stddef.h>
-
-#define QK4_0 32
-typedef struct {
-    ggml_fp16_t d;          // delta
-    uint8_t qs[QK4_0 / 2];  // nibbles / quants
-} block_q4_0;
-static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0 block size/padding");
-
-#define QK4_1 32
-typedef struct {
-    ggml_fp16_t d;          // delta
-    ggml_fp16_t m;          // min
-    uint8_t qs[QK4_1 / 2];  // nibbles / quants
-} block_q4_1;
-static_assert(sizeof(block_q4_1) == 2 * sizeof(ggml_fp16_t) + QK4_1 / 2, "wrong q4_1 block size/padding");
-
-#define QK5_0 32
-typedef struct {
-    ggml_fp16_t d;         // delta
-    uint8_t qh[4];         // 5-th bit of quants
-    uint8_t qs[QK5_0 / 2]; // nibbles / quants
-} block_q5_0;
-static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding");
-
-#define QK5_1 32
-typedef struct {
-    ggml_fp16_t d;         // delta
-    ggml_fp16_t m;         // min
-    uint8_t qh[4];         // 5-th bit of quants
-    uint8_t qs[QK5_1 / 2]; // nibbles / quants
-} block_q5_1;
-static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
-
-#define QK8_0 32
-typedef struct {
-    ggml_fp16_t d;         // delta
-    int8_t  qs[QK8_0];     // quants
-} block_q8_0;
-static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding");
-
-#define QK8_1 32
-typedef struct {
-    float d;               // delta
-    float s;               // d * sum(qs[i])
-    int8_t  qs[QK8_1];     // quants
-} block_q8_1;
-static_assert(sizeof(block_q8_1) == 2*sizeof(float) + QK8_1, "wrong q8_1 block size/padding");
-
-//
-// Super-block quantization structures
-//
-
-// Super-block size
-#ifdef GGML_QKK_64
-#define QK_K 64
-#define K_SCALE_SIZE 4
-#else
-#define QK_K 256
-#define K_SCALE_SIZE 12
-#endif
-
-// 2-bit quantization
-// weight is represented as x = a * q + b
-// 16 blocks of 16 elements each
-// Effectively 2.625 bits per weight
-typedef struct {
-    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
-    uint8_t qs[QK_K/4];      // quants
-    ggml_fp16_t d;           // super-block scale for quantized scales
-    ggml_fp16_t dmin;        // super-block scale for quantized mins
-} block_q2_K;
-static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
-
-// 3-bit quantization
-// weight is represented as x = a * q
-// 16 blocks of 16 elements each
-// Effectively 3.4375 bits per weight
-#ifdef GGML_QKK_64
-typedef struct {
-    uint8_t hmask[QK_K/8];     // quants - high bit
-    uint8_t qs[QK_K/4];        // quants - low 2 bits
-    uint8_t scales[2];
-    ggml_fp16_t d;             // super-block scale
-} block_q3_K;
-static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + 2, "wrong q3_K block size/padding");
-#else
-typedef struct {
-    uint8_t hmask[QK_K/8];     // quants - high bit
-    uint8_t qs[QK_K/4];        // quants - low 2 bits
-    uint8_t scales[12];        // scales, quantized with 6 bits
-    ggml_fp16_t d;             // super-block scale
-} block_q3_K;
-static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + 12, "wrong q3_K block size/padding");
-#endif
-
-// 4-bit quantization
-// 8 blocks of 32 elements each
-// weight is represented as x = a * q + b
-// Effectively 4.5 bits per weight
-#ifdef GGML_QKK_64
-typedef struct {
-    ggml_fp16_t d[2];          // super-block scales/mins
-    uint8_t scales[2];         // 4-bit block scales/mins
-    uint8_t qs[QK_K/2];        // 4--bit quants
-} block_q4_K;
-static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + QK_K/2 + 2, "wrong q4_K block size/padding");
-#else
-typedef struct {
-    ggml_fp16_t d;             // super-block scale for quantized scales
-    ggml_fp16_t dmin;          // super-block scale for quantized mins
-    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
-    uint8_t qs[QK_K/2];        // 4--bit quants
-} block_q4_K;
-static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2, "wrong q4_K block size/padding");
-#endif
-
-// 5-bit quantization
-// 8 blocks of 32 elements each
-// weight is represented as x = a * q + b
-// Effectively 5.5 bits per weight
-#ifdef GGML_QKK_64
-typedef struct {
-    ggml_fp16_t d;               // super-block scale
-    int8_t  scales[QK_K/16];     // 8-bit block scales
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-} block_q5_K;
-static_assert(sizeof(block_q5_K) == sizeof(ggml_fp16_t) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
-#else
-typedef struct {
-    ggml_fp16_t d;               // super-block scale for quantized scales
-    ggml_fp16_t dmin;            // super-block scale for quantized mins
-    uint8_t scales[K_SCALE_SIZE];   // scales and mins, quantized with 6 bits
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-} block_q5_K;
-static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
-#endif
-
-// 6-bit quantization
-// weight is represented as x = a * q
-// 16 blocks of 16 elements each
-// Effectively 6.5625 bits per weight
-typedef struct {
-    uint8_t ql[QK_K/2];      // quants, lower 4 bits
-    uint8_t qh[QK_K/4];      // quants, upper 2 bits
-    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
-    ggml_fp16_t d;           // super-block scale
-} block_q6_K;
-static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + QK_K / 16 + 3*QK_K/4, "wrong q6_K block size/padding");
-
-// This is only used for intermediate quantization and dot products
-typedef struct {
-    float   d;              // delta
-    int8_t  qs[QK_K];       // quants
-    int16_t bsums[QK_K/16]; // sum of quants in groups of 16
-} block_q8_K;
-static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_t), "wrong q8_K block size/padding");
-
-// (Almost) "true" 2-bit quantization.
-// Due to the need to use blocks as per ggml dsign, it ends up using
-// 2.0625 bpw because of the 16-bit scale for each block of 256.
-typedef struct {
-    ggml_fp16_t d;
-    uint16_t qs[QK_K/8];
-} block_iq2_xxs;
-static_assert(sizeof(block_iq2_xxs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t), "wrong iq2_xxs block size/padding");
-
-// 2.3125 bpw quants
-typedef struct {
-    ggml_fp16_t d;
-    uint16_t qs[QK_K/8];
-    uint8_t  scales[QK_K/32];
-} block_iq2_xs;
-static_assert(sizeof(block_iq2_xs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t) + QK_K/32, "wrong iq2_xs block size/padding");
-
-// Quantization
-void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k);
-void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k);
-void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int k);
-void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y, int k);
-void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int k);
-void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int k);
-
-void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int k);
-void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k);
-void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k);
-void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k);
-void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k);
-void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k);
-void quantize_row_iq2_xxs_reference(const float * restrict x, block_iq2_xxs * restrict y, int k);
-void quantize_row_iq2_xs_reference (const float * restrict x, block_iq2_xs  * restrict y, int k);
-
-void quantize_row_q4_0(const float * restrict x, void * restrict y, int k);
-void quantize_row_q4_1(const float * restrict x, void * restrict y, int k);
-void quantize_row_q5_0(const float * restrict x, void * restrict y, int k);
-void quantize_row_q5_1(const float * restrict x, void * restrict y, int k);
-void quantize_row_q8_0(const float * restrict x, void * restrict y, int k);
-void quantize_row_q8_1(const float * restrict x, void * restrict y, int k);
-
-void quantize_row_q2_K(const float * restrict x, void * restrict y, int k);
-void quantize_row_q3_K(const float * restrict x, void * restrict y, int k);
-void quantize_row_q4_K(const float * restrict x, void * restrict y, int k);
-void quantize_row_q5_K(const float * restrict x, void * restrict y, int k);
-void quantize_row_q6_K(const float * restrict x, void * restrict y, int k);
-void quantize_row_q8_K(const float * restrict x, void * restrict y, int k);
-void quantize_row_iq2_xxs(const float * restrict x, void * restrict y, int k);
-void quantize_row_iq2_xs (const float * restrict x, void * restrict y, int k);
-
-// Dequantization
-void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k);
-void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int k);
-void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int k);
-void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int k);
-void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int k);
-//void dequantize_row_q8_1(const block_q8_1 * restrict x, float * restrict y, int k);
-
-void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int k);
-void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k);
-void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k);
-void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int k);
-void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k);
-void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k);
-void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int k);
-void dequantize_row_iq2_xs (const block_iq2_xs  * restrict x, float * restrict y, int k);
-
-// Dot product
-void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-
-void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-void ggml_vec_dot_iq2_xs_q8_K (int n, float * restrict s, const void * restrict vx, const void * restrict vy);
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
deleted file mode 100644
index 2dfe430..0000000
--- a/ggml/src/ggml.c
+++ /dev/null
@@ -1,20026 +0,0 @@
-#define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnings on Windows
-#define _USE_MATH_DEFINES // For M_PI on MSVC
-
-#include "ggml-impl.h"
-#include "ggml-quants.h"
-
-#if defined(_MSC_VER) || defined(__MINGW32__)
-#include <malloc.h> // using malloc.h with MSC/MINGW
-#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
-#include <alloca.h>
-#endif
-
-#include <assert.h>
-#include <errno.h>
-#include <time.h>
-#include <math.h>
-#include <stdlib.h>
-#include <string.h>
-#include <stdint.h>
-#include <inttypes.h>
-#include <stdio.h>
-#include <float.h>
-#include <limits.h>
-#include <stdarg.h>
-#include <signal.h>
-
-#ifdef GGML_USE_METAL
-#include <unistd.h>
-#endif
-
-#if defined(_MSC_VER)
-// disable "possible loss of data" to avoid hundreds of casts
-// we should just be careful :)
-#pragma warning(disable: 4244 4267)
-
-// disable POSIX deprecation warnings
-// these functions are never going away, anyway
-#pragma warning(disable: 4996)
-#endif
-
-#if defined(_WIN32)
-
-#include <windows.h>
-
-typedef volatile LONG atomic_int;
-typedef atomic_int atomic_bool;
-
-static void atomic_store(atomic_int * ptr, LONG val) {
-    InterlockedExchange(ptr, val);
-}
-static LONG atomic_load(atomic_int * ptr) {
-    return InterlockedCompareExchange(ptr, 0, 0);
-}
-static LONG atomic_fetch_add(atomic_int * ptr, LONG inc) {
-    return InterlockedExchangeAdd(ptr, inc);
-}
-static LONG atomic_fetch_sub(atomic_int * ptr, LONG dec) {
-    return atomic_fetch_add(ptr, -(dec));
-}
-
-typedef HANDLE pthread_t;
-
-typedef DWORD thread_ret_t;
-static int pthread_create(pthread_t * out, void * unused, thread_ret_t(*func)(void *), void * arg) {
-    (void) unused;
-    HANDLE handle = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) func, arg, 0, NULL);
-    if (handle == NULL)
-    {
-        return EAGAIN;
-    }
-
-    *out = handle;
-    return 0;
-}
-
-static int pthread_join(pthread_t thread, void * unused) {
-    (void) unused;
-    int ret = (int) WaitForSingleObject(thread, INFINITE);
-    CloseHandle(thread);
-    return ret;
-}
-
-static int sched_yield (void) {
-    Sleep (0);
-    return 0;
-}
-#else
-#include <pthread.h>
-#include <stdatomic.h>
-
-typedef void * thread_ret_t;
-
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <unistd.h>
-
-#endif
-
-#ifdef GGML_USE_CPU_HBM
-#include <hbwmalloc.h>
-#endif
-
-#if defined(__APPLE__)
-#include <TargetConditionals.h>
-#endif
-
-#if (defined(__linux__) || defined(__APPLE__) || defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__)) && \
-    (!defined(TARGET_OS_TV) && !defined(TARGET_OS_WATCH))
-
-#include <sys/wait.h>
-
-void ggml_print_backtrace(void) {
-    /*
-    #include <execinfo.h>
-    #include <dlfcn.h>
-
-    void * trace[100];
-
-    int nptrs = backtrace(trace, sizeof(trace)/sizeof(trace[0]));
-
-    backtrace_symbols_fd(trace, nptrs, STDERR_FILENO);
-    */
-
-    // backtrack_symbols does not show line numbers, use gdb instead
-    char attach[32];
-    snprintf(attach, sizeof(attach), "attach %d", getpid());
-    int pid = fork();
-    if (pid == 0) {
-        execlp("gdb", "gdb", "--batch",
-            "-ex", "set style enabled on",
-            "-ex", attach,
-            "-ex", "bt -frame-info source-and-location",
-            "-ex", "detach",
-            "-ex", "quit",
-            (char *) NULL);
-    } else {
-        waitpid(pid, NULL, 0);
-    }
-}
-#else
-void ggml_print_backtrace(void) {
-    // platform not supported
-}
-#endif
-
-/*#define GGML_PERF*/
-#define GGML_DEBUG 0
-#define GGML_GELU_FP16
-#define GGML_GELU_QUICK_FP16
-#define GGML_SILU_FP16
-// #define GGML_CROSS_ENTROPY_EXP_FP16
-// #define GGML_FLASH_ATTN_EXP_FP16
-
-#define GGML_SOFT_MAX_UNROLL 4
-#define GGML_VEC_DOT_UNROLL  2
-#define GGML_VEC_MAD_UNROLL  32
-
-//
-// logging
-//
-
-#if (GGML_DEBUG >= 1)
-#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
-#else
-#define GGML_PRINT_DEBUG(...)
-#endif
-
-#if (GGML_DEBUG >= 5)
-#define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
-#else
-#define GGML_PRINT_DEBUG_5(...)
-#endif
-
-#if (GGML_DEBUG >= 10)
-#define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
-#else
-#define GGML_PRINT_DEBUG_10(...)
-#endif
-
-#define GGML_PRINT(...) printf(__VA_ARGS__)
-
-//
-// end of logging block
-//
-
-#ifdef GGML_USE_ACCELERATE
-// uncomment to use vDSP for soft max computation
-// note: not sure if it is actually faster
-//#define GGML_SOFT_MAX_ACCELERATE
-#endif
-
-#if defined(_MSC_VER) || defined(__MINGW32__)
-#define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN)
-#define GGML_ALIGNED_FREE(ptr)    _aligned_free(ptr)
-#else
-inline static void * ggml_aligned_malloc(size_t size) {
-    if (size == 0) {
-        GGML_PRINT("WARNING: Behavior may be unexpected when allocating 0 bytes for ggml_aligned_malloc!\n");
-        return NULL;
-    }
-    void * aligned_memory = NULL;
-#ifdef GGML_USE_CPU_HBM
-    int result = hbw_posix_memalign(&aligned_memory, 16, size);
-#elif GGML_USE_METAL
-    int result = posix_memalign(&aligned_memory, sysconf(_SC_PAGESIZE), size);
-#else
-    int result = posix_memalign(&aligned_memory, GGML_MEM_ALIGN, size);
-#endif
-    if (result != 0) {
-        // Handle allocation failure
-        const char *error_desc = "unknown allocation error";
-        switch (result) {
-            case EINVAL:
-                error_desc = "invalid alignment value";
-                break;
-            case ENOMEM:
-                error_desc = "insufficient memory";
-                break;
-        }
-        GGML_PRINT("%s: %s (attempted to allocate %6.2f MB)\n", __func__, error_desc, size/(1024.0*1024.0));
-        return NULL;
-    }
-    return aligned_memory;
-}
-#define GGML_ALIGNED_MALLOC(size) ggml_aligned_malloc(size)
-#ifdef GGML_USE_CPU_HBM
-#define GGML_ALIGNED_FREE(ptr)    if(NULL != ptr) hbw_free(ptr)
-#else
-#define GGML_ALIGNED_FREE(ptr)    free(ptr)
-#endif
-#endif
-
-#define UNUSED GGML_UNUSED
-#define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0)
-
-#if defined(GGML_USE_ACCELERATE)
-#include <Accelerate/Accelerate.h>
-#if defined(GGML_USE_CLBLAST) // allow usage of CLBlast alongside Accelerate functions
-#include "ggml-opencl.h"
-#endif
-#elif defined(GGML_USE_OPENBLAS)
-#if defined(GGML_BLAS_USE_MKL)
-#include <mkl.h>
-#else
-#include <cblas.h>
-#endif
-#elif defined(GGML_USE_CUBLAS)
-#include "ggml-cuda.h"
-#elif defined(GGML_USE_CLBLAST)
-#include "ggml-opencl.h"
-#endif
-
-// floating point type used to accumulate sums
-typedef double ggml_float;
-
-#undef MIN
-#undef MAX
-
-#define MIN(a, b) ((a) < (b) ? (a) : (b))
-#define MAX(a, b) ((a) > (b) ? (a) : (b))
-
-//
-// global data
-//
-
-// precomputed gelu table for f16 (128 KB)
-static ggml_fp16_t ggml_table_gelu_f16[1 << 16];
-
-// precomputed quick gelu table for f16 (128 KB)
-static ggml_fp16_t ggml_table_gelu_quick_f16[1 << 16];
-
-// precomputed silu table for f16 (128 KB)
-static ggml_fp16_t ggml_table_silu_f16[1 << 16];
-
-// precomputed exp table for f16 (128 KB)
-static ggml_fp16_t ggml_table_exp_f16[1 << 16];
-
-// precomputed f32 table for f16 (256 KB) (ggml-impl.h)
-float ggml_table_f32_f16[1 << 16];
-
-// note: do not use these inside ggml.c
-// these are meant to be used via the ggml.h API
-float ggml_fp16_to_fp32(ggml_fp16_t x) {
-    return (float) GGML_FP16_TO_FP32(x);
-}
-
-ggml_fp16_t ggml_fp32_to_fp16(float x) {
-    return GGML_FP32_TO_FP16(x);
-}
-
-void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int n) {
-    for (int i = 0; i < n; i++) {
-        y[i] = GGML_FP16_TO_FP32(x[i]);
-    }
-}
-
-void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int n) {
-    int i = 0;
-#if defined(__F16C__)
-    for (; i + 7 < n; i += 8) {
-        __m256 x_vec = _mm256_loadu_ps(x + i);
-        __m128i y_vec = _mm256_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
-        _mm_storeu_si128((__m128i *)(y + i), y_vec);
-    }
-    for(; i + 3 < n; i += 4) {
-        __m128 x_vec = _mm_loadu_ps(x + i);
-        __m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
-        _mm_storel_epi64((__m128i *)(y + i), y_vec);
-    }
-#endif
-    for (; i < n; i++) {
-        y[i] = GGML_FP32_TO_FP16(x[i]);
-    }
-}
-
-//
-// timing
-//
-
-#if defined(_MSC_VER) || defined(__MINGW32__)
-static int64_t timer_freq, timer_start;
-void ggml_time_init(void) {
-    LARGE_INTEGER t;
-    QueryPerformanceFrequency(&t);
-    timer_freq = t.QuadPart;
-
-    // The multiplication by 1000 or 1000000 below can cause an overflow if timer_freq
-    // and the uptime is high enough.
-    // We subtract the program start time to reduce the likelihood of that happening.
-    QueryPerformanceCounter(&t);
-    timer_start = t.QuadPart;
-}
-int64_t ggml_time_ms(void) {
-    LARGE_INTEGER t;
-    QueryPerformanceCounter(&t);
-    return ((t.QuadPart-timer_start) * 1000) / timer_freq;
-}
-int64_t ggml_time_us(void) {
-    LARGE_INTEGER t;
-    QueryPerformanceCounter(&t);
-    return ((t.QuadPart-timer_start) * 1000000) / timer_freq;
-}
-#else
-void ggml_time_init(void) {}
-int64_t ggml_time_ms(void) {
-    struct timespec ts;
-    clock_gettime(CLOCK_MONOTONIC, &ts);
-    return (int64_t)ts.tv_sec*1000 + (int64_t)ts.tv_nsec/1000000;
-}
-
-int64_t ggml_time_us(void) {
-    struct timespec ts;
-    clock_gettime(CLOCK_MONOTONIC, &ts);
-    return (int64_t)ts.tv_sec*1000000 + (int64_t)ts.tv_nsec/1000;
-}
-#endif
-
-int64_t ggml_cycles(void) {
-    return clock();
-}
-
-int64_t ggml_cycles_per_ms(void) {
-    return CLOCKS_PER_SEC/1000;
-}
-
-#ifdef GGML_PERF
-#define ggml_perf_time_ms()       ggml_time_ms()
-#define ggml_perf_time_us()       ggml_time_us()
-#define ggml_perf_cycles()        ggml_cycles()
-#define ggml_perf_cycles_per_ms() ggml_cycles_per_ms()
-#else
-#define ggml_perf_time_ms()       0
-#define ggml_perf_time_us()       0
-#define ggml_perf_cycles()        0
-#define ggml_perf_cycles_per_ms() 0
-#endif
-
-//
-// cache line
-//
-
-#if defined(__cpp_lib_hardware_interference_size)
-#define CACHE_LINE_SIZE hardware_destructive_interference_size
-#else
-#if defined(__POWER9_VECTOR__)
-#define CACHE_LINE_SIZE 128
-#else
-#define CACHE_LINE_SIZE 64
-#endif
-#endif
-
-static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
-
-static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y);
-static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y);
-
-ggml_collect_imatrix_t g_imatrix_collect = NULL;
-
-void ggml_set_imatrix_collection(ggml_collect_imatrix_t imatrix_collect) {
-    g_imatrix_collect = imatrix_collect;
-}
-
-static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
-    [GGML_TYPE_I8] = {
-        .type_name                = "i8",
-        .blck_size                = 1,
-        .type_size                = sizeof(int8_t),
-        .is_quantized             = false,
-    },
-    [GGML_TYPE_I16] = {
-        .type_name                = "i16",
-        .blck_size                = 1,
-        .type_size                = sizeof(int16_t),
-        .is_quantized             = false,
-    },
-    [GGML_TYPE_I32] = {
-        .type_name                = "i32",
-        .blck_size                = 1,
-        .type_size                = sizeof(int32_t),
-        .is_quantized             = false,
-    },
-    [GGML_TYPE_F32] = {
-        .type_name                = "f32",
-        .blck_size                = 1,
-        .type_size                = sizeof(float),
-        .is_quantized             = false,
-        .vec_dot                  = (ggml_vec_dot_t) ggml_vec_dot_f32,
-        .vec_dot_type             = GGML_TYPE_F32,
-    },
-    [GGML_TYPE_F16] = {
-        .type_name                = "f16",
-        .blck_size                = 1,
-        .type_size                = sizeof(ggml_fp16_t),
-        .is_quantized             = false,
-        .to_float                 = (ggml_to_float_t) ggml_fp16_to_fp32_row,
-        .from_float               = (ggml_from_float_t) ggml_fp32_to_fp16_row,
-        .from_float_reference     = (ggml_from_float_t) ggml_fp32_to_fp16_row,
-        .vec_dot                  = (ggml_vec_dot_t) ggml_vec_dot_f16,
-        .vec_dot_type             = GGML_TYPE_F16,
-    },
-    [GGML_TYPE_Q4_0] = {
-        .type_name                = "q4_0",
-        .blck_size                = QK4_0,
-        .type_size                = sizeof(block_q4_0),
-        .is_quantized             = true,
-        .to_float                 = (ggml_to_float_t) dequantize_row_q4_0,
-        .from_float               = quantize_row_q4_0,
-        .from_float_reference     = (ggml_from_float_t) quantize_row_q4_0_reference,
-        .vec_dot                  = ggml_vec_dot_q4_0_q8_0,
-        .vec_dot_type             = GGML_TYPE_Q8_0,
-    },
-    [GGML_TYPE_Q4_1] = {
-        .type_name                = "q4_1",
-        .blck_size                = QK4_1,
-        .type_size                = sizeof(block_q4_1),
-        .is_quantized             = true,
-        .to_float                 = (ggml_to_float_t) dequantize_row_q4_1,
-        .from_float               = quantize_row_q4_1,
-        .from_float_reference     = (ggml_from_float_t) quantize_row_q4_1_reference,
-        .vec_dot                  = ggml_vec_dot_q4_1_q8_1,
-        .vec_dot_type             = GGML_TYPE_Q8_1,
-    },
-    [4] = { // GGML_TYPE_Q4_2
-        .type_name                = "DEPRECATED",
-        .blck_size                = 0,
-        .type_size                = 0,
-        .is_quantized             = false,
-        .to_float                 = NULL,
-        .from_float               = NULL,
-        .from_float_reference     = NULL,
-        .vec_dot                  = NULL,
-        .vec_dot_type             = GGML_TYPE_COUNT,
-    },
-    [5] = { // GGML_TYPE_Q4_3
-        .type_name                = "DEPRECATED",
-        .blck_size                = 0,
-        .type_size                = 0,
-        .is_quantized             = false,
-        .to_float                 = NULL,
-        .from_float               = NULL,
-        .from_float_reference     = NULL,
-        .vec_dot                  = NULL,
-        .vec_dot_type             = GGML_TYPE_COUNT,
-    },
-    [GGML_TYPE_Q5_0] = {
-        .type_name                = "q5_0",
-        .blck_size                = QK5_0,
-        .type_size                = sizeof(block_q5_0),
-        .is_quantized             = true,
-        .to_float                 = (ggml_to_float_t) dequantize_row_q5_0,
-        .from_float               = quantize_row_q5_0,
-        .from_float_reference     = (ggml_from_float_t) quantize_row_q5_0_reference,
-        .vec_dot                  = ggml_vec_dot_q5_0_q8_0,
-        .vec_dot_type             = GGML_TYPE_Q8_0,
-    },
-    [GGML_TYPE_Q5_1] = {
-        .type_name                = "q5_1",
-        .blck_size                = QK5_1,
-        .type_size                = sizeof(block_q5_1),
-        .is_quantized             = true,
-        .to_float                 = (ggml_to_float_t) dequantize_row_q5_1,
-        .from_float               = quantize_row_q5_1,
-        .from_float_reference     = (ggml_from_float_t) quantize_row_q5_1_reference,
-        .vec_dot                  = ggml_vec_dot_q5_1_q8_1,
-        .vec_dot_type             = GGML_TYPE_Q8_1,
-    },
-    [GGML_TYPE_Q8_0] = {
-        .type_name                = "q8_0",
-        .blck_size                = QK8_0,
-        .type_size                = sizeof(block_q8_0),
-        .is_quantized             = true,
-        .to_float                 = (ggml_to_float_t) dequantize_row_q8_0,
-        .from_float               = quantize_row_q8_0,
-        .from_float_reference     = (ggml_from_float_t) quantize_row_q8_0_reference,
-        .vec_dot                  = ggml_vec_dot_q8_0_q8_0,
-        .vec_dot_type             = GGML_TYPE_Q8_0,
-    },
-    [GGML_TYPE_Q8_1] = {
-        .type_name                = "q8_1",
-        .blck_size                = QK8_1,
-        .type_size                = sizeof(block_q8_1),
-        .is_quantized             = true,
-        .from_float               = quantize_row_q8_1,
-        .from_float_reference     = (ggml_from_float_t) quantize_row_q8_1_reference,
-        .vec_dot_type             = GGML_TYPE_Q8_1,
-    },
-    [GGML_TYPE_Q2_K] = {
-        .type_name                = "q2_K",
-        .blck_size                = QK_K,
-        .type_size                = sizeof(block_q2_K),
-        .is_quantized             = true,
-        .to_float                 = (ggml_to_float_t) dequantize_row_q2_K,
-        .from_float               = quantize_row_q2_K,
-        .from_float_reference     = (ggml_from_float_t) quantize_row_q2_K_reference,
-        .vec_dot                  = ggml_vec_dot_q2_K_q8_K,
-        .vec_dot_type             = GGML_TYPE_Q8_K,
-    },
-    [GGML_TYPE_Q3_K] = {
-        .type_name                = "q3_K",
-        .blck_size                = QK_K,
-        .type_size                = sizeof(block_q3_K),
-        .is_quantized             = true,
-        .to_float                 = (ggml_to_float_t) dequantize_row_q3_K,
-        .from_float               = quantize_row_q3_K,
-        .from_float_reference     = (ggml_from_float_t) quantize_row_q3_K_reference,
-        .vec_dot                  = ggml_vec_dot_q3_K_q8_K,
-        .vec_dot_type             = GGML_TYPE_Q8_K,
-    },
-    [GGML_TYPE_Q4_K] = {
-        .type_name                = "q4_K",
-        .blck_size                = QK_K,
-        .type_size                = sizeof(block_q4_K),
-        .is_quantized             = true,
-        .to_float                 = (ggml_to_float_t) dequantize_row_q4_K,
-        .from_float               = quantize_row_q4_K,
-        .from_float_reference     = (ggml_from_float_t) quantize_row_q4_K_reference,
-        .vec_dot                  = ggml_vec_dot_q4_K_q8_K,
-        .vec_dot_type             = GGML_TYPE_Q8_K,
-    },
-    [GGML_TYPE_Q5_K] = {
-        .type_name                = "q5_K",
-        .blck_size                = QK_K,
-        .type_size                = sizeof(block_q5_K),
-        .is_quantized             = true,
-        .to_float                 = (ggml_to_float_t) dequantize_row_q5_K,
-        .from_float               = quantize_row_q5_K,
-        .from_float_reference     = (ggml_from_float_t) quantize_row_q5_K_reference,
-        .vec_dot                  = ggml_vec_dot_q5_K_q8_K,
-        .vec_dot_type             = GGML_TYPE_Q8_K,
-    },
-    [GGML_TYPE_Q6_K] = {
-        .type_name                = "q6_K",
-        .blck_size                = QK_K,
-        .type_size                = sizeof(block_q6_K),
-        .is_quantized             = true,
-        .to_float                 = (ggml_to_float_t) dequantize_row_q6_K,
-        .from_float               = quantize_row_q6_K,
-        .from_float_reference     = (ggml_from_float_t) quantize_row_q6_K_reference,
-        .vec_dot                  = ggml_vec_dot_q6_K_q8_K,
-        .vec_dot_type             = GGML_TYPE_Q8_K,
-    },
-    [GGML_TYPE_IQ2_XXS] = {
-        .type_name                = "iq2_xxs",
-        .blck_size                = QK_K,
-        .type_size                = sizeof(block_iq2_xxs),
-        .is_quantized             = true,
-        .to_float                 = (ggml_to_float_t) dequantize_row_iq2_xxs,
-        .from_float               = quantize_row_iq2_xxs,
-        .from_float_reference     = (ggml_from_float_t) quantize_row_iq2_xxs_reference,
-        .vec_dot                  = ggml_vec_dot_iq2_xxs_q8_K,
-        .vec_dot_type             = GGML_TYPE_Q8_K,
-    },
-    [GGML_TYPE_IQ2_XS] = {
-        .type_name                = "iq2_xs",
-        .blck_size                = QK_K,
-        .type_size                = sizeof(block_iq2_xs),
-        .is_quantized             = true,
-        .to_float                 = (ggml_to_float_t) dequantize_row_iq2_xs,
-        .from_float               = quantize_row_iq2_xs,
-        .from_float_reference     = (ggml_from_float_t) quantize_row_iq2_xs_reference,
-        .vec_dot                  = ggml_vec_dot_iq2_xs_q8_K,
-        .vec_dot_type             = GGML_TYPE_Q8_K,
-    },
-    [GGML_TYPE_Q8_K] = {
-        .type_name                = "q8_K",
-        .blck_size                = QK_K,
-        .type_size                = sizeof(block_q8_K),
-        .is_quantized             = true,
-        .from_float               = quantize_row_q8_K,
-    }
-};
-
-// For internal test use
-ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
-    GGML_ASSERT(type < GGML_TYPE_COUNT);
-    return type_traits[type];
-}
-
-//
-// simd mappings
-//
-
-#if defined(__ARM_NEON)
-#if !defined(__aarch64__)
-
-// 64-bit compatibility
-
-inline static float vaddvq_f32(float32x4_t v) {
-    return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
-}
-
-#endif
-#endif
-
-// we define a common set of C macros which map to specific intrinsics based on the current architecture
-// we then implement the fundamental computation operations below using only these macros
-// adding support for new architectures requires to define the corresponding SIMD macros
-//
-// GGML_F32_STEP / GGML_F16_STEP
-//   number of elements to process in a single step
-//
-// GGML_F32_EPR / GGML_F16_EPR
-//   number of elements to fit in a single register
-//
-
-#if defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA)
-
-#define GGML_SIMD
-
-// F32 NEON
-
-#define GGML_F32_STEP 16
-#define GGML_F32_EPR  4
-
-#define GGML_F32x4              float32x4_t
-#define GGML_F32x4_ZERO         vdupq_n_f32(0.0f)
-#define GGML_F32x4_SET1(x)      vdupq_n_f32(x)
-#define GGML_F32x4_LOAD         vld1q_f32
-#define GGML_F32x4_STORE        vst1q_f32
-#define GGML_F32x4_FMA(a, b, c) vfmaq_f32(a, b, c)
-#define GGML_F32x4_ADD          vaddq_f32
-#define GGML_F32x4_MUL          vmulq_f32
-#define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x)
-#define GGML_F32x4_REDUCE(res, x)              \
-{                                              \
-    int offset = GGML_F32_ARR >> 1;            \
-    for (int i = 0; i < offset; ++i) {         \
-        x[i] = vaddq_f32(x[i], x[offset+i]);   \
-    }                                          \
-    offset >>= 1;                              \
-    for (int i = 0; i < offset; ++i) {         \
-        x[i] = vaddq_f32(x[i], x[offset+i]);   \
-    }                                          \
-    offset >>= 1;                              \
-    for (int i = 0; i < offset; ++i) {         \
-        x[i] = vaddq_f32(x[i], x[offset+i]);   \
-    }                                          \
-    res = GGML_F32x4_REDUCE_ONE(x[0]);         \
-}
-
-#define GGML_F32_VEC        GGML_F32x4
-#define GGML_F32_VEC_ZERO   GGML_F32x4_ZERO
-#define GGML_F32_VEC_SET1   GGML_F32x4_SET1
-#define GGML_F32_VEC_LOAD   GGML_F32x4_LOAD
-#define GGML_F32_VEC_STORE  GGML_F32x4_STORE
-#define GGML_F32_VEC_FMA    GGML_F32x4_FMA
-#define GGML_F32_VEC_ADD    GGML_F32x4_ADD
-#define GGML_F32_VEC_MUL    GGML_F32x4_MUL
-#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
-
-// F16 NEON
-
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-    #define GGML_F16_STEP 32
-    #define GGML_F16_EPR  8
-
-    #define GGML_F16x8              float16x8_t
-    #define GGML_F16x8_ZERO         vdupq_n_f16(0.0f)
-    #define GGML_F16x8_SET1(x)      vdupq_n_f16(x)
-    #define GGML_F16x8_LOAD         vld1q_f16
-    #define GGML_F16x8_STORE        vst1q_f16
-    #define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
-    #define GGML_F16x8_ADD          vaddq_f16
-    #define GGML_F16x8_MUL          vmulq_f16
-    #define GGML_F16x8_REDUCE(res, x)                             \
-    do {                                                          \
-        int offset = GGML_F16_ARR >> 1;                           \
-        for (int i = 0; i < offset; ++i) {                        \
-            x[i] = vaddq_f16(x[i], x[offset+i]);                  \
-        }                                                         \
-        offset >>= 1;                                             \
-        for (int i = 0; i < offset; ++i) {                        \
-            x[i] = vaddq_f16(x[i], x[offset+i]);                  \
-        }                                                         \
-        offset >>= 1;                                             \
-        for (int i = 0; i < offset; ++i) {                        \
-            x[i] = vaddq_f16(x[i], x[offset+i]);                  \
-        }                                                         \
-        const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 (x[0])); \
-        const float32x4_t t1 = vcvt_f32_f16(vget_high_f16(x[0])); \
-        res = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1));         \
-    } while (0)
-
-    #define GGML_F16_VEC                GGML_F16x8
-    #define GGML_F16_VEC_ZERO           GGML_F16x8_ZERO
-    #define GGML_F16_VEC_SET1           GGML_F16x8_SET1
-    #define GGML_F16_VEC_LOAD(p, i)     GGML_F16x8_LOAD(p)
-    #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE(p, r[i])
-    #define GGML_F16_VEC_FMA            GGML_F16x8_FMA
-    #define GGML_F16_VEC_ADD            GGML_F16x8_ADD
-    #define GGML_F16_VEC_MUL            GGML_F16x8_MUL
-    #define GGML_F16_VEC_REDUCE         GGML_F16x8_REDUCE
-#else
-    // if FP16 vector arithmetic is not supported, we use FP32 instead
-    // and take advantage of the vcvt_ functions to convert to/from FP16
-
-    #define GGML_F16_STEP 16
-    #define GGML_F16_EPR  4
-
-    #define GGML_F32Cx4              float32x4_t
-    #define GGML_F32Cx4_ZERO         vdupq_n_f32(0.0f)
-    #define GGML_F32Cx4_SET1(x)      vdupq_n_f32(x)
-    #define GGML_F32Cx4_LOAD(x)      vcvt_f32_f16(vld1_f16(x))
-    #define GGML_F32Cx4_STORE(x, y)  vst1_f16(x, vcvt_f16_f32(y))
-    #define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c)
-    #define GGML_F32Cx4_ADD          vaddq_f32
-    #define GGML_F32Cx4_MUL          vmulq_f32
-    #define GGML_F32Cx4_REDUCE       GGML_F32x4_REDUCE
-
-    #define GGML_F16_VEC                GGML_F32Cx4
-    #define GGML_F16_VEC_ZERO           GGML_F32Cx4_ZERO
-    #define GGML_F16_VEC_SET1           GGML_F32Cx4_SET1
-    #define GGML_F16_VEC_LOAD(p, i)     GGML_F32Cx4_LOAD(p)
-    #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE(p, r[i])
-    #define GGML_F16_VEC_FMA            GGML_F32Cx4_FMA
-    #define GGML_F16_VEC_ADD            GGML_F32Cx4_ADD
-    #define GGML_F16_VEC_MUL            GGML_F32Cx4_MUL
-    #define GGML_F16_VEC_REDUCE         GGML_F32Cx4_REDUCE
-#endif
-
-#elif defined(__AVX__)
-
-#define GGML_SIMD
-
-// F32 AVX
-
-#define GGML_F32_STEP 32
-#define GGML_F32_EPR  8
-
-#define GGML_F32x8         __m256
-#define GGML_F32x8_ZERO    _mm256_setzero_ps()
-#define GGML_F32x8_SET1(x) _mm256_set1_ps(x)
-#define GGML_F32x8_LOAD    _mm256_loadu_ps
-#define GGML_F32x8_STORE   _mm256_storeu_ps
-#if defined(__FMA__)
-    #define GGML_F32x8_FMA(a, b, c) _mm256_fmadd_ps(b, c, a)
-#else
-    #define GGML_F32x8_FMA(a, b, c) _mm256_add_ps(_mm256_mul_ps(b, c), a)
-#endif
-#define GGML_F32x8_ADD     _mm256_add_ps
-#define GGML_F32x8_MUL     _mm256_mul_ps
-#define GGML_F32x8_REDUCE(res, x)                                 \
-do {                                                              \
-    int offset = GGML_F32_ARR >> 1;                               \
-    for (int i = 0; i < offset; ++i) {                            \
-        x[i] = _mm256_add_ps(x[i], x[offset+i]);                  \
-    }                                                             \
-    offset >>= 1;                                                 \
-    for (int i = 0; i < offset; ++i) {                            \
-        x[i] = _mm256_add_ps(x[i], x[offset+i]);                  \
-    }                                                             \
-    offset >>= 1;                                                 \
-    for (int i = 0; i < offset; ++i) {                            \
-        x[i] = _mm256_add_ps(x[i], x[offset+i]);                  \
-    }                                                             \
-    const __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(x[0]),    \
-                                 _mm256_extractf128_ps(x[0], 1)); \
-    const __m128 t1 = _mm_hadd_ps(t0, t0);                        \
-    res = _mm_cvtss_f32(_mm_hadd_ps(t1, t1));                     \
-} while (0)
-// TODO: is this optimal ?
-
-#define GGML_F32_VEC        GGML_F32x8
-#define GGML_F32_VEC_ZERO   GGML_F32x8_ZERO
-#define GGML_F32_VEC_SET1   GGML_F32x8_SET1
-#define GGML_F32_VEC_LOAD   GGML_F32x8_LOAD
-#define GGML_F32_VEC_STORE  GGML_F32x8_STORE
-#define GGML_F32_VEC_FMA    GGML_F32x8_FMA
-#define GGML_F32_VEC_ADD    GGML_F32x8_ADD
-#define GGML_F32_VEC_MUL    GGML_F32x8_MUL
-#define GGML_F32_VEC_REDUCE GGML_F32x8_REDUCE
-
-// F16 AVX
-
-#define GGML_F16_STEP 32
-#define GGML_F16_EPR  8
-
-// F16 arithmetic is not supported by AVX, so we use F32 instead
-
-#define GGML_F32Cx8             __m256
-#define GGML_F32Cx8_ZERO        _mm256_setzero_ps()
-#define GGML_F32Cx8_SET1(x)     _mm256_set1_ps(x)
-
-#if defined(__F16C__)
-// the  _mm256_cvt intrinsics require F16C
-#define GGML_F32Cx8_LOAD(x)     _mm256_cvtph_ps(_mm_loadu_si128((__m128i *)(x)))
-#define GGML_F32Cx8_STORE(x, y) _mm_storeu_si128((__m128i *)(x), _mm256_cvtps_ph(y, 0))
-#else
-static inline __m256 __avx_f32cx8_load(ggml_fp16_t *x) {
-    float tmp[8];
-
-    for (int i = 0; i < 8; i++) {
-        tmp[i] = GGML_FP16_TO_FP32(x[i]);
-    }
-
-    return _mm256_loadu_ps(tmp);
-}
-static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
-    float arr[8];
-
-    _mm256_storeu_ps(arr, y);
-
-    for (int i = 0; i < 8; i++)
-        x[i] = GGML_FP32_TO_FP16(arr[i]);
-}
-#define GGML_F32Cx8_LOAD(x)     __avx_f32cx8_load(x)
-#define GGML_F32Cx8_STORE(x, y) __avx_f32cx8_store(x, y)
-#endif
-
-#define GGML_F32Cx8_FMA         GGML_F32x8_FMA
-#define GGML_F32Cx8_ADD         _mm256_add_ps
-#define GGML_F32Cx8_MUL         _mm256_mul_ps
-#define GGML_F32Cx8_REDUCE      GGML_F32x8_REDUCE
-
-#define GGML_F16_VEC                GGML_F32Cx8
-#define GGML_F16_VEC_ZERO           GGML_F32Cx8_ZERO
-#define GGML_F16_VEC_SET1           GGML_F32Cx8_SET1
-#define GGML_F16_VEC_LOAD(p, i)     GGML_F32Cx8_LOAD(p)
-#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx8_STORE(p, r[i])
-#define GGML_F16_VEC_FMA            GGML_F32Cx8_FMA
-#define GGML_F16_VEC_ADD            GGML_F32Cx8_ADD
-#define GGML_F16_VEC_MUL            GGML_F32Cx8_MUL
-#define GGML_F16_VEC_REDUCE         GGML_F32Cx8_REDUCE
-
-#elif defined(__POWER9_VECTOR__)
-
-#define GGML_SIMD
-
-// F32 POWER9
-
-#define GGML_F32_STEP 32
-#define GGML_F32_EPR  4
-
-#define GGML_F32x4              vector float
-#define GGML_F32x4_ZERO         0.0f
-#define GGML_F32x4_SET1         vec_splats
-#define GGML_F32x4_LOAD(p)      vec_xl(0, p)
-#define GGML_F32x4_STORE(p, r)  vec_xst(r, 0, p)
-#define GGML_F32x4_FMA(a, b, c) vec_madd(b, c, a)
-#define GGML_F32x4_ADD          vec_add
-#define GGML_F32x4_MUL          vec_mul
-#define GGML_F32x4_REDUCE(res, x)              \
-{                                              \
-    int offset = GGML_F32_ARR >> 1;            \
-    for (int i = 0; i < offset; ++i) {         \
-        x[i] = vec_add(x[i], x[offset+i]);     \
-    }                                          \
-    offset >>= 1;                              \
-    for (int i = 0; i < offset; ++i) {         \
-        x[i] = vec_add(x[i], x[offset+i]);     \
-    }                                          \
-    offset >>= 1;                              \
-    for (int i = 0; i < offset; ++i) {         \
-        x[i] = vec_add(x[i], x[offset+i]);     \
-    }                                          \
-    res = vec_extract(x[0], 0) +               \
-          vec_extract(x[0], 1) +               \
-          vec_extract(x[0], 2) +               \
-          vec_extract(x[0], 3);                \
-}
-
-#define GGML_F32_VEC        GGML_F32x4
-#define GGML_F32_VEC_ZERO   GGML_F32x4_ZERO
-#define GGML_F32_VEC_SET1   GGML_F32x4_SET1
-#define GGML_F32_VEC_LOAD   GGML_F32x4_LOAD
-#define GGML_F32_VEC_STORE  GGML_F32x4_STORE
-#define GGML_F32_VEC_FMA    GGML_F32x4_FMA
-#define GGML_F32_VEC_ADD    GGML_F32x4_ADD
-#define GGML_F32_VEC_MUL    GGML_F32x4_MUL
-#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
-
-// F16 POWER9
-#define GGML_F16_STEP       GGML_F32_STEP
-#define GGML_F16_EPR        GGML_F32_EPR
-#define GGML_F16_VEC        GGML_F32x4
-#define GGML_F16_VEC_ZERO   GGML_F32x4_ZERO
-#define GGML_F16_VEC_SET1   GGML_F32x4_SET1
-#define GGML_F16_VEC_FMA    GGML_F32x4_FMA
-#define GGML_F16_VEC_REDUCE GGML_F32x4_REDUCE
-// Use vec_xl, not vec_ld, in case the load address is not aligned.
-#define GGML_F16_VEC_LOAD(p, i) (i & 0x1) ?                   \
-  vec_extract_fp32_from_shorth(vec_xl(0, p - GGML_F16_EPR)) : \
-  vec_extract_fp32_from_shortl(vec_xl(0, p))
-#define GGML_ENDIAN_BYTE(i) ((unsigned char *)&(uint16_t){1})[i]
-#define GGML_F16_VEC_STORE(p, r, i)                             \
-  if (i & 0x1)                                                  \
-    vec_xst(vec_pack_to_short_fp32(r[i - GGML_ENDIAN_BYTE(1)],  \
-                                   r[i - GGML_ENDIAN_BYTE(0)]), \
-            0, p - GGML_F16_EPR)
-
-#elif defined(__wasm_simd128__)
-
-#define GGML_SIMD
-
-// F32 WASM
-
-#define GGML_F32_STEP 16
-#define GGML_F32_EPR  4
-
-#define GGML_F32x4              v128_t
-#define GGML_F32x4_ZERO         wasm_f32x4_splat(0.0f)
-#define GGML_F32x4_SET1(x)      wasm_f32x4_splat(x)
-#define GGML_F32x4_LOAD         wasm_v128_load
-#define GGML_F32x4_STORE        wasm_v128_store
-#define GGML_F32x4_FMA(a, b, c) wasm_f32x4_add(wasm_f32x4_mul(b, c), a)
-#define GGML_F32x4_ADD          wasm_f32x4_add
-#define GGML_F32x4_MUL          wasm_f32x4_mul
-#define GGML_F32x4_REDUCE(res, x)                  \
-{                                                  \
-    int offset = GGML_F32_ARR >> 1;                \
-    for (int i = 0; i < offset; ++i) {             \
-        x[i] = wasm_f32x4_add(x[i], x[offset+i]);  \
-    }                                              \
-    offset >>= 1;                                  \
-    for (int i = 0; i < offset; ++i) {             \
-        x[i] = wasm_f32x4_add(x[i], x[offset+i]);  \
-    }                                              \
-    offset >>= 1;                                  \
-    for (int i = 0; i < offset; ++i) {             \
-        x[i] = wasm_f32x4_add(x[i], x[offset+i]);  \
-    }                                              \
-    res = wasm_f32x4_extract_lane(x[0], 0) +       \
-          wasm_f32x4_extract_lane(x[0], 1) +       \
-          wasm_f32x4_extract_lane(x[0], 2) +       \
-          wasm_f32x4_extract_lane(x[0], 3);        \
-}
-
-#define GGML_F32_VEC        GGML_F32x4
-#define GGML_F32_VEC_ZERO   GGML_F32x4_ZERO
-#define GGML_F32_VEC_SET1   GGML_F32x4_SET1
-#define GGML_F32_VEC_LOAD   GGML_F32x4_LOAD
-#define GGML_F32_VEC_STORE  GGML_F32x4_STORE
-#define GGML_F32_VEC_FMA    GGML_F32x4_FMA
-#define GGML_F32_VEC_ADD    GGML_F32x4_ADD
-#define GGML_F32_VEC_MUL    GGML_F32x4_MUL
-#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
-
-// F16 WASM
-
-#define GGML_F16_STEP 16
-#define GGML_F16_EPR  4
-
-inline static v128_t __wasm_f16x4_load(const ggml_fp16_t * p) {
-    float tmp[4];
-
-    tmp[0] = GGML_FP16_TO_FP32(p[0]);
-    tmp[1] = GGML_FP16_TO_FP32(p[1]);
-    tmp[2] = GGML_FP16_TO_FP32(p[2]);
-    tmp[3] = GGML_FP16_TO_FP32(p[3]);
-
-    return wasm_v128_load(tmp);
-}
-
-inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
-    float tmp[4];
-
-    wasm_v128_store(tmp, x);
-
-    p[0] = GGML_FP32_TO_FP16(tmp[0]);
-    p[1] = GGML_FP32_TO_FP16(tmp[1]);
-    p[2] = GGML_FP32_TO_FP16(tmp[2]);
-    p[3] = GGML_FP32_TO_FP16(tmp[3]);
-}
-
-#define GGML_F16x4             v128_t
-#define GGML_F16x4_ZERO        wasm_f32x4_splat(0.0f)
-#define GGML_F16x4_SET1(x)     wasm_f32x4_splat(x)
-#define GGML_F16x4_LOAD(x)     __wasm_f16x4_load(x)
-#define GGML_F16x4_STORE(x, y) __wasm_f16x4_store(x, y)
-#define GGML_F16x4_FMA         GGML_F32x4_FMA
-#define GGML_F16x4_ADD         wasm_f32x4_add
-#define GGML_F16x4_MUL         wasm_f32x4_mul
-#define GGML_F16x4_REDUCE(res, x)                  \
-{                                                  \
-    int offset = GGML_F16_ARR >> 1;                \
-    for (int i = 0; i < offset; ++i) {             \
-        x[i] = wasm_f32x4_add(x[i], x[offset+i]);  \
-    }                                              \
-    offset >>= 1;                                  \
-    for (int i = 0; i < offset; ++i) {             \
-        x[i] = wasm_f32x4_add(x[i], x[offset+i]);  \
-    }                                              \
-    offset >>= 1;                                  \
-    for (int i = 0; i < offset; ++i) {             \
-        x[i] = wasm_f32x4_add(x[i], x[offset+i]);  \
-    }                                              \
-    res = wasm_f32x4_extract_lane(x[0], 0) +       \
-          wasm_f32x4_extract_lane(x[0], 1) +       \
-          wasm_f32x4_extract_lane(x[0], 2) +       \
-          wasm_f32x4_extract_lane(x[0], 3);        \
-}
-
-#define GGML_F16_VEC                GGML_F16x4
-#define GGML_F16_VEC_ZERO           GGML_F16x4_ZERO
-#define GGML_F16_VEC_SET1           GGML_F16x4_SET1
-#define GGML_F16_VEC_LOAD(p, i)     GGML_F16x4_LOAD(p)
-#define GGML_F16_VEC_STORE(p, r, i) GGML_F16x4_STORE(p, r[i])
-#define GGML_F16_VEC_FMA            GGML_F16x4_FMA
-#define GGML_F16_VEC_ADD            GGML_F16x4_ADD
-#define GGML_F16_VEC_MUL            GGML_F16x4_MUL
-#define GGML_F16_VEC_REDUCE         GGML_F16x4_REDUCE
-
-#elif defined(__SSE3__)
-
-#define GGML_SIMD
-
-// F32 SSE
-
-#define GGML_F32_STEP 32
-#define GGML_F32_EPR  4
-
-#define GGML_F32x4         __m128
-#define GGML_F32x4_ZERO    _mm_setzero_ps()
-#define GGML_F32x4_SET1(x) _mm_set1_ps(x)
-#define GGML_F32x4_LOAD    _mm_loadu_ps
-#define GGML_F32x4_STORE   _mm_storeu_ps
-#if defined(__FMA__)
-    // TODO: Does this work?
-    #define GGML_F32x4_FMA(a, b, c) _mm_fmadd_ps(b, c, a)
-#else
-    #define GGML_F32x4_FMA(a, b, c) _mm_add_ps(_mm_mul_ps(b, c), a)
-#endif
-#define GGML_F32x4_ADD     _mm_add_ps
-#define GGML_F32x4_MUL     _mm_mul_ps
-#define GGML_F32x4_REDUCE(res, x)                                 \
-{                                                                 \
-    int offset = GGML_F32_ARR >> 1;                               \
-    for (int i = 0; i < offset; ++i) {                            \
-        x[i] = _mm_add_ps(x[i], x[offset+i]);                     \
-    }                                                             \
-    offset >>= 1;                                                 \
-    for (int i = 0; i < offset; ++i) {                            \
-        x[i] = _mm_add_ps(x[i], x[offset+i]);                     \
-    }                                                             \
-    offset >>= 1;                                                 \
-    for (int i = 0; i < offset; ++i) {                            \
-        x[i] = _mm_add_ps(x[i], x[offset+i]);                     \
-    }                                                             \
-    const __m128 t0 = _mm_hadd_ps(x[0], x[0]);                    \
-    res = _mm_cvtss_f32(_mm_hadd_ps(t0, t0));                     \
-}
-// TODO: is this optimal ?
-
-#define GGML_F32_VEC        GGML_F32x4
-#define GGML_F32_VEC_ZERO   GGML_F32x4_ZERO
-#define GGML_F32_VEC_SET1   GGML_F32x4_SET1
-#define GGML_F32_VEC_LOAD   GGML_F32x4_LOAD
-#define GGML_F32_VEC_STORE  GGML_F32x4_STORE
-#define GGML_F32_VEC_FMA    GGML_F32x4_FMA
-#define GGML_F32_VEC_ADD    GGML_F32x4_ADD
-#define GGML_F32_VEC_MUL    GGML_F32x4_MUL
-#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
-
-// F16 SSE
-
-#define GGML_F16_STEP 32
-#define GGML_F16_EPR  4
-
-static inline __m128 __sse_f16x4_load(ggml_fp16_t *x) {
-    float tmp[4];
-
-    tmp[0] = GGML_FP16_TO_FP32(x[0]);
-    tmp[1] = GGML_FP16_TO_FP32(x[1]);
-    tmp[2] = GGML_FP16_TO_FP32(x[2]);
-    tmp[3] = GGML_FP16_TO_FP32(x[3]);
-
-    return _mm_loadu_ps(tmp);
-}
-
-static inline void __sse_f16x4_store(ggml_fp16_t *x, __m128 y) {
-    float arr[4];
-
-    _mm_storeu_ps(arr, y);
-
-    x[0] = GGML_FP32_TO_FP16(arr[0]);
-    x[1] = GGML_FP32_TO_FP16(arr[1]);
-    x[2] = GGML_FP32_TO_FP16(arr[2]);
-    x[3] = GGML_FP32_TO_FP16(arr[3]);
-}
-
-#define GGML_F32Cx4             __m128
-#define GGML_F32Cx4_ZERO        _mm_setzero_ps()
-#define GGML_F32Cx4_SET1(x)     _mm_set1_ps(x)
-#define GGML_F32Cx4_LOAD(x)     __sse_f16x4_load(x)
-#define GGML_F32Cx4_STORE(x, y) __sse_f16x4_store(x, y)
-#define GGML_F32Cx4_FMA         GGML_F32x4_FMA
-#define GGML_F32Cx4_ADD         _mm_add_ps
-#define GGML_F32Cx4_MUL         _mm_mul_ps
-#define GGML_F32Cx4_REDUCE      GGML_F32x4_REDUCE
-
-#define GGML_F16_VEC                 GGML_F32Cx4
-#define GGML_F16_VEC_ZERO            GGML_F32Cx4_ZERO
-#define GGML_F16_VEC_SET1            GGML_F32Cx4_SET1
-#define GGML_F16_VEC_LOAD(p, i)      GGML_F32Cx4_LOAD(p)
-#define GGML_F16_VEC_STORE(p, r, i)  GGML_F32Cx4_STORE(p, r[i])
-#define GGML_F16_VEC_FMA             GGML_F32Cx4_FMA
-#define GGML_F16_VEC_ADD             GGML_F32Cx4_ADD
-#define GGML_F16_VEC_MUL             GGML_F32Cx4_MUL
-#define GGML_F16_VEC_REDUCE          GGML_F32Cx4_REDUCE
-
-#endif
-
-// GGML_F32_ARR / GGML_F16_ARR
-//   number of registers to use per step
-#ifdef GGML_SIMD
-#define GGML_F32_ARR (GGML_F32_STEP/GGML_F32_EPR)
-#define GGML_F16_ARR (GGML_F16_STEP/GGML_F16_EPR)
-#endif
-
-//
-// fundamental operations
-//
-
-inline static void ggml_vec_set_i8(const int n, int8_t * x, const int8_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
-
-inline static void ggml_vec_set_i16(const int n, int16_t * x, const int16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
-
-inline static void ggml_vec_set_i32(const int n, int32_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
-
-inline static void ggml_vec_set_f16(const int n, ggml_fp16_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
-
-inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i] + y[i]; }
-inline static void ggml_vec_add1_f32(const int n, float * z, const float * x, const float   v) { for (int i = 0; i < n; ++i) z[i]  = x[i] + v;    }
-inline static void ggml_vec_acc_f32 (const int n, float * y, const float * x)                  { for (int i = 0; i < n; ++i) y[i] += x[i];        }
-inline static void ggml_vec_acc1_f32(const int n, float * y, const float   v)                  { for (int i = 0; i < n; ++i) y[i] += v;           }
-inline static void ggml_vec_sub_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i] - y[i]; }
-inline static void ggml_vec_set_f32 (const int n, float * x, const float   v)                  { for (int i = 0; i < n; ++i) x[i]  = v;           }
-inline static void ggml_vec_cpy_f32 (const int n, float * y, const float * x)                  { for (int i = 0; i < n; ++i) y[i]  = x[i];        }
-inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x)                  { for (int i = 0; i < n; ++i) y[i]  = -x[i];       }
-inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i]*y[i];   }
-inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i]/y[i];   }
-
-static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y) {
-#ifdef GGML_SIMD
-    float sumf = 0.0f;
-    const int np = (n & ~(GGML_F32_STEP - 1));
-
-    GGML_F32_VEC sum[GGML_F32_ARR] = { GGML_F32_VEC_ZERO };
-
-    GGML_F32_VEC ax[GGML_F32_ARR];
-    GGML_F32_VEC ay[GGML_F32_ARR];
-
-    for (int i = 0; i < np; i += GGML_F32_STEP) {
-        for (int j = 0; j < GGML_F32_ARR; j++) {
-            ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
-            ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
-
-            sum[j] = GGML_F32_VEC_FMA(sum[j], ax[j], ay[j]);
-        }
-    }
-
-    // reduce sum0..sum3 to sum0
-    GGML_F32_VEC_REDUCE(sumf, sum);
-
-    // leftovers
-    for (int i = np; i < n; ++i) {
-        sumf += x[i]*y[i];
-    }
-#else
-    // scalar
-    ggml_float sumf = 0.0;
-    for (int i = 0; i < n; ++i) {
-        sumf += (ggml_float)(x[i]*y[i]);
-    }
-#endif
-
-    *s = sumf;
-}
-
-static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y) {
-    ggml_float sumf = 0.0;
-
-#if defined(GGML_SIMD)
-    const int np = (n & ~(GGML_F16_STEP - 1));
-
-    GGML_F16_VEC sum[GGML_F16_ARR] = { GGML_F16_VEC_ZERO };
-
-    GGML_F16_VEC ax[GGML_F16_ARR];
-    GGML_F16_VEC ay[GGML_F16_ARR];
-
-    for (int i = 0; i < np; i += GGML_F16_STEP) {
-        for (int j = 0; j < GGML_F16_ARR; j++) {
-            ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j);
-            ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
-
-            sum[j] = GGML_F16_VEC_FMA(sum[j], ax[j], ay[j]);
-        }
-    }
-
-    // reduce sum0..sum3 to sum0
-    GGML_F16_VEC_REDUCE(sumf, sum);
-
-    // leftovers
-    for (int i = np; i < n; ++i) {
-        sumf += (ggml_float)(GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]));
-    }
-#else
-    for (int i = 0; i < n; ++i) {
-        sumf += (ggml_float)(GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]));
-    }
-#endif
-
-    *s = sumf;
-}
-
-// compute GGML_VEC_DOT_UNROLL dot products at once
-// xs - x row stride in bytes
-inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * restrict s, void * restrict xv, ggml_fp16_t * restrict y) {
-    ggml_float sumf[GGML_VEC_DOT_UNROLL] = { 0.0 };
-
-    ggml_fp16_t * restrict x[GGML_VEC_DOT_UNROLL];
-
-    for (int i = 0; i < GGML_VEC_DOT_UNROLL; ++i) {
-        x[i] = (ggml_fp16_t *) ((char *) xv + i*xs);
-    }
-
-#if defined(GGML_SIMD)
-    const int np = (n & ~(GGML_F16_STEP - 1));
-
-    GGML_F16_VEC sum[GGML_VEC_DOT_UNROLL][GGML_F16_ARR] = { { GGML_F16_VEC_ZERO } };
-
-    GGML_F16_VEC ax[GGML_F16_ARR];
-    GGML_F16_VEC ay[GGML_F16_ARR];
-
-    for (int i = 0; i < np; i += GGML_F16_STEP) {
-        for (int j = 0; j < GGML_F16_ARR; j++) {
-            ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
-
-            for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) {
-                ax[j] = GGML_F16_VEC_LOAD(x[k] + i + j*GGML_F16_EPR, j);
-
-                sum[k][j] = GGML_F16_VEC_FMA(sum[k][j], ax[j], ay[j]);
-            }
-        }
-    }
-
-    // reduce sum0..sum3 to sum0
-    for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) {
-        GGML_F16_VEC_REDUCE(sumf[k], sum[k]);
-    }
-
-    // leftovers
-    for (int i = np; i < n; ++i) {
-        for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
-            sumf[j] += (ggml_float)(GGML_FP16_TO_FP32(x[j][i])*GGML_FP16_TO_FP32(y[i]));
-        }
-    }
-#else
-    for (int i = 0; i < n; ++i) {
-        for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
-            sumf[j] += (ggml_float)(GGML_FP16_TO_FP32(x[j][i])*GGML_FP16_TO_FP32(y[i]));
-        }
-    }
-#endif
-
-    for (int i = 0; i < GGML_VEC_DOT_UNROLL; ++i) {
-        s[i] = sumf[i];
-    }
-}
-
-inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float * restrict x, const float v) {
-#if defined(GGML_SIMD)
-    const int np = (n & ~(GGML_F32_STEP - 1));
-
-    GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
-
-    GGML_F32_VEC ax[GGML_F32_ARR];
-    GGML_F32_VEC ay[GGML_F32_ARR];
-
-    for (int i = 0; i < np; i += GGML_F32_STEP) {
-        for (int j = 0; j < GGML_F32_ARR; j++) {
-            ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
-            ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
-            ay[j] = GGML_F32_VEC_FMA(ay[j], ax[j], vx);
-
-            GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
-        }
-    }
-
-    // leftovers
-    for (int i = np; i < n; ++i) {
-        y[i] += x[i]*v;
-    }
-#else
-    // scalar
-    for (int i = 0; i < n; ++i) {
-        y[i] += x[i]*v;
-    }
-#endif
-}
-
-// xs and vs are byte strides of x and v
-inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int vs, float * restrict y, const float * restrict xv, const float * restrict vv) {
-
-    const float * restrict x[GGML_VEC_MAD_UNROLL];
-    const float * restrict v[GGML_VEC_MAD_UNROLL];
-
-    for (int i = 0; i < GGML_VEC_MAD_UNROLL; ++i) {
-        x[i] = (const float *) ((const char *) xv + i*xs);
-        v[i] = (const float *) ((const char *) vv + i*vs);
-    }
-
-#if defined(GGML_SIMD)
-    const int np = (n & ~(GGML_F32_STEP - 1));
-
-    GGML_F32_VEC vx[GGML_VEC_MAD_UNROLL];
-
-    for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
-        vx[k] = GGML_F32_VEC_SET1(v[k][0]);
-    }
-
-    GGML_F32_VEC ax[GGML_VEC_MAD_UNROLL][GGML_F32_ARR];
-    GGML_F32_VEC ay[GGML_F32_ARR];
-
-    for (int i = 0; i < np; i += GGML_F32_STEP) {
-        for (int j = 0; j < GGML_F32_ARR; j++) {
-            ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
-
-            for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
-                ax[k][j] = GGML_F32_VEC_LOAD(x[k] + i + j*GGML_F32_EPR);
-                ay[j] = GGML_F32_VEC_FMA(ay[j], ax[k][j], vx[k]);
-            }
-
-            GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
-        }
-    }
-
-    // leftovers
-    for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
-        for (int i = np; i < n; ++i) {
-            y[i] += x[k][i]*v[k][0];
-        }
-    }
-#else
-    // scalar
-    for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
-        for (int i = 0; i < n; ++i) {
-            y[i] += x[k][i]*v[k][0];
-        }
-    }
-#endif
-}
-
-//inline static void ggml_vec_scale_f32(const int n, float * y, const float   v) { for (int i = 0; i < n; ++i) y[i] *= v;          }
-inline static void ggml_vec_scale_f32(const int n, float * y, const float   v) {
-#if defined(GGML_USE_ACCELERATE)
-    vDSP_vsmul(y, 1, &v, y, 1, n);
-#elif defined(GGML_SIMD)
-    const int np = (n & ~(GGML_F32_STEP - 1));
-
-    GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
-
-    GGML_F32_VEC ay[GGML_F32_ARR];
-
-    for (int i = 0; i < np; i += GGML_F32_STEP) {
-        for (int j = 0; j < GGML_F32_ARR; j++) {
-            ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
-            ay[j] = GGML_F32_VEC_MUL(ay[j], vx);
-
-            GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
-        }
-    }
-
-    // leftovers
-    for (int i = np; i < n; ++i) {
-        y[i] *= v;
-    }
-#else
-    // scalar
-    for (int i = 0; i < n; ++i) {
-        y[i] *= v;
-    }
-#endif
-}
-
-inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, x, x); *s = sqrtf(*s);   }
-inline static void ggml_vec_sqr_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i];   }
-inline static void ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrtf(x[i]); }
-inline static void ggml_vec_log_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = logf(x[i]);   }
-inline static void ggml_vec_abs_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fabsf(x[i]); }
-inline static void ggml_vec_sgn_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); }
-inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; }
-inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]);  }
-inline static void ggml_vec_elu_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; }
-inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
-inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
-
-static const float GELU_COEF_A     = 0.044715f;
-static const float GELU_QUICK_COEF = -1.702f;
-static const float SQRT_2_OVER_PI  = 0.79788456080286535587989211986876f;
-
-inline static float ggml_gelu_f32(float x) {
-    return 0.5f*x*(1.0f + tanhf(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
-}
-
-inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
-    const uint16_t * i16 = (const uint16_t *) x;
-    for (int i = 0; i < n; ++i) {
-        y[i] = ggml_table_gelu_f16[i16[i]];
-    }
-}
-
-#ifdef GGML_GELU_FP16
-inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
-    uint16_t t;
-    for (int i = 0; i < n; ++i) {
-        ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
-        memcpy(&t, &fp16, sizeof(uint16_t));
-        y[i] = GGML_FP16_TO_FP32(ggml_table_gelu_f16[t]);
-    }
-}
-#else
-inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
-    for (int i = 0; i < n; ++i) {
-        y[i] = ggml_gelu_f32(x[i]);
-    }
-}
-#endif
-
-inline static float ggml_gelu_quick_f32(float x) {
-    return x*(1.0f/(1.0f+expf(GELU_QUICK_COEF*x)));
-}
-
-//inline static void ggml_vec_gelu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
-//    const uint16_t * i16 = (const uint16_t *) x;
-//    for (int i = 0; i < n; ++i) {
-//        y[i] = ggml_table_gelu_quick_f16[i16[i]];
-//    }
-//}
-
-#ifdef GGML_GELU_QUICK_FP16
-inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) {
-    uint16_t t;
-    for (int i = 0; i < n; ++i) {
-        ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
-        memcpy(&t, &fp16, sizeof(uint16_t));
-        y[i] = GGML_FP16_TO_FP32(ggml_table_gelu_quick_f16[t]);
-    }
-}
-#else
-inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) {
-    for (int i = 0; i < n; ++i) {
-        y[i] = ggml_gelu_quick_f32(x[i]);
-    }
-}
-#endif
-
-// Sigmoid Linear Unit (SiLU) function
-inline static float ggml_silu_f32(float x) {
-    return x/(1.0f + expf(-x));
-}
-
-//inline static void ggml_vec_silu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
-//    const uint16_t * i16 = (const uint16_t *) x;
-//    for (int i = 0; i < n; ++i) {
-//        y[i] = ggml_table_silu_f16[i16[i]];
-//    }
-//}
-
-#ifdef GGML_SILU_FP16
-inline static void ggml_vec_silu_f32(const int n, float * y, const float * x) {
-    uint16_t t;
-    for (int i = 0; i < n; ++i) {
-        ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
-        memcpy(&t, &fp16, sizeof(uint16_t));
-        y[i] = GGML_FP16_TO_FP32(ggml_table_silu_f16[t]);
-    }
-}
-#else
-inline static void ggml_vec_silu_f32(const int n, float * y, const float * x) {
-    for (int i = 0; i < n; ++i) {
-        y[i] = ggml_silu_f32(x[i]);
-    }
-}
-#endif
-
-inline static float ggml_silu_backward_f32(float x, float dy) {
-    const float s = 1.0f/(1.0f + expf(-x));
-    return dy*s*(1.0f + x*(1.0f - s));
-}
-
-#ifdef GGML_SILU_FP16
-inline static void ggml_vec_silu_backward_f32(const int n, float * dx, const float * x, const float * dy) {
-    for (int i = 0; i < n; ++i) {
-        // we did not use x[i] to compute forward silu but its f16 equivalent
-        // take derivative at f16 of x[i]:
-        ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
-        float usedx = GGML_FP16_TO_FP32(fp16);
-        dx[i] = ggml_silu_backward_f32(usedx, dy[i]);
-    }
-}
-#else
-inline static void ggml_vec_silu_backward_f32(const int n, float * dx, const float * x, const float * dy) {
-    for (int i = 0; i < n; ++i) {
-        dx[i] = ggml_silu_backward_f32(x[i], dy[i]);
-    }
-}
-#endif
-
-inline static void ggml_vec_sum_f32(const int n, float * s, const float * x) {
-#ifndef GGML_USE_ACCELERATE
-    ggml_float sum = 0.0;
-    for (int i = 0; i < n; ++i) {
-        sum += (ggml_float)x[i];
-    }
-    *s = sum;
-#else
-    vDSP_sve(x, 1, s, n);
-#endif
-}
-
-inline static void ggml_vec_sum_f32_ggf(const int n, ggml_float * s, const float * x) {
-    ggml_float sum = 0.0;
-    for (int i = 0; i < n; ++i) {
-        sum += (ggml_float)x[i];
-    }
-    *s = sum;
-}
-
-inline static void ggml_vec_sum_f16_ggf(const int n, float * s, const ggml_fp16_t * x) {
-    float sum = 0.0f;
-    for (int i = 0; i < n; ++i) {
-        sum += GGML_FP16_TO_FP32(x[i]);
-    }
-    *s = sum;
-}
-
-inline static void ggml_vec_max_f32(const int n, float * s, const float * x) {
-#ifndef GGML_USE_ACCELERATE
-    float max = -INFINITY;
-    for (int i = 0; i < n; ++i) {
-        max = MAX(max, x[i]);
-    }
-    *s = max;
-#else
-    vDSP_maxv(x, 1, s, n);
-#endif
-}
-
-inline static void ggml_vec_norm_inv_f32(const int n, float * s, const float * x) {
-    ggml_vec_norm_f32(n, s, x);
-    *s = 1.f/(*s);
-}
-
-inline static void ggml_vec_argmax_f32(const int n, int * s, const float * x) {
-    float max = -INFINITY;
-    int idx = 0;
-    for (int i = 0; i < n; ++i) {
-        max = MAX(max, x[i]);
-        if (max == x[i]) { idx = i; }
-    }
-    *s = idx;
-}
-
-//
-// data types
-//
-
-static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
-    "NONE",
-
-    "DUP",
-    "ADD",
-    "ADD1",
-    "ACC",
-    "SUB",
-    "MUL",
-    "DIV",
-    "SQR",
-    "SQRT",
-    "LOG",
-    "SUM",
-    "SUM_ROWS",
-    "MEAN",
-    "ARGMAX",
-    "REPEAT",
-    "REPEAT_BACK",
-    "CONCAT",
-    "SILU_BACK",
-    "NORM",
-    "RMS_NORM",
-    "RMS_NORM_BACK",
-    "GROUP_NORM",
-
-    "MUL_MAT",
-    "MUL_MAT_ID",
-    "OUT_PROD",
-
-    "SCALE",
-    "SET",
-    "CPY",
-    "CONT",
-    "RESHAPE",
-    "VIEW",
-    "PERMUTE",
-    "TRANSPOSE",
-    "GET_ROWS",
-    "GET_ROWS_BACK",
-    "DIAG",
-    "DIAG_MASK_INF",
-    "DIAG_MASK_ZERO",
-    "SOFT_MAX",
-    "SOFT_MAX_BACK",
-    "ROPE",
-    "ROPE_BACK",
-    "ALIBI",
-    "CLAMP",
-    "CONV_TRANSPOSE_1D",
-    "IM2COL",
-    "CONV_TRANSPOSE_2D",
-    "POOL_1D",
-    "POOL_2D",
-    "UPSCALE",
-    "PAD",
-    "ARGSORT",
-    "LEAKY_RELU",
-
-    "FLASH_ATTN",
-    "FLASH_FF",
-    "FLASH_ATTN_BACK",
-    "WIN_PART",
-    "WIN_UNPART",
-    "GET_REL_POS",
-    "ADD_REL_POS",
-
-    "UNARY",
-
-    "MAP_UNARY",
-    "MAP_BINARY",
-
-    "MAP_CUSTOM1_F32",
-    "MAP_CUSTOM2_F32",
-    "MAP_CUSTOM3_F32",
-
-    "MAP_CUSTOM1",
-    "MAP_CUSTOM2",
-    "MAP_CUSTOM3",
-
-    "CROSS_ENTROPY_LOSS",
-    "CROSS_ENTROPY_LOSS_BACK",
-};
-
-static_assert(GGML_OP_COUNT == 72, "GGML_OP_COUNT != 72");
-
-static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
-    "none",
-
-    "x",
-    "x+y",
-    "x+y",
-    "view(x,nb,offset)+=y->x",
-    "x-y",
-    "x*y",
-    "x/y",
-    "x^2",
-    "√x",
-    "log(x)",
-    "Σx",
-    "Σx_k",
-    "Σx/n",
-    "argmax(x)",
-    "repeat(x)",
-    "repeat_back(x)",
-    "concat(x, y)",
-    "silu_back(x)",
-    "norm(x)",
-    "rms_norm(x)",
-    "rms_norm_back(x)",
-    "group_norm(x)",
-
-    "X*Y",
-    "X[i]*Y",
-    "X*Y",
-
-    "x*v",
-    "y-\\>view(x)",
-    "x-\\>y",
-    "cont(x)",
-    "reshape(x)",
-    "view(x)",
-    "permute(x)",
-    "transpose(x)",
-    "get_rows(x)",
-    "get_rows_back(x)",
-    "diag(x)",
-    "diag_mask_inf(x)",
-    "diag_mask_zero(x)",
-    "soft_max(x)",
-    "soft_max_back(x)",
-    "rope(x)",
-    "rope_back(x)",
-    "alibi(x)",
-    "clamp(x)",
-    "conv_transpose_1d(x)",
-    "im2col(x)",
-    "conv_transpose_2d(x)",
-    "pool_1d(x)",
-    "pool_2d(x)",
-    "upscale(x)",
-    "pad(x)",
-    "argsort(x)",
-    "leaky_relu(x)",
-
-    "flash_attn(x)",
-    "flash_ff(x)",
-    "flash_attn_back(x)",
-    "win_part(x)",
-    "win_unpart(x)",
-    "get_rel_pos(x)",
-    "add_rel_pos(x)",
-
-    "unary(x)",
-
-    "f(x)",
-    "f(x,y)",
-
-    "custom_f32(x)",
-    "custom_f32(x,y)",
-    "custom_f32(x,y,z)",
-
-    "custom(x)",
-    "custom(x,y)",
-    "custom(x,y,z)",
-
-    "cross_entropy_loss(x,y)",
-    "cross_entropy_loss_back(x,y)",
-};
-
-static_assert(GGML_OP_COUNT == 72, "GGML_OP_COUNT != 72");
-
-static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
-
-
-static const char * GGML_UNARY_OP_NAME[GGML_UNARY_OP_COUNT] = {
-    "ABS",
-    "SGN",
-    "NEG",
-    "STEP",
-    "TANH",
-    "ELU",
-    "RELU",
-    "GELU",
-    "GELU_QUICK",
-    "SILU",
-};
-
-static_assert(GGML_UNARY_OP_COUNT == 10, "GGML_UNARY_OP_COUNT != 10");
-
-
-static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
-static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
-
-// WARN:
-// Mis-configuration can lead to problem that's hard to reason about:
-// * At best  it crash or talks nosense.
-// * At worst it talks slightly difference but hard to perceive.
-//
-// An op has to enable INIT or FINALIZE when any of it's branch needs that pass.
-// Take care about compile options (e.g., GGML_USE_xxx).
-static bool GGML_OP_HAS_INIT    [GGML_OP_COUNT] = { 0 };
-static bool GGML_OP_HAS_FINALIZE[GGML_OP_COUNT] = { 0 };
-
-static void ggml_setup_op_has_task_pass(void) {
-    {   // INIT
-        bool * p = GGML_OP_HAS_INIT;
-
-        p[GGML_OP_ACC                    ] = true;
-        p[GGML_OP_MUL_MAT                ] = true;
-        p[GGML_OP_MUL_MAT_ID             ] = true;
-        p[GGML_OP_OUT_PROD               ] = true;
-        p[GGML_OP_SET                    ] = true;
-        p[GGML_OP_GET_ROWS_BACK          ] = true;
-        p[GGML_OP_DIAG_MASK_INF          ] = true;
-        p[GGML_OP_DIAG_MASK_ZERO         ] = true;
-        p[GGML_OP_CONV_TRANSPOSE_1D      ] = true;
-        p[GGML_OP_CONV_TRANSPOSE_2D      ] = true;
-        p[GGML_OP_FLASH_ATTN_BACK        ] = true;
-        p[GGML_OP_CROSS_ENTROPY_LOSS     ] = true;
-        p[GGML_OP_ADD_REL_POS            ] = true;
-    }
-
-    {   // FINALIZE
-        bool * p = GGML_OP_HAS_FINALIZE;
-
-        p[GGML_OP_CROSS_ENTROPY_LOSS     ] = true;
-    }
-}
-
-//
-// ggml context
-//
-
-struct ggml_context {
-    size_t mem_size;
-    void * mem_buffer;
-    bool   mem_buffer_owned;
-    bool   no_alloc;
-    bool   no_alloc_save; // this is used to save the no_alloc state when using scratch buffers
-
-    int    n_objects;
-
-    struct ggml_object * objects_begin;
-    struct ggml_object * objects_end;
-
-    struct ggml_scratch scratch;
-    struct ggml_scratch scratch_save;
-};
-
-struct ggml_context_container {
-    bool used;
-
-    struct ggml_context context;
-};
-
-//
-// NUMA support
-//
-
-#define GGML_NUMA_MAX_NODES 8
-#define GGML_NUMA_MAX_CPUS 512
-
-struct ggml_numa_node {
-    uint32_t cpus[GGML_NUMA_MAX_CPUS]; // hardware threads on this node
-    uint32_t n_cpus;
-};
-
-struct ggml_numa_nodes {
-    struct ggml_numa_node nodes[GGML_NUMA_MAX_NODES];
-    uint32_t n_nodes;
-    uint32_t total_cpus; // hardware threads on system
-};
-
-//
-// ggml state
-//
-
-struct ggml_state {
-    struct ggml_context_container contexts[GGML_MAX_CONTEXTS];
-    struct ggml_numa_nodes numa;
-};
-
-// global state
-static struct ggml_state g_state;
-static atomic_int g_state_barrier = 0;
-
-// barrier via spin lock
-inline static void ggml_critical_section_start(void) {
-    int processing = atomic_fetch_add(&g_state_barrier, 1);
-
-    while (processing > 0) {
-        // wait for other threads to finish
-        atomic_fetch_sub(&g_state_barrier, 1);
-        sched_yield(); // TODO: reconsider this
-        processing = atomic_fetch_add(&g_state_barrier, 1);
-    }
-}
-
-// TODO: make this somehow automatically executed
-//       some sort of "sentry" mechanism
-inline static void ggml_critical_section_end(void) {
-    atomic_fetch_sub(&g_state_barrier, 1);
-}
-
-void ggml_numa_init(void) {
-    if (g_state.numa.n_nodes > 0) {
-        fprintf(stderr, "ggml_numa_init: NUMA already initialized\n");
-
-        return;
-    }
-
-#ifdef __linux__
-    struct stat st;
-    char path[256];
-    int rv;
-
-    // enumerate nodes
-    while (g_state.numa.n_nodes < GGML_NUMA_MAX_NODES) {
-        rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u", g_state.numa.n_nodes);
-        GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
-        if (stat(path, &st) != 0) { break; }
-        ++g_state.numa.n_nodes;
-    }
-
-    // enumerate CPUs
-    while (g_state.numa.total_cpus < GGML_NUMA_MAX_CPUS) {
-        rv = snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%u", g_state.numa.total_cpus);
-        GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
-        if (stat(path, &st) != 0) { break; }
-        ++g_state.numa.total_cpus;
-    }
-
-    GGML_PRINT_DEBUG("found %u numa nodes, %u CPUs\n", g_state.numa.n_nodes, g_state.numa.total_cpus);
-
-    if (g_state.numa.n_nodes < 1 || g_state.numa.total_cpus < 1) {
-        g_state.numa.n_nodes = 0;
-        return;
-    }
-
-    for (uint32_t n = 0; n < g_state.numa.n_nodes; ++n) {
-        struct ggml_numa_node * node = &g_state.numa.nodes[n];
-        GGML_PRINT_DEBUG("CPUs on node %u:", n);
-        node->n_cpus = 0;
-        for (uint32_t c = 0; c < g_state.numa.total_cpus; ++c) {
-            rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u/cpu%u", n, c);
-            GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
-            if (stat(path, &st) == 0) {
-                node->cpus[node->n_cpus++] = c;
-                GGML_PRINT_DEBUG(" %u", c);
-            }
-        }
-        GGML_PRINT_DEBUG("\n");
-    }
-
-    if (ggml_is_numa()) {
-        FILE *fptr = fopen("/proc/sys/kernel/numa_balancing", "r");
-        if (fptr != NULL) {
-            char buf[42];
-            if (fgets(buf, sizeof(buf), fptr) && strncmp(buf, "0\n", sizeof(buf)) != 0) {
-                GGML_PRINT("WARNING: /proc/sys/kernel/numa_balancing is enabled, this has been observed to impair performance\n");
-            }
-            fclose(fptr);
-        }
-    }
-#else
-    // TODO
-#endif
-}
-
-bool ggml_is_numa(void) {
-    return g_state.numa.n_nodes > 1;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-void ggml_print_object(const struct ggml_object * obj) {
-    GGML_PRINT(" - ggml_object: type = %d, offset = %zu, size = %zu, next = %p\n",
-            obj->type, obj->offs, obj->size, (const void *) obj->next);
-}
-
-void ggml_print_objects(const struct ggml_context * ctx) {
-    struct ggml_object * obj = ctx->objects_begin;
-
-    GGML_PRINT("%s: objects in context %p:\n", __func__, (const void *) ctx);
-
-    while (obj != NULL) {
-        ggml_print_object(obj);
-        obj = obj->next;
-    }
-
-    GGML_PRINT("%s: --- end ---\n", __func__);
-}
-
-int64_t ggml_nelements(const struct ggml_tensor * tensor) {
-    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
-
-    return tensor->ne[0]*tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
-}
-
-int64_t ggml_nrows(const struct ggml_tensor * tensor) {
-    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
-
-    return tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
-}
-
-size_t ggml_nbytes(const struct ggml_tensor * tensor) {
-    size_t nbytes;
-    size_t blck_size = ggml_blck_size(tensor->type);
-    if (blck_size == 1) {
-        nbytes = ggml_type_size(tensor->type);
-        for (int i = 0; i < GGML_MAX_DIMS; ++i) {
-            nbytes += (tensor->ne[i] - 1)*tensor->nb[i];
-        }
-    }
-    else {
-        nbytes = tensor->ne[0]*tensor->nb[0]/blck_size;
-        for (int i = 1; i < GGML_MAX_DIMS; ++i) {
-            nbytes += (tensor->ne[i] - 1)*tensor->nb[i];
-        }
-    }
-
-    return nbytes;
-}
-
-size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) {
-    return GGML_PAD(ggml_nbytes(tensor), GGML_MEM_ALIGN);
-}
-
-int ggml_blck_size(enum ggml_type type) {
-    return type_traits[type].blck_size;
-}
-
-size_t ggml_type_size(enum ggml_type type) {
-    return type_traits[type].type_size;
-}
-
-size_t ggml_row_size(enum ggml_type type, int64_t ne) {
-    assert(ne % ggml_blck_size(type) == 0);
-    return ggml_type_size(type)*ne/ggml_blck_size(type);
-}
-
-double ggml_type_sizef(enum ggml_type type) {
-    return ((double)(type_traits[type].type_size))/type_traits[type].blck_size;
-}
-
-const char * ggml_type_name(enum ggml_type type) {
-    return type_traits[type].type_name;
-}
-
-bool ggml_is_quantized(enum ggml_type type) {
-    return type_traits[type].is_quantized;
-}
-
-const char * ggml_op_name(enum ggml_op op) {
-    return GGML_OP_NAME[op];
-}
-
-const char * ggml_op_symbol(enum ggml_op op) {
-    return GGML_OP_SYMBOL[op];
-}
-
-const char * ggml_unary_op_name(enum ggml_unary_op op) {
-    return GGML_UNARY_OP_NAME[op];
-}
-
-const char * ggml_op_desc(const struct ggml_tensor * t) {
-    if (t->op == GGML_OP_UNARY) {
-        enum ggml_unary_op uop = ggml_get_unary_op(t);
-        return ggml_unary_op_name(uop);
-    }
-    else {
-        return ggml_op_name(t->op);
-    }
-}
-
-size_t ggml_element_size(const struct ggml_tensor * tensor) {
-    return ggml_type_size(tensor->type);
-}
-
-bool ggml_is_scalar(const struct ggml_tensor * tensor) {
-    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
-
-    return tensor->ne[0] == 1 && tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1;
-}
-
-bool ggml_is_vector(const struct ggml_tensor * tensor) {
-    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
-
-    return tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1;
-}
-
-bool ggml_is_matrix(const struct ggml_tensor * tensor) {
-    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
-
-    return tensor->ne[2] == 1 && tensor->ne[3] == 1;
-}
-
-bool ggml_is_3d(const struct ggml_tensor * tensor) {
-    return tensor->ne[3] == 1;
-}
-
-int ggml_n_dims(const struct ggml_tensor * tensor) {
-    for (int i = GGML_MAX_DIMS - 1; i >= 1; --i) {
-        if (tensor->ne[i] > 1) {
-            return i + 1;
-        }
-    }
-    return 1;
-}
-
-static inline bool ggml_can_mul_mat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
-    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
-
-    return (t0->ne[0]           == t1->ne[0])  &&
-           (t1->ne[2]%t0->ne[2] == 0)          && // verify t0 is broadcastable
-           (t1->ne[3]%t0->ne[3] == 0);
-}
-
-static inline bool ggml_can_out_prod(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
-    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
-
-    return (t0->ne[1] == t1->ne[1])   &&
-           (t1->ne[2]%t0->ne[2] == 0) && // verify t0 is broadcastable
-           (t1->ne[3]%t0->ne[3] == 0);
-}
-
-enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
-    enum ggml_type wtype = GGML_TYPE_COUNT;
-
-    switch (ftype) {
-        case GGML_FTYPE_ALL_F32:              wtype = GGML_TYPE_F32;   break;
-        case GGML_FTYPE_MOSTLY_F16:           wtype = GGML_TYPE_F16;   break;
-        case GGML_FTYPE_MOSTLY_Q4_0:          wtype = GGML_TYPE_Q4_0;  break;
-        case GGML_FTYPE_MOSTLY_Q4_1:          wtype = GGML_TYPE_Q4_1;  break;
-        case GGML_FTYPE_MOSTLY_Q5_0:          wtype = GGML_TYPE_Q5_0;  break;
-        case GGML_FTYPE_MOSTLY_Q5_1:          wtype = GGML_TYPE_Q5_1;  break;
-        case GGML_FTYPE_MOSTLY_Q8_0:          wtype = GGML_TYPE_Q8_0;  break;
-        case GGML_FTYPE_MOSTLY_Q2_K:          wtype = GGML_TYPE_Q2_K;  break;
-        case GGML_FTYPE_MOSTLY_Q3_K:          wtype = GGML_TYPE_Q3_K;  break;
-        case GGML_FTYPE_MOSTLY_Q4_K:          wtype = GGML_TYPE_Q4_K;  break;
-        case GGML_FTYPE_MOSTLY_Q5_K:          wtype = GGML_TYPE_Q5_K;  break;
-        case GGML_FTYPE_MOSTLY_Q6_K:          wtype = GGML_TYPE_Q6_K;  break;
-        case GGML_FTYPE_MOSTLY_IQ2_XXS:       wtype = GGML_TYPE_IQ2_XXS;  break;
-        case GGML_FTYPE_MOSTLY_IQ2_XS:        wtype = GGML_TYPE_IQ2_XS;   break;
-        case GGML_FTYPE_UNKNOWN:              wtype = GGML_TYPE_COUNT; break;
-        case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
-    }
-
-    GGML_ASSERT(wtype != GGML_TYPE_COUNT);
-
-    return wtype;
-}
-
-size_t ggml_tensor_overhead(void) {
-    return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE;
-}
-
-bool ggml_is_transposed(const struct ggml_tensor * tensor) {
-    return tensor->nb[0] > tensor->nb[1];
-}
-
-bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
-    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
-
-    return
-        tensor->nb[0] == ggml_type_size(tensor->type) &&
-        tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/ggml_blck_size(tensor->type) &&
-        tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
-        tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
-}
-
-static inline bool ggml_is_contiguous_except_dim_1(const struct ggml_tensor * tensor) {
-    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
-
-    return
-        tensor->nb[0] == ggml_type_size(tensor->type) &&
-        tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
-        tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
-}
-
-bool ggml_is_permuted(const struct ggml_tensor * tensor) {
-    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
-
-    return tensor->nb[0] > tensor->nb[1] || tensor->nb[1] > tensor->nb[2] || tensor->nb[2] > tensor->nb[3];
-}
-
-static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) {
-    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
-
-    return
-        tensor->nb[0] == ggml_type_size(tensor->type) &&
-        tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
-        tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
-}
-
-bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
-    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
-
-    return
-        (t0->ne[0] == t1->ne[0] ) &&
-        (t0->ne[1] == t1->ne[1] ) &&
-        (t0->ne[2] == t1->ne[2] ) &&
-        (t0->ne[3] == t1->ne[3] );
-}
-
-// check if t1 can be represented as a repeatition of t0
-static inline bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
-    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
-
-    return
-        (t1->ne[0]%t0->ne[0] == 0) &&
-        (t1->ne[1]%t0->ne[1] == 0) &&
-        (t1->ne[2]%t0->ne[2] == 0) &&
-        (t1->ne[3]%t0->ne[3] == 0);
-}
-
-static inline bool ggml_can_repeat_rows(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
-    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
-
-    return (t0->ne[0] == t1->ne[0]) && ggml_can_repeat(t0, t1);
-}
-
-static inline int ggml_up32(int n) {
-    return (n + 31) & ~31;
-}
-
-//static inline int ggml_up64(int n) {
-//    return (n + 63) & ~63;
-//}
-
-static inline int ggml_up(int n, int m) {
-    // assert m is a power of 2
-    GGML_ASSERT((m & (m - 1)) == 0);
-    return (n + m - 1) & ~(m - 1);
-}
-
-// assert that pointer is aligned to GGML_MEM_ALIGN
-#define ggml_assert_aligned(ptr) \
-    GGML_ASSERT(((uintptr_t) (ptr))%GGML_MEM_ALIGN == 0)
-
-////////////////////////////////////////////////////////////////////////////////
-
-struct ggml_context * ggml_init(struct ggml_init_params params) {
-    // make this function thread safe
-    ggml_critical_section_start();
-
-    static bool is_first_call = true;
-
-    if (is_first_call) {
-        // initialize time system (required on Windows)
-        ggml_time_init();
-
-        // initialize GELU, Quick GELU, SILU and EXP F32 tables
-        {
-            const uint64_t t_start = ggml_time_us(); UNUSED(t_start);
-
-            ggml_fp16_t ii;
-            for (int i = 0; i < (1 << 16); ++i) {
-                uint16_t ui = i;
-                memcpy(&ii, &ui, sizeof(ii));
-                const float f = ggml_table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(ii);
-                ggml_table_gelu_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_f32(f));
-                ggml_table_gelu_quick_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_quick_f32(f));
-                ggml_table_silu_f16[i] = GGML_FP32_TO_FP16(ggml_silu_f32(f));
-                ggml_table_exp_f16[i]  = GGML_FP32_TO_FP16(expf(f));
-            }
-
-            const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
-
-            GGML_PRINT_DEBUG("%s: GELU, Quick GELU, SILU and EXP tables initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
-        }
-
-        // initialize g_state
-        {
-            const uint64_t t_start = ggml_time_us(); UNUSED(t_start);
-
-            g_state = (struct ggml_state) {
-                /*.contexts =*/ { { 0 } },
-                /*.numa =*/ {
-                    .n_nodes = 0,
-                    .total_cpus = 0,
-                },
-            };
-
-            for (int i = 0; i < GGML_MAX_CONTEXTS; ++i) {
-                g_state.contexts[i].used = false;
-            }
-
-            const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
-
-            GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
-        }
-
-#if defined(GGML_USE_CUBLAS)
-        ggml_init_cublas();
-#elif defined(GGML_USE_CLBLAST)
-        ggml_cl_init();
-#endif
-
-        ggml_setup_op_has_task_pass();
-
-        is_first_call = false;
-    }
-
-    // find non-used context in g_state
-    struct ggml_context * ctx = NULL;
-
-    for (int i = 0; i < GGML_MAX_CONTEXTS; i++) {
-        if (!g_state.contexts[i].used) {
-            g_state.contexts[i].used = true;
-            ctx = &g_state.contexts[i].context;
-
-            GGML_PRINT_DEBUG("%s: found unused context %d\n", __func__, i);
-            break;
-        }
-    }
-
-    if (ctx == NULL) {
-        GGML_PRINT_DEBUG("%s: no unused context found\n", __func__);
-
-        ggml_critical_section_end();
-
-        return NULL;
-    }
-
-    // allow to call ggml_init with 0 size
-    if (params.mem_size == 0) {
-        params.mem_size = GGML_MEM_ALIGN;
-    }
-
-    const size_t mem_size = params.mem_buffer ? params.mem_size : GGML_PAD(params.mem_size, GGML_MEM_ALIGN);
-
-    *ctx = (struct ggml_context) {
-        /*.mem_size           =*/ mem_size,
-        /*.mem_buffer         =*/ params.mem_buffer ? params.mem_buffer : GGML_ALIGNED_MALLOC(mem_size),
-        /*.mem_buffer_owned   =*/ params.mem_buffer ? false : true,
-        /*.no_alloc           =*/ params.no_alloc,
-        /*.no_alloc_save      =*/ params.no_alloc,
-        /*.n_objects          =*/ 0,
-        /*.objects_begin      =*/ NULL,
-        /*.objects_end        =*/ NULL,
-        /*.scratch            =*/ { 0, 0, NULL, },
-        /*.scratch_save       =*/ { 0, 0, NULL, },
-    };
-
-    GGML_ASSERT(ctx->mem_buffer != NULL);
-
-    ggml_assert_aligned(ctx->mem_buffer);
-
-    GGML_PRINT_DEBUG("%s: context initialized\n", __func__);
-
-    ggml_critical_section_end();
-
-    return ctx;
-}
-
-void ggml_free(struct ggml_context * ctx) {
-    if (ctx == NULL) {
-        return;
-    }
-
-    // make this function thread safe
-    ggml_critical_section_start();
-
-    bool found = false;
-
-    for (int i = 0; i < GGML_MAX_CONTEXTS; i++) {
-        if (&g_state.contexts[i].context == ctx) {
-            g_state.contexts[i].used = false;
-
-            GGML_PRINT_DEBUG("%s: context %d has been freed. memory used = %zu\n",
-                    __func__, i, ggml_used_mem(ctx));
-
-            if (ctx->mem_buffer_owned) {
-                GGML_ALIGNED_FREE(ctx->mem_buffer);
-            }
-
-            found = true;
-            break;
-        }
-    }
-
-    if (!found) {
-        GGML_PRINT_DEBUG("%s: context not found\n", __func__);
-    }
-
-    ggml_critical_section_end();
-}
-
-size_t ggml_used_mem(const struct ggml_context * ctx) {
-    return ctx->objects_end == NULL ? 0 : ctx->objects_end->offs + ctx->objects_end->size;
-}
-
-size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch) {
-    const size_t result = ctx->scratch.data ? ctx->scratch.offs : 0;
-
-    ctx->scratch = scratch;
-
-    return result;
-}
-
-bool ggml_get_no_alloc(struct ggml_context * ctx) {
-    return ctx->no_alloc;
-}
-
-void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc) {
-    ctx->no_alloc = no_alloc;
-}
-
-void * ggml_get_mem_buffer(const struct ggml_context * ctx) {
-    return ctx->mem_buffer;
-}
-
-size_t ggml_get_mem_size(const struct ggml_context * ctx) {
-    return ctx->mem_size;
-}
-
-size_t ggml_get_max_tensor_size(const struct ggml_context * ctx) {
-    size_t max_size = 0;
-
-    for (struct ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor != NULL; tensor = ggml_get_next_tensor(ctx, tensor)) {
-        max_size = MAX(max_size, ggml_nbytes(tensor));
-    }
-
-    return max_size;
-}
-
-// IMPORTANT:
-// when creating "opt" tensors, always save and load the scratch buffer
-// this is an error prone process, but it is necessary to support inplace
-// operators when using scratch buffers
-// TODO: implement a better way
-static void ggml_scratch_save(struct ggml_context * ctx) {
-    // this is needed to allow opt tensors to store their data
-    // TODO: again, need to find a better way
-    ctx->no_alloc_save = ctx->no_alloc;
-    ctx->no_alloc      = false;
-
-    ctx->scratch_save = ctx->scratch;
-    ctx->scratch.data = NULL;
-}
-
-static void ggml_scratch_load(struct ggml_context * ctx) {
-    ctx->no_alloc = ctx->no_alloc_save;
-
-    ctx->scratch = ctx->scratch_save;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-static struct ggml_object * ggml_new_object(struct ggml_context * ctx, enum ggml_object_type type, size_t size) {
-    // always insert objects at the end of the context's memory pool
-    struct ggml_object * obj_cur = ctx->objects_end;
-
-    const size_t cur_offs = obj_cur == NULL ? 0 : obj_cur->offs;
-    const size_t cur_size = obj_cur == NULL ? 0 : obj_cur->size;
-    const size_t cur_end  = cur_offs + cur_size;
-
-    // align to GGML_MEM_ALIGN
-    size_t size_needed = GGML_PAD(size, GGML_MEM_ALIGN);
-
-    char * const mem_buffer = ctx->mem_buffer;
-    struct ggml_object * const obj_new = (struct ggml_object *)(mem_buffer + cur_end);
-
-    if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
-        GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
-                __func__, cur_end + size_needed, ctx->mem_size);
-        assert(false);
-        return NULL;
-    }
-
-    *obj_new = (struct ggml_object) {
-        .offs = cur_end + GGML_OBJECT_SIZE,
-        .size = size_needed,
-        .next = NULL,
-        .type = type,
-    };
-
-    ggml_assert_aligned(mem_buffer + obj_new->offs);
-
-    if (obj_cur != NULL) {
-        obj_cur->next = obj_new;
-    } else {
-        // this is the first object in this context
-        ctx->objects_begin = obj_new;
-    }
-
-    ctx->objects_end = obj_new;
-
-    //printf("%s: inserted new object at %zu, size = %zu\n", __func__, cur_end, obj_new->size);
-
-    return obj_new;
-}
-
-static struct ggml_tensor * ggml_new_tensor_impl(
-        struct ggml_context * ctx,
-        enum   ggml_type      type,
-        int                   n_dims,
-        const int64_t       * ne,
-        struct ggml_tensor  * view_src,
-        size_t                view_offs) {
-
-    assert(n_dims >= 1 && n_dims <= GGML_MAX_DIMS);
-
-    // find the base tensor and absolute offset
-    if (view_src != NULL && view_src->view_src != NULL) {
-        view_offs += view_src->view_offs;
-        view_src   = view_src->view_src;
-    }
-
-    size_t data_size = ggml_row_size(type, ne[0]);
-    for (int i = 1; i < n_dims; i++) {
-        data_size *= ne[i];
-    }
-
-    GGML_ASSERT(view_src == NULL || data_size + view_offs <= ggml_nbytes(view_src));
-
-    void * data = view_src != NULL ? view_src->data : NULL;
-    if (data != NULL) {
-        data = (char *) data + view_offs;
-    }
-
-    size_t obj_alloc_size = 0;
-
-    if (view_src == NULL && !ctx->no_alloc) {
-        if (ctx->scratch.data != NULL) {
-            // allocate tensor data in the scratch buffer
-            if (ctx->scratch.offs + data_size > ctx->scratch.size) {
-                GGML_PRINT("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n",
-                        __func__, ctx->scratch.offs + data_size, ctx->scratch.size);
-                assert(false);
-                return NULL;
-            }
-
-            data = (char * const) ctx->scratch.data + ctx->scratch.offs;
-
-            ctx->scratch.offs += data_size;
-        } else {
-            // allocate tensor data in the context's memory pool
-            obj_alloc_size = data_size;
-        }
-    }
-
-    struct ggml_object * const obj_new = ggml_new_object(ctx, GGML_OBJECT_TENSOR, GGML_TENSOR_SIZE + obj_alloc_size);
-
-    // TODO: for recoverable errors, we would need to free the data allocated from the scratch buffer here
-
-    struct ggml_tensor * const result = (struct ggml_tensor *)((char *)ctx->mem_buffer + obj_new->offs);
-
-    *result = (struct ggml_tensor) {
-        /*.type         =*/ type,
-        /*.backend      =*/ GGML_BACKEND_CPU,
-        /*.buffer       =*/ NULL,
-        /*.ne           =*/ { 1, 1, 1, 1 },
-        /*.nb           =*/ { 0, 0, 0, 0 },
-        /*.op           =*/ GGML_OP_NONE,
-        /*.op_params    =*/ { 0 },
-        /*.is_param     =*/ false,
-        /*.grad         =*/ NULL,
-        /*.src          =*/ { NULL },
-        /*.perf_runs    =*/ 0,
-        /*.perf_cycles  =*/ 0,
-        /*.perf_time_us =*/ 0,
-        /*.view_src     =*/ view_src,
-        /*.view_offs    =*/ view_offs,
-        /*.data         =*/ obj_alloc_size > 0 ? (void *)(result + 1) : data,
-        /*.name         =*/ { 0 },
-        /*.extra        =*/ NULL,
-        /*.padding      =*/ { 0 },
-    };
-
-    // TODO: this should not be needed as long as we don't rely on aligned SIMD loads
-    //ggml_assert_aligned(result->data);
-
-    for (int i = 0; i < n_dims; i++) {
-        result->ne[i] = ne[i];
-    }
-
-    result->nb[0] = ggml_type_size(type);
-    result->nb[1] = result->nb[0]*(result->ne[0]/ggml_blck_size(type));
-    for (int i = 2; i < GGML_MAX_DIMS; i++) {
-        result->nb[i] = result->nb[i - 1]*result->ne[i - 1];
-    }
-
-    ctx->n_objects++;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_new_tensor(
-        struct ggml_context * ctx,
-        enum   ggml_type      type,
-        int                   n_dims,
-        const int64_t       * ne) {
-    return ggml_new_tensor_impl(ctx, type, n_dims, ne, NULL, 0);
-}
-
-struct ggml_tensor * ggml_new_tensor_1d(
-        struct ggml_context * ctx,
-        enum   ggml_type      type,
-        int64_t ne0) {
-    return ggml_new_tensor(ctx, type, 1, &ne0);
-}
-
-struct ggml_tensor * ggml_new_tensor_2d(
-        struct ggml_context * ctx,
-        enum   ggml_type      type,
-        int64_t ne0,
-        int64_t ne1) {
-    const int64_t ne[2] = { ne0, ne1 };
-    return ggml_new_tensor(ctx, type, 2, ne);
-}
-
-struct ggml_tensor * ggml_new_tensor_3d(
-        struct ggml_context * ctx,
-        enum   ggml_type      type,
-        int64_t ne0,
-        int64_t ne1,
-        int64_t ne2) {
-    const int64_t ne[3] = { ne0, ne1, ne2 };
-    return ggml_new_tensor(ctx, type, 3, ne);
-}
-
-struct ggml_tensor * ggml_new_tensor_4d(
-        struct ggml_context * ctx,
-        enum   ggml_type type,
-        int64_t ne0,
-        int64_t ne1,
-        int64_t ne2,
-        int64_t ne3) {
-    const int64_t ne[4] = { ne0, ne1, ne2, ne3 };
-    return ggml_new_tensor(ctx, type, 4, ne);
-}
-
-struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value) {
-    ggml_scratch_save(ctx);
-
-    struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 1);
-
-    ggml_scratch_load(ctx);
-
-    ggml_set_i32(result, value);
-
-    return result;
-}
-
-struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value) {
-    ggml_scratch_save(ctx);
-
-    struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
-
-    ggml_scratch_load(ctx);
-
-    ggml_set_f32(result, value);
-
-    return result;
-}
-
-struct ggml_tensor * ggml_dup_tensor(struct ggml_context * ctx, const struct ggml_tensor * src) {
-    return ggml_new_tensor(ctx, src->type, GGML_MAX_DIMS, src->ne);
-}
-
-static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) {
-    GGML_ASSERT(tensor != NULL); // silence -Warray-bounds warnings
-    assert(params_size <= GGML_MAX_OP_PARAMS);
-    memcpy(tensor->op_params, params, params_size);
-}
-
-static int32_t ggml_get_op_params_i32(const struct ggml_tensor * tensor, uint32_t i) {
-    assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
-    return ((const int32_t *)(tensor->op_params))[i];
-}
-
-static void ggml_set_op_params_i32(struct ggml_tensor * tensor, uint32_t i, int32_t value) {
-    assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t));
-    ((int32_t *)(tensor->op_params))[i] = value;
-}
-
-struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor) {
-    memset(tensor->data, 0, ggml_nbytes(tensor));
-    return tensor;
-}
-
-struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value) {
-    const int n     = ggml_nrows(tensor);
-    const int nc    = tensor->ne[0];
-    const size_t n1 = tensor->nb[1];
-
-    char * const data = tensor->data;
-
-    switch (tensor->type) {
-        case GGML_TYPE_I8:
-            {
-                assert(tensor->nb[0] == sizeof(int8_t));
-                for (int i = 0; i < n; i++) {
-                    ggml_vec_set_i8(nc, (int8_t *)(data + i*n1), value);
-                }
-            } break;
-        case GGML_TYPE_I16:
-            {
-                assert(tensor->nb[0] == sizeof(int16_t));
-                for (int i = 0; i < n; i++) {
-                    ggml_vec_set_i16(nc, (int16_t *)(data + i*n1), value);
-                }
-            } break;
-        case GGML_TYPE_I32:
-            {
-                assert(tensor->nb[0] == sizeof(int32_t));
-                for (int i = 0; i < n; i++) {
-                    ggml_vec_set_i32(nc, (int32_t *)(data + i*n1), value);
-                }
-            } break;
-        case GGML_TYPE_F16:
-            {
-                assert(tensor->nb[0] == sizeof(ggml_fp16_t));
-                for (int i = 0; i < n; i++) {
-                    ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), GGML_FP32_TO_FP16(value));
-                }
-            } break;
-        case GGML_TYPE_F32:
-            {
-                assert(tensor->nb[0] == sizeof(float));
-                for (int i = 0; i < n; i++) {
-                    ggml_vec_set_f32(nc, (float *)(data + i*n1), value);
-                }
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-
-    return tensor;
-}
-
-struct ggml_tensor * ggml_set_f32(struct ggml_tensor * tensor, float value) {
-    const int n     = ggml_nrows(tensor);
-    const int nc    = tensor->ne[0];
-    const size_t n1 = tensor->nb[1];
-
-    char * const data = tensor->data;
-
-    switch (tensor->type) {
-        case GGML_TYPE_I8:
-            {
-                assert(tensor->nb[0] == sizeof(int8_t));
-                for (int i = 0; i < n; i++) {
-                    ggml_vec_set_i8(nc, (int8_t *)(data + i*n1), value);
-                }
-            } break;
-        case GGML_TYPE_I16:
-            {
-                assert(tensor->nb[0] == sizeof(int16_t));
-                for (int i = 0; i < n; i++) {
-                    ggml_vec_set_i16(nc, (int16_t *)(data + i*n1), value);
-                }
-            } break;
-        case GGML_TYPE_I32:
-            {
-                assert(tensor->nb[0] == sizeof(int32_t));
-                for (int i = 0; i < n; i++) {
-                    ggml_vec_set_i32(nc, (int32_t *)(data + i*n1), value);
-                }
-            } break;
-        case GGML_TYPE_F16:
-            {
-                assert(tensor->nb[0] == sizeof(ggml_fp16_t));
-                for (int i = 0; i < n; i++) {
-                    ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), GGML_FP32_TO_FP16(value));
-                }
-            } break;
-        case GGML_TYPE_F32:
-            {
-                assert(tensor->nb[0] == sizeof(float));
-                for (int i = 0; i < n; i++) {
-                    ggml_vec_set_f32(nc, (float *)(data + i*n1), value);
-                }
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-
-    return tensor;
-}
-
-void ggml_unravel_index(const struct ggml_tensor * tensor, int64_t i, int64_t * i0, int64_t * i1, int64_t * i2, int64_t * i3) {
-    const int64_t ne2 = tensor->ne[2];
-    const int64_t ne1 = tensor->ne[1];
-    const int64_t ne0 = tensor->ne[0];
-
-    const int64_t i3_ = (i/(ne2*ne1*ne0));
-    const int64_t i2_ = (i - i3_*ne2*ne1*ne0)/(ne1*ne0);
-    const int64_t i1_ = (i - i3_*ne2*ne1*ne0 - i2_*ne1*ne0)/ne0;
-    const int64_t i0_ = (i - i3_*ne2*ne1*ne0 - i2_*ne1*ne0 - i1_*ne0);
-
-    if (i0) {
-        * i0 = i0_;
-    }
-    if (i1) {
-        * i1 = i1_;
-    }
-    if (i2) {
-        * i2 = i2_;
-    }
-    if (i3) {
-        * i3 = i3_;
-    }
-}
-
-int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i) {
-    if (!ggml_is_contiguous(tensor)) {
-        int64_t id[4] = { 0, 0, 0, 0 };
-        ggml_unravel_index(tensor, i, &id[0], &id[1], &id[2], &id[3]);
-        return ggml_get_i32_nd(tensor, id[0], id[1], id[2], id[3]);
-    }
-    switch (tensor->type) {
-        case GGML_TYPE_I8:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(int8_t));
-                return ((int8_t *)(tensor->data))[i];
-            }
-        case GGML_TYPE_I16:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(int16_t));
-                return ((int16_t *)(tensor->data))[i];
-            }
-        case GGML_TYPE_I32:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(int32_t));
-                return ((int32_t *)(tensor->data))[i];
-            }
-        case GGML_TYPE_F16:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
-                return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]);
-            }
-        case GGML_TYPE_F32:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(float));
-                return ((float *)(tensor->data))[i];
-            }
-        default:
-            {
-                GGML_ASSERT(false);
-            }
-    }
-
-    return 0.0f;
-}
-
-void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value) {
-    if (!ggml_is_contiguous(tensor)) {
-        int64_t id[4] = { 0, 0, 0, 0 };
-        ggml_unravel_index(tensor, i, &id[0], &id[1], &id[2], &id[3]);
-        ggml_set_i32_nd(tensor, id[0], id[1], id[2], id[3], value);
-        return;
-    }
-    switch (tensor->type) {
-        case GGML_TYPE_I8:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(int8_t));
-                ((int8_t *)(tensor->data))[i] = value;
-            } break;
-        case GGML_TYPE_I16:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(int16_t));
-                ((int16_t *)(tensor->data))[i] = value;
-            } break;
-        case GGML_TYPE_I32:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(int32_t));
-                ((int32_t *)(tensor->data))[i] = value;
-            } break;
-        case GGML_TYPE_F16:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
-                ((ggml_fp16_t *)(tensor->data))[i] = GGML_FP32_TO_FP16(value);
-            } break;
-        case GGML_TYPE_F32:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(float));
-                ((float *)(tensor->data))[i] = value;
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3) {
-    void * data   = (char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3];
-    switch (tensor->type) {
-        case GGML_TYPE_I8:
-            return ((int8_t *) data)[0];
-        case GGML_TYPE_I16:
-            return ((int16_t *) data)[0];
-        case GGML_TYPE_I32:
-            return ((int32_t *) data)[0];
-        case GGML_TYPE_F16:
-            return GGML_FP16_TO_FP32(((ggml_fp16_t *) data)[0]);
-        case GGML_TYPE_F32:
-            return ((float *) data)[0];
-        default:
-            GGML_ASSERT(false);
-    }
-
-    return 0.0f;
-}
-
-void ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value) {
-    void * data   = (char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3];
-    switch (tensor->type) {
-        case GGML_TYPE_I8:
-            {
-                ((int8_t *)(data))[0] = value;
-            } break;
-        case GGML_TYPE_I16:
-            {
-                ((int16_t *)(data))[0] = value;
-            } break;
-        case GGML_TYPE_I32:
-            {
-                ((int32_t *)(data))[0] = value;
-            } break;
-        case GGML_TYPE_F16:
-            {
-                ((ggml_fp16_t *)(data))[0] = GGML_FP32_TO_FP16(value);
-            } break;
-        case GGML_TYPE_F32:
-            {
-                ((float *)(data))[0] = value;
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i) {
-    if (!ggml_is_contiguous(tensor)) {
-        int64_t id[4] = { 0, 0, 0, 0 };
-        ggml_unravel_index(tensor, i, &id[0], &id[1], &id[2], &id[3]);
-        return ggml_get_f32_nd(tensor, id[0], id[1], id[2], id[3]);
-    }
-    switch (tensor->type) {
-        case GGML_TYPE_I8:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(int8_t));
-                return ((int8_t *)(tensor->data))[i];
-            }
-        case GGML_TYPE_I16:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(int16_t));
-                return ((int16_t *)(tensor->data))[i];
-            }
-        case GGML_TYPE_I32:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(int32_t));
-                return ((int32_t *)(tensor->data))[i];
-            }
-        case GGML_TYPE_F16:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
-                return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]);
-            }
-        case GGML_TYPE_F32:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(float));
-                return ((float *)(tensor->data))[i];
-            }
-        default:
-            {
-                GGML_ASSERT(false);
-            }
-    }
-
-    return 0.0f;
-}
-
-void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value) {
-    if (!ggml_is_contiguous(tensor)) {
-        int64_t id[4] = { 0, 0, 0, 0 };
-        ggml_unravel_index(tensor, i, &id[0], &id[1], &id[2], &id[3]);
-        ggml_set_f32_nd(tensor, id[0], id[1], id[2], id[3], value);
-        return;
-    }
-    switch (tensor->type) {
-        case GGML_TYPE_I8:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(int8_t));
-                ((int8_t *)(tensor->data))[i] = value;
-            } break;
-        case GGML_TYPE_I16:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(int16_t));
-                ((int16_t *)(tensor->data))[i] = value;
-            } break;
-        case GGML_TYPE_I32:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(int32_t));
-                ((int32_t *)(tensor->data))[i] = value;
-            } break;
-        case GGML_TYPE_F16:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
-                ((ggml_fp16_t *)(tensor->data))[i] = GGML_FP32_TO_FP16(value);
-            } break;
-        case GGML_TYPE_F32:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(float));
-                ((float *)(tensor->data))[i] = value;
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3) {
-    void * data   = (char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3];
-    switch (tensor->type) {
-        case GGML_TYPE_I8:
-            return ((int8_t *) data)[0];
-        case GGML_TYPE_I16:
-            return ((int16_t *) data)[0];
-        case GGML_TYPE_I32:
-            return ((int32_t *) data)[0];
-        case GGML_TYPE_F16:
-            return GGML_FP16_TO_FP32(((ggml_fp16_t *) data)[0]);
-        case GGML_TYPE_F32:
-            return ((float *) data)[0];
-        default:
-            GGML_ASSERT(false);
-    }
-
-    return 0.0f;
-}
-
-void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value) {
-    void * data   = (char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3];
-    switch (tensor->type) {
-        case GGML_TYPE_I8:
-            {
-                ((int8_t *)(data))[0] = value;
-            } break;
-        case GGML_TYPE_I16:
-            {
-                ((int16_t *)(data))[0] = value;
-            } break;
-        case GGML_TYPE_I32:
-            {
-                ((int32_t *)(data))[0] = value;
-            } break;
-        case GGML_TYPE_F16:
-            {
-                ((ggml_fp16_t *)(data))[0] = GGML_FP32_TO_FP16(value);
-            } break;
-        case GGML_TYPE_F32:
-            {
-                ((float *)(data))[0] = value;
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-void * ggml_get_data(const struct ggml_tensor * tensor) {
-    return tensor->data;
-}
-
-float * ggml_get_data_f32(const struct ggml_tensor * tensor) {
-    assert(tensor->type == GGML_TYPE_F32);
-    return (float *)(tensor->data);
-}
-
-enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor) {
-    GGML_ASSERT(tensor->op == GGML_OP_UNARY);
-    return (enum ggml_unary_op) ggml_get_op_params_i32(tensor, 0);
-}
-
-const char * ggml_get_name(const struct ggml_tensor * tensor) {
-    return tensor->name;
-}
-
-struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * name) {
-    strncpy(tensor->name, name, sizeof(tensor->name));
-    tensor->name[sizeof(tensor->name) - 1] = '\0';
-    return tensor;
-}
-
-struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char * fmt, ...) {
-    va_list args;
-    va_start(args, fmt);
-    vsnprintf(tensor->name, sizeof(tensor->name), fmt, args);
-    va_end(args);
-    return tensor;
-}
-
-struct ggml_tensor * ggml_view_tensor(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * src) {
-    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, GGML_MAX_DIMS, src->ne, src, 0);
-    ggml_format_name(result, "%s (view)", src->name);
-
-    for (int i = 0; i < GGML_MAX_DIMS; i++) {
-        result->nb[i] = src->nb[i];
-    }
-
-    return result;
-}
-
-struct ggml_tensor * ggml_get_first_tensor(const struct ggml_context * ctx) {
-    struct ggml_object * obj = ctx->objects_begin;
-
-    char * const mem_buffer = ctx->mem_buffer;
-
-    while (obj != NULL) {
-        if (obj->type == GGML_OBJECT_TENSOR) {
-            return (struct ggml_tensor *)(mem_buffer + obj->offs);
-        }
-
-        obj = obj->next;
-    }
-
-    return NULL;
-}
-
-struct ggml_tensor * ggml_get_next_tensor(const struct ggml_context * ctx, struct ggml_tensor * tensor) {
-    struct ggml_object * obj = (struct ggml_object *) ((char *)tensor - GGML_OBJECT_SIZE);
-    obj = obj->next;
-
-    char * const mem_buffer = ctx->mem_buffer;
-
-    while (obj != NULL) {
-        if (obj->type == GGML_OBJECT_TENSOR) {
-            return (struct ggml_tensor *)(mem_buffer + obj->offs);
-        }
-
-        obj = obj->next;
-    }
-
-    return NULL;
-}
-
-struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name) {
-    struct ggml_object * obj = ctx->objects_begin;
-
-    char * const mem_buffer = ctx->mem_buffer;
-
-    while (obj != NULL) {
-        if (obj->type == GGML_OBJECT_TENSOR) {
-            struct ggml_tensor * cur = (struct ggml_tensor *)(mem_buffer + obj->offs);
-            if (strcmp(cur->name, name) == 0) {
-                return cur;
-            }
-        }
-
-        obj = obj->next;
-    }
-
-    return NULL;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-// ggml_dup
-
-static struct ggml_tensor * ggml_dup_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        bool inplace) {
-    bool is_node = false;
-
-    if (!inplace && (a->grad)) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    result->op   = GGML_OP_DUP;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_dup(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a) {
-    return ggml_dup_impl(ctx, a, false);
-}
-
-struct ggml_tensor * ggml_dup_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a) {
-    return ggml_dup_impl(ctx, a, true);
-}
-
-// ggml_add
-
-static struct ggml_tensor * ggml_add_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        struct ggml_tensor * b,
-        bool inplace) {
-    GGML_ASSERT(ggml_can_repeat(b, a));
-
-    bool is_node = false;
-
-    if (!inplace && (a->grad || b->grad)) {
-        // TODO: support backward pass for broadcasting
-        GGML_ASSERT(ggml_are_same_shape(a, b));
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    result->op   = GGML_OP_ADD;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-    result->src[1] = b;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_add(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        struct ggml_tensor * b) {
-    return ggml_add_impl(ctx, a, b, false);
-}
-
-struct ggml_tensor * ggml_add_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        struct ggml_tensor * b) {
-    return ggml_add_impl(ctx, a, b, true);
-}
-
-// ggml_add_cast
-
-static struct ggml_tensor * ggml_add_cast_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        struct ggml_tensor * b,
-        enum   ggml_type     type) {
-    // TODO: support less-strict constraint
-    //       GGML_ASSERT(ggml_can_repeat(b, a));
-    GGML_ASSERT(ggml_can_repeat_rows(b, a));
-    GGML_ASSERT(ggml_is_quantized(a->type) || a->type == GGML_TYPE_F16); // currently only supported for quantized input and f16
-
-    bool is_node = false;
-
-    if (a->grad || b->grad) {
-        // TODO: support backward pass for broadcasting
-        GGML_ASSERT(ggml_are_same_shape(a, b));
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = ggml_new_tensor(ctx, type, GGML_MAX_DIMS, a->ne);
-
-    result->op   = GGML_OP_ADD;
-    result->grad = is_node ? ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, a->ne) : NULL;
-    result->src[0] = a;
-    result->src[1] = b;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_add_cast(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        struct ggml_tensor * b,
-        enum   ggml_type     type) {
-    return ggml_add_cast_impl(ctx, a, b, type);
-}
-
-// ggml_add1
-
-static struct ggml_tensor * ggml_add1_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        struct ggml_tensor * b,
-        bool inplace) {
-    GGML_ASSERT(ggml_is_scalar(b));
-    GGML_ASSERT(ggml_is_padded_1d(a));
-
-    bool is_node = false;
-
-    if (a->grad || b->grad) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    result->op   = GGML_OP_ADD1;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-    result->src[1] = b;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_add1(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        struct ggml_tensor * b) {
-    return ggml_add1_impl(ctx, a, b, false);
-}
-
-struct ggml_tensor * ggml_add1_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        struct ggml_tensor * b) {
-    return ggml_add1_impl(ctx, a, b, true);
-}
-
-// ggml_acc
-
-static struct ggml_tensor * ggml_acc_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        struct ggml_tensor * b,
-        size_t               nb1,
-        size_t               nb2,
-        size_t               nb3,
-        size_t               offset,
-        bool inplace) {
-    GGML_ASSERT(ggml_nelements(b) <= ggml_nelements(a));
-    GGML_ASSERT(ggml_is_contiguous(a));
-    GGML_ASSERT(a->type == GGML_TYPE_F32);
-    GGML_ASSERT(b->type == GGML_TYPE_F32);
-
-    bool is_node = false;
-
-    if (!inplace && (a->grad || b->grad)) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    int32_t params[] = { nb1, nb2, nb3, offset, inplace ? 1 : 0 };
-    ggml_set_op_params(result, params, sizeof(params));
-
-    result->op   = GGML_OP_ACC;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-    result->src[1] = b;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_acc(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        struct ggml_tensor * b,
-        size_t               nb1,
-        size_t               nb2,
-        size_t               nb3,
-        size_t               offset) {
-    return ggml_acc_impl(ctx, a, b, nb1, nb2, nb3, offset, false);
-}
-
-struct ggml_tensor * ggml_acc_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        struct ggml_tensor * b,
-        size_t               nb1,
-        size_t               nb2,
-        size_t               nb3,
-        size_t               offset) {
-    return ggml_acc_impl(ctx, a, b, nb1, nb2, nb3, offset, true);
-}
-
-// ggml_sub
-
-static struct ggml_tensor * ggml_sub_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        struct ggml_tensor * b,
-        bool inplace) {
-    GGML_ASSERT(ggml_are_same_shape(a, b));
-
-    bool is_node = false;
-
-    if (!inplace && (a->grad || b->grad)) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    result->op   = GGML_OP_SUB;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-    result->src[1] = b;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_sub(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        struct ggml_tensor * b) {
-    return ggml_sub_impl(ctx, a, b, false);
-}
-
-struct ggml_tensor * ggml_sub_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        struct ggml_tensor * b) {
-    return ggml_sub_impl(ctx, a, b, true);
-}
-
-// ggml_mul
-
-static struct ggml_tensor * ggml_mul_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        struct ggml_tensor * b,
-        bool inplace) {
-    GGML_ASSERT(ggml_can_repeat(b, a));
-
-    bool is_node = false;
-
-    if (!inplace && (a->grad || b->grad)) {
-        // TODO: support backward pass for broadcasting
-        GGML_ASSERT(ggml_are_same_shape(a, b));
-        is_node = true;
-    }
-
-    if (inplace) {
-        GGML_ASSERT(!is_node);
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    result->op   = GGML_OP_MUL;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-    result->src[1] = b;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_mul(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b) {
-    return ggml_mul_impl(ctx, a, b, false);
-}
-
-struct ggml_tensor * ggml_mul_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b) {
-    return ggml_mul_impl(ctx, a, b, true);
-}
-
-// ggml_div
-
-static struct ggml_tensor * ggml_div_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        struct ggml_tensor * b,
-        bool inplace) {
-    GGML_ASSERT(ggml_can_repeat(b, a));
-
-    bool is_node = false;
-
-    if (!inplace && (a->grad || b->grad)) {
-        is_node = true;
-    }
-
-    if (inplace) {
-        GGML_ASSERT(!is_node);
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    result->op   = GGML_OP_DIV;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-    result->src[1] = b;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_div(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b) {
-    return ggml_div_impl(ctx, a, b, false);
-}
-
-struct ggml_tensor * ggml_div_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b) {
-    return ggml_div_impl(ctx, a, b, true);
-}
-
-// ggml_sqr
-
-static struct ggml_tensor * ggml_sqr_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        bool inplace) {
-    bool is_node = false;
-
-    if (!inplace && (a->grad)) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    result->op   = GGML_OP_SQR;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_sqr(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_sqr_impl(ctx, a, false);
-}
-
-struct ggml_tensor * ggml_sqr_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_sqr_impl(ctx, a, true);
-}
-
-// ggml_sqrt
-
-static struct ggml_tensor * ggml_sqrt_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        bool inplace) {
-    bool is_node = false;
-
-    if (!inplace && (a->grad)) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    result->op   = GGML_OP_SQRT;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_sqrt(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_sqrt_impl(ctx, a, false);
-}
-
-struct ggml_tensor * ggml_sqrt_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_sqrt_impl(ctx, a, true);
-}
-
-// ggml_log
-
-static struct ggml_tensor * ggml_log_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        bool inplace) {
-    bool is_node = false;
-
-    if (!inplace && (a->grad)) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    result->op   = GGML_OP_LOG;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_log(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_log_impl(ctx, a, false);
-}
-
-struct ggml_tensor * ggml_log_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_log_impl(ctx, a, true);
-}
-
-// ggml_sum
-
-struct ggml_tensor * ggml_sum(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a) {
-    bool is_node = false;
-
-    if (a->grad) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = ggml_new_tensor_1d(ctx, a->type, 1);
-
-    result->op   = GGML_OP_SUM;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
-// ggml_sum_rows
-
-struct ggml_tensor * ggml_sum_rows(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a) {
-    bool is_node = false;
-
-    if (a->grad) {
-        is_node = true;
-    }
-
-    int64_t ne[GGML_MAX_DIMS] = { 1 };
-    for (int i = 1; i < GGML_MAX_DIMS; ++i) {
-        ne[i] = a->ne[i];
-    }
-
-    struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, ne);
-
-    result->op   = GGML_OP_SUM_ROWS;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
-// ggml_mean
-
-struct ggml_tensor * ggml_mean(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a) {
-    bool is_node = false;
-
-    if (a->grad) {
-        GGML_ASSERT(false); // TODO: implement
-        is_node = true;
-    }
-
-    int64_t ne[4] = { 1, a->ne[1], a->ne[2], a->ne[3] };
-    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
-
-    result->op   = GGML_OP_MEAN;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
-// ggml_argmax
-
-struct ggml_tensor * ggml_argmax(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a) {
-    GGML_ASSERT(ggml_is_matrix(a));
-    bool is_node = false;
-
-    if (a->grad) {
-        GGML_ASSERT(false);
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, a->ne[1]);
-
-    result->op   = GGML_OP_ARGMAX;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
-// ggml_repeat
-
-struct ggml_tensor * ggml_repeat(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        struct ggml_tensor * b) {
-    GGML_ASSERT(ggml_can_repeat(a, b));
-
-    bool is_node = false;
-
-    if (a->grad) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, b->ne);
-
-    result->op   = GGML_OP_REPEAT;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
-// ggml_repeat_back
-
-struct ggml_tensor * ggml_repeat_back(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        struct ggml_tensor * b) {
-    GGML_ASSERT(ggml_can_repeat(b, a));
-
-    bool is_node = false;
-
-    if (a->grad) {
-        is_node = true;
-    }
-
-    if (ggml_are_same_shape(a, b) && !is_node) {
-        return a;
-    }
-
-    struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, b->ne);
-
-    result->op   = GGML_OP_REPEAT_BACK;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
-// ggml_concat
-
-struct ggml_tensor * ggml_concat(
-    struct ggml_context* ctx,
-    struct ggml_tensor* a,
-    struct ggml_tensor* b) {
-    GGML_ASSERT(a->ne[0] == b->ne[0] && a->ne[1] == b->ne[1] && a->ne[3] == b->ne[3]);
-
-    bool is_node = false;
-
-    if (a->grad || b->grad) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type, a->ne[0], a->ne[1], a->ne[2] + b->ne[2], a->ne[3]);
-
-    result->op = GGML_OP_CONCAT;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-    result->src[1] = b;
-
-    return result;
-}
-
-// ggml_abs
-
-struct ggml_tensor * ggml_abs(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_unary(ctx, a, GGML_UNARY_OP_ABS);
-}
-
-struct ggml_tensor * ggml_abs_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_ABS);
-}
-
-// ggml_sgn
-
-struct ggml_tensor * ggml_sgn(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_unary(ctx, a, GGML_UNARY_OP_SGN);
-}
-
-struct ggml_tensor * ggml_sgn_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_SGN);
-}
-
-// ggml_neg
-
-struct ggml_tensor * ggml_neg(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_unary(ctx, a, GGML_UNARY_OP_NEG);
-}
-
-struct ggml_tensor * ggml_neg_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_NEG);
-}
-
-// ggml_step
-
-struct ggml_tensor * ggml_step(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_unary(ctx, a, GGML_UNARY_OP_STEP);
-}
-
-struct ggml_tensor * ggml_step_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_STEP);
-}
-
-// ggml_tanh
-
-struct ggml_tensor * ggml_tanh(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_unary(ctx, a, GGML_UNARY_OP_TANH);
-}
-
-struct ggml_tensor * ggml_tanh_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_TANH);
-}
-
-// ggml_elu
-
-struct ggml_tensor * ggml_elu(
-    struct ggml_context * ctx,
-    struct ggml_tensor  * a) {
-    return ggml_unary(ctx, a, GGML_UNARY_OP_ELU);
-}
-
-struct ggml_tensor * ggml_elu_inplace(
-    struct ggml_context * ctx,
-    struct ggml_tensor  * a) {
-    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_ELU);
-}
-
-// ggml_relu
-
-struct ggml_tensor * ggml_relu(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_unary(ctx, a, GGML_UNARY_OP_RELU);
-}
-
-struct ggml_tensor * ggml_relu_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_RELU);
-}
-
-// ggml_leaky_relu
-
-struct ggml_tensor * ggml_leaky_relu(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a, float negative_slope, bool inplace) {
-    bool is_node = false;
-
-    if (!inplace && (a->grad)) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-    ggml_set_op_params(result, &negative_slope, sizeof(negative_slope));
-
-    result->op   = GGML_OP_LEAKY_RELU;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
-// ggml_gelu
-
-struct ggml_tensor * ggml_gelu(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_unary(ctx, a, GGML_UNARY_OP_GELU);
-}
-
-struct ggml_tensor * ggml_gelu_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_GELU);
-}
-
-// ggml_gelu_quick
-
-struct ggml_tensor * ggml_gelu_quick(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_unary(ctx, a, GGML_UNARY_OP_GELU_QUICK);
-}
-
-struct ggml_tensor * ggml_gelu_quick_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_GELU_QUICK);
-}
-
-// ggml_silu
-
-struct ggml_tensor * ggml_silu(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_unary(ctx, a, GGML_UNARY_OP_SILU);
-}
-
-struct ggml_tensor * ggml_silu_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_SILU);
-}
-
-// ggml_silu_back
-
-struct ggml_tensor * ggml_silu_back(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b) {
-    bool is_node = false;
-
-    if (a->grad || b->grad) {
-        // TODO: implement backward
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
-
-    result->op   = GGML_OP_SILU_BACK;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-    result->src[1] = b;
-
-    return result;
-}
-
-// ggml_norm
-
-static struct ggml_tensor * ggml_norm_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        float eps,
-        bool inplace) {
-    bool is_node = false;
-
-    if (!inplace && (a->grad)) {
-        GGML_ASSERT(false); // TODO: implement backward
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    ggml_set_op_params(result, &eps, sizeof(eps));
-
-    result->op   = GGML_OP_NORM;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_norm(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        float eps) {
-    return ggml_norm_impl(ctx, a, eps, false);
-}
-
-struct ggml_tensor * ggml_norm_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        float eps) {
-    return ggml_norm_impl(ctx, a, eps, true);
-}
-
-// ggml_rms_norm
-
-static struct ggml_tensor * ggml_rms_norm_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        float eps,
-        bool inplace) {
-    bool is_node = false;
-
-    if (!inplace && (a->grad)) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    ggml_set_op_params(result, &eps, sizeof(eps));
-
-    result->op   = GGML_OP_RMS_NORM;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_rms_norm(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        float  eps) {
-    return ggml_rms_norm_impl(ctx, a, eps, false);
-}
-
-struct ggml_tensor * ggml_rms_norm_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        float eps) {
-    return ggml_rms_norm_impl(ctx, a, eps, true);
-}
-
-// ggml_rms_norm_back
-
-struct ggml_tensor * ggml_rms_norm_back(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b,
-        float  eps) {
-    bool is_node = false;
-
-    if (a->grad) {
-        // TODO: implement backward
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
-
-    ggml_set_op_params(result, &eps, sizeof(eps));
-
-    result->op   = GGML_OP_RMS_NORM_BACK;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-    result->src[1] = b;
-
-    return result;
-}
-
-// ggml_group_norm
-
-static struct ggml_tensor * ggml_group_norm_impl(
-    struct ggml_context * ctx,
-    struct ggml_tensor * a,
-    int n_groups,
-    bool inplace) {
-
-    bool is_node = false;
-    if (!inplace && (a->grad)) {
-        GGML_ASSERT(false); // TODO: implement backward
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    result->op_params[0] = n_groups;
-
-    result->op = GGML_OP_GROUP_NORM;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_group_norm(
-    struct ggml_context * ctx,
-    struct ggml_tensor * a,
-    int n_groups) {
-    return ggml_group_norm_impl(ctx, a, n_groups, false);
-}
-
-struct ggml_tensor * ggml_group_norm_inplace(
-    struct ggml_context * ctx,
-    struct ggml_tensor * a,
-    int n_groups) {
-    return ggml_group_norm_impl(ctx, a, n_groups, true);
-}
-
-// ggml_mul_mat
-
-struct ggml_tensor * ggml_mul_mat(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b) {
-    GGML_ASSERT(ggml_can_mul_mat(a, b));
-    GGML_ASSERT(!ggml_is_transposed(a));
-
-    bool is_node = false;
-
-    if (a->grad || b->grad) {
-        is_node = true;
-    }
-
-    const int64_t ne[4] = { a->ne[1], b->ne[1], b->ne[2], b->ne[3] };
-    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
-
-    result->op   = GGML_OP_MUL_MAT;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-    result->src[1] = b;
-
-    return result;
-}
-
-void ggml_mul_mat_set_prec(
-        struct ggml_tensor * a,
-        enum ggml_prec       prec) {
-    const int32_t prec_i32 = (int32_t) prec;
-
-    ggml_set_op_params_i32(a, 0, prec_i32);
-}
-
-// ggml_mul_mat_id
-
-struct ggml_tensor * ggml_mul_mat_id(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * const as[],
-        int                   n_as,
-        struct ggml_tensor  * ids,
-        int                   id,
-        struct ggml_tensor  * b) {
-
-    GGML_ASSERT(ids->type == GGML_TYPE_I32);
-    GGML_ASSERT(ids->ne[2] == 1 && ids->ne[3] == 1);
-    GGML_ASSERT(ids->ne[1] == b->ne[1]);
-    GGML_ASSERT(ids->ne[2] == b->ne[2] && ids->ne[3] == b->ne[3]);
-    GGML_ASSERT(n_as > 0 && n_as <= GGML_MAX_SRC - 2);
-    GGML_ASSERT(id >= 0 && id < ids->ne[0]);
-
-    bool is_node = false;
-
-    if (as[0]->grad || b->grad) {
-        is_node = true;
-    }
-
-    const int64_t ne[4] = { as[0]->ne[1], b->ne[1], b->ne[2], b->ne[3] };
-    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
-
-    ggml_set_op_params_i32(result, 0, id);
-    ggml_set_op_params_i32(result, 1, n_as);
-
-    result->op   = GGML_OP_MUL_MAT_ID;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = ids;
-    result->src[1] = b;
-
-    for (int i = 0; i < n_as; i++) {
-        struct ggml_tensor * a = as[i];
-        GGML_ASSERT(ggml_are_same_shape(as[0], a));
-        GGML_ASSERT(ggml_can_mul_mat(a, b));
-        GGML_ASSERT(!ggml_is_transposed(a));
-        result->src[i + 2] = a;
-    }
-
-    return result;
-}
-
-// ggml_out_prod
-
-struct ggml_tensor * ggml_out_prod(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b) {
-    GGML_ASSERT(ggml_can_out_prod(a, b));
-    GGML_ASSERT(!ggml_is_transposed(a));
-
-    bool is_node = false;
-
-    if (a->grad || b->grad) {
-        is_node = true;
-    }
-
-    // a is broadcastable to b for ne[2] and ne[3] -> use b->ne[2] and b->ne[3]
-    const int64_t ne[4] = { a->ne[0], b->ne[0], b->ne[2], b->ne[3] };
-    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
-
-    result->op   = GGML_OP_OUT_PROD;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-    result->src[1] = b;
-
-    return result;
-}
-
-// ggml_scale
-
-static struct ggml_tensor * ggml_scale_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        float                 s,
-        bool inplace) {
-    GGML_ASSERT(ggml_is_padded_1d(a));
-
-    bool is_node = false;
-
-    if (a->grad) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    ggml_set_op_params(result, &s, sizeof(s));
-
-    result->op   = GGML_OP_SCALE;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_scale(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        float                s) {
-    return ggml_scale_impl(ctx, a, s, false);
-}
-
-struct ggml_tensor * ggml_scale_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        float                s) {
-    return ggml_scale_impl(ctx, a, s, true);
-}
-
-// ggml_set
-
-static struct ggml_tensor * ggml_set_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b,
-        size_t                nb1,
-        size_t                nb2,
-        size_t                nb3,
-        size_t                offset,
-        bool inplace) {
-    GGML_ASSERT(ggml_nelements(a) >= ggml_nelements(b));
-
-    bool is_node = false;
-
-    if (a->grad || b->grad) {
-        is_node = true;
-    }
-
-    // make a view of the destination
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    int32_t params[] = { nb1, nb2, nb3, offset, inplace ? 1 : 0 };
-    ggml_set_op_params(result, params, sizeof(params));
-
-    result->op   = GGML_OP_SET;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-    result->src[1] = b;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_set(
-        struct ggml_context * ctx,
-        struct ggml_tensor *  a,
-        struct ggml_tensor *  b,
-        size_t                nb1,
-        size_t                nb2,
-        size_t                nb3,
-        size_t                offset) {
-    return ggml_set_impl(ctx, a, b, nb1, nb2, nb3, offset, false);
-}
-
-struct ggml_tensor * ggml_set_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor *  a,
-        struct ggml_tensor *  b,
-        size_t                nb1,
-        size_t                nb2,
-        size_t                nb3,
-        size_t                offset) {
-    return ggml_set_impl(ctx, a, b, nb1, nb2, nb3, offset, true);
-}
-
-struct ggml_tensor * ggml_set_1d(
-        struct ggml_context * ctx,
-        struct ggml_tensor *  a,
-        struct ggml_tensor *  b,
-        size_t                offset) {
-    return ggml_set_impl(ctx, a, b, a->nb[1], a->nb[2], a->nb[3], offset, false);
-}
-
-struct ggml_tensor * ggml_set_1d_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor *  a,
-        struct ggml_tensor *  b,
-        size_t                offset) {
-    return ggml_set_impl(ctx, a, b, a->nb[1], a->nb[2], a->nb[3], offset, true);
-}
-
-struct ggml_tensor * ggml_set_2d(
-        struct ggml_context * ctx,
-        struct ggml_tensor *  a,
-        struct ggml_tensor *  b,
-        size_t                nb1,
-        size_t                offset) {
-    return ggml_set_impl(ctx, a, b, nb1, a->nb[2], a->nb[3], offset, false);
-}
-
-struct ggml_tensor * ggml_set_2d_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor *  a,
-        struct ggml_tensor *  b,
-        size_t                nb1,
-        size_t                offset) {
-    return ggml_set_impl(ctx, a, b, nb1, a->nb[2], a->nb[3], offset, true);
-}
-
-// ggml_cpy
-
-static struct ggml_tensor * ggml_cpy_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b) {
-    GGML_ASSERT(ggml_nelements(a) == ggml_nelements(b));
-
-    bool is_node = false;
-
-    if (a->grad || b->grad) {
-        // inplace is false and either one have a grad
-        is_node = true;
-    }
-
-    // make a view of the destination
-    struct ggml_tensor * result = ggml_view_tensor(ctx, b);
-    if (strlen(b->name) > 0) {
-        ggml_format_name(result, "%s (copy of %s)", b->name, a->name);
-    } else {
-        ggml_format_name(result, "%s (copy)", a->name);
-    }
-
-    result->op   = GGML_OP_CPY;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-    result->src[1] = b;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_cpy(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        struct ggml_tensor * b) {
-    return ggml_cpy_impl(ctx, a, b);
-}
-
-struct ggml_tensor * ggml_cast(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        enum   ggml_type      type) {
-    bool is_node = false;
-
-    struct ggml_tensor * result = ggml_new_tensor(ctx, type, GGML_MAX_DIMS, a->ne);
-    ggml_format_name(result, "%s (copy)", a->name);
-
-    result->op   = GGML_OP_CPY;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-    result->src[1] = result;
-
-    return result;
-}
-
-// ggml_cont
-
-static struct ggml_tensor * ggml_cont_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    bool is_node = false;
-
-    if (a->grad) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
-    ggml_format_name(result, "%s (cont)", a->name);
-
-    result->op   = GGML_OP_CONT;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_cont(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a) {
-    return ggml_cont_impl(ctx, a);
-}
-
-// make contiguous, with new shape
-GGML_API struct ggml_tensor * ggml_cont_1d(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        int64_t               ne0) {
-    return ggml_cont_4d(ctx, a, ne0, 1, 1, 1);
-}
-
-GGML_API struct ggml_tensor * ggml_cont_2d(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        int64_t               ne0,
-        int64_t               ne1) {
-    return ggml_cont_4d(ctx, a, ne0, ne1, 1, 1);
-}
-
-GGML_API struct ggml_tensor * ggml_cont_3d(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        int64_t               ne0,
-        int64_t               ne1,
-        int64_t               ne2) {
-    return ggml_cont_4d(ctx, a, ne0, ne1, ne2, 1);
-}
-
-struct ggml_tensor * ggml_cont_4d(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        int64_t               ne0,
-        int64_t               ne1,
-        int64_t               ne2,
-        int64_t               ne3) {
-    GGML_ASSERT(ggml_nelements(a) == (ne0*ne1*ne2*ne3));
-
-    bool is_node = false;
-
-    struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type, ne0, ne1, ne2, ne3);
-    ggml_format_name(result, "%s (cont)", a->name);
-
-    result->op   = GGML_OP_CONT;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
-// ggml_reshape
-
-struct ggml_tensor * ggml_reshape(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        struct ggml_tensor * b) {
-    GGML_ASSERT(ggml_is_contiguous(a));
-    // as only the shape of b is relevant, and not its memory layout, b is allowed to be non contiguous.
-    GGML_ASSERT(ggml_nelements(a) == ggml_nelements(b));
-
-    bool is_node = false;
-
-    if (a->grad) {
-        is_node = true;
-    }
-
-    if (b->grad) {
-        // gradient propagation is not supported
-        //GGML_ASSERT(false);
-    }
-
-    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, GGML_MAX_DIMS, b->ne, a, 0);
-    ggml_format_name(result, "%s (reshaped)", a->name);
-
-    result->op   = GGML_OP_RESHAPE;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_reshape_1d(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        int64_t               ne0) {
-    GGML_ASSERT(ggml_is_contiguous(a));
-    GGML_ASSERT(ggml_nelements(a) == ne0);
-
-    bool is_node = false;
-
-    if (a->grad) {
-        is_node = true;
-    }
-
-    const int64_t ne[1] = { ne0 };
-    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, ne, a, 0);
-    ggml_format_name(result, "%s (reshaped)", a->name);
-
-    result->op   = GGML_OP_RESHAPE;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_reshape_2d(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        int64_t               ne0,
-        int64_t               ne1) {
-    GGML_ASSERT(ggml_is_contiguous(a));
-    GGML_ASSERT(ggml_nelements(a) == ne0*ne1);
-
-    bool is_node = false;
-
-    if (a->grad) {
-        is_node = true;
-    }
-
-    const int64_t ne[2] = { ne0, ne1 };
-    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, a, 0);
-    ggml_format_name(result, "%s (reshaped)", a->name);
-
-    result->op   = GGML_OP_RESHAPE;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_reshape_3d(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        int64_t               ne0,
-        int64_t               ne1,
-        int64_t               ne2) {
-    GGML_ASSERT(ggml_is_contiguous(a));
-    GGML_ASSERT(ggml_nelements(a) == ne0*ne1*ne2);
-
-    bool is_node = false;
-
-    if (a->grad) {
-        is_node = true;
-    }
-
-    const int64_t ne[3] = { ne0, ne1, ne2 };
-    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, a, 0);
-    ggml_format_name(result, "%s (reshaped)", a->name);
-
-    result->op   = GGML_OP_RESHAPE;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_reshape_4d(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        int64_t               ne0,
-        int64_t               ne1,
-        int64_t               ne2,
-        int64_t               ne3) {
-    GGML_ASSERT(ggml_is_contiguous(a));
-    GGML_ASSERT(ggml_nelements(a) == ne0*ne1*ne2*ne3);
-
-    bool is_node = false;
-
-    if (a->grad) {
-        is_node = true;
-    }
-
-    const int64_t ne[4] = { ne0, ne1, ne2, ne3 };
-    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, a, 0);
-    ggml_format_name(result, "%s (reshaped)", a->name);
-
-    result->op   = GGML_OP_RESHAPE;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
-static struct ggml_tensor * ggml_view_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        int                   n_dims,
-        const int64_t       * ne,
-        size_t                offset) {
-
-    bool is_node = false;
-
-    if (a->grad) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, n_dims, ne, a, offset);
-    ggml_format_name(result, "%s (view)", a->name);
-
-    ggml_set_op_params(result, &offset, sizeof(offset));
-
-    result->op   = GGML_OP_VIEW;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
-// ggml_view_1d
-
-struct ggml_tensor * ggml_view_1d(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        int64_t               ne0,
-        size_t                offset) {
-
-    struct ggml_tensor * result = ggml_view_impl(ctx, a, 1, &ne0, offset);
-
-    return result;
-}
-
-// ggml_view_2d
-
-struct ggml_tensor * ggml_view_2d(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        int64_t               ne0,
-        int64_t               ne1,
-        size_t                nb1,
-        size_t                offset) {
-
-    const int64_t ne[2] = { ne0, ne1 };
-
-    struct ggml_tensor * result = ggml_view_impl(ctx, a, 2, ne, offset);
-
-    result->nb[1] = nb1;
-    result->nb[2] = result->nb[1]*ne1;
-    result->nb[3] = result->nb[2];
-
-    return result;
-}
-
-// ggml_view_3d
-
-struct ggml_tensor * ggml_view_3d(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        int64_t               ne0,
-        int64_t               ne1,
-        int64_t               ne2,
-        size_t                nb1,
-        size_t                nb2,
-        size_t                offset) {
-
-    const int64_t ne[3] = { ne0, ne1, ne2 };
-
-    struct ggml_tensor * result = ggml_view_impl(ctx, a, 3, ne, offset);
-
-    result->nb[1] = nb1;
-    result->nb[2] = nb2;
-    result->nb[3] = result->nb[2]*ne2;
-
-    return result;
-}
-
-// ggml_view_4d
-
-struct ggml_tensor * ggml_view_4d(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        int64_t               ne0,
-        int64_t               ne1,
-        int64_t               ne2,
-        int64_t               ne3,
-        size_t                nb1,
-        size_t                nb2,
-        size_t                nb3,
-        size_t                offset) {
-
-    const int64_t ne[4] = { ne0, ne1, ne2, ne3 };
-
-    struct ggml_tensor * result = ggml_view_impl(ctx, a, 4, ne, offset);
-
-    result->nb[1] = nb1;
-    result->nb[2] = nb2;
-    result->nb[3] = nb3;
-
-    return result;
-}
-
-// ggml_permute
-
-struct ggml_tensor * ggml_permute(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        int                   axis0,
-        int                   axis1,
-        int                   axis2,
-        int                   axis3) {
-    GGML_ASSERT(axis0 >= 0 && axis0 < GGML_MAX_DIMS);
-    GGML_ASSERT(axis1 >= 0 && axis1 < GGML_MAX_DIMS);
-    GGML_ASSERT(axis2 >= 0 && axis2 < GGML_MAX_DIMS);
-    GGML_ASSERT(axis3 >= 0 && axis3 < GGML_MAX_DIMS);
-
-    GGML_ASSERT(axis0 != axis1);
-    GGML_ASSERT(axis0 != axis2);
-    GGML_ASSERT(axis0 != axis3);
-    GGML_ASSERT(axis1 != axis2);
-    GGML_ASSERT(axis1 != axis3);
-    GGML_ASSERT(axis2 != axis3);
-
-    bool is_node = false;
-
-    if (a->grad) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = ggml_view_tensor(ctx, a);
-    ggml_format_name(result, "%s (permuted)", a->name);
-
-    int ne[GGML_MAX_DIMS];
-    int nb[GGML_MAX_DIMS];
-
-    ne[axis0] = a->ne[0];
-    ne[axis1] = a->ne[1];
-    ne[axis2] = a->ne[2];
-    ne[axis3] = a->ne[3];
-
-    nb[axis0] = a->nb[0];
-    nb[axis1] = a->nb[1];
-    nb[axis2] = a->nb[2];
-    nb[axis3] = a->nb[3];
-
-    result->ne[0] = ne[0];
-    result->ne[1] = ne[1];
-    result->ne[2] = ne[2];
-    result->ne[3] = ne[3];
-
-    result->nb[0] = nb[0];
-    result->nb[1] = nb[1];
-    result->nb[2] = nb[2];
-    result->nb[3] = nb[3];
-
-    result->op   = GGML_OP_PERMUTE;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    int32_t params[] = { axis0, axis1, axis2, axis3 };
-    ggml_set_op_params(result, params, sizeof(params));
-
-    return result;
-}
-
-// ggml_transpose
-
-struct ggml_tensor * ggml_transpose(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    bool is_node = false;
-
-    if (a->grad) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = ggml_view_tensor(ctx, a);
-    ggml_format_name(result, "%s (transposed)", a->name);
-
-    result->ne[0] = a->ne[1];
-    result->ne[1] = a->ne[0];
-
-    result->nb[0] = a->nb[1];
-    result->nb[1] = a->nb[0];
-
-    result->op   = GGML_OP_TRANSPOSE;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
-// ggml_get_rows
-
-struct ggml_tensor * ggml_get_rows(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b) {
-    GGML_ASSERT(a->ne[2] == b->ne[1]);
-    GGML_ASSERT(b->ne[3] == 1);
-    GGML_ASSERT(b->type == GGML_TYPE_I32);
-
-    bool is_node = false;
-
-    if (a->grad || b->grad) {
-        is_node = true;
-    }
-
-    // TODO: implement non F32 return
-    enum ggml_type type = GGML_TYPE_F32;
-    if (a->type == GGML_TYPE_I32) {
-        type = a->type;
-    }
-    struct ggml_tensor * result = ggml_new_tensor_4d(ctx, type, a->ne[0], b->ne[0], b->ne[1], b->ne[2]);
-
-    result->op   = GGML_OP_GET_ROWS;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-    result->src[1] = b;
-
-    return result;
-}
-
-// ggml_get_rows_back
-
-struct ggml_tensor * ggml_get_rows_back(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b,
-        struct ggml_tensor  * c) {
-    GGML_ASSERT(ggml_is_matrix(a) && ggml_is_vector(b) && b->type == GGML_TYPE_I32);
-    GGML_ASSERT(ggml_is_matrix(c) && (a->ne[0] == c->ne[0]));
-
-    bool is_node = false;
-
-    if (a->grad || b->grad) {
-        is_node = true;
-    }
-
-    // TODO: implement non F32 return
-    //struct ggml_tensor * result = ggml_new_tensor_2d(ctx, a->type, a->ne[0], b->ne[0]);
-    struct ggml_tensor * result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, c->ne[0], c->ne[1]);
-
-    result->op   = GGML_OP_GET_ROWS_BACK;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-    result->src[1] = b;
-
-    return result;
-}
-
-// ggml_diag
-
-struct ggml_tensor * ggml_diag(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    GGML_ASSERT(a->ne[1] == 1);
-    bool is_node = false;
-
-    if (a->grad) {
-        is_node = true;
-    }
-
-    const int64_t ne[4] = { a->ne[0], a->ne[0], a->ne[2], a->ne[3] };
-    struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, 4, ne);
-
-    result->op   = GGML_OP_DIAG;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
-// ggml_diag_mask_inf
-
-static struct ggml_tensor * ggml_diag_mask_inf_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        int                   n_past,
-        bool                  inplace) {
-    bool is_node = false;
-
-    if (a->grad) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    int32_t params[] = { n_past };
-    ggml_set_op_params(result, params, sizeof(params));
-
-    result->op   = GGML_OP_DIAG_MASK_INF;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_diag_mask_inf(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        int                   n_past) {
-    return ggml_diag_mask_inf_impl(ctx, a, n_past, false);
-}
-
-struct ggml_tensor * ggml_diag_mask_inf_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        int                   n_past) {
-    return ggml_diag_mask_inf_impl(ctx, a, n_past, true);
-}
-
-// ggml_diag_mask_zero
-
-static struct ggml_tensor * ggml_diag_mask_zero_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        int                   n_past,
-        bool                  inplace) {
-    bool is_node = false;
-
-    if (a->grad) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    int32_t params[] = { n_past };
-    ggml_set_op_params(result, params, sizeof(params));
-
-    result->op   = GGML_OP_DIAG_MASK_ZERO;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_diag_mask_zero(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        int                   n_past) {
-    return ggml_diag_mask_zero_impl(ctx, a, n_past, false);
-}
-
-struct ggml_tensor * ggml_diag_mask_zero_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        int                   n_past) {
-    return ggml_diag_mask_zero_impl(ctx, a, n_past, true);
-}
-
-// ggml_soft_max
-
-static struct ggml_tensor * ggml_soft_max_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * mask,
-        float                 scale,
-        bool                  inplace) {
-    GGML_ASSERT(ggml_is_contiguous(a));
-    if (mask) {
-        GGML_ASSERT(ggml_is_contiguous(mask));
-        GGML_ASSERT(mask->ne[2] == 1);
-        GGML_ASSERT(mask->ne[3] == 1);
-        GGML_ASSERT(ggml_can_repeat_rows(mask, a));
-    }
-
-    bool is_node = false;
-
-    if (a->grad) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    float params[] = { scale };
-    ggml_set_op_params(result, params, sizeof(params));
-
-    result->op   = GGML_OP_SOFT_MAX;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-    result->src[1] = mask;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_soft_max(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_soft_max_impl(ctx, a, NULL, 1.0f, false);
-}
-
-struct ggml_tensor * ggml_soft_max_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_soft_max_impl(ctx, a, NULL, 1.0f, true);
-}
-
-struct ggml_tensor * ggml_soft_max_ext(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * mask,
-        float                 scale) {
-    return ggml_soft_max_impl(ctx, a, mask, scale, false);
-}
-
-// ggml_soft_max_back
-
-static struct ggml_tensor * ggml_soft_max_back_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b,
-        bool                  inplace) {
-    bool is_node = false;
-
-    if (a->grad || b->grad) {
-        is_node = true; // TODO : implement backward pass
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    result->op   = GGML_OP_SOFT_MAX_BACK;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-    result->src[1] = b;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_soft_max_back(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b) {
-    return ggml_soft_max_back_impl(ctx, a, b, false);
-}
-
-struct ggml_tensor * ggml_soft_max_back_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b) {
-    return ggml_soft_max_back_impl(ctx, a, b, true);
-}
-
-// ggml_rope
-
-static struct ggml_tensor * ggml_rope_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b,
-        int                   n_dims,
-        int                   mode,
-        int                   n_ctx,
-        int                   n_orig_ctx,
-        float                 freq_base,
-        float                 freq_scale,
-        float                 ext_factor,
-        float                 attn_factor,
-        float                 beta_fast,
-        float                 beta_slow,
-        float                 xpos_base,
-        bool                  xpos_down,
-        bool                  inplace) {
-    GGML_ASSERT(ggml_is_vector(b));
-    GGML_ASSERT(b->type == GGML_TYPE_I32);
-    GGML_ASSERT(a->ne[2] == b->ne[0]);
-
-    bool is_node = false;
-
-    if (a->grad) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    int32_t params[13] = { /*n_past*/ 0, n_dims, mode, n_ctx, n_orig_ctx };
-    memcpy(params +  5, &freq_base,    sizeof(float));
-    memcpy(params +  6, &freq_scale,   sizeof(float));
-    memcpy(params +  7, &ext_factor,   sizeof(float));
-    memcpy(params +  8, &attn_factor,  sizeof(float));
-    memcpy(params +  9, &beta_fast,    sizeof(float));
-    memcpy(params + 10, &beta_slow,    sizeof(float));
-    memcpy(params + 11, &xpos_base,    sizeof(float));
-    memcpy(params + 12, &xpos_down,    sizeof(bool));
-    ggml_set_op_params(result, params, sizeof(params));
-
-    result->op   = GGML_OP_ROPE;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-    result->src[1] = b;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_rope(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b,
-        int                   n_dims,
-        int                   mode,
-        int                   n_ctx) {
-    return ggml_rope_impl(
-        ctx, a, b, n_dims, mode, n_ctx, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, false, false
-    );
-}
-
-struct ggml_tensor * ggml_rope_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b,
-        int                   n_dims,
-        int                   mode,
-        int                   n_ctx) {
-    return ggml_rope_impl(
-        ctx, a, b, n_dims, mode, n_ctx, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, false, true
-    );
-}
-
-struct ggml_tensor * ggml_rope_custom(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b,
-        int                   n_dims,
-        int                   mode,
-        int                   n_ctx,
-        int                   n_orig_ctx,
-        float                 freq_base,
-        float                 freq_scale,
-        float                 ext_factor,
-        float                 attn_factor,
-        float                 beta_fast,
-        float                 beta_slow) {
-    return ggml_rope_impl(
-        ctx, a, b, n_dims, mode, n_ctx, n_orig_ctx, freq_base, freq_scale,
-        ext_factor, attn_factor, beta_fast, beta_slow, 0.0f, false, false
-    );
-}
-
-struct ggml_tensor * ggml_rope_custom_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b,
-        int                   n_dims,
-        int                   mode,
-        int                   n_ctx,
-        int                   n_orig_ctx,
-        float                 freq_base,
-        float                 freq_scale,
-        float                 ext_factor,
-        float                 attn_factor,
-        float                 beta_fast,
-        float                 beta_slow) {
-    return ggml_rope_impl(
-        ctx, a, b, n_dims, mode, n_ctx, n_orig_ctx, freq_base, freq_scale,
-        ext_factor, attn_factor, beta_fast, beta_slow, 0.0f, false, true
-    );
-}
-
-struct ggml_tensor * ggml_rope_xpos_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b,
-        int                   n_dims,
-        float                 base,
-        bool                  down) {
-    return ggml_rope_impl(ctx, a, b, n_dims, 0, 0, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, base, down, true);
-}
-
-// ggml_rope_back
-
-struct ggml_tensor * ggml_rope_back(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b,
-        int                   n_dims,
-        int                   mode,
-        int                   n_ctx,
-        int                   n_orig_ctx,
-        float                 freq_base,
-        float                 freq_scale,
-        float                 ext_factor,
-        float                 attn_factor,
-        float                 beta_fast,
-        float                 beta_slow,
-        float                 xpos_base,
-        bool                  xpos_down) {
-    GGML_ASSERT(ggml_is_vector(b));
-    GGML_ASSERT(b->type == GGML_TYPE_I32);
-    GGML_ASSERT(a->ne[2] == b->ne[0]);
-
-    GGML_ASSERT((mode & 4) == 0 && "ggml_rope_back() for ChatGLM not implemented yet");
-
-    bool is_node = false;
-
-    if (a->grad) {
-        is_node = false; // TODO: implement backward
-    }
-
-    struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
-
-    int32_t params[13] = { /*n_past*/ 0, n_dims, mode, n_ctx, n_orig_ctx };
-    memcpy(params +  5, &freq_base,    sizeof(float));
-    memcpy(params +  6, &freq_scale,   sizeof(float));
-    memcpy(params +  7, &ext_factor,   sizeof(float));
-    memcpy(params +  8, &attn_factor,  sizeof(float));
-    memcpy(params +  9, &beta_fast,    sizeof(float));
-    memcpy(params + 10, &beta_slow,    sizeof(float));
-    memcpy(params + 11, &xpos_base,    sizeof(float));
-    memcpy(params + 12, &xpos_down,    sizeof(bool));
-    ggml_set_op_params(result, params, sizeof(params));
-
-    result->op   = GGML_OP_ROPE_BACK;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-    result->src[1] = b;
-
-    return result;
-}
-
-// ggml_alibi
-
-struct ggml_tensor * ggml_alibi(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        int                   n_past,
-        int                   n_head,
-        float                 bias_max) {
-    GGML_ASSERT(n_past >= 0);
-    bool is_node = false;
-
-    if (a->grad) {
-        GGML_ASSERT(false); // TODO: implement backward
-        is_node = true;
-    }
-
-    // TODO: when implement backward, fix this:
-    //struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-    struct ggml_tensor * result = ggml_view_tensor(ctx, a);
-
-    int32_t op_params[3] = { n_past, n_head };
-    memcpy(op_params + 2, &bias_max, sizeof(float));
-    ggml_set_op_params(result, op_params, sizeof(op_params));
-
-    result->op   = GGML_OP_ALIBI;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
-// ggml_clamp
-
-struct ggml_tensor * ggml_clamp(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        float                 min,
-        float                 max) {
-    bool is_node = false;
-
-    if (a->grad) {
-        GGML_ASSERT(false); // TODO: implement backward
-        is_node = true;
-    }
-
-    // TODO: when implement backward, fix this:
-    struct ggml_tensor * result = ggml_view_tensor(ctx, a);
-
-    float params[] = { min, max };
-    ggml_set_op_params(result, params, sizeof(params));
-
-    result->op   = GGML_OP_CLAMP;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
-// ggml_conv_1d
-
-static int64_t ggml_calc_conv_output_size(int64_t ins, int64_t ks, int s, int p, int d) {
-    return (ins + 2 * p - d * (ks - 1) - 1) / s + 1;
-}
-
-GGML_API struct ggml_tensor * ggml_conv_1d(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b,
-        int                   s0,
-        int                   p0,
-        int                   d0) {
-    struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, 0, p0, 0, d0, 0, false); // [N, OL, IC * K]
-
-    struct ggml_tensor * result =
-        ggml_mul_mat(ctx,
-                ggml_reshape_2d(ctx, im2col, im2col->ne[0], (im2col->ne[2] * im2col->ne[1])), // [N, OL, IC * K] => [N*OL, IC * K]
-                ggml_reshape_2d(ctx, a, (a->ne[0] * a->ne[1]), a->ne[2]));                    // [OC,IC, K] => [OC, IC * K]
-
-    result = ggml_reshape_3d(ctx, result, im2col->ne[1], a->ne[2], im2col->ne[2]); // [N, OC, OL]
-
-    return result;
-}
-
-// ggml_conv_1d_ph
-
-struct ggml_tensor* ggml_conv_1d_ph(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b,
-        int                   s,
-        int                   d) {
-    return ggml_conv_1d(ctx, a, b, s, a->ne[0] / 2, d);
-}
-
-// ggml_conv_transpose_1d
-
-static int64_t ggml_calc_conv_transpose_1d_output_size(int64_t ins, int64_t ks, int s, int p, int d) {
-    return (ins - 1) * s - 2 * p + d * (ks - 1) + 1;
-}
-
-GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b,
-        int                   s0,
-        int                   p0,
-        int                   d0) {
-    GGML_ASSERT(ggml_is_matrix(b));
-    GGML_ASSERT(a->ne[2] == b->ne[1]);
-    GGML_ASSERT(a->ne[3] == 1);
-
-    GGML_ASSERT(p0 == 0);
-    GGML_ASSERT(d0 == 1);
-
-    bool is_node = false;
-
-    if (a->grad || b->grad) {
-        GGML_ASSERT(false); // TODO: implement backward
-        is_node = true;
-    }
-
-    const int64_t ne[4] = {
-        ggml_calc_conv_transpose_1d_output_size(b->ne[0], a->ne[0], s0, 0 /*p0*/, 1 /*d0*/),
-        a->ne[1], b->ne[2], 1,
-    };
-    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
-
-    int32_t params[] = { s0, p0, d0 };
-    ggml_set_op_params(result, params, sizeof(params));
-
-    result->op = GGML_OP_CONV_TRANSPOSE_1D;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-    result->src[1] = b;
-
-    return result;
-}
-
-// ggml_conv_2d
-
-// im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
-// a: [OC,IC, KH, KW]
-// b: [N, IC, IH, IW]
-// result: [N, OH, OW, IC*KH*KW]
-struct ggml_tensor * ggml_im2col(
-    struct ggml_context * ctx,
-    struct ggml_tensor  * a,
-    struct ggml_tensor  * b,
-    int                  s0,
-    int                  s1,
-    int                  p0,
-    int                  p1,
-    int                  d0,
-    int                  d1,
-    bool                 is_2D) {
-
-    if(is_2D) {
-        GGML_ASSERT(a->ne[2] == b->ne[2]);
-    } else {
-        GGML_ASSERT(a->ne[1] == b->ne[1]);
-    }
-    bool is_node = false;
-
-    if (a->grad || b->grad) {
-        GGML_ASSERT(false); // TODO: implement backward
-        is_node = true;
-    }
-
-    const int64_t OH = is_2D ? ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1) : 0;
-    const int64_t OW =         ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0);
-
-    const int64_t ne[4] = {
-        is_2D ? (a->ne[2] * a->ne[1] * a->ne[0]) : a->ne[1] * a->ne[0],
-        OW,
-        is_2D ? OH : b->ne[2],
-        is_2D ?      b->ne[3] : 1,
-    };
-
-    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F16, 4, ne);
-    int32_t params[] = { s0, s1, p0, p1, d0, d1, (is_2D ? 1 : 0) };
-    ggml_set_op_params(result, params, sizeof(params));
-
-    result->op = GGML_OP_IM2COL;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-    result->src[1] = b;
-
-    return result;
-}
-
-// a: [OC,IC, KH, KW]
-// b: [N, IC, IH, IW]
-// result: [N, OC, OH, OW]
-struct ggml_tensor * ggml_conv_2d(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b,
-        int                  s0,
-        int                  s1,
-        int                  p0,
-        int                  p1,
-        int                  d0,
-        int                  d1) {
-    struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, s1, p0, p1, d0, d1, true); // [N, OH, OW, IC * KH * KW]
-
-    struct ggml_tensor * result =
-        ggml_mul_mat(ctx,
-                ggml_reshape_2d(ctx, im2col, im2col->ne[0],  im2col->ne[3] * im2col->ne[2] * im2col->ne[1]), // [N, OH, OW, IC * KH * KW] => [N*OH*OW, IC * KH * KW]
-                ggml_reshape_2d(ctx, a, (a->ne[0] * a->ne[1] * a->ne[2]),  a->ne[3]));                       // [OC,IC, KH, KW] => [OC, IC * KH * KW]
-
-    result = ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], a->ne[3], im2col->ne[3]); // [N, OC, OH, OW]
-
-    return result;
-}
-
-// ggml_conv_2d_sk_p0
-struct ggml_tensor * ggml_conv_2d_sk_p0(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b) {
-    return ggml_conv_2d(ctx, a, b, a->ne[0], a->ne[1], 0, 0, 1, 1);
-}
-
-// ggml_conv_2d_s1_ph
-
-struct ggml_tensor * ggml_conv_2d_s1_ph(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b) {
-    return ggml_conv_2d(ctx, a, b, 1, 1, a->ne[0] / 2, a->ne[1] / 2, 1, 1);
-}
-
-// ggml_conv_transpose_2d_p0
-
-static int64_t ggml_calc_conv_transpose_output_size(int64_t ins, int64_t ks, int s, int p) {
-    return (ins - 1) * s - 2 * p + ks;
-}
-
-struct ggml_tensor * ggml_conv_transpose_2d_p0(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b,
-        int                   stride) {
-    GGML_ASSERT(a->ne[3] == b->ne[2]);
-
-    bool is_node = false;
-
-    if (a->grad || b->grad) {
-        GGML_ASSERT(false); // TODO: implement backward
-        is_node = true;
-    }
-
-    const int64_t ne[4] = {
-        ggml_calc_conv_transpose_output_size(b->ne[0], a->ne[0], stride, 0 /*p0*/),
-        ggml_calc_conv_transpose_output_size(b->ne[1], a->ne[1], stride, 0 /*p1*/),
-        a->ne[2], b->ne[3],
-    };
-
-    struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
-
-    ggml_set_op_params_i32(result, 0, stride);
-
-    result->op = GGML_OP_CONV_TRANSPOSE_2D;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-    result->src[1] = b;
-
-    return result;
-}
-
-// ggml_pool_*
-
-static int64_t ggml_calc_pool_output_size(int64_t ins, int ks, int s, float p) {
-    return (ins + 2 * p - ks) / s + 1;
-}
-
-// ggml_pool_1d
-
-struct ggml_tensor * ggml_pool_1d(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        enum ggml_op_pool     op,
-        int                   k0,
-        int                   s0,
-        int                   p0) {
-
-    bool is_node = false;
-
-    if (a->grad) {
-        GGML_ASSERT(false); // TODO: implement backward
-        is_node = true;
-    }
-
-    const int64_t ne[2] = {
-        ggml_calc_pool_output_size(a->ne[0], k0, s0, p0),
-        a->ne[1],
-    };
-    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
-
-    int32_t params[] = { op, k0, s0, p0 };
-    ggml_set_op_params(result, params, sizeof(params));
-
-    result->op = GGML_OP_POOL_1D;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
-// ggml_pool_2d
-
-struct ggml_tensor * ggml_pool_2d(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        enum ggml_op_pool     op,
-        int                   k0,
-        int                   k1,
-        int                   s0,
-        int                   s1,
-        float                 p0,
-        float                 p1) {
-
-    bool is_node = false;
-
-    if (a->grad) {
-        GGML_ASSERT(false); // TODO: implement backward
-        is_node = true;
-    }
-
-    const int64_t ne[3] = {
-        ggml_calc_pool_output_size(a->ne[0], k0, s0, p0),
-        ggml_calc_pool_output_size(a->ne[1], k1, s1, p1),
-        a->ne[2],
-    };
-    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
-
-    int32_t params[] = { op, k0, k1, s0, s1, p0, p1 };
-    ggml_set_op_params(result, params, sizeof(params));
-
-    result->op = GGML_OP_POOL_2D;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
-// ggml_upscale
-
-static struct ggml_tensor * ggml_upscale_impl(
-    struct ggml_context * ctx,
-    struct ggml_tensor * a,
-    int scale_factor) {
-    bool is_node = false;
-
-    if (a->grad) {
-        GGML_ASSERT(false); // TODO: implement backward
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type,
-            a->ne[0] * scale_factor,
-            a->ne[1] * scale_factor,
-            a->ne[2], a->ne[3]);
-
-    result->op = GGML_OP_UPSCALE;
-    result->op_params[0] = scale_factor;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_pad(
-    struct ggml_context * ctx,
-    struct ggml_tensor  * a,
-    int p0, int p1, int p2, int p3) {
-    bool is_node = false;
-
-    if (a->grad) {
-        GGML_ASSERT(false); // TODO: implement backward
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type,
-            a->ne[0] + p0,
-            a->ne[1] + p1,
-            a->ne[2] + p2,
-            a->ne[3] + p3);
-
-    result->op = GGML_OP_PAD;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_upscale(
-    struct ggml_context * ctx,
-    struct ggml_tensor * a,
-    int scale_factor) {
-    return ggml_upscale_impl(ctx, a, scale_factor);
-}
-
-// ggml_argsort
-
-struct ggml_tensor * ggml_argsort(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        enum ggml_sort_order  order) {
-    bool is_node = false;
-
-    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, GGML_MAX_DIMS, a->ne);
-
-    ggml_set_op_params_i32(result, 0, (int32_t) order);
-
-    result->op   = GGML_OP_ARGSORT;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
-// ggml_top_k
-
-struct ggml_tensor * ggml_top_k(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        int                   k) {
-    GGML_ASSERT(a->ne[0] >= k);
-
-    struct ggml_tensor * result = ggml_argsort(ctx, a, GGML_SORT_DESC);
-
-    result = ggml_view_4d(ctx, result,
-                k, result->ne[1], result->ne[2], result->ne[3],
-                   result->nb[1], result->nb[2], result->nb[3],
-                0);
-
-    return result;
-}
-
-// ggml_flash_attn
-
-struct ggml_tensor * ggml_flash_attn(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * q,
-        struct ggml_tensor  * k,
-        struct ggml_tensor  * v,
-        bool                  masked) {
-    GGML_ASSERT(ggml_can_mul_mat(k, q));
-    // TODO: check if vT can be multiplied by (k*qT)
-
-    bool is_node = false;
-
-    if (q->grad || k->grad || v->grad) {
-        is_node = true;
-    }
-
-    //struct ggml_tensor * result = ggml_dup_tensor(ctx, q);
-    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, q->ne);
-
-    int32_t t = masked ? 1 : 0;
-    ggml_set_op_params(result, &t, sizeof(t));
-
-    result->op   = GGML_OP_FLASH_ATTN;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = q;
-    result->src[1] = k;
-    result->src[2] = v;
-
-    return result;
-}
-
-// ggml_flash_ff
-
-struct ggml_tensor * ggml_flash_ff(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b0,
-        struct ggml_tensor  * b1,
-        struct ggml_tensor  * c0,
-        struct ggml_tensor  * c1) {
-    GGML_ASSERT(ggml_can_mul_mat(b0, a));
-    // TODO: more checks
-
-    bool is_node = false;
-
-    if (a->grad || b0->grad || b1->grad || c0->grad || c1->grad) {
-        is_node = true;
-    }
-
-    //struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
-    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, a->ne);
-
-    result->op   = GGML_OP_FLASH_FF;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-    result->src[1] = b0;
-    result->src[2] = b1;
-    result->src[3] = c0;
-    result->src[4] = c1;
-
-    return result;
-}
-
-// ggml_flash_attn_back
-
-struct ggml_tensor * ggml_flash_attn_back(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * q,
-        struct ggml_tensor  * k,
-        struct ggml_tensor  * v,
-        struct ggml_tensor  * d,
-        bool                  masked) {
-    GGML_ASSERT(ggml_can_mul_mat(k, q));
-    // TODO: check if vT can be multiplied by (k*qT)
-
-    // d shape [D,N,ne2,ne3]
-    // q shape [D,N,ne2,ne3]
-    // k shape [D,M,kvne2,ne3]
-    // v shape [M,D,kvne2,ne3]
-
-    const int64_t     D = q->ne[0];
-    const int64_t     N = q->ne[1];
-    const int64_t     M = k->ne[1];
-    const int64_t   ne2 = q->ne[2];
-    const int64_t   ne3 = q->ne[3];
-    const int64_t kvne2 = k->ne[2];
-
-    GGML_ASSERT(k->ne[0] == D);
-    GGML_ASSERT(v->ne[0] == M);
-    GGML_ASSERT(v->ne[1] == D);
-    GGML_ASSERT(d->ne[0] == D);
-    GGML_ASSERT(d->ne[1] == N);
-    GGML_ASSERT(k->ne[2] == kvne2);
-    GGML_ASSERT(k->ne[3] == ne3);
-    GGML_ASSERT(v->ne[2] == kvne2);
-    GGML_ASSERT(v->ne[3] == ne3);
-    GGML_ASSERT(d->ne[2] == ne2);
-    GGML_ASSERT(d->ne[3] == ne3);
-
-    GGML_ASSERT(ne2 % kvne2 == 0);
-
-    bool is_node = false;
-
-    if (q->grad || k->grad || v->grad) {
-        // when using this operation (in backwards pass) these grads are set.
-        // we don't want to create (big) grad of our result, so is_node is false.
-        is_node = false;
-    }
-
-    // store gradients of q, k and v as continuous tensors concatenated in result.
-    // note: v and gradv are actually transposed, i.e. v->ne[0] != D.
-    const int64_t elem_q = ggml_nelements(q);
-    const int64_t elem_k = ggml_nelements(k);
-    const int64_t elem_v = ggml_nelements(v);
-
-    enum ggml_type result_type = GGML_TYPE_F32;
-    GGML_ASSERT(ggml_blck_size(result_type) == 1);
-    const size_t tsize = ggml_type_size(result_type);
-
-    const size_t offs_q = 0;
-    const size_t offs_k = offs_q + GGML_PAD(elem_q * tsize, GGML_MEM_ALIGN);
-    const size_t offs_v = offs_k + GGML_PAD(elem_k * tsize, GGML_MEM_ALIGN);
-    const size_t end    = offs_v + GGML_PAD(elem_v * tsize, GGML_MEM_ALIGN);
-
-    const size_t nelements = (end + tsize - 1)/tsize;
-
-    struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nelements);
-
-    int32_t masked_i = masked ? 1 : 0;
-    ggml_set_op_params(result, &masked_i, sizeof(masked_i));
-
-    result->op   = GGML_OP_FLASH_ATTN_BACK;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = q;
-    result->src[1] = k;
-    result->src[2] = v;
-    result->src[3] = d;
-
-    return result;
-}
-
-// ggml_win_part
-
-struct ggml_tensor * ggml_win_part(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        int                   w) {
-    GGML_ASSERT(a->ne[3] == 1);
-    GGML_ASSERT(a->type  == GGML_TYPE_F32);
-
-    bool is_node = false;
-
-    if (a->grad) {
-        GGML_ASSERT(false); // TODO: implement backward
-        is_node = true;
-    }
-
-    // padding
-    const int px = (w - a->ne[1]%w)%w;
-    const int py = (w - a->ne[2]%w)%w;
-
-    const int npx = (px + a->ne[1])/w;
-    const int npy = (py + a->ne[2])/w;
-    const int np  = npx*npy;
-
-    const int64_t ne[4] = { a->ne[0], w, w, np, };
-    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
-
-    int32_t params[] = { npx, npy, w };
-    ggml_set_op_params(result, params, sizeof(params));
-
-    result->op   = GGML_OP_WIN_PART;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
-// ggml_win_unpart
-
-struct ggml_tensor * ggml_win_unpart(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        int                   w0,
-        int                   h0,
-        int                   w) {
-    GGML_ASSERT(a->type == GGML_TYPE_F32);
-
-    bool is_node = false;
-
-    if (a->grad) {
-        GGML_ASSERT(false); // TODO: implement backward
-        is_node = true;
-    }
-
-    const int64_t ne[4] = { a->ne[0], w0, h0, 1, };
-    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
-
-    int32_t params[] = { w };
-    ggml_set_op_params(result, params, sizeof(params));
-
-    result->op   = GGML_OP_WIN_UNPART;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
-// ggml_get_rel_pos
-
-struct ggml_tensor * ggml_get_rel_pos(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        int                   qh,
-        int                   kh) {
-    GGML_ASSERT(qh == kh);
-    GGML_ASSERT(2*MAX(qh, kh) - 1 == a->ne[1]);
-
-    bool is_node = false;
-
-    if (a->grad) {
-        GGML_ASSERT(false); // TODO: implement backward
-        is_node = true;
-    }
-
-    const int64_t ne[4] = { a->ne[0], kh, qh, 1, };
-    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F16, 3, ne);
-
-    result->op   = GGML_OP_GET_REL_POS;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
-// ggml_add_rel_pos
-
-static struct ggml_tensor * ggml_add_rel_pos_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * pw,
-        struct ggml_tensor  * ph,
-        bool                  inplace) {
-    GGML_ASSERT(ggml_are_same_shape(pw, ph));
-    GGML_ASSERT(ggml_is_contiguous(a));
-    GGML_ASSERT(ggml_is_contiguous(pw));
-    GGML_ASSERT(ggml_is_contiguous(ph));
-    GGML_ASSERT(ph->type == GGML_TYPE_F32);
-    GGML_ASSERT(pw->type == GGML_TYPE_F32);
-    GGML_ASSERT(pw->ne[3] == a->ne[2]);
-    GGML_ASSERT(pw->ne[0]*pw->ne[0] == a->ne[0]);
-    GGML_ASSERT(pw->ne[1]*pw->ne[2] == a->ne[1]);
-
-    bool is_node = false;
-
-    if (!inplace && (a->grad || pw->grad || ph->grad)) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-    ggml_set_op_params_i32(result, 0, inplace ? 1 : 0);
-
-    result->op   = GGML_OP_ADD_REL_POS;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-    result->src[1] = pw;
-    result->src[2] = ph;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_add_rel_pos(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * pw,
-        struct ggml_tensor  * ph) {
-    return ggml_add_rel_pos_impl(ctx, a, pw, ph, false);
-}
-
-struct ggml_tensor * ggml_add_rel_pos_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * pw,
-        struct ggml_tensor  * ph) {
-    return ggml_add_rel_pos_impl(ctx, a, pw, ph, true);
-}
-
-// gmml_unary
-
-static struct ggml_tensor * ggml_unary_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        enum ggml_unary_op op,
-        bool inplace) {
-    bool is_node = false;
-
-    if (!inplace && (a->grad)) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    ggml_set_op_params_i32(result, 0, (int32_t) op);
-
-    result->op   = GGML_OP_UNARY;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_unary(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        enum ggml_unary_op op) {
-    return ggml_unary_impl(ctx, a, op, false);
-}
-
-struct ggml_tensor * ggml_unary_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        enum ggml_unary_op op) {
-    return ggml_unary_impl(ctx, a, op, true);
-}
-
-// ggml_map_unary
-
-static struct ggml_tensor * ggml_map_unary_impl_f32(
-        struct ggml_context        * ctx,
-        struct ggml_tensor         * a,
-        const  ggml_unary_op_f32_t fun,
-        bool   inplace) {
-    bool is_node = false;
-
-    if (!inplace && a->grad) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
-
-    result->op = GGML_OP_MAP_UNARY;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_map_unary_f32(
-        struct ggml_context        * ctx,
-        struct ggml_tensor         * a,
-        const  ggml_unary_op_f32_t fun) {
-    return ggml_map_unary_impl_f32(ctx, a, fun, false);
-}
-
-struct ggml_tensor * ggml_map_unary_inplace_f32(
-        struct ggml_context        * ctx,
-        struct ggml_tensor         * a,
-        const  ggml_unary_op_f32_t fun) {
-    return ggml_map_unary_impl_f32(ctx, a, fun, true);
-}
-
-// ggml_map_binary
-
-static struct ggml_tensor * ggml_map_binary_impl_f32(
-        struct ggml_context         * ctx,
-        struct ggml_tensor          * a,
-        struct ggml_tensor          * b,
-        const  ggml_binary_op_f32_t fun,
-        bool   inplace) {
-    GGML_ASSERT(ggml_are_same_shape(a, b));
-
-    bool is_node = false;
-
-    if (!inplace && (a->grad || b->grad)) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
-
-    result->op = GGML_OP_MAP_BINARY;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-    result->src[1] = b;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_map_binary_f32(
-        struct ggml_context         * ctx,
-        struct ggml_tensor          * a,
-        struct ggml_tensor          * b,
-        const  ggml_binary_op_f32_t fun) {
-    return ggml_map_binary_impl_f32(ctx, a, b, fun, false);
-}
-
-struct ggml_tensor * ggml_map_binary_inplace_f32(
-        struct ggml_context         * ctx,
-        struct ggml_tensor          * a,
-        struct ggml_tensor          * b,
-        const  ggml_binary_op_f32_t fun) {
-    return ggml_map_binary_impl_f32(ctx, a, b, fun, true);
-}
-
-// ggml_map_custom1_f32
-
-static struct ggml_tensor * ggml_map_custom1_impl_f32(
-        struct ggml_context          * ctx,
-        struct ggml_tensor           * a,
-        const  ggml_custom1_op_f32_t   fun,
-        bool   inplace) {
-    bool is_node = false;
-
-    if (!inplace && a->grad) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
-
-    result->op = GGML_OP_MAP_CUSTOM1_F32;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_map_custom1_f32(
-        struct ggml_context          * ctx,
-        struct ggml_tensor           * a,
-        const  ggml_custom1_op_f32_t   fun) {
-    return ggml_map_custom1_impl_f32(ctx, a, fun, false);
-}
-
-struct ggml_tensor * ggml_map_custom1_inplace_f32(
-        struct ggml_context          * ctx,
-        struct ggml_tensor           * a,
-        const  ggml_custom1_op_f32_t   fun) {
-    return ggml_map_custom1_impl_f32(ctx, a, fun, true);
-}
-
-// ggml_map_custom2_f32
-
-static struct ggml_tensor * ggml_map_custom2_impl_f32(
-        struct ggml_context          * ctx,
-        struct ggml_tensor           * a,
-        struct ggml_tensor           * b,
-        const  ggml_custom2_op_f32_t   fun,
-        bool   inplace) {
-    bool is_node = false;
-
-    if (!inplace && (a->grad || b->grad)) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
-
-    result->op = GGML_OP_MAP_CUSTOM2_F32;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-    result->src[1] = b;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_map_custom2_f32(
-        struct ggml_context          * ctx,
-        struct ggml_tensor           * a,
-        struct ggml_tensor           * b,
-        const  ggml_custom2_op_f32_t   fun) {
-    return ggml_map_custom2_impl_f32(ctx, a, b, fun, false);
-}
-
-struct ggml_tensor * ggml_map_custom2_inplace_f32(
-        struct ggml_context          * ctx,
-        struct ggml_tensor           * a,
-        struct ggml_tensor           * b,
-        const  ggml_custom2_op_f32_t   fun) {
-    return ggml_map_custom2_impl_f32(ctx, a, b, fun, true);
-}
-
-// ggml_map_custom3_f32
-
-static struct ggml_tensor * ggml_map_custom3_impl_f32(
-        struct ggml_context          * ctx,
-        struct ggml_tensor           * a,
-        struct ggml_tensor           * b,
-        struct ggml_tensor           * c,
-        const  ggml_custom3_op_f32_t   fun,
-        bool   inplace) {
-    bool is_node = false;
-
-    if (!inplace && (a->grad || b->grad || c->grad)) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
-
-    result->op = GGML_OP_MAP_CUSTOM3_F32;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-    result->src[1] = b;
-    result->src[2] = c;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_map_custom3_f32(
-        struct ggml_context          * ctx,
-        struct ggml_tensor           * a,
-        struct ggml_tensor           * b,
-        struct ggml_tensor           * c,
-        const  ggml_custom3_op_f32_t   fun) {
-    return ggml_map_custom3_impl_f32(ctx, a, b, c, fun, false);
-}
-
-struct ggml_tensor * ggml_map_custom3_inplace_f32(
-        struct ggml_context          * ctx,
-        struct ggml_tensor           * a,
-        struct ggml_tensor           * b,
-        struct ggml_tensor           * c,
-        const  ggml_custom3_op_f32_t   fun) {
-    return ggml_map_custom3_impl_f32(ctx, a, b, c, fun, true);
-}
-
-// ggml_map_custom1
-struct ggml_map_custom1_op_params {
-    ggml_custom1_op_t fun;
-    int n_tasks;
-    void * userdata;
-};
-
-static struct ggml_tensor * ggml_map_custom1_impl(
-        struct ggml_context          * ctx,
-        struct ggml_tensor           * a,
-        const  ggml_custom1_op_t       fun,
-        int                            n_tasks,
-        void                         * userdata,
-        bool                           inplace) {
-    GGML_ASSERT(n_tasks == GGML_N_TASKS_MAX || n_tasks > 0);
-
-    bool is_node = false;
-
-    if (!inplace && a->grad) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    struct ggml_map_custom1_op_params params = {
-        /*.fun      =*/ fun,
-        /*.n_tasks  =*/ n_tasks,
-        /*.userdata =*/ userdata
-    };
-    ggml_set_op_params(result, (const void *) &params, sizeof(params));
-
-    result->op = GGML_OP_MAP_CUSTOM1;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_map_custom1(
-        struct ggml_context          * ctx,
-        struct ggml_tensor           * a,
-        const  ggml_custom1_op_t       fun,
-        int                            n_tasks,
-        void                         * userdata) {
-    return ggml_map_custom1_impl(ctx, a, fun, n_tasks, userdata, false);
-}
-
-struct ggml_tensor * ggml_map_custom1_inplace(
-        struct ggml_context          * ctx,
-        struct ggml_tensor           * a,
-        const  ggml_custom1_op_t       fun,
-        int                            n_tasks,
-        void                         * userdata) {
-    return ggml_map_custom1_impl(ctx, a, fun, n_tasks, userdata, true);
-}
-
-// ggml_map_custom2
-
-struct ggml_map_custom2_op_params {
-    ggml_custom2_op_t fun;
-    int n_tasks;
-    void * userdata;
-};
-
-static struct ggml_tensor * ggml_map_custom2_impl(
-        struct ggml_context          * ctx,
-        struct ggml_tensor           * a,
-        struct ggml_tensor           * b,
-        const  ggml_custom2_op_t       fun,
-        int                            n_tasks,
-        void                         * userdata,
-        bool                           inplace) {
-    GGML_ASSERT(n_tasks == GGML_N_TASKS_MAX || n_tasks > 0);
-
-    bool is_node = false;
-
-    if (!inplace && (a->grad || b->grad)) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    struct ggml_map_custom2_op_params params = {
-        /*.fun      =*/ fun,
-        /*.n_tasks  =*/ n_tasks,
-        /*.userdata =*/ userdata
-    };
-    ggml_set_op_params(result, (const void *) &params, sizeof(params));
-
-    result->op = GGML_OP_MAP_CUSTOM2;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-    result->src[1] = b;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_map_custom2(
-        struct ggml_context          * ctx,
-        struct ggml_tensor           * a,
-        struct ggml_tensor           * b,
-        const  ggml_custom2_op_t       fun,
-        int                            n_tasks,
-        void                         * userdata) {
-    return ggml_map_custom2_impl(ctx, a, b, fun, n_tasks, userdata, false);
-}
-
-struct ggml_tensor * ggml_map_custom2_inplace(
-        struct ggml_context          * ctx,
-        struct ggml_tensor           * a,
-        struct ggml_tensor           * b,
-        const  ggml_custom2_op_t       fun,
-        int                            n_tasks,
-        void                         * userdata) {
-    return ggml_map_custom2_impl(ctx, a, b, fun, n_tasks, userdata, true);
-}
-
-// ggml_map_custom3
-
-struct ggml_map_custom3_op_params {
-    ggml_custom3_op_t fun;
-    int n_tasks;
-    void * userdata;
-};
-
-static struct ggml_tensor * ggml_map_custom3_impl(
-        struct ggml_context          * ctx,
-        struct ggml_tensor           * a,
-        struct ggml_tensor           * b,
-        struct ggml_tensor           * c,
-        const  ggml_custom3_op_t       fun,
-        int                            n_tasks,
-        void                         * userdata,
-        bool                           inplace) {
-    GGML_ASSERT(n_tasks == GGML_N_TASKS_MAX || n_tasks > 0);
-
-    bool is_node = false;
-
-    if (!inplace && (a->grad || b->grad || c->grad)) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    struct ggml_map_custom3_op_params params = {
-        /*.fun      =*/ fun,
-        /*.n_tasks  =*/ n_tasks,
-        /*.userdata =*/ userdata
-    };
-    ggml_set_op_params(result, (const void *) &params, sizeof(params));
-
-    result->op = GGML_OP_MAP_CUSTOM3;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-    result->src[1] = b;
-    result->src[2] = c;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_map_custom3(
-        struct ggml_context          * ctx,
-        struct ggml_tensor           * a,
-        struct ggml_tensor           * b,
-        struct ggml_tensor           * c,
-        const  ggml_custom3_op_t       fun,
-        int                            n_tasks,
-        void                         * userdata) {
-    return ggml_map_custom3_impl(ctx, a, b, c, fun, n_tasks, userdata, false);
-}
-
-struct ggml_tensor * ggml_map_custom3_inplace(
-        struct ggml_context          * ctx,
-        struct ggml_tensor           * a,
-        struct ggml_tensor           * b,
-        struct ggml_tensor           * c,
-        const  ggml_custom3_op_t       fun,
-        int                            n_tasks,
-        void                         * userdata) {
-    return ggml_map_custom3_impl(ctx, a, b, c, fun, n_tasks, userdata, true);
-}
-
-// ggml_cross_entropy_loss
-
-struct ggml_tensor * ggml_cross_entropy_loss(
-        struct ggml_context         * ctx,
-        struct ggml_tensor          * a,
-        struct ggml_tensor          * b) {
-    GGML_ASSERT(ggml_are_same_shape(a, b));
-    bool is_node = false;
-
-    if (a->grad || b->grad) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = ggml_new_tensor_1d(ctx, a->type, 1);
-
-    result->op   = GGML_OP_CROSS_ENTROPY_LOSS;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-    result->src[1] = b;
-
-    return result;
-}
-
-// ggml_cross_entropy_loss_back
-
-struct ggml_tensor * ggml_cross_entropy_loss_back(
-        struct ggml_context         * ctx,
-        struct ggml_tensor          * a,
-        struct ggml_tensor          * b,
-        struct ggml_tensor          * c) {
-    GGML_ASSERT(ggml_are_same_shape(a, b));
-    GGML_ASSERT(ggml_is_scalar(c));
-
-    struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
-
-    result->op   = GGML_OP_CROSS_ENTROPY_LOSS_BACK;
-    result->grad = NULL;
-    result->src[0] = a;
-    result->src[1] = b;
-    result->src[2] = c;
-
-    return result;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-void ggml_set_param(
-        struct ggml_context * ctx,
-        struct ggml_tensor * tensor) {
-    tensor->is_param = true;
-
-    GGML_ASSERT(tensor->grad == NULL);
-    tensor->grad = ggml_dup_tensor(ctx, tensor);
-    ggml_format_name(tensor->grad, "%s (grad)", tensor->name);
-}
-
-// ggml_compute_forward_dup
-
-static void ggml_compute_forward_dup_same_cont(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
-    GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
-    GGML_ASSERT(src0->type == dst->type);
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const size_t nb00 = src0->nb[0];
-    const size_t nb0 = dst->nb[0];
-
-    const int ith = params->ith; // thread index
-    const int nth = params->nth; // number of threads
-
-    // parallelize by elements
-    const int ne = ggml_nelements(dst);
-    const int dr = (ne + nth - 1) / nth;
-    const int ie0 = dr * ith;
-    const int ie1 = MIN(ie0 + dr, ne);
-
-    if (ie0 < ie1) {
-        memcpy(
-            ((char *)  dst->data + ie0*nb0),
-            ((char *) src0->data + ie0*nb00),
-            (ie1 - ie0) * ggml_type_size(src0->type));
-    }
-
-}
-static void ggml_compute_forward_dup_f16(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    GGML_TENSOR_UNARY_OP_LOCALS
-
-    const int ith = params->ith; // thread index
-    const int nth = params->nth; // number of threads
-
-    if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) {
-        ggml_compute_forward_dup_same_cont(params, src0, dst);
-        return;
-    }
-
-    // parallelize by rows
-    const int nr = ne01;
-    // number of rows per thread
-    const int dr = (nr + nth - 1) / nth;
-    // row range for this thread
-    const int ir0 = dr * ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    if (src0->type == dst->type &&
-        ne00 == ne0 &&
-        nb00 == ggml_type_size(src0->type) && nb0 == ggml_type_size(dst->type)) {
-        // copy by rows
-        const size_t rs = ne00*nb00;
-        for (int64_t i03 = 0; i03 < ne03; i03++) {
-            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                for (int64_t i01 = ir0; i01 < ir1; i01++) {
-                    memcpy(
-                        ((char *)  dst->data + i01*nb1  + i02*nb2  + i03*nb3),
-                        ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03),
-                        rs);
-                }
-            }
-        }
-        return;
-    }
-
-    // TODO: add more special-case implementations for tensor shapes/strides that can benefit from memcpy
-
-    if (ggml_is_contiguous(dst)) {
-        if (nb00 == sizeof(ggml_fp16_t)) {
-            if (dst->type == GGML_TYPE_F16) {
-                size_t id = 0;
-                const size_t rs = ne00 * nb00;
-                char * dst_ptr = (char *) dst->data;
-
-                for (int i03 = 0; i03 < ne03; i03++) {
-                    for (int i02 = 0; i02 < ne02; i02++) {
-                        id += rs * ir0;
-                        for (int i01 = ir0; i01 < ir1; i01++) {
-                            const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03;
-                            memcpy(dst_ptr + id, src0_ptr, rs);
-                            id += rs;
-                        }
-                        id += rs * (ne01 - ir1);
-                    }
-                }
-            } else if (dst->type == GGML_TYPE_F32) {
-                size_t id = 0;
-                float * dst_ptr = (float *) dst->data;
-
-                for (int i03 = 0; i03 < ne03; i03++) {
-                    for (int i02 = 0; i02 < ne02; i02++) {
-                        id += ne00 * ir0;
-                        for (int i01 = ir0; i01 < ir1; i01++) {
-                            const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
-                            for (int i00 = 0; i00 < ne00; i00++) {
-                                dst_ptr[id] = GGML_FP16_TO_FP32(src0_ptr[i00]);
-                                id++;
-                            }
-                        }
-                        id += ne00 * (ne01 - ir1);
-                    }
-                }
-            } else if (type_traits[dst->type].from_float) {
-                ggml_from_float_t const quantize_row_q = type_traits[dst->type].from_float;
-                float * src0_f32 = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith;
-
-                size_t id = 0;
-                size_t rs = nb0 * (ne00 / ggml_blck_size(dst->type));
-                char * dst_ptr = (char *) dst->data;
-
-                for (int i03 = 0; i03 < ne03; i03++) {
-                    for (int i02 = 0; i02 < ne02; i02++) {
-                        id += rs * ir0;
-                        for (int i01 = ir0; i01 < ir1; i01++) {
-                            const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
-
-                            for (int i00 = 0; i00 < ne00; i00++) {
-                                src0_f32[i00] = GGML_FP16_TO_FP32(src0_ptr[i00]);
-                            }
-
-                            quantize_row_q(src0_f32, dst_ptr + id, ne00);
-                            id += rs;
-                        }
-                        id += rs * (ne01 - ir1);
-                    }
-                }
-            } else {
-                GGML_ASSERT(false); // TODO: implement
-            }
-        } else {
-            //printf("%s: this is not optimal - fix me\n", __func__);
-
-            if (dst->type == GGML_TYPE_F32) {
-                size_t id = 0;
-                float * dst_ptr = (float *) dst->data;
-
-                for (int i03 = 0; i03 < ne03; i03++) {
-                    for (int i02 = 0; i02 < ne02; i02++) {
-                        id += ne00 * ir0;
-                        for (int i01 = ir0; i01 < ir1; i01++) {
-                            for (int i00 = 0; i00 < ne00; i00++) {
-                                const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
-
-                                dst_ptr[id] = GGML_FP16_TO_FP32(*src0_ptr);
-                                id++;
-                            }
-                        }
-                        id += ne00 * (ne01 - ir1);
-                    }
-                }
-            } else if (dst->type == GGML_TYPE_F16) {
-                size_t id = 0;
-                ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data;
-
-                for (int i03 = 0; i03 < ne03; i03++) {
-                    for (int i02 = 0; i02 < ne02; i02++) {
-                        id += ne00 * ir0;
-                        for (int i01 = ir0; i01 < ir1; i01++) {
-                            for (int i00 = 0; i00 < ne00; i00++) {
-                                const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
-
-                                dst_ptr[id] = *src0_ptr;
-                                id++;
-                            }
-                        }
-                        id += ne00 * (ne01 - ir1);
-                    }
-                }
-            } else {
-                GGML_ASSERT(false); // TODO: implement
-            }
-        }
-        return;
-    }
-
-    // dst counters
-    int64_t i10 = 0;
-    int64_t i11 = 0;
-    int64_t i12 = 0;
-    int64_t i13 = 0;
-
-    if (dst->type == GGML_TYPE_F16) {
-        for (int64_t i03 = 0; i03 < ne03; i03++) {
-            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                i10 += ne00 * ir0;
-                while (i10 >= ne0) {
-                    i10 -= ne0;
-                    if (++i11 == ne1) {
-                        i11 = 0;
-                        if (++i12 == ne2) {
-                            i12 = 0;
-                            if (++i13 == ne3) {
-                                i13 = 0;
-                            }
-                        }
-                    }
-                }
-                for (int64_t i01 = ir0; i01 < ir1; i01++) {
-                    for (int64_t i00 = 0; i00 < ne00; i00++) {
-                        const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
-                              char * dst_ptr  = ((char *)  dst->data + i10*nb0  + i11*nb1  + i12*nb2  + i13*nb3);
-
-                        memcpy(dst_ptr, src0_ptr, sizeof(ggml_fp16_t));
-
-                        if (++i10 == ne00) {
-                            i10 = 0;
-                            if (++i11 == ne01) {
-                                i11 = 0;
-                                if (++i12 == ne02) {
-                                    i12 = 0;
-                                    if (++i13 == ne03) {
-                                        i13 = 0;
-                                    }
-                                }
-                            }
-                        }
-                    }
-                }
-                i10 += ne00 * (ne01 - ir1);
-                while (i10 >= ne0) {
-                    i10 -= ne0;
-                    if (++i11 == ne1) {
-                        i11 = 0;
-                        if (++i12 == ne2) {
-                            i12 = 0;
-                            if (++i13 == ne3) {
-                                i13 = 0;
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    } else if (dst->type == GGML_TYPE_F32) {
-        for (int64_t i03 = 0; i03 < ne03; i03++) {
-            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                i10 += ne00 * ir0;
-                while (i10 >= ne0) {
-                    i10 -= ne0;
-                    if (++i11 == ne1) {
-                        i11 = 0;
-                        if (++i12 == ne2) {
-                            i12 = 0;
-                            if (++i13 == ne3) {
-                                i13 = 0;
-                            }
-                        }
-                    }
-                }
-                for (int64_t i01 = ir0; i01 < ir1; i01++) {
-                    for (int64_t i00 = 0; i00 < ne00; i00++) {
-                        const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
-                              char * dst_ptr  = ((char *)  dst->data + i10*nb0  + i11*nb1  + i12*nb2  + i13*nb3);
-
-                        *(float *) dst_ptr = GGML_FP16_TO_FP32(*(const ggml_fp16_t *) src0_ptr);
-
-                        if (++i10 == ne0) {
-                            i10 = 0;
-                            if (++i11 == ne1) {
-                                i11 = 0;
-                                if (++i12 == ne2) {
-                                    i12 = 0;
-                                    if (++i13 == ne3) {
-                                        i13 = 0;
-                                    }
-                                }
-                            }
-                        }
-                    }
-                }
-                i10 += ne00 * (ne01 - ir1);
-                while (i10 >= ne0) {
-                    i10 -= ne0;
-                    if (++i11 == ne1) {
-                        i11 = 0;
-                        if (++i12 == ne2) {
-                            i12 = 0;
-                            if (++i13 == ne3) {
-                                i13 = 0;
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    } else {
-        GGML_ASSERT(false); // TODO: implement
-    }
-}
-
-static void ggml_compute_forward_dup_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    GGML_TENSOR_UNARY_OP_LOCALS
-
-    const int ith = params->ith; // thread index
-    const int nth = params->nth; // number of threads
-
-    if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) {
-        ggml_compute_forward_dup_same_cont(params, src0, dst);
-        return;
-    }
-
-    // parallelize by rows
-    const int nr = ne01;
-    // number of rows per thread
-    const int dr = (nr + nth - 1) / nth;
-    // row range for this thread
-    const int ir0 = dr * ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    if (src0->type == dst->type &&
-        ne00 == ne0 &&
-        nb00 == ggml_type_size(src0->type) && nb0 == ggml_type_size(dst->type)) {
-        // copy by rows
-        const size_t rs = ne00*nb00;
-        for (int64_t i03 = 0; i03 < ne03; i03++) {
-            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                for (int64_t i01 = ir0; i01 < ir1; i01++) {
-                    memcpy(
-                        ((char *)  dst->data + i01*nb1  + i02*nb2  + i03*nb3),
-                        ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03),
-                        rs);
-                }
-            }
-        }
-        return;
-    }
-
-    if (ggml_is_contiguous(dst)) {
-        // TODO: simplify
-        if (nb00 == sizeof(float)) {
-            if (dst->type == GGML_TYPE_F32) {
-                size_t id = 0;
-                const size_t rs = ne00 * nb00;
-                char * dst_ptr = (char *) dst->data;
-
-                for (int i03 = 0; i03 < ne03; i03++) {
-                    for (int i02 = 0; i02 < ne02; i02++) {
-                        id += rs * ir0;
-                        for (int i01 = ir0; i01 < ir1; i01++) {
-                            const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03;
-                            memcpy(dst_ptr + id, src0_ptr, rs);
-                            id += rs;
-                        }
-                        id += rs * (ne01 - ir1);
-                    }
-                }
-            } else if (type_traits[dst->type].from_float) {
-                ggml_from_float_t const quantize_row_q = type_traits[dst->type].from_float;
-
-                size_t id = 0;
-                size_t rs = nb0 * (ne00 / ggml_blck_size(dst->type));
-                char * dst_ptr = (char *) dst->data;
-
-                for (int i03 = 0; i03 < ne03; i03++) {
-                    for (int i02 = 0; i02 < ne02; i02++) {
-                        id += rs * ir0;
-                        for (int i01 = ir0; i01 < ir1; i01++) {
-                            const float * src0_ptr = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
-                            quantize_row_q(src0_ptr, dst_ptr + id, ne00);
-                            id += rs;
-                        }
-                        id += rs * (ne01 - ir1);
-                    }
-                }
-            } else {
-                GGML_ASSERT(false); // TODO: implement
-            }
-        } else {
-            //printf("%s: this is not optimal - fix me\n", __func__);
-
-            if (dst->type == GGML_TYPE_F32) {
-                size_t id = 0;
-                float * dst_ptr = (float *) dst->data;
-
-                for (int i03 = 0; i03 < ne03; i03++) {
-                    for (int i02 = 0; i02 < ne02; i02++) {
-                        id += ne00 * ir0;
-                        for (int i01 = ir0; i01 < ir1; i01++) {
-                            for (int i00 = 0; i00 < ne00; i00++) {
-                                const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
-
-                                dst_ptr[id] = *src0_ptr;
-                                id++;
-                            }
-                        }
-                        id += ne00 * (ne01 - ir1);
-                    }
-                }
-            } else if (dst->type == GGML_TYPE_F16) {
-                size_t id = 0;
-                ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data;
-
-                for (int i03 = 0; i03 < ne03; i03++) {
-                    for (int i02 = 0; i02 < ne02; i02++) {
-                        id += ne00 * ir0;
-                        for (int i01 = ir0; i01 < ir1; i01++) {
-                            for (int i00 = 0; i00 < ne00; i00++) {
-                                const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
-
-                                dst_ptr[id] = GGML_FP32_TO_FP16(*src0_ptr);
-                                id++;
-                            }
-                        }
-                        id += ne00 * (ne01 - ir1);
-                    }
-                }
-            } else {
-                GGML_ASSERT(false); // TODO: implement
-            }
-        }
-
-        return;
-    }
-
-    // dst counters
-
-    int64_t i10 = 0;
-    int64_t i11 = 0;
-    int64_t i12 = 0;
-    int64_t i13 = 0;
-
-    if (dst->type == GGML_TYPE_F32) {
-        for (int64_t i03 = 0; i03 < ne03; i03++) {
-            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                i10 += ne00 * ir0;
-                while (i10 >= ne0) {
-                    i10 -= ne0;
-                    if (++i11 == ne1) {
-                        i11 = 0;
-                        if (++i12 == ne2) {
-                            i12 = 0;
-                            if (++i13 == ne3) {
-                                i13 = 0;
-                            }
-                        }
-                    }
-                }
-                for (int64_t i01 = ir0; i01 < ir1; i01++) {
-                    for (int64_t i00 = 0; i00 < ne00; i00++) {
-                        const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
-                              char * dst_ptr  = ((char *)  dst->data + i10*nb0  + i11*nb1  + i12*nb2  + i13*nb3);
-
-                        memcpy(dst_ptr, src0_ptr, sizeof(float));
-
-                        if (++i10 == ne0) {
-                            i10 = 0;
-                            if (++i11 == ne1) {
-                                i11 = 0;
-                                if (++i12 == ne2) {
-                                    i12 = 0;
-                                    if (++i13 == ne3) {
-                                        i13 = 0;
-                                    }
-                                }
-                            }
-                        }
-                    }
-                }
-                i10 += ne00 * (ne01 - ir1);
-                while (i10 >= ne0) {
-                    i10 -= ne0;
-                    if (++i11 == ne1) {
-                        i11 = 0;
-                        if (++i12 == ne2) {
-                            i12 = 0;
-                            if (++i13 == ne3) {
-                                i13 = 0;
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    } else if (dst->type == GGML_TYPE_F16) {
-        for (int64_t i03 = 0; i03 < ne03; i03++) {
-            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                i10 += ne00 * ir0;
-                while (i10 >= ne0) {
-                    i10 -= ne0;
-                    if (++i11 == ne1) {
-                        i11 = 0;
-                        if (++i12 == ne2) {
-                            i12 = 0;
-                            if (++i13 == ne3) {
-                                i13 = 0;
-                            }
-                        }
-                    }
-                }
-                for (int64_t i01 = ir0; i01 < ir1; i01++) {
-                    for (int64_t i00 = 0; i00 < ne00; i00++) {
-                        const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
-                              char * dst_ptr  = ((char *)  dst->data + i10*nb0  + i11*nb1  + i12*nb2  + i13*nb3);
-
-                        *(ggml_fp16_t *) dst_ptr = GGML_FP32_TO_FP16(*(const float *) src0_ptr);
-
-                        if (++i10 == ne0) {
-                            i10 = 0;
-                            if (++i11 == ne1) {
-                                i11 = 0;
-                                if (++i12 == ne2) {
-                                    i12 = 0;
-                                    if (++i13 == ne3) {
-                                        i13 = 0;
-                                    }
-                                }
-                            }
-                        }
-                    }
-                }
-                i10 += ne00 * (ne01 - ir1);
-                while (i10 >= ne0) {
-                    i10 -= ne0;
-                    if (++i11 == ne1) {
-                        i11 = 0;
-                        if (++i12 == ne2) {
-                            i12 = 0;
-                            if (++i13 == ne3) {
-                                i13 = 0;
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    } else {
-        GGML_ASSERT(false); // TODO: implement
-    }
-}
-
-// A simplified version of ggml_compute_forward_dup that doesn't do float upcasting, and just plain old memcpy.
-static void ggml_compute_forward_dup_bytes(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
-    GGML_ASSERT(src0->type == dst->type);
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst)) {
-        ggml_compute_forward_dup_same_cont(params, src0, dst);
-        return;
-    }
-
-    GGML_TENSOR_UNARY_OP_LOCALS;
-
-    const size_t type_size = ggml_type_size(src0->type);
-    const int ith = params->ith; // thread index
-    const int nth = params->nth; // number of threads
-
-
-    // parallelize by rows
-    const int nr = ne01;
-    // number of rows per thread
-    const int dr = (nr + nth - 1) / nth;
-    // row range for this thread
-    const int ir0 = dr * ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    if (src0->type == dst->type &&
-        ne00 == ne0 &&
-        nb00 == type_size && nb0 == type_size) {
-        // copy by rows
-        const size_t rs = ne00 * type_size;
-        for (int64_t i03 = 0; i03 < ne03; i03++) {
-            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                for (int64_t i01 = ir0; i01 < ir1; i01++) {
-                    memcpy(
-                        ((char *)  dst->data + i01*nb1  + i02*nb2  + i03*nb3),
-                        ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03),
-                        rs);
-                }
-            }
-        }
-        return;
-    }
-
-    if (ggml_is_contiguous(dst)) {
-        size_t id = 0;
-        char * dst_ptr = (char *) dst->data;
-        const size_t rs = ne00 * type_size;
-
-        if (nb00 == type_size) {
-            // src0 is contigous on first dimension, copy by rows
-            for (int64_t i03 = 0; i03 < ne03; i03++) {
-                for (int64_t i02 = 0; i02 < ne02; i02++) {
-                    id += rs * ir0;
-                    for (int64_t i01 = ir0; i01 < ir1; i01++) {
-                        const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03;
-                        memcpy(dst_ptr + id, src0_ptr, rs);
-                        id += rs;
-                    }
-                    id += rs * (ne01 - ir1);
-                }
-            }
-        } else {
-            //printf("%s: this is not optimal - fix me\n", __func__);
-
-            for (int64_t i03 = 0; i03 < ne03; i03++) {
-                for (int64_t i02 = 0; i02 < ne02; i02++) {
-                    id += rs * ir0;
-                    for (int64_t i01 = ir0; i01 < ir1; i01++) {
-                        for (int64_t i00 = 0; i00 < ne00; i00++) {
-                            const char * src0_ptr = (char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03;
-                            memcpy(dst_ptr + id, src0_ptr, type_size);
-
-                            id += type_size;
-                        }
-                    }
-                    id += rs * (ne01 - ir1);
-                }
-            }
-        }
-
-        return;
-    }
-
-    // dst counters
-
-    int64_t i10 = 0;
-    int64_t i11 = 0;
-    int64_t i12 = 0;
-    int64_t i13 = 0;
-
-    for (int64_t i03 = 0; i03 < ne03; i03++) {
-        for (int64_t i02 = 0; i02 < ne02; i02++) {
-            i10 += ne00 * ir0;
-            while (i10 >= ne0) {
-                i10 -= ne0;
-                if (++i11 == ne1) {
-                    i11 = 0;
-                    if (++i12 == ne2) {
-                        i12 = 0;
-                        if (++i13 == ne3) {
-                            i13 = 0;
-                        }
-                    }
-                }
-            }
-            for (int64_t i01 = ir0; i01 < ir1; i01++) {
-                for (int64_t i00 = 0; i00 < ne00; i00++) {
-                    const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
-                          char * dst_ptr  = ((char *)  dst->data + i10*nb0  + i11*nb1  + i12*nb2  + i13*nb3);
-
-                    memcpy(dst_ptr, src0_ptr, type_size);
-
-                    if (++i10 == ne0) {
-                        i10 = 0;
-                        if (++i11 == ne1) {
-                            i11 = 0;
-                            if (++i12 == ne2) {
-                                i12 = 0;
-                                if (++i13 == ne3) {
-                                    i13 = 0;
-                                }
-                            }
-                        }
-                    }
-                }
-            }
-            i10 += ne00 * (ne01 - ir1);
-            while (i10 >= ne0) {
-                i10 -= ne0;
-                if (++i11 == ne1) {
-                    i11 = 0;
-                    if (++i12 == ne2) {
-                        i12 = 0;
-                        if (++i13 == ne3) {
-                            i13 = 0;
-                        }
-                    }
-                }
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_dup(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    if (src0->type == dst->type) {
-        ggml_compute_forward_dup_bytes(params, src0, dst);
-        return;
-    }
-
-    switch (src0->type) {
-        case GGML_TYPE_F16:
-            {
-                ggml_compute_forward_dup_f16(params, src0, dst);
-            } break;
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_dup_f32(params, src0, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_add
-
-static void ggml_compute_forward_add_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
-    GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nr  = ggml_nrows(src0);
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    GGML_ASSERT( nb0 == sizeof(float));
-    GGML_ASSERT(nb00 == sizeof(float));
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    if (nb10 == sizeof(float)) {
-        for (int ir = ir0; ir < ir1; ++ir) {
-            // src1 is broadcastable across src0 and dst in i1, i2, i3
-            const int64_t i03 = ir/(ne02*ne01);
-            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
-            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
-
-            const int64_t i13 = i03 % ne13;
-            const int64_t i12 = i02 % ne12;
-            const int64_t i11 = i01 % ne11;
-            const int64_t nr0 = ne00 / ne10;
-
-            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
-            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
-            float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
-
-            for (int64_t r = 0; r < nr0; ++r) {
-#ifdef GGML_USE_ACCELERATE
-                vDSP_vadd(src0_ptr + r*ne10, 1, src1_ptr, 1, dst_ptr + r*ne10, 1, ne10);
-#else
-                ggml_vec_add_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
-#endif
-            }
-        }
-    } else {
-        // src1 is not contiguous
-        for (int ir = ir0; ir < ir1; ++ir) {
-            // src1 is broadcastable across src0 and dst in i1, i2, i3
-            const int64_t i03 = ir/(ne02*ne01);
-            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
-            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
-
-            const int64_t i13 = i03 % ne13;
-            const int64_t i12 = i02 % ne12;
-            const int64_t i11 = i01 % ne11;
-
-            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
-            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
-
-            for (int64_t i0 = 0; i0 < ne0; ++i0) {
-                const int64_t i10 = i0 % ne10;
-                float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10);
-
-                dst_ptr[i0] = src0_ptr[i0] + *src1_ptr;
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_add_f16_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
-    GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nr  = ggml_nrows(src0);
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F16);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-
-    if (dst->type == GGML_TYPE_F32) {
-        GGML_ASSERT( nb0 == sizeof(float));
-    }
-    else {
-        GGML_ASSERT(dst->type  == GGML_TYPE_F16);
-        GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
-    }
-
-    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    if (nb10 == sizeof(float)) {
-        if (dst->type == GGML_TYPE_F16) {
-            for (int ir = ir0; ir < ir1; ++ir) {
-                // src0, src1 and dst are same shape => same indices
-                const int i3 = ir/(ne2*ne1);
-                const int i2 = (ir - i3*ne2*ne1)/ne1;
-                const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
-
-                ggml_fp16_t * dst_ptr  = (ggml_fp16_t *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1);
-                ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
-                float *       src1_ptr = (float *)       ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11);
-
-                for (int i = 0; i < ne0; i++) {
-                    dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + src1_ptr[i]);
-                }
-            }
-        } else {
-            for (int ir = ir0; ir < ir1; ++ir) {
-                // src0, src1 and dst are same shape => same indices
-                const int i3 = ir/(ne2*ne1);
-                const int i2 = (ir - i3*ne2*ne1)/ne1;
-                const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
-
-                float *       dst_ptr  = (float *)       ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1);
-                ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
-                float *       src1_ptr = (float *)       ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11);
-
-                for (int i = 0; i < ne0; i++) {
-                    dst_ptr[i] = GGML_FP16_TO_FP32(src0_ptr[i]) + src1_ptr[i];
-                }
-            }
-        }
-    }
-    else {
-        // src1 is not contiguous
-        GGML_ASSERT(false);
-    }
-}
-
-static void ggml_compute_forward_add_f16_f16(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
-    GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nr  = ggml_nrows(src0);
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F16);
-    GGML_ASSERT(src1->type == GGML_TYPE_F16);
-    GGML_ASSERT(dst->type  == GGML_TYPE_F16);
-
-    GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
-    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    if (nb10 == sizeof(ggml_fp16_t)) {
-        for (int ir = ir0; ir < ir1; ++ir) {
-            // src0, src1 and dst are same shape => same indices
-            const int i3 = ir/(ne2*ne1);
-            const int i2 = (ir - i3*ne2*ne1)/ne1;
-            const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
-
-            ggml_fp16_t * dst_ptr  = (ggml_fp16_t *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1);
-            ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
-            ggml_fp16_t * src1_ptr = (ggml_fp16_t *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11);
-
-            for (int i = 0; i < ne0; i++) {
-                dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + GGML_FP16_TO_FP32(src1_ptr[i]));
-            }
-        }
-    }
-    else {
-        // src1 is not contiguous
-        GGML_ASSERT(false);
-    }
-}
-
-static void ggml_compute_forward_add_q_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
-    GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int nr  = ggml_nrows(src0);
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const enum ggml_type type = src0->type;
-    const enum ggml_type dtype = dst->type;
-    ggml_to_float_t const dequantize_row_q = type_traits[type].to_float;
-    ggml_from_float_t const quantize_row_q = type_traits[dtype].from_float;
-
-    // we don't support permuted src0 or src1
-    GGML_ASSERT(nb00 == ggml_type_size(type));
-    GGML_ASSERT(nb10 == sizeof(float));
-
-    // dst cannot be transposed or permuted
-    GGML_ASSERT(nb0 <= nb1);
-    GGML_ASSERT(nb1 <= nb2);
-    GGML_ASSERT(nb2 <= nb3);
-
-    GGML_ASSERT(ggml_is_quantized(src0->type));
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    float * wdata = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith;
-
-    for (int ir = ir0; ir < ir1; ++ir) {
-        // src0 indices
-        const int i03 = ir/(ne02*ne01);
-        const int i02 = (ir - i03*ne02*ne01)/ne01;
-        const int i01 = (ir - i03*ne02*ne01 - i02*ne01);
-
-        // src1 and dst are same shape as src0 => same indices
-        const int i13 = i03;
-        const int i12 = i02;
-        const int i11 = i01;
-
-        const int i3 = i03;
-        const int i2 = i02;
-        const int i1 = i01;
-
-        void  * src0_row = (void *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03));
-        float * src1_row = (float *)((char *) src1->data + (i11*nb11 + i12*nb12 + i13*nb13));
-        void  * dst_row  = (void *) ((char *)  dst->data + ( i1*nb1  +  i2*nb2  +  i3*nb3));
-
-        assert(ne00 % 32 == 0);
-
-        // unquantize row from src0 to temp buffer
-        dequantize_row_q(src0_row, wdata, ne00);
-        // add src1
-        ggml_vec_acc_f32(ne00, wdata, src1_row);
-        // quantize row to dst
-        if (quantize_row_q != NULL) {
-            quantize_row_q(wdata, dst_row, ne00);
-        } else {
-            memcpy(dst_row, wdata, ne0*nb0);
-        }
-    }
-}
-
-static void ggml_compute_forward_add(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_add_f32(params, src0, src1, dst);
-            } break;
-        case GGML_TYPE_F16:
-            {
-                if (src1->type == GGML_TYPE_F16) {
-                    ggml_compute_forward_add_f16_f16(params, src0, src1, dst);
-                }
-                else if (src1->type == GGML_TYPE_F32) {
-                    ggml_compute_forward_add_f16_f32(params, src0, src1, dst);
-                }
-                else {
-                    GGML_ASSERT(false);
-                }
-            } break;
-        case GGML_TYPE_Q4_0:
-        case GGML_TYPE_Q4_1:
-        case GGML_TYPE_Q5_0:
-        case GGML_TYPE_Q5_1:
-        case GGML_TYPE_Q8_0:
-        case GGML_TYPE_Q2_K:
-        case GGML_TYPE_Q3_K:
-        case GGML_TYPE_Q4_K:
-        case GGML_TYPE_Q5_K:
-        case GGML_TYPE_Q6_K:
-        case GGML_TYPE_IQ2_XXS:
-        case GGML_TYPE_IQ2_XS:
-            {
-                ggml_compute_forward_add_q_f32(params, src0, src1, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_add1
-
-static void ggml_compute_forward_add1_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
-    GGML_ASSERT(ggml_are_same_shape(src0, dst));
-    GGML_ASSERT(ggml_is_scalar(src1));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nr  = ggml_nrows(src0);
-
-    GGML_TENSOR_UNARY_OP_LOCALS
-
-    GGML_ASSERT( nb0 == sizeof(float));
-    GGML_ASSERT(nb00 == sizeof(float));
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    for (int ir = ir0; ir < ir1; ++ir) {
-        // src0 and dst are same shape => same indices
-        const int i3 = ir/(ne2*ne1);
-        const int i2 = (ir - i3*ne2*ne1)/ne1;
-        const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
-
-#ifdef GGML_USE_ACCELERATE
-        UNUSED(ggml_vec_add1_f32);
-
-        vDSP_vadd(
-                (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1,
-                (float *) ((char *) src1->data), 0,
-                (float *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1 ), 1,
-                ne0);
-#else
-        ggml_vec_add1_f32(ne0,
-                (float *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1 ),
-                (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01),
-               *(float *) src1->data);
-#endif
-    }
-}
-
-static void ggml_compute_forward_add1_f16_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
-    GGML_ASSERT(ggml_are_same_shape(src0, dst));
-    GGML_ASSERT(ggml_is_scalar(src1));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    // scalar to add
-    const float v = *(float *) src1->data;
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nr  = ggml_nrows(src0);
-
-    GGML_TENSOR_UNARY_OP_LOCALS
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F16);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-    GGML_ASSERT(dst->type  == GGML_TYPE_F16);
-
-    GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
-    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    for (int ir = ir0; ir < ir1; ++ir) {
-        // src0 and dst are same shape => same indices
-        const int i3 = ir/(ne2*ne1);
-        const int i2 = (ir - i3*ne2*ne1)/ne1;
-        const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
-
-        ggml_fp16_t * dst_ptr  = (ggml_fp16_t *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1 );
-        ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
-        for (int i = 0; i < ne0; i++) {
-            dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + v);
-        }
-    }
-}
-
-static void ggml_compute_forward_add1_f16_f16(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
-    GGML_ASSERT(ggml_are_same_shape(src0, dst));
-    GGML_ASSERT(ggml_is_scalar(src1));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    // scalar to add
-    const float v = GGML_FP16_TO_FP32(*(ggml_fp16_t *) src1->data);
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nr  = ggml_nrows(src0);
-
-    GGML_TENSOR_UNARY_OP_LOCALS
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F16);
-    GGML_ASSERT(src1->type == GGML_TYPE_F16);
-    GGML_ASSERT(dst->type  == GGML_TYPE_F16);
-
-    GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
-    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    for (int ir = ir0; ir < ir1; ++ir) {
-        // src0 and dst are same shape => same indices
-        const int i3 = ir/(ne2*ne1);
-        const int i2 = (ir - i3*ne2*ne1)/ne1;
-        const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
-
-        ggml_fp16_t * dst_ptr  = (ggml_fp16_t *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1 );
-        ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
-        for (int i = 0; i < ne0; i++) {
-            dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + v);
-        }
-    }
-}
-
-static void ggml_compute_forward_add1_q_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
-    GGML_ASSERT(ggml_are_same_shape(src0, dst));
-    GGML_ASSERT(ggml_is_scalar(src1));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    // scalar to add
-    const float v = *(float *) src1->data;
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nr  = ggml_nrows(src0);
-
-    GGML_TENSOR_UNARY_OP_LOCALS
-
-    const enum ggml_type type = src0->type;
-    ggml_to_float_t const dequantize_row_q = type_traits[type].to_float;
-    ggml_from_float_t const quantize_row_q = type_traits[type].from_float;
-
-    // we don't support permuted src0
-    GGML_ASSERT(nb00 == ggml_type_size(type));
-
-    // dst cannot be transposed or permuted
-    GGML_ASSERT(nb0 <= nb1);
-    GGML_ASSERT(nb1 <= nb2);
-    GGML_ASSERT(nb2 <= nb3);
-
-    GGML_ASSERT(ggml_is_quantized(src0->type));
-    GGML_ASSERT(dst->type == src0->type);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    float * wdata = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32) * ith;
-
-    for (int ir = ir0; ir < ir1; ++ir) {
-        // src0 and dst are same shape => same indices
-        const int i3 = ir/(ne2*ne1);
-        const int i2 = (ir - i3*ne2*ne1)/ne1;
-        const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
-
-        void  * src0_row = (void *) ((char *) src0->data + (i1*nb01 + i2*nb02 + i3*nb03));
-        void  * dst_row  = (void *) ((char *)  dst->data + (i1*nb1  + i2*nb2  + i3*nb0 ));
-
-        assert(ne0 % 32 == 0);
-
-        // unquantize row from src0 to temp buffer
-        dequantize_row_q(src0_row, wdata, ne0);
-        // add src1
-        ggml_vec_acc1_f32(ne0, wdata, v);
-        // quantize row to dst
-        quantize_row_q(wdata, dst_row, ne0);
-    }
-}
-
-static void ggml_compute_forward_add1(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_add1_f32(params, src0, src1, dst);
-            } break;
-        case GGML_TYPE_F16:
-            {
-                if (src1->type == GGML_TYPE_F16) {
-                    ggml_compute_forward_add1_f16_f16(params, src0, src1, dst);
-                }
-                else if (src1->type == GGML_TYPE_F32) {
-                    ggml_compute_forward_add1_f16_f32(params, src0, src1, dst);
-                }
-                else {
-                    GGML_ASSERT(false);
-                }
-            } break;
-        case GGML_TYPE_Q4_0:
-        case GGML_TYPE_Q4_1:
-        case GGML_TYPE_Q5_0:
-        case GGML_TYPE_Q5_1:
-        case GGML_TYPE_Q8_0:
-        case GGML_TYPE_Q8_1:
-        case GGML_TYPE_Q2_K:
-        case GGML_TYPE_Q3_K:
-        case GGML_TYPE_Q4_K:
-        case GGML_TYPE_Q5_K:
-        case GGML_TYPE_Q6_K:
-        case GGML_TYPE_IQ2_XXS:
-        case GGML_TYPE_IQ2_XS:
-            {
-                ggml_compute_forward_add1_q_f32(params, src0, src1, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_acc
-
-static void ggml_compute_forward_acc_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
-    GGML_ASSERT(ggml_are_same_shape(src0, dst));
-    GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
-
-    // view src0 and dst with these strides and data offset inbytes during acc
-    // nb0 is implicitly element_size because src0 and dst are contiguous
-    size_t nb1     = ((int32_t *) dst->op_params)[0];
-    size_t nb2     = ((int32_t *) dst->op_params)[1];
-    size_t nb3     = ((int32_t *) dst->op_params)[2];
-    size_t offset  = ((int32_t *) dst->op_params)[3];
-    bool   inplace = (bool) ((int32_t *) dst->op_params)[4];
-
-    if (!inplace && (params->type == GGML_TASK_INIT)) {
-        // memcpy needs to be synchronized across threads to avoid race conditions.
-        // => do it in INIT phase
-        memcpy(
-            ((char *)  dst->data),
-            ((char *) src0->data),
-            ggml_nbytes(dst));
-    }
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nr = ggml_nrows(src1);
-    const int nc = src1->ne[0];
-
-    GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne)
-    GGML_TENSOR_LOCALS(size_t,  nb1, src1, nb)
-
-    // src0 and dst as viewed during acc
-    const size_t nb0 = ggml_element_size(src0);
-
-    const size_t nb00 = nb0;
-    const size_t nb01 = nb1;
-    const size_t nb02 = nb2;
-    const size_t nb03 = nb3;
-
-    GGML_ASSERT(offset + (ne10 == 0 ? 0 : ne10-1)*nb0  + (ne11 == 0 ? 0 : ne11-1)*nb1  + (ne12 == 0 ? 0 : ne12-1)*nb2  + (ne13 == 0 ? 0 : ne13-1)*nb3  < ggml_nbytes(dst));
-    GGML_ASSERT(offset + (ne10 == 0 ? 0 : ne10-1)*nb00 + (ne11 == 0 ? 0 : ne11-1)*nb01 + (ne12 == 0 ? 0 : ne12-1)*nb02 + (ne13 == 0 ? 0 : ne13-1)*nb03 < ggml_nbytes(src0));
-
-    GGML_ASSERT(nb10 == sizeof(float));
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    for (int ir = ir0; ir < ir1; ++ir) {
-        // src0 and dst are viewed with shape of src1 and offset
-        // => same indices
-        const int i3 = ir/(ne12*ne11);
-        const int i2 = (ir - i3*ne12*ne11)/ne11;
-        const int i1 = (ir - i3*ne12*ne11 - i2*ne11);
-
-#ifdef GGML_USE_ACCELERATE
-        vDSP_vadd(
-                (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + offset), 1,
-                (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1,
-                (float *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1  + offset), 1, nc);
-#else
-        ggml_vec_add_f32(nc,
-                (float *) ((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + offset),
-                (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + offset),
-                (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11));
-#endif
-    }
-}
-
-static void ggml_compute_forward_acc(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_acc_f32(params, src0, src1, dst);
-            } break;
-        case GGML_TYPE_F16:
-        case GGML_TYPE_Q4_0:
-        case GGML_TYPE_Q4_1:
-        case GGML_TYPE_Q5_0:
-        case GGML_TYPE_Q5_1:
-        case GGML_TYPE_Q8_0:
-        case GGML_TYPE_Q8_1:
-        case GGML_TYPE_Q2_K:
-        case GGML_TYPE_Q3_K:
-        case GGML_TYPE_Q4_K:
-        case GGML_TYPE_Q5_K:
-        case GGML_TYPE_Q6_K:
-        case GGML_TYPE_IQ2_XXS:
-        case GGML_TYPE_IQ2_XS:
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_sub
-
-static void ggml_compute_forward_sub_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
-    assert(params->ith == 0);
-    assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int nr  = ggml_nrows(src0);
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    GGML_ASSERT( nb0 == sizeof(float));
-    GGML_ASSERT(nb00 == sizeof(float));
-
-    if (nb10 == sizeof(float)) {
-        for (int ir = 0; ir < nr; ++ir) {
-            // src0, src1 and dst are same shape => same indices
-            const int i3 = ir/(ne2*ne1);
-            const int i2 = (ir - i3*ne2*ne1)/ne1;
-            const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
-
-#ifdef GGML_USE_ACCELERATE
-            vDSP_vsub(
-                    (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1,
-                    (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1,
-                    (float *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1 ), 1,
-                    ne0);
-#else
-            ggml_vec_sub_f32(ne0,
-                    (float *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1 ),
-                    (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01),
-                    (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11));
-#endif
-                // }
-            // }
-        }
-    } else {
-        // src1 is not contiguous
-        for (int ir = 0; ir < nr; ++ir) {
-            // src0, src1 and dst are same shape => same indices
-            const int i3 = ir/(ne2*ne1);
-            const int i2 = (ir - i3*ne2*ne1)/ne1;
-            const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
-
-            float * dst_ptr  = (float *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1 );
-            float * src0_ptr = (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
-            for (int i0 = 0; i0 < ne0; i0++) {
-                float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11 + i0*nb10);
-
-                dst_ptr[i0] = src0_ptr[i0] - *src1_ptr;
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_sub(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_sub_f32(params, src0, src1, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_mul
-
-static void ggml_compute_forward_mul_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
-    GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-#ifdef GGML_USE_CLBLAST
-    if (src1->backend == GGML_BACKEND_GPU) {
-        // TODO: OpenCL kernel support full broadcast
-        GGML_ASSERT(ggml_can_repeat_rows(src1, src0));
-        if (ith == 0) {
-            ggml_cl_mul(src0, src1, dst);
-        }
-        return;
-    }
-#endif
-
-    const int64_t nr = ggml_nrows(src0);
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    GGML_ASSERT( nb0 == sizeof(float));
-    GGML_ASSERT(nb00 == sizeof(float));
-
-    if (nb10 == sizeof(float)) {
-        for (int64_t ir = ith; ir < nr; ir += nth) {
-            // src0 and dst are same shape => same indices
-            const int64_t i03 = ir/(ne02*ne01);
-            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
-            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
-
-            const int64_t i13 = i03 % ne13;
-            const int64_t i12 = i02 % ne12;
-            const int64_t i11 = i01 % ne11;
-            const int64_t nr0 = ne00 / ne10;
-
-            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
-            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
-            float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
-
-            for (int64_t r = 0 ; r < nr0; ++r) {
-#ifdef GGML_USE_ACCELERATE
-                UNUSED(ggml_vec_mul_f32);
-
-                vDSP_vmul(src0_ptr + r*ne10, 1, src1_ptr, 1, dst_ptr + r*ne10, 1, ne10);
-#else
-                ggml_vec_mul_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
-#endif
-            }
-        }
-    } else {
-        // src1 is not contiguous
-        for (int64_t ir = ith; ir < nr; ir += nth) {
-            // src0 and dst are same shape => same indices
-            // src1 is broadcastable across src0 and dst in i1, i2, i3
-            const int64_t i03 = ir/(ne02*ne01);
-            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
-            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
-
-            const int64_t i13 = i03 % ne13;
-            const int64_t i12 = i02 % ne12;
-            const int64_t i11 = i01 % ne11;
-
-            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
-            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
-
-            for (int64_t i0 = 0; i0 < ne00; ++i0) {
-                const int64_t i10 = i0 % ne10;
-                float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10);
-
-                dst_ptr[i0] = src0_ptr[i0] * (*src1_ptr);
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_mul(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
-    GGML_ASSERT(src1->type == GGML_TYPE_F32 && "only f32 src1 supported for now");
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_mul_f32(params, src0, src1, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_div
-
-static void ggml_compute_forward_div_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
-    GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int64_t nr = ggml_nrows(src0);
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    GGML_ASSERT( nb0 == sizeof(float));
-    GGML_ASSERT(nb00 == sizeof(float));
-
-    if (nb10 == sizeof(float)) {
-        for (int64_t ir = ith; ir < nr; ir += nth) {
-            // src0 and dst are same shape => same indices
-            const int64_t i03 = ir/(ne02*ne01);
-            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
-            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
-
-            const int64_t i13 = i03 % ne13;
-            const int64_t i12 = i02 % ne12;
-            const int64_t i11 = i01 % ne11;
-            const int64_t nr0 = ne00 / ne10;
-
-            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
-            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
-            float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
-
-            for (int64_t r = 0; r < nr0; ++r) {
-#ifdef GGML_USE_ACCELERATE
-                UNUSED(ggml_vec_div_f32);
-
-                vDSP_vdiv(src1_ptr, 1, src0_ptr + r*ne10, 1, dst_ptr + r*ne10, 1, ne10);
-#else
-                ggml_vec_div_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
-#endif
-            }
-        }
-    } else {
-        // src1 is not contiguous
-        for (int64_t ir = ith; ir < nr; ir += nth) {
-            // src0 and dst are same shape => same indices
-            // src1 is broadcastable across src0 and dst in i1, i2, i3
-            const int64_t i03 = ir/(ne02*ne01);
-            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
-            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
-
-            const int64_t i13 = i03 % ne13;
-            const int64_t i12 = i02 % ne12;
-            const int64_t i11 = i01 % ne11;
-
-            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
-            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
-
-            for (int64_t i0 = 0; i0 < ne00; ++i0) {
-                const int64_t i10 = i0 % ne10;
-                float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10);
-
-                dst_ptr[i0] = src0_ptr[i0] / (*src1_ptr);
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_div(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_div_f32(params, src0, src1, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_sqr
-
-static void ggml_compute_forward_sqr_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    assert(params->ith == 0);
-    assert(ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int n     = ggml_nrows(src0);
-    const int nc    = src0->ne[0];
-
-    assert( dst->nb[0] == sizeof(float));
-    assert(src0->nb[0] == sizeof(float));
-
-    for (int i = 0; i < n; i++) {
-        ggml_vec_sqr_f32(nc,
-                (float *) ((char *) dst->data  + i*( dst->nb[1])),
-                (float *) ((char *) src0->data + i*(src0->nb[1])));
-    }
-}
-
-static void ggml_compute_forward_sqr(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_sqr_f32(params, src0, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_sqrt
-
-static void ggml_compute_forward_sqrt_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    assert(params->ith == 0);
-    assert(ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int n  = ggml_nrows(src0);
-    const int nc = src0->ne[0];
-
-    assert( dst->nb[0] == sizeof(float));
-    assert(src0->nb[0] == sizeof(float));
-
-    for (int i = 0; i < n; i++) {
-        ggml_vec_sqrt_f32(nc,
-                (float *) ((char *) dst->data  + i*( dst->nb[1])),
-                (float *) ((char *) src0->data + i*(src0->nb[1])));
-    }
-}
-
-static void ggml_compute_forward_sqrt(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_sqrt_f32(params, src0, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_log
-
-static void ggml_compute_forward_log_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    GGML_ASSERT(params->ith == 0);
-    GGML_ASSERT(ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int n  = ggml_nrows(src0);
-    const int nc = src0->ne[0];
-
-    GGML_ASSERT( dst->nb[0] == sizeof(float));
-    GGML_ASSERT(src0->nb[0] == sizeof(float));
-
-    for (int i = 0; i < n; i++) {
-        ggml_vec_log_f32(nc,
-                (float *) ((char *) dst->data  + i*( dst->nb[1])),
-                (float *) ((char *) src0->data + i*(src0->nb[1])));
-    }
-}
-
-static void ggml_compute_forward_log(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_log_f32(params, src0, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_sum
-
-static void ggml_compute_forward_sum_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    assert(params->ith == 0);
-    assert(ggml_is_scalar(dst));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    assert(ggml_is_scalar(dst));
-    assert(src0->nb[0] == sizeof(float));
-
-    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne)
-    GGML_TENSOR_LOCALS(size_t,  nb0, src0, nb)
-
-    ggml_float sum     = 0;
-    ggml_float row_sum = 0;
-
-    for (int64_t i03 = 0; i03 < ne03; i03++) {
-        for (int64_t i02 = 0; i02 < ne02; i02++) {
-            for (int64_t i01 = 0; i01 < ne01; i01++) {
-                ggml_vec_sum_f32_ggf(ne00,
-                        &row_sum,
-                        (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03));
-                sum += row_sum;
-            }
-        }
-    }
-    ((float *) dst->data)[0] = sum;
-}
-
-static void ggml_compute_forward_sum_f16(
-    const struct ggml_compute_params * params,
-    const struct ggml_tensor * src0,
-          struct ggml_tensor * dst) {
-    assert(params->ith == 0);
-    assert(ggml_is_scalar(dst));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    assert(src0->nb[0] == sizeof(ggml_fp16_t));
-
-    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne)
-    GGML_TENSOR_LOCALS(size_t,  nb0, src0, nb)
-
-    float sum = 0;
-    float row_sum = 0;
-
-    for (int64_t i03 = 0; i03 < ne03; i03++) {
-        for (int64_t i02 = 0; i02 < ne02; i02++) {
-            for (int64_t i01 = 0; i01 < ne01; i01++) {
-                ggml_vec_sum_f16_ggf(ne00,
-                    &row_sum,
-                    (ggml_fp16_t *) ((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03));
-                sum += row_sum;
-            }
-        }
-    }
-    ((ggml_fp16_t *) dst->data)[0] = GGML_FP32_TO_FP16(sum);
-}
-
-static void ggml_compute_forward_sum(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_sum_f32(params, src0, dst);
-            } break;
-        case GGML_TYPE_F16:
-            {
-                ggml_compute_forward_sum_f16(params, src0, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_sum_rows
-
-static void ggml_compute_forward_sum_rows_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    GGML_ASSERT(params->ith == 0);
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    GGML_ASSERT(src0->nb[0] == sizeof(float));
-    GGML_ASSERT(dst->nb[0] == sizeof(float));
-
-    GGML_TENSOR_UNARY_OP_LOCALS
-
-    GGML_ASSERT(ne0 == 1);
-    GGML_ASSERT(ne1 == ne01);
-    GGML_ASSERT(ne2 == ne02);
-    GGML_ASSERT(ne3 == ne03);
-
-    for (int64_t i3 = 0; i3 < ne03; i3++) {
-        for (int64_t i2 = 0; i2 < ne02; i2++) {
-            for (int64_t i1 = 0; i1 < ne01; i1++) {
-                float * src_row = (float *) ((char *) src0->data + i1*nb01 + i2*nb02 + i3*nb03);
-                float * dst_row = (float *) ((char *) dst->data  + i1*nb1  + i2*nb2  + i3*nb3);
-                float row_sum = 0;
-                ggml_vec_sum_f32(ne00, &row_sum, src_row);
-                dst_row[0] = row_sum;
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_sum_rows(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_sum_rows_f32(params, src0, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_mean
-
-static void ggml_compute_forward_mean_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    assert(params->ith == 0);
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    assert(src0->nb[0] == sizeof(float));
-
-    GGML_TENSOR_UNARY_OP_LOCALS
-
-    assert(ne0 == 1);
-    assert(ne1 == ne01);
-    assert(ne2 == ne02);
-    assert(ne3 == ne03);
-
-    UNUSED(ne0);
-    UNUSED(ne1);
-    UNUSED(ne2);
-    UNUSED(ne3);
-
-    for (int64_t i03 = 0; i03 < ne03; i03++) {
-        for (int64_t i02 = 0; i02 < ne02; i02++) {
-            for (int64_t i01 = 0; i01 < ne01; i01++) {
-                ggml_vec_sum_f32(ne00,
-                        (float *) ((char *)  dst->data + i01*nb1  + i02*nb2  + i03*nb3),
-                        (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03));
-
-                *(float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3) /= (float) ne00;
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_mean(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_mean_f32(params, src0, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_argmax
-
-static void ggml_compute_forward_argmax_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    assert(params->ith == 0);
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    assert(src0->nb[0] == sizeof(float));
-    assert(dst->nb[0] == sizeof(float));
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-
-    const size_t nb01 = src0->nb[1];
-    const size_t nb0 = dst->nb[0];
-
-    for (int64_t i1 = 0; i1 < ne01; i1++) {
-        float * src = (float *) ((char *) src0->data + i1*nb01);
-        int32_t * dst_ = (int32_t *) ((char *)  dst->data + i1*nb0);
-        int v = 0;
-        ggml_vec_argmax_f32(ne00, &v, src);
-        dst_[0] = v;
-    }
-}
-
-static void ggml_compute_forward_argmax(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_argmax_f32(params, src0, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_repeat
-
-static void ggml_compute_forward_repeat_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    GGML_ASSERT(params->ith == 0);
-    GGML_ASSERT(ggml_can_repeat(src0, dst));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    GGML_TENSOR_UNARY_OP_LOCALS
-
-    // guaranteed to be an integer due to the check in ggml_can_repeat
-    const int nr0 = (int)(ne0/ne00);
-    const int nr1 = (int)(ne1/ne01);
-    const int nr2 = (int)(ne2/ne02);
-    const int nr3 = (int)(ne3/ne03);
-
-    // TODO: support for transposed / permuted tensors
-    GGML_ASSERT(nb0  == sizeof(float));
-    GGML_ASSERT(nb00 == sizeof(float));
-
-    // TODO: maybe this is not optimal?
-    for                         (int i3 = 0; i3 < nr3;  i3++) {
-        for                     (int k3 = 0; k3 < ne03; k3++) {
-            for                 (int i2 = 0; i2 < nr2;  i2++) {
-                for             (int k2 = 0; k2 < ne02; k2++) {
-                    for         (int i1 = 0; i1 < nr1;  i1++) {
-                        for     (int k1 = 0; k1 < ne01; k1++) {
-                            for (int i0 = 0; i0 < nr0;  i0++) {
-                                ggml_vec_cpy_f32(ne00,
-                                        (float *) ((char *)  dst->data + (i3*ne03 + k3)*nb3  + (i2*ne02 + k2)*nb2  + (i1*ne01 + k1)*nb1  + (i0*ne00)*nb0),
-                                        (float *) ((char *) src0->data + (          k3)*nb03 + (          k2)*nb02 + (          k1)*nb01));
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_repeat_f16(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    GGML_ASSERT(params->ith == 0);
-    GGML_ASSERT(ggml_can_repeat(src0, dst));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    GGML_TENSOR_UNARY_OP_LOCALS
-
-    // guaranteed to be an integer due to the check in ggml_can_repeat
-    const int nr0 = (int)(ne0/ne00);
-    const int nr1 = (int)(ne1/ne01);
-    const int nr2 = (int)(ne2/ne02);
-    const int nr3 = (int)(ne3/ne03);
-
-    // TODO: support for transposed / permuted tensors
-    GGML_ASSERT(nb0  == sizeof(ggml_fp16_t));
-    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
-
-    // TODO: maybe this is not optimal?
-    for                         (int i3 = 0; i3 < nr3;  i3++) {
-        for                     (int k3 = 0; k3 < ne03; k3++) {
-            for                 (int i2 = 0; i2 < nr2;  i2++) {
-                for             (int k2 = 0; k2 < ne02; k2++) {
-                    for         (int i1 = 0; i1 < nr1;  i1++) {
-                        for     (int k1 = 0; k1 < ne01; k1++) {
-                            for (int i0 = 0; i0 < nr0;  i0++) {
-                                ggml_fp16_t * y = (ggml_fp16_t *) ((char *)  dst->data + (i3*ne03 + k3)*nb3  + (i2*ne02 + k2)*nb2  + (i1*ne01 + k1)*nb1  + (i0*ne00)*nb0);
-                                ggml_fp16_t * x = (ggml_fp16_t *) ((char *) src0->data + (          k3)*nb03 + (          k2)*nb02 + (          k1)*nb01);
-                                // ggml_vec_cpy_f16(ne00, y, x)
-                                for (int i = 0; i < ne00; ++i) {
-                                    y[i]  = x[i];
-                                }
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_repeat(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F16:
-        case GGML_TYPE_I16:
-            {
-                ggml_compute_forward_repeat_f16(params, src0, dst);
-            } break;
-        case GGML_TYPE_F32:
-        case GGML_TYPE_I32:
-            {
-                ggml_compute_forward_repeat_f32(params, src0, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_repeat_back
-
-static void ggml_compute_forward_repeat_back_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    GGML_ASSERT(params->ith == 0);
-    GGML_ASSERT(ggml_can_repeat(dst, src0));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    GGML_TENSOR_UNARY_OP_LOCALS
-
-    // guaranteed to be an integer due to the check in ggml_can_repeat
-    const int nr0 = (int)(ne00/ne0);
-    const int nr1 = (int)(ne01/ne1);
-    const int nr2 = (int)(ne02/ne2);
-    const int nr3 = (int)(ne03/ne3);
-
-    // TODO: support for transposed / permuted tensors
-    GGML_ASSERT(nb0  == sizeof(float));
-    GGML_ASSERT(nb00 == sizeof(float));
-
-    if (ggml_is_contiguous(dst)) {
-        ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
-    } else {
-        for         (int k3 = 0; k3 < ne3; k3++) {
-            for     (int k2 = 0; k2 < ne2; k2++) {
-                for (int k1 = 0; k1 < ne1; k1++) {
-                    ggml_vec_set_f32(ne0,
-                        (float *) ((char *) dst->data + k1*nb1 + k2*nb2 + k3*nb3),
-                        0);
-                }
-            }
-        }
-    }
-
-    // TODO: maybe this is not optimal?
-    for                         (int i3 = 0; i3 < nr3; i3++) {
-        for                     (int k3 = 0; k3 < ne3; k3++) {
-            for                 (int i2 = 0; i2 < nr2; i2++) {
-                for             (int k2 = 0; k2 < ne2; k2++) {
-                    for         (int i1 = 0; i1 < nr1; i1++) {
-                        for     (int k1 = 0; k1 < ne1; k1++) {
-                            for (int i0 = 0; i0 < nr0; i0++) {
-                                ggml_vec_acc_f32(ne0,
-                                        (float *) ((char *)  dst->data + (         k3)*nb3  + (         k2)*nb2  + (         k1)*nb1),
-                                        (float *) ((char *) src0->data + (i3*ne3 + k3)*nb03 + (i2*ne2 + k2)*nb02 + (i1*ne1 + k1)*nb01 + (i0*ne0)*nb00));
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_repeat_back(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_repeat_back_f32(params, src0, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_concat
-
-static void ggml_compute_forward_concat_f32(
-    const struct ggml_compute_params * params,
-    const struct ggml_tensor * src0,
-    const struct ggml_tensor * src1,
-    struct ggml_tensor * dst) {
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    GGML_ASSERT(src0->nb[0] == sizeof(float));
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    // TODO: support for transposed / permuted tensors
-    GGML_ASSERT(nb0  == sizeof(float));
-    GGML_ASSERT(nb00 == sizeof(float));
-    GGML_ASSERT(nb10 == sizeof(float));
-
-    for (int i3 = 0; i3 < ne3; i3++) {
-        for (int i2 = ith; i2 < ne2; i2 += nth) {
-            if (i2 < ne02) { // src0
-                for (int i1 = 0; i1 < ne1; i1++) {
-                    for (int i0 = 0; i0 < ne0; i0++) {
-                        const float * x = (float *)((char *) src0->data + i0 * nb00 + i1 * nb01 + i2 * nb02 + i3 * nb03);
-
-                        float * y = (float *)((char *)dst->data + i0 * nb0 + i1 * nb1 + i2 * nb2 + i3 * nb3);
-                        *y = *x;
-                    }
-                }
-            } // src1
-            else {
-                for (int i1 = 0; i1 < ne1; i1++) {
-                    for (int i0 = 0; i0 < ne0; i0++) {
-                        const float * x = (float *)((char *) src1->data + i0 * nb10 + i1 * nb11 + (i2 - ne02) * nb12 + i3 * nb13);
-
-                        float * y = (float *)((char *)dst->data + i0 * nb0 + i1 * nb1 + i2 * nb2 + i3 * nb3);
-                        *y = *x;
-                    }
-                }
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_concat(
-    const struct ggml_compute_params* params,
-    const struct ggml_tensor* src0,
-    const struct ggml_tensor* src1,
-    struct ggml_tensor* dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-        case GGML_TYPE_I32:
-            {
-                ggml_compute_forward_concat_f32(params, src0, src1, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_abs
-
-static void ggml_compute_forward_abs_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    assert(params->ith == 0);
-    assert(ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int n  = ggml_nrows(src0);
-    const int nc = src0->ne[0];
-
-    assert(dst->nb[0]  == sizeof(float));
-    assert(src0->nb[0] == sizeof(float));
-
-    for (int i = 0; i < n; i++) {
-        ggml_vec_abs_f32(nc,
-                (float *) ((char *) dst->data  + i*( dst->nb[1])),
-                (float *) ((char *) src0->data + i*(src0->nb[1])));
-    }
-}
-
-static void ggml_compute_forward_abs(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_abs_f32(params, src0, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_sgn
-
-static void ggml_compute_forward_sgn_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    assert(params->ith == 0);
-    assert(ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int n  = ggml_nrows(src0);
-    const int nc = src0->ne[0];
-
-    assert(dst->nb[0]  == sizeof(float));
-    assert(src0->nb[0] == sizeof(float));
-
-    for (int i = 0; i < n; i++) {
-        ggml_vec_sgn_f32(nc,
-                (float *) ((char *) dst->data  + i*( dst->nb[1])),
-                (float *) ((char *) src0->data + i*(src0->nb[1])));
-    }
-}
-
-static void ggml_compute_forward_sgn(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_sgn_f32(params, src0, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_neg
-
-static void ggml_compute_forward_neg_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    assert(params->ith == 0);
-    assert(ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int n  = ggml_nrows(src0);
-    const int nc = src0->ne[0];
-
-    assert(dst->nb[0]  == sizeof(float));
-    assert(src0->nb[0] == sizeof(float));
-
-    for (int i = 0; i < n; i++) {
-        ggml_vec_neg_f32(nc,
-                (float *) ((char *) dst->data  + i*( dst->nb[1])),
-                (float *) ((char *) src0->data + i*(src0->nb[1])));
-    }
-}
-
-static void ggml_compute_forward_neg(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_neg_f32(params, src0, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_step
-
-static void ggml_compute_forward_step_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    assert(params->ith == 0);
-    assert(ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int n  = ggml_nrows(src0);
-    const int nc = src0->ne[0];
-
-    assert(dst->nb[0]  == sizeof(float));
-    assert(src0->nb[0] == sizeof(float));
-
-    for (int i = 0; i < n; i++) {
-        ggml_vec_step_f32(nc,
-                (float *) ((char *) dst->data  + i*( dst->nb[1])),
-                (float *) ((char *) src0->data + i*(src0->nb[1])));
-    }
-}
-
-static void ggml_compute_forward_step(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_step_f32(params, src0, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_tanh
-
-static void ggml_compute_forward_tanh_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    assert(params->ith == 0);
-    assert(ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int n  = ggml_nrows(src0);
-    const int nc = src0->ne[0];
-
-    assert(dst->nb[0]  == sizeof(float));
-    assert(src0->nb[0] == sizeof(float));
-
-    for (int i = 0; i < n; i++) {
-        ggml_vec_tanh_f32(nc,
-                (float *) ((char *) dst->data  + i*( dst->nb[1])),
-                (float *) ((char *) src0->data + i*(src0->nb[1])));
-    }
-}
-
-static void ggml_compute_forward_tanh(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_tanh_f32(params, src0, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_elu
-
-static void ggml_compute_forward_elu_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    assert(params->ith == 0);
-    assert(ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int n  = ggml_nrows(src0);
-    const int nc = src0->ne[0];
-
-    assert(dst->nb[0]  == sizeof(float));
-    assert(src0->nb[0] == sizeof(float));
-
-    for (int i = 0; i < n; i++) {
-        ggml_vec_elu_f32(nc,
-                (float *) ((char *) dst->data  + i*( dst->nb[1])),
-                (float *) ((char *) src0->data + i*(src0->nb[1])));
-    }
-}
-
-static void ggml_compute_forward_elu(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_elu_f32(params, src0, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_relu
-
-static void ggml_compute_forward_relu_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    assert(params->ith == 0);
-    assert(ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int n  = ggml_nrows(src0);
-    const int nc = src0->ne[0];
-
-    assert(dst->nb[0]  == sizeof(float));
-    assert(src0->nb[0] == sizeof(float));
-
-    for (int i = 0; i < n; i++) {
-        ggml_vec_relu_f32(nc,
-                (float *) ((char *) dst->data  + i*( dst->nb[1])),
-                (float *) ((char *) src0->data + i*(src0->nb[1])));
-    }
-}
-
-static void ggml_compute_forward_relu(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_relu_f32(params, src0, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_gelu
-
-static void ggml_compute_forward_gelu_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0));
-    GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
-    GGML_ASSERT(ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nc = src0->ne[0];
-    const int nr = ggml_nrows(src0);
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    for (int i1 = ir0; i1 < ir1; i1++) {
-        ggml_vec_gelu_f32(nc,
-                (float *) ((char *) dst->data  + i1*( dst->nb[1])),
-                (float *) ((char *) src0->data + i1*(src0->nb[1])));
-
-#ifndef NDEBUG
-        for (int k = 0; k < nc; k++) {
-            const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k];
-            UNUSED(x);
-            assert(!isnan(x));
-            assert(!isinf(x));
-        }
-#endif
-    }
-}
-
-static void ggml_compute_forward_gelu(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_gelu_f32(params, src0, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_gelu_quick
-
-static void ggml_compute_forward_gelu_quick_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0));
-    GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
-    GGML_ASSERT(ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nc = src0->ne[0];
-    const int nr = ggml_nrows(src0);
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    for (int i1 = ir0; i1 < ir1; i1++) {
-        ggml_vec_gelu_quick_f32(nc,
-                (float *) ((char *) dst->data  + i1*( dst->nb[1])),
-                (float *) ((char *) src0->data + i1*(src0->nb[1])));
-
-#ifndef NDEBUG
-        for (int k = 0; k < nc; k++) {
-            const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k];
-            UNUSED(x);
-            assert(!isnan(x));
-            assert(!isinf(x));
-        }
-#endif
-    }
-}
-
-static void ggml_compute_forward_gelu_quick(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_gelu_quick_f32(params, src0, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_silu
-
-static void ggml_compute_forward_silu_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0));
-    GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
-    GGML_ASSERT(ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nc = src0->ne[0];
-    const int nr = ggml_nrows(src0);
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    for (int i1 = ir0; i1 < ir1; i1++) {
-        ggml_vec_silu_f32(nc,
-                (float *) ((char *) dst->data  + i1*( dst->nb[1])),
-                (float *) ((char *) src0->data + i1*(src0->nb[1])));
-
-#ifndef NDEBUG
-        for (int k = 0; k < nc; k++) {
-            const float x = ((float *) ((char *) dst->data + i1*(dst->nb[1])))[k];
-            UNUSED(x);
-            assert(!isnan(x));
-            assert(!isinf(x));
-        }
-#endif
-    }
-}
-
-static void ggml_compute_forward_silu(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_silu_f32(params, src0, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-// ggml_compute_forward_leaky_relu
-
-static void ggml_compute_forward_leaky_relu_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    assert(params->ith == 0);
-    assert(ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int n  = ggml_nrows(src0);
-    const int nc = src0->ne[0];
-
-    float negative_slope;
-    memcpy(&negative_slope, dst->op_params, sizeof(float));
-
-    assert(dst->nb[0]  == sizeof(float));
-    assert(src0->nb[0] == sizeof(float));
-
-    for (int i = 0; i < n; i++) {
-        ggml_vec_leaky_relu_f32(nc,
-                (float *) ((char *) dst->data  + i*( dst->nb[1])),
-                (float *) ((char *) src0->data + i*(src0->nb[1])), negative_slope);
-    }
-}
-
-static void ggml_compute_forward_leaky_relu(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_leaky_relu_f32(params, src0, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_silu_back
-
-static void ggml_compute_forward_silu_back_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * grad,
-        struct ggml_tensor * dst) {
-    GGML_ASSERT(ggml_is_contiguous_except_dim_1(grad));
-    GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0));
-    GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
-    GGML_ASSERT(ggml_are_same_shape(src0, dst));
-    GGML_ASSERT(ggml_are_same_shape(src0, grad));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nc = src0->ne[0];
-    const int nr = ggml_nrows(src0);
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    for (int i1 = ir0; i1 < ir1; i1++) {
-        ggml_vec_silu_backward_f32(nc,
-                (float *) ((char *) dst->data  + i1*( dst->nb[1])),
-                (float *) ((char *) src0->data + i1*(src0->nb[1])),
-                (float *) ((char *) grad->data + i1*(grad->nb[1])));
-
-#ifndef NDEBUG
-        for (int k = 0; k < nc; k++) {
-            const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k];
-            UNUSED(x);
-            assert(!isnan(x));
-            assert(!isinf(x));
-        }
-#endif
-    }
-}
-
-static void ggml_compute_forward_silu_back(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * grad,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_silu_back_f32(params, src0, grad, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_norm
-
-static void ggml_compute_forward_norm_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    GGML_ASSERT(ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    GGML_ASSERT(src0->nb[0] == sizeof(float));
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    GGML_TENSOR_UNARY_OP_LOCALS
-
-    float eps;
-    memcpy(&eps, dst->op_params, sizeof(float));
-
-    GGML_ASSERT(eps > 0.0f);
-
-    // TODO: optimize
-    for (int64_t i03 = 0; i03 < ne03; i03++) {
-        for (int64_t i02 = 0; i02 < ne02; i02++) {
-            for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
-                const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
-
-                ggml_float sum = 0.0;
-                for (int64_t i00 = 0; i00 < ne00; i00++) {
-                    sum += (ggml_float)x[i00];
-                }
-
-                float mean = sum/ne00;
-
-                float * y = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3);
-
-                ggml_float sum2 = 0.0;
-                for (int64_t i00 = 0; i00 < ne00; i00++) {
-                    float v = x[i00] - mean;
-                    y[i00] = v;
-                    sum2 += (ggml_float)(v*v);
-                }
-
-                float variance = sum2/ne00;
-                const float scale = 1.0f/sqrtf(variance + eps);
-
-                ggml_vec_scale_f32(ne00, y, scale);
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_norm(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_norm_f32(params, src0, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_group_rms_norm
-
-static void ggml_compute_forward_rms_norm_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    GGML_ASSERT(ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    GGML_ASSERT(src0->nb[0] == sizeof(float));
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    GGML_TENSOR_UNARY_OP_LOCALS
-
-    float eps;
-    memcpy(&eps, dst->op_params, sizeof(float));
-
-    GGML_ASSERT(eps > 0.0f);
-
-    // TODO: optimize
-    for (int64_t i03 = 0; i03 < ne03; i03++) {
-        for (int64_t i02 = 0; i02 < ne02; i02++) {
-            for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
-                const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
-
-                ggml_float sum = 0.0;
-                for (int64_t i00 = 0; i00 < ne00; i00++) {
-                    sum += (ggml_float)(x[i00] * x[i00]);
-                }
-
-                const float mean = sum/ne00;
-
-                float * y = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3);
-
-                memcpy(y, x, ne00 * sizeof(float));
-                // for (int i00 = 0; i00 < ne00; i00++) {
-                //     y[i00] = x[i00];
-                // }
-
-                const float scale = 1.0f/sqrtf(mean + eps);
-
-                ggml_vec_scale_f32(ne00, y, scale);
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_rms_norm(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_rms_norm_f32(params, src0, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-static void ggml_compute_forward_rms_norm_back_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
-    GGML_ASSERT(ggml_are_same_shape(src0, dst) && ggml_are_same_shape(src0, src1));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    GGML_ASSERT(src0->nb[0] == sizeof(float));
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    float eps;
-    memcpy(&eps, dst->op_params, sizeof(float));
-
-    // TODO: optimize
-    for (int64_t i03 = 0; i03 < ne03; i03++) {
-        for (int64_t i02 = 0; i02 < ne02; i02++) {
-            for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
-                // src1 is same shape as src0 => same indices
-                const int64_t i11 = i01;
-                const int64_t i12 = i02;
-                const int64_t i13 = i03;
-
-                const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
-                const float * dz = (float *) ((char *) src1->data + i11*nb11 + i12*nb12 + i13*nb13);
-
-                ggml_float sum_xx  = 0.0;
-                ggml_float sum_xdz = 0.0;
-
-                for (int64_t i00 = 0; i00 < ne00; i00++) {
-                    sum_xx  += (ggml_float)(x[i00] * x[i00]);
-                    sum_xdz += (ggml_float)(x[i00] * dz[i00]);
-                }
-
-                //const float mean     = (float)(sum_xx)/ne00;
-                const float mean_eps = (float)(sum_xx)/ne00 + eps;
-                const float sum_eps  = (float)(sum_xx) + eps*ne00;
-                //const float mean_xdz = (float)(sum_xdz)/ne00;
-                // we could cache rms from forward pass to improve performance.
-                // to do this implement ggml_rms and compose ggml_rms_norm using ggml_rms.
-                //const float rms      = sqrtf(mean_eps);
-                const float rrms     = 1.0f / sqrtf(mean_eps);
-                //const float scale    = -rrms/(ne00 * mean_eps); // -1/(n*rms**3)
-
-                {
-                    // z = rms_norm(x)
-                    //
-                    // rms_norm(src0) =
-                    //     scale(
-                    //         src0,
-                    //         div(
-                    //             1,
-                    //             sqrt(
-                    //                 add(
-                    //                     scale(
-                    //                         sum(
-                    //                             sqr(
-                    //                                 src0)),
-                    //                         (1.0/N)),
-                    //                     eps))));
-
-                    // postorder:
-                    // ## op    args         grad
-                    // 00 param src0         grad[#00]
-                    // 01 const 1
-                    // 02 sqr   (#00)        grad[#02]
-                    // 03 sum   (#02)        grad[#03]
-                    // 04 const 1/N
-                    // 05 scale (#03, #04)   grad[#05]
-                    // 06 const eps
-                    // 07 add   (#05, #06)   grad[#07]
-                    // 08 sqrt  (#07)        grad[#08]
-                    // 09 div   (#01,#08)    grad[#09]
-                    // 10 scale (#00,#09)    grad[#10]
-                    //
-                    // backward pass, given grad[#10]
-                    // #10: scale
-                    // grad[#00] += scale(grad[#10],#09)
-                    // grad[#09] += sum(mul(grad[#10],#00))
-                    // #09: div
-                    // grad[#08] += neg(mul(grad[#09], div(#09,#08)))
-                    // #08: sqrt
-                    // grad[#07] += mul(grad[#08], div(0.5, #08))
-                    // #07: add
-                    // grad[#05] += grad[#07]
-                    // #05: scale
-                    // grad[#03] += scale(grad[#05],#04)
-                    // #03: sum
-                    // grad[#02] += repeat(grad[#03], #02)
-                    // #02:
-                    // grad[#00] += scale(mul(#00, grad[#02]), 2.0)
-                    //
-                    // substitute and simplify:
-                    // grad[#00] = scale(grad(#10), #09) + scale(mul(#00, grad[#02]), 2.0)
-                    // grad[#02] = repeat(grad[#03], #02)
-                    // grad[#02] = repeat(scale(grad[#05],#04), #02)
-                    // grad[#02] = repeat(scale(grad[#07],#04), #02)
-                    // grad[#02] = repeat(scale(mul(grad[#08], div(0.5, #08)),#04), #02)
-                    // grad[#02] = repeat(scale(mul(neg(mul(grad[#09], div(#09,#08))), div(0.5, #08)),#04), #02)
-                    // grad[#02] = repeat(scale(mul(neg(mul(sum(mul(grad[#10],#00)), div(#09,#08))), div(0.5, #08)),#04), #02)
-                    // grad[#02] = repeat(-(sum(mul(grad[#10],#00)) * div(#09,#08) * div(0.5, #08) * (1/N)), #02)
-                    // grad[#02] = repeat(-(sum(mul(grad[#10],#00)) * div(div(#01,#08),#08) * div(0.5, #08) * (1/N)), #02)
-                    // grad[#02] = repeat(-(sum(mul(grad[#10],#00)) * div(1,#08*#08) * div(0.5, #08) * (1/N)), #02)
-                    // grad[#02] = repeat(-(sum(mul(grad[#10],#00)) * div(1,#07) * div(0.5, #08) * (1/N)), #02)
-                    // grad[#00] = scale(grad(#10), #09) + scale(mul(#00, grad[#02]), 2.0)
-                    // grad[#00] = scale(grad(#10), #09) + scale(mul(#00, repeat(-(sum(mul(grad[#10],#00)) * div(1,#07) * div(0.5, #08) * (1/N)), #02)), 2.0)
-                    // grad[#00] = scale(grad(#10), #09) + scale(scale(#00, -(sum(mul(grad[#10],#00)) * div(1,#07) * div(0.5, #08) * (1/N))), 2.0)
-                    // grad[#00] = scale(grad(#10), #09) + scale(#00, -(sum(mul(grad[#10],#00)) * div(1,#07) * div(1,#08) * (1/N)))
-                    // grad[#00] = scale(grad(#10), #09) + scale(#00, sum(mul(grad[#10],#00)) * div(1,#07*#08) * (-1/N))
-                    // grad[#00] = scale(grad(#10), #09) + scale(#00, sum(mul(grad[#10],#00)) * div(1,#07*#08) * (-1/N))
-                    // grad[#00] = scale(grad(#10), #09) + scale(#00, sum(mul(grad[#10],#00)) * div(1,mean_eps*rms) * (-1/N))
-                    // grad[#00] = scale(grad(#10), #09) + scale(#00, sum(mul(grad[#10],#00)) * div(-1,rms*N*mean_eps))
-                    // grad[#00] = scale(grad(#10), #09) + scale(#00, sum(mul(grad[#10],#00)) * div(-1,rms*N*(sum_xx/N+eps)))
-                    // grad[#00] = scale(grad(#10), #09) + scale(#00, sum(mul(grad[#10],#00)) * div(-1,rms*N*sum_xx+rms*N*eps))
-                    // grad[#00] = scale(dz, rrms) + scale(x, sum(mul(dz,x)) * div(-1,rms*N*mean_eps))
-                    // grad[#00] = scale(dz, rrms) + scale(x, sum_xdz * div(-1,rms*N*mean_eps))
-                    // a = b*c + d*e
-                    // a = b*c*f/f + d*e*f/f
-                    // a = (b*c*f + d*e*f)*(1/f)
-                    // a = (b*c*(1/c) + d*e*(1/c))*(1/(1/c))
-                    // a = (b + d*e/c)*c
-                    // b = dz, c = rrms, d = x, e = sum_xdz * div(-1,rms*N*mean_eps)
-                    // a = (dz + x*sum_xdz * div(-1,rms*N*mean_eps)/rrms)*rrms
-                    // a = (dz + x*sum_xdz * div(-1,rms*N*mean_eps)*rms)*rrms
-                    // a = (dz + x*sum_xdz * div(-rms,rms*N*mean_eps))*rrms
-                    // a = (dz + x*sum_xdz * div(-1,N*mean_eps))*rrms
-                    // a = (dz + x*div(-sum_xdz,N*mean_eps))*rrms
-                    // a = (dz + x*div(-mean_xdz,mean_eps))*rrms
-                    // grad[#00] = scale(dz + scale(x, div(-mean_xdz,mean_eps)),rrms)
-                    // grad[#00] = scale(dz + scale(x, -mean_xdz/mean_eps),rrms)
-                    // dx = scale(dz + scale(x, -mean_xdz/mean_eps),rrms)
-                }
-                // dx = scale(dz + scale(x, -mean_xdz/mean_eps),rrms)
-                // post-order:
-                // dx := x
-                // dx := scale(dx,-mean_xdz/mean_eps)
-                // dx := add(dx, dz)
-                // dx := scale(dx, rrms)
-                float * dx = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3);
-
-                ggml_vec_cpy_f32  (ne00, dx, x);
-                // ggml_vec_scale_f32(ne00, dx, -mean_xdz/mean_eps);
-                ggml_vec_scale_f32(ne00, dx, (float)(-sum_xdz)/sum_eps);
-                ggml_vec_acc_f32  (ne00, dx, dz);
-                ggml_vec_scale_f32(ne00, dx, rrms);
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_rms_norm_back(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_rms_norm_back_f32(params, src0, src1, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_group_norm
-
-static void ggml_compute_forward_group_norm_f32(
-    const struct ggml_compute_params * params,
-    const struct ggml_tensor * src0,
-    struct ggml_tensor * dst) {
-    GGML_ASSERT(ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    GGML_ASSERT(src0->nb[0] == sizeof(float));
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    GGML_TENSOR_UNARY_OP_LOCALS
-
-    const float eps = 1e-6f; // TODO: make this a parameter
-
-    // TODO: optimize
-
-    int n_channels = src0->ne[2];
-    int n_groups = dst->op_params[0];
-    int n_channels_per_group = (n_channels + n_groups - 1) / n_groups;
-    for (int i = ith; i < n_groups; i+=nth) {
-        int start = i * n_channels_per_group;
-        int end = start + n_channels_per_group;
-        if (end > n_channels) {
-            end = n_channels;
-        }
-        int step = end - start;
-
-        for (int64_t i03 = 0; i03 < ne03; i03++) {
-            ggml_float sum = 0.0;
-            for (int64_t i02 = start; i02 < end; i02++) {
-                for (int64_t i01 = 0; i01 < ne01; i01++) {
-                    const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03);
-
-                    for (int64_t i00 = 0; i00 < ne00; i00++) {
-                        sum += (ggml_float)x[i00] / (ne00 * ne01 * step);
-                    }
-                }
-            }
-            float mean = sum;
-            ggml_float sum2 = 0.0;
-
-            for (int64_t i02 = start; i02 < end; i02++) {
-                for (int64_t i01 = 0; i01 < ne01; i01++) {
-                    const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03);
-
-                    float * y = (float *)((char *) dst->data + i01 * nb1 + i02 * nb2 + i03 * nb3);
-
-                    for (int64_t i00 = 0; i00 < ne00; i00++) {
-                        float v = x[i00] - mean;
-                        y[i00] = v;
-                        sum2 += (ggml_float)(v * v) / (ne00 * ne01 * step);
-                    }
-                }
-            }
-            float variance = sum2;
-            const float scale = 1.0f / sqrtf(variance + eps);
-
-            for (int64_t i02 = start; i02 < end; i02++) {
-                for (int64_t i01 = 0; i01 < ne01; i01++) {
-                    float * y = (float *)((char *) dst->data + i01 * nb1 + i02 * nb2 + i03 * nb3);
-                    ggml_vec_scale_f32(ne00, y, scale);
-                }
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_group_norm(
-    const struct ggml_compute_params * params,
-    const struct ggml_tensor * src0,
-    struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_group_norm_f32(params, src0, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_mul_mat
-
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
-// helper function to determine if it is better to use BLAS or not
-// for large matrices, BLAS is faster
-static bool ggml_compute_forward_mul_mat_use_blas(struct ggml_tensor * dst) {
-    const struct ggml_tensor * src0 = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-
-    //const int64_t ne00 = src0->ne[0];
-    //const int64_t ne01 = src0->ne[1];
-
-    const int64_t ne10 = src1->ne[0];
-
-    const int64_t ne0 = dst->ne[0];
-    const int64_t ne1 = dst->ne[1];
-
-    // NOTE: with GGML_OP_MUL_MAT_ID we don't want to go through the BLAS branch because it will dequantize (to_float)
-    //       all the experts for each batch element and the processing would become incredibly slow
-    // TODO: find the optimal values for these
-    if (dst->op != GGML_OP_MUL_MAT_ID &&
-        ggml_is_contiguous(src0) &&
-        ggml_is_contiguous(src1) &&
-      //src0->type == GGML_TYPE_F32 &&
-        src1->type == GGML_TYPE_F32 &&
-        (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
-
-        /*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/
-        return true;
-    }
-
-    return false;
-}
-#endif
-
-static void ggml_compute_forward_mul_mat(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-              struct ggml_tensor * dst) {
-    int64_t t0 = ggml_perf_time_us();
-    UNUSED(t0);
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    if (ith == 1 && g_imatrix_collect) {
-        g_imatrix_collect(src0, src1);
-    }
-
-    const enum ggml_type type = src0->type;
-
-    const bool src1_cont = ggml_is_contiguous(src1);
-
-    ggml_vec_dot_t    const vec_dot               = type_traits[type].vec_dot;
-    enum ggml_type    const vec_dot_type          = type_traits[type].vec_dot_type;
-    ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
-
-    GGML_ASSERT(ne0 == ne01);
-    GGML_ASSERT(ne1 == ne11);
-    GGML_ASSERT(ne2 == ne12);
-    GGML_ASSERT(ne3 == ne13);
-
-    // we don't support permuted src0 or src1
-    GGML_ASSERT(nb00 == ggml_type_size(type));
-    GGML_ASSERT(nb10 == ggml_type_size(src1->type));
-
-    // dst cannot be transposed or permuted
-    GGML_ASSERT(nb0 == sizeof(float));
-    GGML_ASSERT(nb0 <= nb1);
-    GGML_ASSERT(nb1 <= nb2);
-    GGML_ASSERT(nb2 <= nb3);
-
-    // broadcast factors
-    const int64_t r2 = ne12/ne02;
-    const int64_t r3 = ne13/ne03;
-
-    // nb01 >= nb00 - src0 is not transposed
-    //   compute by src0 rows
-
-#if defined(GGML_USE_CLBLAST)
-    if (ggml_cl_can_mul_mat(src0, src1, dst)) {
-        if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
-            ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
-        }
-        return;
-    }
-#endif
-
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
-    if (ggml_compute_forward_mul_mat_use_blas(dst)) {
-        if (params->ith != 0) {
-            return;
-        }
-
-        if (params->type == GGML_TASK_INIT) {
-            return;
-        }
-
-        if (params->type == GGML_TASK_FINALIZE) {
-            return;
-        }
-
-        for (int64_t i13 = 0; i13 < ne13; i13++) {
-            for (int64_t i12 = 0; i12 < ne12; i12++) {
-                // broadcast src0 into src1 across 2nd,3rd dimension
-                const int64_t i03 = i13/r3;
-                const int64_t i02 = i12/r2;
-
-                const void  * x = (char *)            src0->data + i02*nb02 + i03*nb03;
-                const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13);
-                      float * d = (float *) ((char *)  dst->data + i12*nb2  + i13*nb3);
-
-                if (type != GGML_TYPE_F32) {
-                            float * const wdata    = params->wdata;
-                    ggml_to_float_t const to_float = type_traits[type].to_float;
-
-                    size_t id = 0;
-                    for (int64_t i01 = 0; i01 < ne01; ++i01) {
-                        to_float((const char *) x + i01*nb01, wdata + id, ne00);
-                        id += ne00;
-                    }
-
-                    assert(id*sizeof(float) <= params->wsize);
-                    x = wdata;
-                }
-
-                cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
-                          ne1, ne01, ne10,
-                         1.0f,    y, ne10,
-                                  x, ne00,
-                         0.0f,    d, ne01);
-            }
-        }
-
-        //printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
-
-        return;
-    }
-#endif
-
-    if (params->type == GGML_TASK_INIT) {
-        if (src1->type != vec_dot_type) {
-            char * wdata = params->wdata;
-            const size_t row_size = ggml_row_size(vec_dot_type, ne10);
-
-            assert(params->wsize >= ne11*ne12*ne13*row_size);
-            GGML_ASSERT(src1->type == GGML_TYPE_F32);
-
-            for (int64_t i13 = 0; i13 < ne13; ++i13) {
-                for (int64_t i12 = 0; i12 < ne12; ++i12) {
-                    for (int64_t i11 = 0; i11 < ne11; ++i11) {
-                        from_float_to_vec_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10);
-                        wdata += row_size;
-                    }
-                }
-            }
-        }
-
-        return;
-    }
-
-    if (params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const void * wdata    = (src1->type == vec_dot_type) ? src1->data : params->wdata;
-    const size_t row_size = ggml_row_size(vec_dot_type, ne10);
-
-    const int64_t nr0 = ne01;          // src0 rows
-    const int64_t nr1 = ne1*ne12*ne13; // src1 rows
-
-    //printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);
-
-    // distribute the thread work across the inner or outer loop based on which one is larger
-
-    const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
-    const int64_t nth1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows
-
-    const int64_t ith0 = ith % nth0;
-    const int64_t ith1 = ith / nth0;
-
-    const int64_t dr0 = (nr0 + nth0 - 1)/nth0;
-    const int64_t dr1 = (nr1 + nth1 - 1)/nth1;
-
-    const int64_t ir010 = dr0*ith0;
-    const int64_t ir011 = MIN(ir010 + dr0, nr0);
-
-    const int64_t ir110 = dr1*ith1;
-    const int64_t ir111 = MIN(ir110 + dr1, nr1);
-
-    //printf("ir010 = %6lld, ir011 = %6lld, ir110 = %6lld, ir111 = %6lld\n", ir010, ir011, ir110, ir111);
-
-    // threads with no work simply yield (not sure if it helps)
-    if (ir010 >= ir011 || ir110 >= ir111) {
-        sched_yield();
-        return;
-    }
-
-    assert(ne12 % ne02 == 0);
-    assert(ne13 % ne03 == 0);
-
-    // block-tiling attempt
-    const int64_t blck_0 = 16;
-    const int64_t blck_1 = 16;
-
-    // attempt to reduce false-sharing (does not seem to make a difference)
-    float tmp[16];
-
-    for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
-        for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
-            for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
-                const int64_t i13 = (ir1/(ne12*ne1));
-                const int64_t i12 = (ir1 - i13*ne12*ne1)/ne1;
-                const int64_t i11 = (ir1 - i13*ne12*ne1 - i12*ne1);
-
-                // broadcast src0 into src1
-                const int64_t i03 = i13/r3;
-                const int64_t i02 = i12/r2;
-
-                const int64_t i1 = i11;
-                const int64_t i2 = i12;
-                const int64_t i3 = i13;
-
-                const char * src0_row = (const char *) src0->data + (0 + i02*nb02 + i03*nb03);
-
-                // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
-                //       if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
-                //       the original src1 data pointer, so we should index using the indices directly
-                // TODO: this is a bit of a hack, we should probably have a better way to handle this
-                const char * src1_col = (const char *) wdata +
-                    (src1_cont || src1->type != vec_dot_type
-                     ? (i11      + i12*ne11 + i13*ne12*ne11)*row_size
-                     : (i11*nb11 + i12*nb12 + i13*nb13));
-
-                float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
-
-                //for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
-                //    vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
-                //}
-
-                for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
-                    vec_dot(ne00, &tmp[ir0 - iir0], src0_row + ir0*nb01, src1_col);
-                }
-                memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
-            }
-        }
-    }
-}
-
-// ggml_compute_forward_mul_mat_id
-
-static void ggml_compute_forward_mul_mat_id(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * ids,
-        const struct ggml_tensor * src1,
-              struct ggml_tensor * dst) {
-
-    const struct ggml_tensor * src0 = dst->src[2]; // only for GGML_TENSOR_BINARY_OP_LOCALS
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const enum ggml_type type = src0->type;
-
-    const bool src1_cont = ggml_is_contiguous(src1);
-
-    ggml_vec_dot_t    const vec_dot               = type_traits[type].vec_dot;
-    enum ggml_type    const vec_dot_type          = type_traits[type].vec_dot_type;
-    ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
-
-    GGML_ASSERT(ne0 == ne01);
-    GGML_ASSERT(ne1 == ne11);
-    GGML_ASSERT(ne2 == ne12);
-    GGML_ASSERT(ne3 == ne13);
-
-    // we don't support permuted src0 or src1
-    GGML_ASSERT(nb00 == ggml_type_size(type));
-    GGML_ASSERT(nb10 == ggml_type_size(src1->type));
-
-    // dst cannot be transposed or permuted
-    GGML_ASSERT(nb0 == sizeof(float));
-    GGML_ASSERT(nb0 <= nb1);
-    GGML_ASSERT(nb1 <= nb2);
-    GGML_ASSERT(nb2 <= nb3);
-
-    // broadcast factors
-    const int64_t r2 = ne12/ne02;
-    const int64_t r3 = ne13/ne03;
-
-    // row groups
-    const int id   = ggml_get_op_params_i32(dst, 0);
-    const int n_as = ggml_get_op_params_i32(dst, 1);
-
-    char * wdata_src1_end = (src1->type == vec_dot_type) ?
-            (char *) params->wdata :
-            (char *) params->wdata + GGML_PAD(ggml_row_size(vec_dot_type, ggml_nelements(src1)), sizeof(int64_t));
-
-    int64_t * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as]
-    int64_t * matrix_rows       = matrix_row_counts + n_as;     // [n_as][ne11]
-
-    #define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)]
-
-   if (params->type == GGML_TASK_INIT) {
-        char * wdata = params->wdata;
-        if (src1->type != vec_dot_type) {
-            const size_t row_size = ggml_row_size(vec_dot_type, ne10);
-
-            assert(params->wsize >= ne11*ne12*ne13*row_size);
-            assert(src1->type == GGML_TYPE_F32);
-
-            for (int64_t i13 = 0; i13 < ne13; ++i13) {
-                for (int64_t i12 = 0; i12 < ne12; ++i12) {
-                    for (int64_t i11 = 0; i11 < ne11; ++i11) {
-                        from_float_to_vec_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10);
-                        wdata += row_size;
-                    }
-                }
-            }
-        }
-
-        // initialize matrix_row_counts
-        GGML_ASSERT(wdata == wdata_src1_end);
-        memset(matrix_row_counts, 0, n_as*sizeof(int64_t));
-
-        // group rows by src0 matrix
-        for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
-            const int32_t row_id = *(const int32_t *) ((const char *) ids->data + i01*ids->nb[1] + id*ids->nb[0]);
-
-            GGML_ASSERT(row_id >= 0 && row_id < n_as);
-            MMID_MATRIX_ROW(row_id, matrix_row_counts[row_id]) = i01;
-            matrix_row_counts[row_id] += 1;
-        }
-
-        return;
-    }
-
-    if (params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    // compute each matrix multiplication in sequence
-    for (int cur_a = 0; cur_a < n_as; ++cur_a) {
-        const int64_t cne1 = matrix_row_counts[cur_a];
-
-        if (cne1 == 0) {
-            continue;
-        }
-
-        const struct ggml_tensor * src0_cur = dst->src[cur_a + 2];
-
-        if (ith == 1 && g_imatrix_collect) {
-            g_imatrix_collect(src0_cur, src1);
-        }
-
-        const void * wdata    = (src1->type == vec_dot_type) ? src1->data : params->wdata;
-        const size_t row_size = ggml_row_size(vec_dot_type, ne10);
-
-        const int64_t nr0 = ne01;           // src0 rows
-        const int64_t nr1 = cne1*ne12*ne13; // src1 rows
-
-        //printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);
-
-        // distribute the thread work across the inner or outer loop based on which one is larger
-
-        const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
-        const int64_t nth1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows
-
-        const int64_t ith0 = ith % nth0;
-        const int64_t ith1 = ith / nth0;
-
-        const int64_t dr0 = (nr0 + nth0 - 1)/nth0;
-        const int64_t dr1 = (nr1 + nth1 - 1)/nth1;
-
-        const int64_t ir010 = dr0*ith0;
-        const int64_t ir011 = MIN(ir010 + dr0, nr0);
-
-        const int64_t ir110 = dr1*ith1;
-        const int64_t ir111 = MIN(ir110 + dr1, nr1);
-
-        //printf("ir010 = %6lld, ir011 = %6lld, ir110 = %6lld, ir111 = %6lld\n", ir010, ir011, ir110, ir111);
-
-        // threads with no work simply yield (not sure if it helps)
-        if (ir010 >= ir011 || ir110 >= ir111) {
-            sched_yield();
-            continue;
-        }
-
-        assert(ne12 % ne02 == 0);
-        assert(ne13 % ne03 == 0);
-
-        // block-tiling attempt
-        const int64_t blck_0 = 16;
-        const int64_t blck_1 = 16;
-
-        // attempt to reduce false-sharing (does not seem to make a difference)
-        float tmp[16];
-
-        for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
-            for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
-                for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
-                    const int64_t  i13 = (ir1/(ne12*cne1)); // Note: currently, src1 is always a matrix
-                    const int64_t  i12 = (ir1 - i13*ne12*cne1)/cne1;
-                    const int64_t _i11 = (ir1 - i13*ne12*cne1 - i12*cne1);
-                    const int64_t  i11 = MMID_MATRIX_ROW(cur_a, _i11);
-
-                    // broadcast src0 into src1
-                    const int64_t i03 = i13/r3;
-                    const int64_t i02 = i12/r2;
-
-                    const int64_t i1 = i11;
-                    const int64_t i2 = i12;
-                    const int64_t i3 = i13;
-
-                    const char * src0_row = (const char *) src0_cur->data + (0 + i02*nb02 + i03*nb03);
-
-                    // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
-                    //       if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
-                    //       the original src1 data pointer, so we should index using the indices directly
-                    // TODO: this is a bit of a hack, we should probably have a better way to handle this
-                    const char * src1_col = (const char *) wdata +
-                        (src1_cont || src1->type != vec_dot_type
-                        ? (i11      + i12*ne11 + i13*ne12*ne11)*row_size
-                        : (i11*nb11 + i12*nb12 + i13*nb13));
-
-                    float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
-
-                    //for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
-                    //    vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
-                    //}
-
-                    for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
-                        vec_dot(ne00, &tmp[ir0 - iir0], src0_row + ir0*nb01, src1_col);
-                    }
-                    memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
-                }
-            }
-        }
-    }
-
-    #undef MMID_MATRIX_ROW
-}
-
-// ggml_compute_forward_out_prod
-
-static void ggml_compute_forward_out_prod_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-              struct ggml_tensor * dst) {
-    // int64_t t0 = ggml_perf_time_us();
-    // UNUSED(t0);
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    GGML_ASSERT(ne0  == ne00);
-    GGML_ASSERT(ne1  == ne10);
-    GGML_ASSERT(ne2  == ne02);
-    GGML_ASSERT(ne02 == ne12);
-    GGML_ASSERT(ne3  == ne13);
-    GGML_ASSERT(ne03 == ne13);
-
-    // we don't support permuted src0 or src1
-    GGML_ASSERT(nb00 == sizeof(float));
-
-    // dst cannot be transposed or permuted
-    GGML_ASSERT(nb0 == sizeof(float));
-    // GGML_ASSERT(nb0 <= nb1);
-    // GGML_ASSERT(nb1 <= nb2);
-    // GGML_ASSERT(nb2 <= nb3);
-
-    // nb01 >= nb00 - src0 is not transposed
-    //   compute by src0 rows
-
-    // TODO: #if defined(GGML_USE_CUBLAS) ggml_cuda_out_prod
-    // TODO: #if defined(GGML_USE_CLBLAST)
-
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
-    bool use_blas = ggml_is_matrix(src0) &&
-        ggml_is_matrix(src1) &&
-        ggml_is_contiguous(src0) &&
-        (ggml_is_contiguous(src1) || ggml_is_transposed(src1));
-#endif
-
-    if (params->type == GGML_TASK_INIT) {
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) // gemm beta will zero dst
-        if (use_blas) {
-            return;
-        }
-#endif
-        ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
-        return;
-    }
-
-    if (params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
-    if (use_blas) {
-        if (params->ith != 0) { // All threads other than the first do no work.
-            return;
-        }
-        // Arguments to ggml_compute_forward_out_prod (expressed as major,minor)
-        // src0: (k,n)
-        // src1: (k,m)
-        // dst:  (m,n)
-        //
-        // Arguments to sgemm (see https://github.com/Reference-LAPACK/lapack/blob/master/BLAS/SRC/sgemm.f)
-        // Also expressed as (major,minor)
-        // a: (m,k): so src1 transposed
-        // b: (k,n): so src0
-        // c: (m,n)
-        //
-        // However, if ggml_is_transposed(src1) is true, then
-        // src1->data already contains a transposed version, so sgemm mustn't
-        // transpose it further.
-
-        int n = src0->ne[0];
-        int k = src0->ne[1];
-        int m = src1->ne[0];
-
-        int transposeA, lda;
-
-        if (!ggml_is_transposed(src1)) {
-            transposeA = CblasTrans;
-            lda = m;
-        } else {
-            transposeA = CblasNoTrans;
-            lda = k;
-        }
-
-        float * a = (float *) ((char *) src1->data);
-        float * b = (float *) ((char *) src0->data);
-        float * c = (float *) ((char *) dst->data);
-
-        cblas_sgemm(CblasRowMajor, transposeA, CblasNoTrans, m, n, k, 1.0, a, lda, b, n, 0.0, c, n);
-
-        return;
-    }
-#endif
-
-    // dst[:,:,:,:] = 0
-    // for i2,i3:
-    //   for i1:
-    //     for i01:
-    //       for i0:
-    //         dst[i0,i1,i2,i3] += src0[i0,i01,i2,i3] * src1[i1,i01,i2,i3]
-
-    // parallelize by last three dimensions
-
-    // total rows in dst
-    const int64_t nr = ne1*ne2*ne3;
-
-    // rows per thread
-    const int64_t dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int64_t ir0 = dr*ith;
-    const int64_t ir1 = MIN(ir0 + dr, nr);
-
-    // block-tiling attempt
-    const int64_t blck_0 = MAX(GGML_VEC_MAD_UNROLL, 32);
-    const int64_t blck_1 = 16;
-
-    for (int64_t bir = ir0; bir < ir1; bir += blck_1) {
-        const int64_t bir1 = MIN(bir + blck_1, ir1);
-        for (int64_t bi01 = 0; bi01 < ne01; bi01 += blck_0) {
-            const int64_t bne01 = MIN(bi01 + blck_0, ne01);
-            for (int64_t ir = bir; ir < bir1; ++ir) {
-                // dst indices
-                const int64_t i3 = ir/(ne2*ne1);
-                const int64_t i2 = (ir - i3*ne2*ne1)/ne1;
-                const int64_t i1 = (ir - i3*ne2*ne1 - i2*ne1);
-
-                const int64_t i02 = i2;
-                const int64_t i03 = i3;
-
-                //const int64_t i10 = i1;
-                const int64_t i12 = i2;
-                const int64_t i13 = i3;
-
-#if GGML_VEC_MAD_UNROLL > 2
-                const int64_t bne01_unroll = bne01 - (bne01 % GGML_VEC_MAD_UNROLL);
-                for (int64_t i01 = bi01; i01 < bne01_unroll; i01 += GGML_VEC_MAD_UNROLL) {
-                    const int64_t i11 = i01;
-
-                    float * s0 = (float *) ((char *) src0->data + (          i01*nb01 + i02*nb02 + i03*nb03));
-                    float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13));
-                    float * d  = (float *) ((char *)  dst->data + (          i1*nb1 + i2*nb2 + i3*nb3));
-
-                    ggml_vec_mad_f32_unroll(ne0, nb01, nb11, d, s0, s1);
-                }
-                for (int64_t i01 = bne01_unroll; i01 < bne01; ++i01) {
-                    const int64_t i11 = i01;
-
-                    float * s0 = (float *) ((char *) src0->data + (          i01*nb01 + i02*nb02 + i03*nb03));
-                    float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13));
-                    float * d  = (float *) ((char *)  dst->data + (          i1*nb1 + i2*nb2 + i3*nb3));
-
-                    ggml_vec_mad_f32(ne0, d, s0, *s1);
-                }
-#else
-                for (int64_t i01 = bi01; i01 < bne01; ++i01) {
-                    const int64_t i11 = i01;
-
-                    float * s0 = (float *) ((char *) src0->data + (          i01*nb01 + i02*nb02 + i03*nb03));
-                    float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13));
-                    float * d  = (float *) ((char *)  dst->data + (          i1*nb1 + i2*nb2 + i3*nb3));
-
-                    ggml_vec_mad_f32(ne0, d, s0, *s1);
-                }
-#endif
-            }
-        }
-    }
-
-    //int64_t t1 = ggml_perf_time_us();
-    //static int64_t acc = 0;
-    //acc += t1 - t0;
-    //if (t1 - t0 > 10) {
-    //    printf("\n");
-    //    printf("ne00 = %5d, ne01 = %5d, ne02 = %5d, ne03 = %5d\n", ne00, ne01, ne02, ne03);
-    //    printf("nb00 = %5d, nb01 = %5d, nb02 = %5d, nb03 = %5d\n", nb00, nb01, nb02, nb03);
-    //    printf("ne10 = %5d, ne11 = %5d, ne12 = %5d, ne13 = %5d\n", ne10, ne11, ne12, ne13);
-    //    printf("nb10 = %5d, nb11 = %5d, nb12 = %5d, nb13 = %5d\n", nb10, nb11, nb12, nb13);
-
-    //    printf("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX task %d/%d: %d us, acc = %d\n", ith, nth, (int) (t1 - t0), (int) acc);
-    //}
-}
-
-static void ggml_compute_forward_out_prod_q_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-              struct ggml_tensor * dst) {
-    // int64_t t0 = ggml_perf_time_us();
-    // UNUSED(t0);
-
-    GGML_TENSOR_BINARY_OP_LOCALS;
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const enum ggml_type type = src0->type;
-    ggml_to_float_t const dequantize_row_q = type_traits[type].to_float;
-
-    GGML_ASSERT(ne02 == ne12);
-    GGML_ASSERT(ne03 == ne13);
-    GGML_ASSERT(ne2  == ne12);
-    GGML_ASSERT(ne3  == ne13);
-
-    // we don't support permuted src0 dim0
-    GGML_ASSERT(nb00 == ggml_type_size(type));
-
-    // dst dim0 cannot be transposed or permuted
-    GGML_ASSERT(nb0 == sizeof(float));
-    // GGML_ASSERT(nb0 <= nb1);
-    // GGML_ASSERT(nb1 <= nb2);
-    // GGML_ASSERT(nb2 <= nb3);
-
-    GGML_ASSERT(ne0 == ne00);
-    GGML_ASSERT(ne1 == ne10);
-    GGML_ASSERT(ne2 == ne02);
-    GGML_ASSERT(ne3 == ne03);
-
-    // nb01 >= nb00 - src0 is not transposed
-    //   compute by src0 rows
-
-    // TODO: #if defined(GGML_USE_CUBLAS) ggml_cuda_out_prod
-    // TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
-
-    if (params->type == GGML_TASK_INIT) {
-        ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
-        return;
-    }
-
-    if (params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    // parallelize by last three dimensions
-
-    // total rows in dst
-    const int64_t nr = ne1*ne2*ne3;
-
-    // rows per thread
-    const int64_t dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int64_t ir0 = dr*ith;
-    const int64_t ir1 = MIN(ir0 + dr, nr);
-
-    // dst[:,:,:,:] = 0
-    // for i2,i3:
-    //   for i1:
-    //     for i01:
-    //       for i0:
-    //         dst[i0,i1,i2,i3] += src0[i0,i01,i2,i3] * src1[i1,i01,i2,i3]
-
-    float * wdata = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32) * ith;
-
-    for (int64_t ir = ir0; ir < ir1; ++ir) {
-        // dst indices
-        const int64_t i3 = ir/(ne2*ne1);
-        const int64_t i2 = (ir - i3*ne2*ne1)/ne1;
-        const int64_t i1 = (ir - i3*ne2*ne1 - i2*ne1);
-
-        const int64_t i02 = i2;
-        const int64_t i03 = i3;
-
-        //const int64_t i10 = i1;
-        const int64_t i12 = i2;
-        const int64_t i13 = i3;
-
-        for (int64_t i01 = 0; i01 < ne01; ++i01) {
-            const int64_t i11 = i01;
-
-            float * s0 = (float *) ((char *) src0->data + (          i01*nb01 + i02*nb02 + i03*nb03));
-            float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13));
-            float * d  = (float *) ((char *)  dst->data + (          i1*nb1 + i2*nb2 + i3*nb3));
-
-            dequantize_row_q(s0, wdata, ne0);
-            ggml_vec_mad_f32(ne0, d, wdata, *s1);
-        }
-    }
-
-    //int64_t t1 = ggml_perf_time_us();
-    //static int64_t acc = 0;
-    //acc += t1 - t0;
-    //if (t1 - t0 > 10) {
-    //    printf("\n");
-    //    printf("ne00 = %5d, ne01 = %5d, ne02 = %5d, ne03 = %5d\n", ne00, ne01, ne02, ne03);
-    //    printf("nb00 = %5d, nb01 = %5d, nb02 = %5d, nb03 = %5d\n", nb00, nb01, nb02, nb03);
-    //    printf("ne10 = %5d, ne11 = %5d, ne12 = %5d, ne13 = %5d\n", ne10, ne11, ne12, ne13);
-    //    printf("nb10 = %5d, nb11 = %5d, nb12 = %5d, nb13 = %5d\n", nb10, nb11, nb12, nb13);
-
-    //    printf("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX task %d/%d: %d us, acc = %d\n", ith, nth, (int) (t1 - t0), (int) acc);
-    //}
-}
-
-static void ggml_compute_forward_out_prod(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_Q4_0:
-        case GGML_TYPE_Q4_1:
-        case GGML_TYPE_Q5_0:
-        case GGML_TYPE_Q5_1:
-        case GGML_TYPE_Q8_0:
-        case GGML_TYPE_Q2_K:
-        case GGML_TYPE_Q3_K:
-        case GGML_TYPE_Q4_K:
-        case GGML_TYPE_Q5_K:
-        case GGML_TYPE_Q6_K:
-        case GGML_TYPE_IQ2_XXS:
-        case GGML_TYPE_IQ2_XS:
-            {
-                ggml_compute_forward_out_prod_q_f32(params, src0, src1, dst);
-            } break;
-        case GGML_TYPE_F16:
-            {
-                GGML_ASSERT(false); // todo
-                // ggml_compute_forward_out_prod_f16_f32(params, src0, src1, dst);
-            } break;
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_out_prod_f32(params, src0, src1, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_scale
-
-static void ggml_compute_forward_scale_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    GGML_ASSERT(ggml_is_contiguous(src0));
-    GGML_ASSERT(ggml_is_contiguous(dst));
-    GGML_ASSERT(ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    // scale factor
-    float v;
-    memcpy(&v, dst->op_params, sizeof(float));
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nc = src0->ne[0];
-    const int nr = ggml_nrows(src0);
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    const size_t nb01 = src0->nb[1];
-
-    const size_t nb1 = dst->nb[1];
-
-    for (int i1 = ir0; i1 < ir1; i1++) {
-        if (dst->data != src0->data) {
-            // src0 is same shape as dst => same indices
-            memcpy((char *)dst->data + i1*nb1, (char *)src0->data + i1*nb01, nc * sizeof(float));
-        }
-        ggml_vec_scale_f32(nc, (float *) ((char *) dst->data + i1*nb1), v);
-    }
-}
-
-static void ggml_compute_forward_scale(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_scale_f32(params, src0, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_set
-
-static void ggml_compute_forward_set_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
-    GGML_ASSERT(ggml_are_same_shape(src0, dst));
-    GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
-
-    // view src0 and dst with these strides and data offset inbytes during set
-    // nb0 is implicitly element_size because src0 and dst are contiguous
-    size_t nb1     = ((int32_t *) dst->op_params)[0];
-    size_t nb2     = ((int32_t *) dst->op_params)[1];
-    size_t nb3     = ((int32_t *) dst->op_params)[2];
-    size_t offset  = ((int32_t *) dst->op_params)[3];
-    bool   inplace = (bool) ((int32_t *) dst->op_params)[4];
-
-    if (!inplace && (params->type == GGML_TASK_INIT)) {
-        // memcpy needs to be synchronized across threads to avoid race conditions.
-        // => do it in INIT phase
-        memcpy(
-            ((char *)  dst->data),
-            ((char *) src0->data),
-            ggml_nbytes(dst));
-    }
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nr = ggml_nrows(src1);
-    const int nc = src1->ne[0];
-
-    GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne)
-    GGML_TENSOR_LOCALS(size_t,  nb1, src1, nb)
-
-    // src0 and dst as viewed during set
-    const size_t nb0 = ggml_element_size(src0);
-
-    const int im0 = (ne10 == 0 ? 0 : ne10-1);
-    const int im1 = (ne11 == 0 ? 0 : ne11-1);
-    const int im2 = (ne12 == 0 ? 0 : ne12-1);
-    const int im3 = (ne13 == 0 ? 0 : ne13-1);
-
-    GGML_ASSERT(offset + im0*nb0  + im1*nb1  + im2*nb2  + im3*nb3  <= ggml_nbytes(dst));
-
-    GGML_ASSERT(nb10 == sizeof(float));
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    for (int ir = ir0; ir < ir1; ++ir) {
-        // src0 and dst are viewed with shape of src1 and offset
-        // => same indices
-        const int i3 = ir/(ne12*ne11);
-        const int i2 = (ir - i3*ne12*ne11)/ne11;
-        const int i1 = (ir - i3*ne12*ne11 - i2*ne11);
-
-        ggml_vec_cpy_f32(nc,
-                (float *) ((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + offset),
-                (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11));
-    }
-}
-
-static void ggml_compute_forward_set(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_set_f32(params, src0, src1, dst);
-            } break;
-        case GGML_TYPE_F16:
-        case GGML_TYPE_Q4_0:
-        case GGML_TYPE_Q4_1:
-        case GGML_TYPE_Q5_0:
-        case GGML_TYPE_Q5_1:
-        case GGML_TYPE_Q8_0:
-        case GGML_TYPE_Q8_1:
-        case GGML_TYPE_Q2_K:
-        case GGML_TYPE_Q3_K:
-        case GGML_TYPE_Q4_K:
-        case GGML_TYPE_Q5_K:
-        case GGML_TYPE_Q6_K:
-        case GGML_TYPE_IQ2_XXS:
-        case GGML_TYPE_IQ2_XS:
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_cpy
-
-static void ggml_compute_forward_cpy(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    ggml_compute_forward_dup(params, src0, dst);
-}
-
-// ggml_compute_forward_cont
-
-static void ggml_compute_forward_cont(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    ggml_compute_forward_dup(params, src0, dst);
-}
-
-// ggml_compute_forward_reshape
-
-static void ggml_compute_forward_reshape(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    // NOP
-    UNUSED(params);
-    UNUSED(src0);
-    UNUSED(dst);
-}
-
-// ggml_compute_forward_view
-
-static void ggml_compute_forward_view(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0) {
-    // NOP
-    UNUSED(params);
-    UNUSED(src0);
-}
-
-// ggml_compute_forward_permute
-
-static void ggml_compute_forward_permute(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0) {
-    // NOP
-    UNUSED(params);
-    UNUSED(src0);
-}
-
-// ggml_compute_forward_transpose
-
-static void ggml_compute_forward_transpose(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0) {
-    // NOP
-    UNUSED(params);
-    UNUSED(src0);
-}
-
-// ggml_compute_forward_get_rows
-
-static void ggml_compute_forward_get_rows_q(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-              struct ggml_tensor * dst) {
-    assert(params->ith == 0);
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    const int64_t nc = ne00;
-    const int64_t nr = ggml_nelements(src1); GGML_UNUSED(nr);
-
-    const enum ggml_type type = src0->type;
-    ggml_to_float_t const dequantize_row_q = type_traits[type].to_float;
-
-    assert(ne0  == nc);
-    assert(ne02 == ne11);
-    assert(nb00 == ggml_type_size(type));
-    assert(ggml_nrows(dst) == nr);
-
-    // TODO: multi-thread
-    for (int64_t i12 = 0; i12 < ne12; ++i12) {
-        for (int64_t i11 = 0; i11 < ne11; ++i11) {
-            for (int64_t i10 = 0; i10 < ne10; ++i10) {
-                const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
-
-                dequantize_row_q(
-                        (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
-                             (float *) ((char *)  dst->data + i10*nb1  + i11*nb2  + i12*nb3), nc);
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_get_rows_f16(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-              struct ggml_tensor * dst) {
-    assert(params->ith == 0);
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    const int64_t nc = ne00;
-    const int64_t nr = ggml_nelements(src1); GGML_UNUSED(nr);
-
-    assert(ne0  == nc);
-    assert(ne02 == ne11);
-    assert(nb00 == sizeof(ggml_fp16_t));
-    assert(ggml_nrows(dst) == nr);
-
-    // TODO: multi-thread
-    for (int64_t i12 = 0; i12 < ne12; ++i12) {
-        for (int64_t i11 = 0; i11 < ne11; ++i11) {
-            for (int64_t i10 = 0; i10 < ne10; ++i10) {
-                const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
-
-                ggml_fp16_to_fp32_row(
-                        (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
-                             (float *) ((char *)  dst->data + i10*nb1  + i11*nb2  + i12*nb3), nc);
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_get_rows_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-              struct ggml_tensor * dst) {
-    assert(params->ith == 0);
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    const int64_t nc = ne00;
-    const int64_t nr = ggml_nelements(src1); GGML_UNUSED(nr);
-
-    assert(ne0  == nc);
-    assert(ne02 == ne11);
-    assert(nb00 == sizeof(float));
-    assert(ggml_nrows(dst) == nr);
-
-    // TODO: multi-thread
-    for (int64_t i12 = 0; i12 < ne12; ++i12) {
-        for (int64_t i11 = 0; i11 < ne11; ++i11) {
-            for (int64_t i10 = 0; i10 < ne10; ++i10) {
-                const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
-
-                ggml_vec_cpy_f32(nc,
-                        (float *) ((char *)  dst->data + i10*nb1  + i11*nb2  + i12*nb3),
-                        (float *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03));
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_get_rows(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_Q4_0:
-        case GGML_TYPE_Q4_1:
-        case GGML_TYPE_Q5_0:
-        case GGML_TYPE_Q5_1:
-        case GGML_TYPE_Q8_0:
-        case GGML_TYPE_Q8_1:
-        case GGML_TYPE_Q2_K:
-        case GGML_TYPE_Q3_K:
-        case GGML_TYPE_Q4_K:
-        case GGML_TYPE_Q5_K:
-        case GGML_TYPE_Q6_K:
-        case GGML_TYPE_IQ2_XXS:
-        case GGML_TYPE_IQ2_XS:
-            {
-                ggml_compute_forward_get_rows_q(params, src0, src1, dst);
-            } break;
-        case GGML_TYPE_F16:
-            {
-                ggml_compute_forward_get_rows_f16(params, src0, src1, dst);
-            } break;
-        case GGML_TYPE_F32:
-        case GGML_TYPE_I32:
-            {
-                ggml_compute_forward_get_rows_f32(params, src0, src1, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-
-    //static bool first = true;
-    //printf("ne0 = %d, ne1 = %d, ne2 = %d\n", dst->ne[0], dst->ne[1], dst->ne[2]);
-    //if (first) {
-    //    first = false;
-    //} else {
-    //    for (int k = 0; k < dst->ne[1]; ++k) {
-    //        for (int j = 0; j < dst->ne[0]/16; ++j) {
-    //            for (int i = 0; i < 16; ++i) {
-    //                printf("%8.4f ", ((float *) dst->data)[k*dst->ne[0] + j*16 + i]);
-    //            }
-    //            printf("\n");
-    //        }
-    //        printf("\n");
-    //    }
-    //    printf("\n");
-    //    exit(0);
-    //}
-}
-
-// ggml_compute_forward_get_rows_back
-
-static void ggml_compute_forward_get_rows_back_f32_f16(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-              struct ggml_tensor * dst) {
-    GGML_ASSERT(params->ith == 0);
-    GGML_ASSERT(ggml_is_contiguous(dst));
-
-    // ggml_compute_forward_dup_same_cont(params, opt0, dst);
-
-    if (params->type == GGML_TASK_INIT) {
-        memset(dst->data, 0, ggml_nbytes(dst));
-    }
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int nc = src0->ne[0];
-    const int nr = ggml_nelements(src1);
-
-    GGML_ASSERT( dst->ne[0] == nc);
-    GGML_ASSERT(src0->nb[0] == sizeof(ggml_fp16_t));
-
-    for (int i = 0; i < nr; ++i) {
-        const int r = ((int32_t *) src1->data)[i];
-
-        for (int j = 0; j < nc; ++j) {
-            ggml_fp16_t v = ((ggml_fp16_t *) ((char *) src0->data + i*src0->nb[1]))[j];
-            ((float *) ((char *) dst->data + r*dst->nb[1]))[j] += GGML_FP16_TO_FP32(v);
-        }
-    }
-}
-
-static void ggml_compute_forward_get_rows_back_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-              struct ggml_tensor * dst) {
-    GGML_ASSERT(params->ith == 0);
-    GGML_ASSERT(ggml_is_contiguous(dst));
-
-    // ggml_compute_forward_dup_same_cont(params, opt0, dst);
-
-    if (params->type == GGML_TASK_INIT) {
-        memset(dst->data, 0, ggml_nbytes(dst));
-    }
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int nc = src0->ne[0];
-    const int nr = ggml_nelements(src1);
-
-    GGML_ASSERT( dst->ne[0] == nc);
-    GGML_ASSERT(src0->nb[0] == sizeof(float));
-
-    for (int i = 0; i < nr; ++i) {
-        const int r = ((int32_t *) src1->data)[i];
-
-        ggml_vec_add_f32(nc,
-                (float *) ((char *)  dst->data + r*dst->nb[1]),
-                (float *) ((char *)  dst->data + r*dst->nb[1]),
-                (float *) ((char *) src0->data + i*src0->nb[1]));
-    }
-}
-
-static void ggml_compute_forward_get_rows_back(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F16:
-            {
-                ggml_compute_forward_get_rows_back_f32_f16(params, src0, src1, dst);
-            } break;
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_get_rows_back_f32(params, src0, src1, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-
-    //static bool first = true;
-    //printf("ne0 = %d, ne1 = %d, ne2 = %d\n", dst->ne[0], dst->ne[1], dst->ne[2]);
-    //if (first) {
-    //    first = false;
-    //} else {
-    //    for (int k = 0; k < dst->ne[1]; ++k) {
-    //        for (int j = 0; j < dst->ne[0]/16; ++j) {
-    //            for (int i = 0; i < 16; ++i) {
-    //                printf("%8.4f ", ((float *) dst->data)[k*dst->ne[0] + j*16 + i]);
-    //            }
-    //            printf("\n");
-    //        }
-    //        printf("\n");
-    //    }
-    //    printf("\n");
-    //    exit(0);
-    //}
-}
-
-// ggml_compute_forward_diag
-
-static void ggml_compute_forward_diag_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    GGML_ASSERT(params->ith == 0);
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    // TODO: handle transposed/permuted matrices
-
-    GGML_TENSOR_UNARY_OP_LOCALS
-
-    GGML_ASSERT(ne00 == ne0);
-    GGML_ASSERT(ne00 == ne1);
-    GGML_ASSERT(ne01 == 1);
-    GGML_ASSERT(ne02 == ne2);
-    GGML_ASSERT(ne03 == ne3);
-
-    GGML_ASSERT(nb00 == sizeof(float));
-    GGML_ASSERT(nb0  == sizeof(float));
-
-    for (int i3 = 0; i3 < ne3; i3++) {
-        for (int i2 = 0; i2 < ne2; i2++) {
-            for (int i1 = 0; i1 < ne1; i1++) {
-                float * d = (float *)((char *)  dst->data + i3*nb3  + i2*nb2 + i1*nb1);
-                float * s = (float *)((char *) src0->data + i3*nb03 + i2*nb02);
-                for (int i0 = 0; i0 < i1; i0++) {
-                    d[i0] = 0;
-                }
-                d[i1] = s[i1];
-                for (int i0 = i1+1; i0 < ne0; i0++) {
-                    d[i0] = 0;
-                }
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_diag(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_diag_f32(params, src0, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_diag_mask_inf
-
-static void ggml_compute_forward_diag_mask_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst,
-        const float value) {
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int  n_past  = ((int32_t *) dst->op_params)[0];
-    const bool inplace = src0->data == dst->data;
-
-    GGML_ASSERT(n_past >= 0);
-
-    if (!inplace && (params->type == GGML_TASK_INIT)) {
-        // memcpy needs to be synchronized across threads to avoid race conditions.
-        // => do it in INIT phase
-        GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
-        GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
-        memcpy(
-            ((char *)  dst->data),
-            ((char *) src0->data),
-            ggml_nbytes(dst));
-    }
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    // TODO: handle transposed/permuted matrices
-
-    const int n  = ggml_nrows(src0);
-    const int nc = src0->ne[0];
-    const int nr = src0->ne[1];
-    const int nz = n/nr;
-
-    GGML_ASSERT( dst->nb[0] == sizeof(float));
-    GGML_ASSERT(src0->nb[0] == sizeof(float));
-
-    for (int k = 0; k < nz; k++) {
-        for (int j = ith; j < nr; j += nth) {
-            for (int i = n_past; i < nc; i++) {
-                if (i > n_past + j) {
-                    *(float *)((char *) dst->data + k*dst->nb[2] + j*dst->nb[1] + i*dst->nb[0]) = value;
-                }
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_diag_mask_inf(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_diag_mask_f32(params, src0, dst, -INFINITY);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-static void ggml_compute_forward_diag_mask_zero(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_diag_mask_f32(params, src0, dst, 0);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_soft_max
-
-static void ggml_compute_forward_soft_max_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-              struct ggml_tensor * dst) {
-    assert(ggml_is_contiguous(dst));
-    assert(ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    float scale = 1.0f;
-    memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
-
-    // TODO: handle transposed/permuted matrices
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int64_t ne11 = src1 ? src1->ne[1] : 1;
-
-    const int nc = src0->ne[0];
-    const int nr = ggml_nrows(src0);
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    float * wp = (float *) params->wdata + (nc + CACHE_LINE_SIZE_F32) * ith;
-
-    for (int i1 = ir0; i1 < ir1; i1++) {
-        float * sp = (float *)((char *) src0->data + i1*src0->nb[1]);
-        float * dp = (float *)((char *)  dst->data +  i1*dst->nb[1]);
-
-        // broadcast the mask across rows
-        float * mp = src1 ? (float *)((char *) src1->data + (i1%ne11)*src1->nb[1]) : NULL;
-
-        ggml_vec_cpy_f32  (nc, wp, sp);
-        ggml_vec_scale_f32(nc, wp, scale);
-        if (mp) {
-            ggml_vec_acc_f32(nc, wp, mp);
-        }
-
-#ifndef NDEBUG
-        for (int i = 0; i < nc; ++i) {
-            //printf("p[%d] = %f\n", i, p[i]);
-            assert(!isnan(wp[i]));
-        }
-#endif
-
-        float max = -INFINITY;
-        ggml_vec_max_f32(nc, &max, wp);
-
-        ggml_float sum = 0.0;
-
-        uint16_t scvt;
-        for (int i = 0; i < nc; i++) {
-            if (wp[i] == -INFINITY) {
-                dp[i] = 0.0f;
-            } else {
-                // const float val = (wp[i] == -INFINITY) ? 0.0 : exp(wp[i] - max);
-                ggml_fp16_t s = GGML_FP32_TO_FP16(wp[i] - max);
-                memcpy(&scvt, &s, sizeof(scvt));
-                const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt]);
-                sum += (ggml_float)val;
-                dp[i] = val;
-            }
-        }
-
-        assert(sum > 0.0);
-
-        sum = 1.0/sum;
-        ggml_vec_scale_f32(nc, dp, sum);
-
-#ifndef NDEBUG
-        for (int i = 0; i < nc; ++i) {
-            assert(!isnan(dp[i]));
-            assert(!isinf(dp[i]));
-        }
-#endif
-    }
-}
-
-static void ggml_compute_forward_soft_max(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-              struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_soft_max_f32(params, src0, src1, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_soft_max_back
-
-static void ggml_compute_forward_soft_max_back_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
-    GGML_ASSERT(ggml_is_contiguous(src0));
-    GGML_ASSERT(ggml_is_contiguous(src1));
-    GGML_ASSERT(ggml_is_contiguous(dst));
-    GGML_ASSERT(ggml_are_same_shape(src0, dst));
-    GGML_ASSERT(ggml_are_same_shape(src1, dst));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    // TODO: handle transposed/permuted matrices
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nc = src0->ne[0];
-    const int nr = ggml_nrows(src0);
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    for (int i1 = ir0; i1 < ir1; i1++) {
-        float *dy = (float *)((char *) src0->data + i1*src0->nb[1]);
-        float *y  = (float *)((char *) src1->data + i1*src1->nb[1]);
-        float *dx = (float *)((char *) dst->data  + i1*dst->nb[1]);
-
-#ifndef NDEBUG
-        for (int i = 0; i < nc; ++i) {
-            //printf("p[%d] = %f\n", i, p[i]);
-            assert(!isnan(dy[i]));
-            assert(!isnan(y[i]));
-        }
-#endif
-        // Jii = yi - yi*yi
-        // Jij = -yi*yj
-        // J = diag(y)-y.T*y
-        // dx = J * dy
-        // dxk = sum_i(Jki * dyi)
-        // dxk = sum_i(-yk*yi * dyi) - (-yk*yk)*dyk + (yk - yk*yk)*dyk
-        // dxk = sum_i(-yk*yi * dyi) + yk*yk*dyk + yk*dyk - yk*yk*dyk
-        // dxk = sum_i(-yk*yi * dyi) + yk*dyk
-        // dxk = -yk * sum_i(yi * dyi) + yk*dyk
-        // dxk = -yk * dot(y, dy) + yk*dyk
-        // dxk = yk * (- dot(y, dy) + dyk)
-        // dxk = yk * (dyk - dot(y, dy))
-        //
-        // post-order:
-        // dot_y_dy := dot(y, dy)
-        // dx := dy
-        // dx := dx - dot_y_dy
-        // dx := dx * y
-
-        // linear runtime, no additional memory
-        float dot_y_dy = 0;
-        ggml_vec_dot_f32 (nc, &dot_y_dy, y, dy);
-        ggml_vec_cpy_f32 (nc, dx, dy);
-        ggml_vec_acc1_f32(nc, dx, -dot_y_dy);
-        ggml_vec_mul_f32 (nc, dx, dx, y);
-
-#ifndef NDEBUG
-        for (int i = 0; i < nc; ++i) {
-            assert(!isnan(dx[i]));
-            assert(!isinf(dx[i]));
-        }
-#endif
-    }
-}
-
-static void ggml_compute_forward_soft_max_back(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_soft_max_back_f32(params, src0, src1, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_alibi
-
-static void ggml_compute_forward_alibi_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    assert(params->ith == 0);
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    //const int n_past = ((int32_t *) dst->op_params)[0];
-    const int n_head = ((int32_t *) dst->op_params)[1];
-    float max_bias;
-    memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
-
-    const int64_t ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
-    const int64_t ne1 = src0->ne[1]; // seq_len_without_past
-    const int64_t ne2 = src0->ne[2]; // n_head -> this is k
-    //const int64_t ne3 = src0->ne[3]; // 1 -> bsz
-
-    const int64_t n  = ggml_nrows(src0);
-    const int64_t ne2_ne3 = n/ne1; // ne2*ne3
-
-    const size_t nb0 = src0->nb[0];
-    const size_t nb1 = src0->nb[1];
-    const size_t nb2 = src0->nb[2];
-    //const int nb3 = src0->nb[3];
-
-    GGML_ASSERT(nb0 == sizeof(float));
-    GGML_ASSERT(n_head == ne2);
-
-    // add alibi to src0 (KQ_scaled)
-    const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
-
-    const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
-    const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
-
-    for (int64_t i = 0; i < ne0; i++) {
-        for (int64_t j = 0; j < ne1; j++) {
-            for (int64_t k = 0; k < ne2_ne3; k++) {
-                float * const src = (float *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
-                float *      pdst = (float *)((char *)  dst->data + i*nb0 + j*nb1 + k*nb2);
-
-                // TODO: k*nb2 or k*nb3
-
-                float m_k;
-
-                if (k < n_heads_log2_floor) {
-                    m_k = powf(m0, k + 1);
-                } else {
-                    m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
-                }
-
-                pdst[0] = i * m_k + src[0];
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_alibi_f16(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    assert(params->ith == 0);
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    //const int n_past = ((int32_t *) dst->op_params)[0];
-    const int n_head = ((int32_t *) dst->op_params)[1];
-    float max_bias;
-    memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
-
-    const int ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
-    const int ne1 = src0->ne[1]; // seq_len_without_past
-    const int ne2 = src0->ne[2]; // n_head -> this is k
-    //const int ne3 = src0->ne[3]; // 1 -> bsz
-
-    const int n  = ggml_nrows(src0);
-    const int ne2_ne3 = n/ne1; // ne2*ne3
-
-    const int nb0 = src0->nb[0];
-    const int nb1 = src0->nb[1];
-    const int nb2 = src0->nb[2];
-    //const int nb3 = src0->nb[3];
-
-    GGML_ASSERT(nb0 == sizeof(ggml_fp16_t));
-    //GGML_ASSERT(ne1 + n_past == ne0); (void) n_past;
-    GGML_ASSERT(n_head == ne2);
-
-    // add alibi to src0 (KQ_scaled)
-    const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
-
-    const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
-    const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
-
-    for (int i = 0; i < ne0; i++) {
-        for (int j = 0; j < ne1; j++) {
-            for (int k = 0; k < ne2_ne3; k++) {
-                ggml_fp16_t * const src  = (ggml_fp16_t *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
-                      float *      pdst  =       (float *)((char *)  dst->data + i*nb0 + j*nb1 + k*nb2);
-
-                // TODO: k*nb2 or k*nb3
-
-                float m_k;
-
-                if (k < n_heads_log2_floor) {
-                    m_k = powf(m0, k + 1);
-                } else {
-                    m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
-                }
-
-                // we return F32
-                pdst[0] = i * m_k + GGML_FP16_TO_FP32(src[0]);
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_alibi(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F16:
-            {
-                ggml_compute_forward_alibi_f16(params, src0, dst);
-            } break;
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_alibi_f32(params, src0, dst);
-            } break;
-        case GGML_TYPE_Q4_0:
-        case GGML_TYPE_Q4_1:
-        case GGML_TYPE_Q5_0:
-        case GGML_TYPE_Q5_1:
-        case GGML_TYPE_Q8_0:
-        case GGML_TYPE_Q8_1:
-        case GGML_TYPE_Q2_K:
-        case GGML_TYPE_Q3_K:
-        case GGML_TYPE_Q4_K:
-        case GGML_TYPE_Q5_K:
-        case GGML_TYPE_Q6_K:
-        case GGML_TYPE_IQ2_XXS:
-        case GGML_TYPE_IQ2_XS:
-        case GGML_TYPE_Q8_K:
-        case GGML_TYPE_I8:
-        case GGML_TYPE_I16:
-        case GGML_TYPE_I32:
-        case GGML_TYPE_COUNT:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_clamp
-
-static void ggml_compute_forward_clamp_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    assert(params->ith == 0);
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    float min;
-    float max;
-    memcpy(&min, (float *) dst->op_params + 0, sizeof(float));
-    memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int n  = ggml_nrows(src0);
-    const int nc = src0->ne[0];
-
-    const size_t nb00 = src0->nb[0];
-    const size_t nb01 = src0->nb[1];
-
-    const size_t nb0 = dst->nb[0];
-    const size_t nb1 = dst->nb[1];
-
-    GGML_ASSERT( nb0 == sizeof(float));
-    GGML_ASSERT(nb00 == sizeof(float));
-
-    for (int j = ith; j < n; j += nth) {
-        float * dst_ptr  = (float *) ((char *)  dst->data + j*nb1);
-        float * src0_ptr = (float *) ((char *) src0->data + j*nb01);
-
-        for (int i = 0; i < nc; i++) {
-            dst_ptr[i] = MAX(MIN(src0_ptr[i], max), min);
-        }
-    }
-}
-
-static void ggml_compute_forward_clamp(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_clamp_f32(params, src0, dst);
-            } break;
-        case GGML_TYPE_F16:
-        case GGML_TYPE_Q4_0:
-        case GGML_TYPE_Q4_1:
-        case GGML_TYPE_Q5_0:
-        case GGML_TYPE_Q5_1:
-        case GGML_TYPE_Q8_0:
-        case GGML_TYPE_Q8_1:
-        case GGML_TYPE_Q2_K:
-        case GGML_TYPE_Q3_K:
-        case GGML_TYPE_Q4_K:
-        case GGML_TYPE_Q5_K:
-        case GGML_TYPE_Q6_K:
-        case GGML_TYPE_IQ2_XXS:
-        case GGML_TYPE_IQ2_XS:
-        case GGML_TYPE_Q8_K:
-        case GGML_TYPE_I8:
-        case GGML_TYPE_I16:
-        case GGML_TYPE_I32:
-        case GGML_TYPE_COUNT:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_rope
-
-static float rope_yarn_ramp(const float low, const float high, const int i0) {
-    const float y = (i0 / 2 - low) / MAX(0.001f, high - low);
-    return 1 - MIN(1, MAX(0, y));
-}
-
-// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
-// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
-static void rope_yarn(
-    float theta_extrap, float freq_scale, float corr_dims[2], int64_t i0, float ext_factor, float mscale,
-    float * cos_theta, float * sin_theta
-) {
-    // Get n-d rotational scaling corrected for extrapolation
-    float theta_interp = freq_scale * theta_extrap;
-    float theta = theta_interp;
-    if (ext_factor != 0.0f) {
-        float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor;
-        theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
-
-        // Get n-d magnitude scaling corrected for interpolation
-        mscale *= 1.0f + 0.1f * logf(1.0f / freq_scale);
-    }
-    *cos_theta = cosf(theta) * mscale;
-    *sin_theta = sinf(theta) * mscale;
-}
-
-// Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get
-// `corr_dim(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))`
-static float ggml_rope_yarn_corr_dim(int n_dims, int n_orig_ctx, float n_rot, float base) {
-    return n_dims * logf(n_orig_ctx / (n_rot * 2 * (float)M_PI)) / (2 * logf(base));
-}
-
-static void ggml_rope_cache_init(
-     float theta_base, float freq_scale, float corr_dims[2], int64_t ne0, float ext_factor, float mscale,
-     float * cache, float sin_sign, float theta_scale
-) {
-    float theta = theta_base;
-    for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
-        rope_yarn(
-            theta, freq_scale, corr_dims, i0, ext_factor, mscale, &cache[i0 + 0], &cache[i0 + 1]
-        );
-        cache[i0 + 1] *= sin_sign;
-
-        theta *= theta_scale;
-    }
-}
-
-void ggml_rope_yarn_corr_dims(
-    int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]
-) {
-    // start and end correction dims
-    dims[0] = MAX(0,         floorf(ggml_rope_yarn_corr_dim(n_dims, n_orig_ctx, beta_fast, freq_base)));
-    dims[1] = MIN(n_dims - 1, ceilf(ggml_rope_yarn_corr_dim(n_dims, n_orig_ctx, beta_slow, freq_base)));
-}
-
-static void ggml_compute_forward_rope_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst,
-        const bool forward) {
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
-
-    // these two only relevant for xPos RoPE:
-    float xpos_base;
-    bool  xpos_down;
-
-    //const int n_past     = ((int32_t *) dst->op_params)[0];
-    const int n_dims     = ((int32_t *) dst->op_params)[1];
-    const int mode       = ((int32_t *) dst->op_params)[2];
-    const int n_ctx      = ((int32_t *) dst->op_params)[3];
-    const int n_orig_ctx = ((int32_t *) dst->op_params)[4];
-
-    memcpy(&freq_base,   (int32_t *) dst->op_params +  5, sizeof(float));
-    memcpy(&freq_scale,  (int32_t *) dst->op_params +  6, sizeof(float));
-    memcpy(&ext_factor,  (int32_t *) dst->op_params +  7, sizeof(float));
-    memcpy(&attn_factor, (int32_t *) dst->op_params +  8, sizeof(float));
-    memcpy(&beta_fast,   (int32_t *) dst->op_params +  9, sizeof(float));
-    memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));
-    memcpy(&xpos_base,   (int32_t *) dst->op_params + 11, sizeof(float));
-    memcpy(&xpos_down,   (int32_t *) dst->op_params + 12, sizeof(bool));
-
-    GGML_TENSOR_UNARY_OP_LOCALS
-
-    //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
-    //printf("n_past = %d, ne2 = %d\n", n_past, ne2);
-
-    GGML_ASSERT(nb00 == sizeof(float));
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nr = ggml_nrows(dst);
-
-    GGML_ASSERT(n_dims <= ne0);
-    GGML_ASSERT(n_dims % 2 == 0);
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    // row index used to determine which thread to use
-    int ir = 0;
-
-    const float theta_scale = powf(freq_base, -2.0f/n_dims);
-    const float inv_ndims = -1.f/n_dims;
-    float corr_dims[2];
-    ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims);
-
-    const bool is_neox = mode & 2;
-    const bool is_glm  = mode & 4;
-
-    // backward process uses inverse rotation by cos and sin.
-    // cos and sin build a rotation matrix, where the inverse is the transpose.
-    // this essentially just switches the sign of sin.
-    const float sin_sign = forward ? 1.0f : -1.0f;
-
-    const int32_t * pos = (const int32_t *) src1->data;
-
-    for (int64_t i3 = 0; i3 < ne3; i3++) {
-        for (int64_t i2 = 0; i2 < ne2; i2++) {
-            const int64_t p = pos[i2];
-
-            float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith;
-            if (!is_glm && !is_neox) { // TODO: cache sin/cos for glm, neox
-                ggml_rope_cache_init(p, freq_scale, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
-            }
-
-            for (int64_t i1 = 0; i1 < ne1; i1++) {
-                if (ir++ < ir0) continue;
-                if (ir   > ir1) break;
-
-                float theta_base = (float)p;
-
-                if (is_glm) {
-                    theta_base = MIN(p, n_ctx - 2);
-                    float block_theta = MAX(p - (n_ctx - 2), 0);
-                    for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
-                        const float cos_theta = cosf(theta_base);
-                        const float sin_theta = sinf(theta_base) * sin_sign;
-                        const float cos_block_theta = cosf(block_theta);
-                        const float sin_block_theta = sinf(block_theta) * sin_sign;
-
-                        theta_base *= theta_scale;
-                        block_theta *= theta_scale;
-
-                        const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
-                              float * dst_data  = (float *)((char *)  dst->data +  i3*nb3 + i2*nb2  + i1*nb1  + i0*nb0);
-
-                        const float x0 = src[0];
-                        const float x1 = src[n_dims/2];
-                        const float x2 = src[n_dims];
-                        const float x3 = src[n_dims/2*3];
-
-                        dst_data[0]          = x0*cos_theta - x1*sin_theta;
-                        dst_data[n_dims/2]   = x0*sin_theta + x1*cos_theta;
-                        dst_data[n_dims]     = x2*cos_block_theta - x3*sin_block_theta;
-                        dst_data[n_dims/2*3] = x2*sin_block_theta + x3*cos_block_theta;
-                    }
-                } else if (!is_neox) {
-                    for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
-                        const float cos_theta = cache[i0 + 0];
-                        const float sin_theta = cache[i0 + 1];
-
-                        // zeta scaling for xPos only:
-                        float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f;
-                        if (xpos_down) zeta = 1.0f / zeta;
-
-                        const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
-                              float * dst_data  = (float *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
-
-                        const float x0 = src[0];
-                        const float x1 = src[1];
-
-                        dst_data[0] = x0*cos_theta*zeta - x1*sin_theta*zeta;
-                        dst_data[1] = x0*sin_theta*zeta + x1*cos_theta*zeta;
-                    }
-                } else {
-                    // TODO: this might be wrong for ne0 != n_dims - need double check
-                    //       it seems we have to rope just the first n_dims elements and do nothing with the rest
-                    // ref:  https://github.com/ml-explore/mlx/blob/dc2edc762c797e3b8de50b1dad4dc0a131691033/benchmarks/python/llama_jax_bench.py#L11-L26
-                    theta_base *= freq_scale;
-                    for (int64_t ic = 0; ic < ne0; ic += 2) {
-                        if (ic < n_dims) {
-                            const int64_t ib = 0;
-
-                            // simplified from `(ib * n_dims + ic) * inv_ndims`
-                            float cur_rot = inv_ndims * ic - ib;
-
-                            float cos_theta, sin_theta;
-                            rope_yarn(
-                                theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor,
-                                &cos_theta, &sin_theta
-                            );
-                            sin_theta *= sin_sign;
-
-                            theta_base *= theta_scale;
-
-                            const int64_t i0 = ib*n_dims + ic/2;
-
-                            const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
-                                  float * dst_data  = (float *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
-
-                            const float x0 = src[0];
-                            const float x1 = src[n_dims/2];
-
-                            dst_data[0]        = x0*cos_theta - x1*sin_theta;
-                            dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
-                        } else {
-                            const int64_t i0 = ic;
-
-                            const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
-                                  float * dst_data  = (float *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
-
-                            dst_data[0] = src[0];
-                            dst_data[1] = src[1];
-                        }
-                    }
-                }
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_rope_f16(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst,
-        const bool forward) {
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
-
-    //const int n_past     = ((int32_t *) dst->op_params)[0];
-    const int n_dims     = ((int32_t *) dst->op_params)[1];
-    const int mode       = ((int32_t *) dst->op_params)[2];
-    const int n_ctx      = ((int32_t *) dst->op_params)[3];
-    const int n_orig_ctx = ((int32_t *) dst->op_params)[4];
-    memcpy(&freq_base,   (int32_t *) dst->op_params +  5, sizeof(float));
-    memcpy(&freq_scale,  (int32_t *) dst->op_params +  6, sizeof(float));
-    memcpy(&ext_factor,  (int32_t *) dst->op_params +  7, sizeof(float));
-    memcpy(&attn_factor, (int32_t *) dst->op_params +  8, sizeof(float));
-    memcpy(&beta_fast,   (int32_t *) dst->op_params +  9, sizeof(float));
-    memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));
-
-    GGML_TENSOR_UNARY_OP_LOCALS
-
-    //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
-    //printf("n_past = %d, ne2 = %d\n", n_past, ne2);
-
-    GGML_ASSERT(nb0 == sizeof(ggml_fp16_t));
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nr = ggml_nrows(dst);
-
-    GGML_ASSERT(n_dims <= ne0);
-    GGML_ASSERT(n_dims % 2 == 0);
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    // row index used to determine which thread to use
-    int ir = 0;
-
-    const float theta_scale = powf(freq_base, -2.0f/n_dims);
-    const float inv_ndims = -1.f/n_dims;
-    float corr_dims[2];
-    ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims);
-
-    const bool is_neox = mode & 2;
-    const bool is_glm  = mode & 4;
-
-    // backward process uses inverse rotation by cos and sin.
-    // cos and sin build a rotation matrix, where the inverse is the transpose.
-    // this essentially just switches the sign of sin.
-    const float sin_sign = forward ? 1.0f : -1.0f;
-
-    const int32_t * pos = (const int32_t *) src1->data;
-
-    for (int64_t i3 = 0; i3 < ne3; i3++) {
-        for (int64_t i2 = 0; i2 < ne2; i2++) {
-            const int64_t p = pos[i2];
-
-            float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith;
-            if (!is_glm && !is_neox) { // TODO: cache sin/cos for glm, neox
-                ggml_rope_cache_init(p, freq_scale, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
-            }
-
-            for (int64_t i1 = 0; i1 < ne1; i1++) {
-                if (ir++ < ir0) continue;
-                if (ir   > ir1) break;
-
-                float theta_base = (float)p;
-
-                if (is_glm) {
-                    theta_base = MIN(p, n_ctx - 2);
-                    float block_theta = MAX(p - (n_ctx - 2), 0);
-                    for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
-                        const float cos_theta = cosf(theta_base);
-                        const float sin_theta = sinf(theta_base) * sin_sign;
-                        const float cos_block_theta = cosf(block_theta);
-                        const float sin_block_theta = sinf(block_theta) * sin_sign;
-
-                        theta_base *= theta_scale;
-                        block_theta *= theta_scale;
-
-                        const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
-                              ggml_fp16_t * dst_data  = (ggml_fp16_t *)((char *)  dst->data +  i3*nb3 + i2*nb2  + i1*nb1  + i0*nb0);
-
-                        const float x0 = GGML_FP16_TO_FP32(src[0]);
-                        const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]);
-                        const float x2 = GGML_FP16_TO_FP32(src[n_dims]);
-                        const float x3 = GGML_FP16_TO_FP32(src[n_dims/2*3]);
-
-                        dst_data[0]          = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
-                        dst_data[n_dims/2]   = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
-                        dst_data[n_dims]     = GGML_FP32_TO_FP16(x2*cos_block_theta - x3*sin_block_theta);
-                        dst_data[n_dims/2*3] = GGML_FP32_TO_FP16(x2*sin_block_theta + x3*cos_block_theta);
-                    }
-                } else if (!is_neox) {
-                    for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
-                        const float cos_theta = cache[i0 + 0];
-                        const float sin_theta = cache[i0 + 1];
-
-                        const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
-                              ggml_fp16_t * dst_data  = (ggml_fp16_t *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
-
-                        const float x0 = GGML_FP16_TO_FP32(src[0]);
-                        const float x1 = GGML_FP16_TO_FP32(src[1]);
-
-                        dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
-                        dst_data[1] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
-                    }
-                } else {
-                    // TODO: this might be wrong for ne0 != n_dims - need double check
-                    //       it seems we have to rope just the first n_dims elements and do nothing with the rest
-                    // ref:  https://github.com/ml-explore/mlx/blob/dc2edc762c797e3b8de50b1dad4dc0a131691033/benchmarks/python/llama_jax_bench.py#L11-L26
-                    theta_base *= freq_scale;
-                    for (int64_t ic = 0; ic < ne0; ic += 2) {
-                        if (ic < n_dims) {
-                            const int64_t ib = 0;
-
-                            // simplified from `(ib * n_dims + ic) * inv_ndims`
-                            float cur_rot = inv_ndims * ic - ib;
-
-                            float cos_theta, sin_theta;
-                            rope_yarn(
-                                theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor,
-                                &cos_theta, &sin_theta
-                            );
-                            sin_theta *= sin_sign;
-
-                            theta_base *= theta_scale;
-
-                            const int64_t i0 = ib*n_dims + ic/2;
-
-                            const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
-                                  ggml_fp16_t * dst_data  = (ggml_fp16_t *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
-
-                            const float x0 = GGML_FP16_TO_FP32(src[0]);
-                            const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]);
-
-                            dst_data[0]        = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
-                            dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
-                        } else {
-                            const int64_t i0 = ic;
-
-                            const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
-                                  ggml_fp16_t * dst_data  = (ggml_fp16_t *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
-
-                            dst_data[0] = src[0];
-                            dst_data[1] = src[1];
-                        }
-                    }
-                }
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_rope(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F16:
-            {
-                ggml_compute_forward_rope_f16(params, src0, src1, dst, true);
-            } break;
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_rope_f32(params, src0, src1, dst, true);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_rope_back
-
-static void ggml_compute_forward_rope_back(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F16:
-            {
-                ggml_compute_forward_rope_f16(params, src0, src1, dst, false);
-            } break;
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_rope_f32(params, src0, src1, dst, false);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_conv_transpose_1d
-
-static void ggml_compute_forward_conv_transpose_1d_f16_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-              struct ggml_tensor * dst) {
-    GGML_ASSERT(src0->type == GGML_TYPE_F16);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    int64_t t0 = ggml_perf_time_us();
-    UNUSED(t0);
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nk = ne00*ne01*ne02;
-
-    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
-    GGML_ASSERT(nb10 == sizeof(float));
-
-    if (params->type == GGML_TASK_INIT) {
-        memset(params->wdata, 0, params->wsize);
-
-        // permute kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
-        {
-            ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
-
-            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                for (int64_t i01 = 0; i01 < ne01; i01++) {
-                    const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i02*nb02 + i01*nb01);
-                    ggml_fp16_t * dst_data = wdata + i01*ne00*ne02;
-                    for (int64_t i00 = 0; i00 < ne00; i00++) {
-                        dst_data[i00*ne02 + i02] = src[i00];
-                    }
-                }
-            }
-        }
-
-        // permute source data (src1) from (L x Cin) to (Cin x L)
-        {
-            ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + nk;
-            ggml_fp16_t * dst_data = wdata;
-
-            for (int64_t i11 = 0; i11 < ne11; i11++) {
-                const float * const src = (float *)((char *) src1->data + i11*nb11);
-                for (int64_t i10 = 0; i10 < ne10; i10++) {
-                    dst_data[i10*ne11 + i11] = GGML_FP32_TO_FP16(src[i10]);
-                }
-            }
-        }
-
-        // need to zero dst since we are accumulating into it
-        memset(dst->data, 0, ggml_nbytes(dst));
-
-        return;
-    }
-
-    if (params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
-
-    // total rows in dst
-    const int nr = ne1;
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    ggml_fp16_t * const wdata     = (ggml_fp16_t *) params->wdata + 0;
-    ggml_fp16_t * const wdata_src = wdata + nk;
-
-    for (int i1 = ir0; i1 < ir1; i1++) {
-        float * dst_data = (float *)((char *) dst->data + i1*nb1);
-        ggml_fp16_t * wdata_kernel = wdata + i1*ne02*ne00;
-        for (int i10 = 0; i10 < ne10; i10++) {
-            const int i1n = i10*ne11;
-            for (int i00 = 0; i00 < ne00; i00++) {
-                float v = 0;
-                ggml_vec_dot_f16(ne02, &v,
-                        (ggml_fp16_t *)    wdata_src + i1n,
-                        (ggml_fp16_t *) wdata_kernel + i00*ne02);
-                dst_data[i10*s0 + i00] += v;
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_conv_transpose_1d_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-              struct ggml_tensor * dst) {
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    int64_t t0 = ggml_perf_time_us();
-    UNUSED(t0);
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nk = ne00*ne01*ne02;
-
-    GGML_ASSERT(nb00 == sizeof(float));
-    GGML_ASSERT(nb10 == sizeof(float));
-
-    if (params->type == GGML_TASK_INIT) {
-        memset(params->wdata, 0, params->wsize);
-
-        // prepare kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
-        {
-            float * const wdata = (float *) params->wdata + 0;
-
-            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                for (int64_t i01 = 0; i01 < ne01; i01++) {
-                    const float * const src = (float *)((char *) src0->data + i02*nb02 + i01*nb01);
-                    float * dst_data = wdata + i01*ne00*ne02;
-                    for (int64_t i00 = 0; i00 < ne00; i00++) {
-                        dst_data[i00*ne02 + i02] = src[i00];
-                    }
-                }
-            }
-        }
-
-        // prepare source data (src1)
-        {
-            float * const wdata = (float *) params->wdata + nk;
-            float * dst_data = wdata;
-
-            for (int64_t i11 = 0; i11 < ne11; i11++) {
-                const float * const src = (float *)((char *) src1->data + i11*nb11);
-                for (int64_t i10 = 0; i10 < ne10; i10++) {
-                    dst_data[i10*ne11 + i11] = src[i10];
-                }
-            }
-        }
-
-        // need to zero dst since we are accumulating into it
-        memset(dst->data, 0, ggml_nbytes(dst));
-
-        return;
-    }
-
-    if (params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
-
-    // total rows in dst
-    const int nr = ne1;
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    float * const wdata     = (float *) params->wdata + 0;
-    float * const wdata_src = wdata + nk;
-
-    for (int i1 = ir0; i1 < ir1; i1++) {
-        float * dst_data = (float *)((char *) dst->data + i1*nb1);
-        float * wdata_kernel = wdata + i1*ne02*ne00;
-        for (int i10 = 0; i10 < ne10; i10++) {
-            const int i1n = i10*ne11;
-            for (int i00 = 0; i00 < ne00; i00++) {
-                float v = 0;
-                ggml_vec_dot_f32(ne02, &v,
-                        wdata_src + i1n,
-                        wdata_kernel + i00*ne02);
-                dst_data[i10*s0 + i00] += v;
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_conv_transpose_1d(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-              struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F16:
-            {
-                ggml_compute_forward_conv_transpose_1d_f16_f32(params, src0, src1, dst);
-            } break;
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_conv_transpose_1d_f32(params, src0, src1, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// src0: kernel [OC, IC, KH, KW]
-// src1: image [N, IC, IH, IW]
-// dst:  result [N, OH, OW, IC*KH*KW]
-static void ggml_compute_forward_im2col_f16(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-              struct ggml_tensor * dst) {
-    GGML_ASSERT(src0->type == GGML_TYPE_F16);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F16);
-
-    int64_t t0 = ggml_perf_time_us();
-    UNUSED(t0);
-
-    GGML_TENSOR_BINARY_OP_LOCALS;
-
-    const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
-    const int32_t s1 = ((const int32_t *)(dst->op_params))[1];
-    const int32_t p0 = ((const int32_t *)(dst->op_params))[2];
-    const int32_t p1 = ((const int32_t *)(dst->op_params))[3];
-    const int32_t d0 = ((const int32_t *)(dst->op_params))[4];
-    const int32_t d1 = ((const int32_t *)(dst->op_params))[5];
-    const bool is_2D = ((const int32_t *)(dst->op_params))[6] == 1;
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int64_t N  = is_2D ? ne13 : ne12;
-    const int64_t IC = is_2D ? ne12 : ne11;
-    const int64_t IH = is_2D ? ne11 : 1;
-    const int64_t IW = ne10;
-
-    const int64_t KH = is_2D ? ne01 : 1;
-    const int64_t KW = ne00;
-
-    const int64_t OH = is_2D ? ne2 : 1;
-    const int64_t OW = ne1;
-
-    int ofs0 = is_2D ? nb13 : nb12;
-    int ofs1 = is_2D ? nb12 : nb11;
-
-    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
-    GGML_ASSERT(nb10 == sizeof(float));
-
-    if (params->type == GGML_TASK_INIT) {
-        return;
-    }
-
-    if (params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    // im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
-    {
-        ggml_fp16_t * const wdata = (ggml_fp16_t *) dst->data;
-
-        for (int64_t in = 0; in < N; in++) {
-            for (int64_t ioh = 0; ioh < OH; ioh++) { // 1
-                for (int64_t iow = 0; iow < OW; iow++) {
-                    for (int64_t iic = ith; iic < IC; iic += nth) {
-
-                        // micro kernel
-                        ggml_fp16_t * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW]
-                        const float * const src_data = (float *)((char *) src1->data + in*ofs0 + iic*ofs1); // [IH, IW]
-
-                        for (int64_t ikh = 0; ikh < KH; ikh++) {  // 1
-                            for (int64_t ikw = 0; ikw < KW; ikw++) {
-                                const int64_t iiw = iow*s0 + ikw*d0 - p0;
-                                const int64_t iih = ioh*s1 + ikh*d1 - p1;
-
-                                if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
-                                    dst_data[iic*(KH*KW) + ikh*KW + ikw] = 0;
-                                } else {
-                                    dst_data[iic*(KH*KW) + ikh*KW + ikw] = GGML_FP32_TO_FP16(src_data[iih*IW + iiw]);
-                                }
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_im2col(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-              struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F16:
-            {
-                ggml_compute_forward_im2col_f16(params, src0, src1, dst);
-            } break;
-        case GGML_TYPE_F32:
-            {
-                GGML_ASSERT(false);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_conv_transpose_2d
-
-static void ggml_compute_forward_conv_transpose_2d(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-              struct ggml_tensor * dst) {
-    GGML_ASSERT(src0->type == GGML_TYPE_F16);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    int64_t t0 = ggml_perf_time_us();
-    UNUSED(t0);
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nk = ne00*ne01*ne02*ne03;
-
-    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
-    GGML_ASSERT(nb10 == sizeof(float));
-
-    if (params->type == GGML_TASK_INIT) {
-        memset(params->wdata, 0, params->wsize);
-
-        // permute kernel data (src0) from (Kw x Kh x Cout x Cin) to (Cin x Kw x Kh x Cout)
-        {
-            ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
-
-            for (int64_t i03 = 0; i03 < ne03; i03++) {
-                for (int64_t i02 = 0; i02 < ne02; i02++) {
-                    const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i03*nb03 + i02*nb02);
-                    ggml_fp16_t * dst_data = wdata + i02*ne01*ne00*ne03;
-                    for (int64_t i01 = 0; i01 < ne01; i01++) {
-                        for (int64_t i00 = 0; i00 < ne00; i00++) {
-                            dst_data[i01*ne00*ne03 + i00*ne03 + i03] = src[i01 * ne00 + i00];
-                        }
-                    }
-                }
-            }
-        }
-
-        // permute source data (src1) from (Sw x Sh x Cin) to (Cin x Sw x Sh)
-        {
-            ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + nk;
-            for (int i12 = 0; i12 < ne12; i12++) {
-                for (int i11 = 0; i11 < ne11; i11++) {
-                    const float * const src = (float *)((char *) src1->data + i12*nb12 + i11*nb11);
-                    ggml_fp16_t * dst_data = wdata + i11*ne10*ne12;
-                    for (int i10 = 0; i10 < ne10; i10++) {
-                        dst_data[i10*ne12 + i12] = GGML_FP32_TO_FP16(src[i10]);
-                    }
-                }
-            }
-        }
-
-        memset(dst->data, 0, ggml_nbytes(dst));
-
-        return;
-    }
-
-    if (params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int32_t stride = ggml_get_op_params_i32(dst, 0);
-
-    // total patches in dst
-    const int np = ne2;
-
-    // patches per thread
-    const int dp = (np + nth - 1)/nth;
-
-    // patch range for this thread
-    const int ip0 = dp*ith;
-    const int ip1 = MIN(ip0 + dp, np);
-
-    ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
-    ggml_fp16_t * const wdata_src = wdata + nk;
-
-    for (int i2 = ip0; i2 < ip1; i2++) { // Cout
-        float * dst_data = (float *)((char *) dst->data + i2*nb2);
-        ggml_fp16_t * wdata_kernel = wdata + i2*ne01*ne00*ne03;
-        for (int i11 = 0; i11 < ne11; i11++) {
-            for (int i10 = 0; i10 < ne10; i10++) {
-                const int i1n = i11*ne10*ne12 + i10*ne12;
-                for (int i01 = 0; i01 < ne01; i01++) {
-                    for (int i00 = 0; i00 < ne00; i00++) {
-                        float v = 0;
-                        ggml_vec_dot_f16(ne03, &v,
-                                wdata_src + i1n,
-                                wdata_kernel + i01*ne00*ne03 + i00*ne03);
-                        dst_data[(i11*stride + i01)*ne0 + i10*stride + i00] += v;
-                    }
-                }
-            }
-        }
-    }
-}
-
-// ggml_compute_forward_pool_1d_sk_p0
-
-static void ggml_compute_forward_pool_1d_sk_p0(
-        const struct ggml_compute_params * params,
-        const enum ggml_op_pool op,
-        const struct ggml_tensor * src,
-        const int k,
-        struct ggml_tensor * dst) {
-    assert(src->type == GGML_TYPE_F32);
-    assert(params->ith == 0);
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const char * cdata = (const char *)src->data;
-    const char * const data_end = cdata + ggml_nbytes(src);
-    float * drow = (float *)dst->data;
-
-    const int64_t rs = dst->ne[0];
-
-    while (cdata < data_end) {
-        const float * const srow = (const float *)cdata;
-
-        int j = 0;
-
-        for (int64_t i = 0; i < rs; ++i) {
-            switch (op) {
-                case GGML_OP_POOL_AVG:   drow[i] = 0;        break;
-                case GGML_OP_POOL_MAX:   drow[i] = -FLT_MAX; break;
-                case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break;
-            }
-            for (int ki = 0; ki < k; ++ki) {
-                switch (op) {
-                    case GGML_OP_POOL_AVG:                          drow[i] += srow[j]; break;
-                    case GGML_OP_POOL_MAX:   if (srow[j] > drow[i]) drow[i]  = srow[j]; break;
-                    case GGML_OP_POOL_COUNT:                        GGML_ASSERT(false); break;
-                }
-                ++j;
-            }
-            switch (op) {
-                case GGML_OP_POOL_AVG:         drow[i] /= k; break;
-                case GGML_OP_POOL_MAX:                       break;
-                case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break;
-            }
-        }
-
-        cdata += src->nb[1];
-        drow  += rs;
-    }
-}
-
-// ggml_compute_forward_pool_1d
-
-static void ggml_compute_forward_pool_1d(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-              struct ggml_tensor * dst) {
-
-    const int32_t * opts = (const int32_t *)dst->op_params;
-    enum ggml_op_pool op = opts[0];
-    const int k0 = opts[1];
-    const int s0 = opts[2];
-    const int p0 = opts[3];
-    GGML_ASSERT(p0 == 0); // padding not supported
-    GGML_ASSERT(k0 == s0); // only s = k supported
-
-    ggml_compute_forward_pool_1d_sk_p0(params, op, src0, k0, dst);
-}
-
-// ggml_compute_forward_pool_2d
-
-static void ggml_compute_forward_pool_2d(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src,
-        struct ggml_tensor * dst) {
-    assert(src->type == GGML_TYPE_F32);
-    assert(params->ith == 0);
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int32_t * opts = (const int32_t *)dst->op_params;
-    enum ggml_op_pool op = opts[0];
-    const int k0 = opts[1];
-    const int k1 = opts[2];
-    const int s0 = opts[3];
-    const int s1 = opts[4];
-    const int p0 = opts[5];
-    const int p1 = opts[6];
-    const char * cdata = (const char*)src->data;
-    const char * const data_end = cdata + ggml_nbytes(src);
-
-    const int64_t px = dst->ne[0];
-    const int64_t py = dst->ne[1];
-    const int64_t pa = px * py;
-
-    float * dplane = (float *)dst->data;
-
-    const int ka = k0 * k1;
-    const int offset0 = -p0;
-    const int offset1 = -p1;
-
-    while (cdata < data_end) {
-        for (int oy = 0; oy < py; ++oy) {
-            float * const drow = dplane + oy * px;
-            for (int ox = 0; ox < px; ++ox) {
-                float * const out =  drow + ox;
-                switch (op) {
-                    case GGML_OP_POOL_AVG:     *out = 0;        break;
-                    case GGML_OP_POOL_MAX:     *out = -FLT_MAX; break;
-                    case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break;
-                }
-
-                const int ix = offset0 + ox * s0;
-                const int iy = offset1 + oy * s1;
-
-                for (int ky = 0; ky < k1; ++ky) {
-                    if (iy + ky < 0 || iy + ky >= src->ne[1]) continue;
-                    const float * const srow = (const float *)(cdata + src->nb[1] * (iy + ky));
-                    for (int kx = 0; kx < k0; ++kx) {
-                        int j = ix + kx;
-                        if (j < 0 || j >= src->ne[0]) continue;
-                        switch (op) {
-                            case GGML_OP_POOL_AVG:                     *out += srow[j]; break;
-                            case GGML_OP_POOL_MAX: if (srow[j] > *out) *out  = srow[j]; break;
-                            case GGML_OP_POOL_COUNT:                GGML_ASSERT(false); break;
-                        }
-                    }
-                }
-                switch (op) {
-                    case GGML_OP_POOL_AVG:           *out /= ka; break;
-                    case GGML_OP_POOL_MAX:                       break;
-                    case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break;
-                }
-            }
-        }
-
-        cdata  += src->nb[2];
-        dplane += pa;
-    }
-}
-
-// ggml_compute_forward_upscale
-
-static void ggml_compute_forward_upscale_f32(
-    const struct ggml_compute_params * params,
-    const struct ggml_tensor * src0,
-    struct ggml_tensor * dst) {
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    GGML_ASSERT(src0->nb[0] == sizeof(float));
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    GGML_TENSOR_UNARY_OP_LOCALS
-
-    const int scale_factor = dst->op_params[0];
-
-    // TODO: optimize
-
-    for (int64_t i3 = 0; i3 < ne3; i3++) {
-        const int64_t i03 = i3;
-        for (int64_t i2 = ith; i2 < ne2; i2 += nth) {
-            const int64_t i02 = i2;
-            for (int64_t i1 = 0; i1 < ne1; i1++) {
-                const int64_t i01 = i1 / scale_factor;
-                for (int64_t i0 = 0; i0 < ne0; i0++) {
-                    const int64_t i00 = i0 / scale_factor;
-
-                    const float * x = (float *)((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
-                          float * y = (float *)((char *)  dst->data +  i0*nb0  +  i1*nb1  +  i2*nb2  +  i3*nb3);
-
-                    *y = *x;
-                }
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_upscale(
-    const struct ggml_compute_params * params,
-    const struct ggml_tensor * src0,
-    struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_upscale_f32(params, src0, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_pad
-
-static void ggml_compute_forward_pad_f32(
-    const struct ggml_compute_params * params,
-    const struct ggml_tensor * src0,
-          struct ggml_tensor * dst) {
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    GGML_ASSERT(src0->nb[0] == sizeof(float));
-    GGML_ASSERT( dst->nb[0] == sizeof(float));
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    GGML_TENSOR_UNARY_OP_LOCALS
-
-    float * dst_ptr = (float *) dst->data;
-
-    // TODO: optimize
-
-    for (int64_t i2 = 0; i2 < ne2; ++i2) {
-        for (int64_t i1 = ith; i1 < ne1; i1 += nth) {
-            for (int64_t i0 = 0; i0 < ne0; ++i0) {
-                for (int64_t i3 = 0; i3 < ne3; ++i3) {
-                    const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;
-
-                    const float * src_ptr = (const float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
-
-                    if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
-                        dst_ptr[dst_idx] = *src_ptr;
-                    } else {
-                        dst_ptr[dst_idx] = 0;
-                    }
-                }
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_pad(
-    const struct ggml_compute_params * params,
-    const struct ggml_tensor * src0,
-    struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_pad_f32(params, src0, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_argsort
-
-static void ggml_compute_forward_argsort_f32(
-    const struct ggml_compute_params * params,
-    const struct ggml_tensor * src0,
-    struct ggml_tensor * dst) {
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    GGML_TENSOR_UNARY_OP_LOCALS
-
-    GGML_ASSERT(nb0 == sizeof(float));
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int64_t nr = ggml_nrows(src0);
-
-    enum ggml_sort_order order = (enum ggml_sort_order) ggml_get_op_params_i32(dst, 0);
-
-    for (int64_t i = ith; i < nr; i += nth) {
-        int32_t * dst_data = (int32_t *)((char *) dst->data + i*nb1);
-        const float * src_data = (float *)((char *) src0->data + i*nb01);
-
-        for (int64_t j = 0; j < ne0; j++) {
-            dst_data[j] = j;
-        }
-
-        // C doesn't have a functional sort, so we do a bubble sort instead
-        for (int64_t j = 0; j < ne0; j++) {
-            for (int64_t k = j + 1; k < ne0; k++) {
-                if ((order == GGML_SORT_ASC && src_data[dst_data[j]] > src_data[dst_data[k]]) ||
-                    (order == GGML_SORT_DESC && src_data[dst_data[j]] < src_data[dst_data[k]])) {
-                    int32_t tmp = dst_data[j];
-                    dst_data[j] = dst_data[k];
-                    dst_data[k] = tmp;
-                }
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_argsort(
-    const struct ggml_compute_params * params,
-    const struct ggml_tensor * src0,
-    struct ggml_tensor * dst) {
-
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_argsort_f32(params, src0, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_flash_attn
-
-static void ggml_compute_forward_flash_attn_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * q,
-        const struct ggml_tensor * k,
-        const struct ggml_tensor * v,
-        const bool masked,
-        struct ggml_tensor * dst) {
-    int64_t t0 = ggml_perf_time_us();
-    UNUSED(t0);
-
-    GGML_TENSOR_LOCALS(int64_t, neq, q,   ne)
-    GGML_TENSOR_LOCALS(size_t,  nbq, q,   nb)
-    GGML_TENSOR_LOCALS(int64_t, nek, k,   ne)
-    GGML_TENSOR_LOCALS(size_t,  nbk, k,   nb)
-    GGML_TENSOR_LOCALS(int64_t, nev, v,   ne)
-    GGML_TENSOR_LOCALS(size_t,  nbv, v,   nb)
-    GGML_TENSOR_LOCALS(int64_t, ne,  dst, ne)
-    GGML_TENSOR_LOCALS(size_t,  nb,  dst, nb)
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int64_t D = neq0;
-    const int64_t N = neq1;
-    const int64_t P = nek1 - N;
-    const int64_t M = P + N;
-
-    const int Mup = ggml_up(M, GGML_SOFT_MAX_UNROLL);
-
-    GGML_ASSERT(ne0 == D);
-    GGML_ASSERT(ne1 == N);
-    GGML_ASSERT(P >= 0);
-
-    GGML_ASSERT(nbq0 == sizeof(float));
-    GGML_ASSERT(nbk0 == sizeof(float));
-    GGML_ASSERT(nbv0 == sizeof(float));
-
-    GGML_ASSERT(neq0 == D);
-    GGML_ASSERT(nek0 == D);
-    GGML_ASSERT(nev1 == D);
-
-    GGML_ASSERT(neq1 == N);
-    GGML_ASSERT(nek1 == N + P);
-    GGML_ASSERT(nev1 == D);
-
-    // dst cannot be transposed or permuted
-    GGML_ASSERT(nb0 == sizeof(float));
-    GGML_ASSERT(nb0 <= nb1);
-    GGML_ASSERT(nb1 <= nb2);
-    GGML_ASSERT(nb2 <= nb3);
-
-    if (params->type == GGML_TASK_INIT) {
-        return;
-    }
-
-    if (params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    // parallelize by q rows using ggml_vec_dot_f32
-
-    // total rows in q
-    const int nr = neq1*neq2*neq3;
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    const float scale = 1.0f/sqrtf(D);
-
-    //printf("P=%d N=%d D=%d ir0=%d ir1=%d scale = %f\n", P, N, D, ir0, ir1, scale);
-
-    for (int ir = ir0; ir < ir1; ++ir) {
-        // q indices
-        const int iq3 = ir/(neq2*neq1);
-        const int iq2 = (ir - iq3*neq2*neq1)/neq1;
-        const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1);
-
-        float * S = (float *) params->wdata + ith*(Mup + CACHE_LINE_SIZE_F32);
-
-        for (int i = M; i < Mup; ++i) {
-            S[i] = -INFINITY;
-        }
-
-        const int64_t masked_begin = masked ? (P + iq1 + 1) : M;
-        for (int64_t ic = 0; ic < masked_begin; ++ic) {
-            // k indices
-            const int ik3 = iq3;
-            const int ik2 = iq2 % nek2;
-            const int ik1 = ic;
-
-            // S indices
-            const int i1 = ik1;
-
-            ggml_vec_dot_f32(neq0,
-                    S + i1,
-                    (float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)),
-                    (float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)));
-        }
-
-        // scale
-        ggml_vec_scale_f32(masked_begin, S, scale);
-
-        for (int64_t i = masked_begin; i < M; i++) {
-            S[i] = -INFINITY;
-        }
-
-        // softmax
-        // exclude known -INF S[..] values from max and loop
-        // dont forget to set their SW values to zero
-        {
-            float max = -INFINITY;
-            ggml_vec_max_f32(masked_begin, &max, S);
-
-            ggml_float sum = 0.0;
-            {
-#ifdef GGML_SOFT_MAX_ACCELERATE
-                max = -max;
-                vDSP_vsadd(S, 1, &max, S, 1, Mup);
-                vvexpf(S, S, &Mup);
-                ggml_vec_sum_f32(Mup, &sum, S);
-#else
-                uint16_t   scvt[GGML_SOFT_MAX_UNROLL]; UNUSED(scvt);
-                ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };
-
-                for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
-                    if (i >= masked_begin) {
-                        break;
-                    }
-                    float * SS = S + i;
-
-                    for (int j = 0; j < GGML_SOFT_MAX_UNROLL; ++j) {
-                        if (i + j >= masked_begin) {
-                            break;
-                        } else if (SS[j] == -INFINITY) {
-                            SS[j] = 0.0f;
-                        } else {
-#ifndef GGML_FLASH_ATTN_EXP_FP16
-                            const float val = expf(SS[j] - max);
-#else
-                            ggml_fp16_t s = GGML_FP32_TO_FP16(SS[j] - max);
-                            memcpy(&scvt[j], &s, sizeof(uint16_t));
-                            const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt[j]]);
-#endif
-                            sump[j] += (ggml_float)val;
-                            SS[j] = val;
-                        }
-                    }
-                }
-
-                for (int i = 0; i < GGML_SOFT_MAX_UNROLL; i++) {
-                    sum += sump[i];
-                }
-#endif
-            }
-
-            assert(sum > 0.0);
-
-            sum = 1.0/sum;
-            ggml_vec_scale_f32(masked_begin, S, sum);
-
-#ifndef NDEBUG
-            for (int i = 0; i < masked_begin; ++i) {
-                assert(!isnan(S[i]));
-                assert(!isinf(S[i]));
-            }
-#endif
-        }
-
-        for (int64_t ic = 0; ic < nev1; ++ic) {
-            // dst indices
-            const int i1 = iq1;
-            const int i2 = iq2;
-            const int i3 = iq3;
-
-            // v indices
-            const int iv2 = iq2 % nev2;
-            const int iv3 = iq3;
-
-            ggml_vec_dot_f32(masked_begin,
-                    (float *) ((char *) dst->data + (ic*nb0 + i1*nb1  + i2*nb2   + i3*nb3)),
-                    (float *) ((char *) v->data   + (         ic*nbv1 + iv2*nbv2 + iv3*nbv3)),
-                    S);
-        }
-    }
-}
-
-static void ggml_compute_forward_flash_attn_f16(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * q,
-        const struct ggml_tensor * k,
-        const struct ggml_tensor * v,
-        const bool masked,
-        struct ggml_tensor * dst) {
-    int64_t t0 = ggml_perf_time_us();
-    UNUSED(t0);
-
-    GGML_TENSOR_LOCALS(int64_t, neq, q,   ne)
-    GGML_TENSOR_LOCALS(size_t,  nbq, q,   nb)
-    GGML_TENSOR_LOCALS(int64_t, nek, k,   ne)
-    GGML_TENSOR_LOCALS(size_t,  nbk, k,   nb)
-    GGML_TENSOR_LOCALS(int64_t, nev, v,   ne)
-    GGML_TENSOR_LOCALS(size_t,  nbv, v,   nb)
-    GGML_TENSOR_LOCALS(int64_t, ne,  dst, ne)
-    GGML_TENSOR_LOCALS(size_t,  nb,  dst, nb)
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int64_t D = neq0;
-    const int64_t N = neq1;
-    const int64_t P = nek1 - N;
-    const int64_t M = P + N;
-
-    const int Mup = ggml_up(M, GGML_SOFT_MAX_UNROLL);
-
-    GGML_ASSERT(ne0 == D);
-    GGML_ASSERT(ne1 == N);
-    GGML_ASSERT(P >= 0);
-
-    GGML_ASSERT(nbq0 == sizeof(ggml_fp16_t));
-    GGML_ASSERT(nbk0 == sizeof(ggml_fp16_t));
-    GGML_ASSERT(nbv0 == sizeof(ggml_fp16_t));
-
-    GGML_ASSERT(neq0 == D);
-    GGML_ASSERT(nek0 == D);
-    GGML_ASSERT(nev1 == D);
-
-    GGML_ASSERT(neq1 == N);
-    GGML_ASSERT(nek1 == N + P);
-    GGML_ASSERT(nev1 == D);
-
-    // dst cannot be transposed or permuted
-    GGML_ASSERT(nb0 == sizeof(float));
-    GGML_ASSERT(nb0 <= nb1);
-    GGML_ASSERT(nb1 <= nb2);
-    GGML_ASSERT(nb2 <= nb3);
-
-    if (params->type == GGML_TASK_INIT) {
-        return;
-    }
-
-    if (params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    // parallelize by q rows using ggml_vec_dot_f32
-
-    // total rows in q
-    const int nr = neq1*neq2*neq3;
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    const float scale = 1.0f/sqrtf(D);
-
-    //printf("P=%d N=%d D=%d ir0=%d ir1=%d scale = %f\n", P, N, D, ir0, ir1, scale);
-
-    for (int ir = ir0; ir < ir1; ++ir) {
-        // q indices
-        const int iq3 = ir/(neq2*neq1);
-        const int iq2 = (ir - iq3*neq2*neq1)/neq1;
-        const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1);
-
-        float * S = (float *) params->wdata + ith*(2*Mup + CACHE_LINE_SIZE_F32);
-
-        for (int i = M; i < Mup; ++i) {
-            S[i] = -INFINITY;
-        }
-
-        if (GGML_VEC_DOT_UNROLL > 2 || nek1 % GGML_VEC_DOT_UNROLL != 0) {
-            for (int64_t ic = 0; ic < nek1; ++ic) {
-                // k indices
-                const int ik3 = iq3;
-                const int ik2 = iq2 % nek2;
-                const int ik1 = ic;
-
-                // S indices
-                const int i1 = ik1;
-
-                ggml_vec_dot_f16(neq0,
-                        S + i1,
-                        (ggml_fp16_t *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)),
-                        (ggml_fp16_t *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)));
-            }
-        } else {
-            for (int64_t ic = 0; ic < nek1; ic += GGML_VEC_DOT_UNROLL) {
-                // k indices
-                const int ik3 = iq3;
-                const int ik2 = iq2 % nek2;
-                const int ik1 = ic;
-
-                // S indices
-                const int i1 = ik1;
-
-                ggml_vec_dot_f16_unroll(neq0, nbk1,
-                        S + i1,
-                        ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)),
-                        (ggml_fp16_t *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)));
-            }
-        }
-
-        // scale
-        ggml_vec_scale_f32(nek1, S, scale);
-
-        if (masked) {
-            for (int64_t i = P; i < M; i++) {
-                if (i > P + iq1) {
-                    S[i] = -INFINITY;
-                }
-            }
-        }
-
-        // softmax
-        // todo: exclude known -INF S[..] values from max and loop, assuming their results to be zero.
-        // dont forget to set their S values to zero
-        {
-            float max = -INFINITY;
-            ggml_vec_max_f32(M, &max, S);
-
-            ggml_float sum = 0.0;
-            {
-#ifdef GGML_SOFT_MAX_ACCELERATE
-                max = -max;
-                vDSP_vsadd(S, 1, &max, S, 1, Mup);
-                vvexpf(S, S, &Mup);
-                ggml_vec_sum_f32(Mup, &sum, S);
-#else
-                uint16_t   scvt[GGML_SOFT_MAX_UNROLL];
-                ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };
-
-                for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
-                    float * SS = S + i;
-
-                    for (int j = 0; j < GGML_SOFT_MAX_UNROLL; ++j) {
-                        if (SS[j] == -INFINITY) {
-                            SS[j] = 0.0f;
-                        } else {
-                            ggml_fp16_t s = GGML_FP32_TO_FP16(SS[j] - max);
-                            memcpy(&scvt[j], &s, sizeof(uint16_t));
-                            const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt[j]]);
-                            sump[j] += (ggml_float)val;
-                            SS[j] = val;
-                        }
-                    }
-                }
-
-                for (int i = 0; i < GGML_SOFT_MAX_UNROLL; i++) {
-                    sum += sump[i];
-                }
-#endif
-            }
-
-            assert(sum > 0.0);
-
-            sum = 1.0/sum;
-            ggml_vec_scale_f32(M, S, sum);
-
-#ifndef NDEBUG
-            for (int i = 0; i < M; ++i) {
-                assert(!isnan(S[i]));
-                assert(!isinf(S[i]));
-            }
-#endif
-        }
-
-        ggml_fp16_t * S16 = (ggml_fp16_t *) ((float *) params->wdata + ith*(2*Mup + CACHE_LINE_SIZE_F32) + Mup);
-
-        for (int64_t i = 0; i < M; i++) {
-            S16[i] = GGML_FP32_TO_FP16(S[i]);
-        }
-
-        // todo: exclude known zero S[..] values from dot (reducing nev0 and increasing begin of v and S16).
-        if (GGML_VEC_DOT_UNROLL == 1 || (nev1 % GGML_VEC_DOT_UNROLL != 0)) {
-            for (int64_t ic = 0; ic < nev1; ++ic) {
-                // dst indices
-                const int i1 = iq1;
-                const int i2 = iq2;
-                const int i3 = iq3;
-
-                // v indices
-                const int iv2 = iq2 % nev2;
-                const int iv3 = iq3;
-
-                ggml_vec_dot_f16(nev0,
-                        (float *)       ((char *) dst->data + (ic*nb0 + i1*nb1  + i2*nb2   + i3*nb3)),
-                        (ggml_fp16_t *) ((char *) v->data   + (         ic*nbv1 + iv2*nbv2 + iv3*nbv3)),
-                        S16);
-            }
-        } else {
-            for (int64_t ic = 0; ic < nev1; ic += GGML_VEC_DOT_UNROLL) {
-                // dst indices
-                const int i1 = iq1;
-                const int i2 = iq2;
-                const int i3 = iq3;
-
-                // v indices
-                const int iv2 = iq2 % nev2;
-                const int iv3 = iq3;
-
-                ggml_vec_dot_f16_unroll(nev0, nbv1,
-                        (float *) ((char *) dst->data + (ic*nb0 + i1*nb1  + i2*nb2   + i3*nb3)),
-                        ((char *)             v->data + (         ic*nbv1 + iv2*nbv2 + iv3*nbv3)),
-                        S16);
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_flash_attn(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * q,
-        const struct ggml_tensor * k,
-        const struct ggml_tensor * v,
-        const bool masked,
-        struct ggml_tensor * dst) {
-    switch (q->type) {
-        case GGML_TYPE_F16:
-            {
-                ggml_compute_forward_flash_attn_f16(params, q, k, v, masked, dst);
-            } break;
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_flash_attn_f32(params, q, k, v, masked, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_flash_ff
-
-static void ggml_compute_forward_flash_ff_f16(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * a,  // F16
-        const struct ggml_tensor * b0, // F16 fc_w
-        const struct ggml_tensor * b1, // F32 fc_b
-        const struct ggml_tensor * c0, // F16 proj_w
-        const struct ggml_tensor * c1, // F32 proj_b
-        struct ggml_tensor * dst) {
-    int64_t t0 = ggml_perf_time_us();
-    UNUSED(t0);
-
-    GGML_TENSOR_LOCALS(int64_t, nea,  a,   ne)
-    GGML_TENSOR_LOCALS(size_t,  nba,  a,   nb)
-    GGML_TENSOR_LOCALS(int64_t, neb0, b0,  ne)
-    GGML_TENSOR_LOCALS(size_t,  nbb0, b0,  nb)
-    GGML_TENSOR_LOCALS(int64_t, neb1, b1,  ne)
-    GGML_TENSOR_LOCALS(size_t,  nbb1, b1,  nb)
-    GGML_TENSOR_LOCALS(int64_t, nec0, c0,  ne)
-    GGML_TENSOR_LOCALS(size_t,  nbc0, c0,  nb)
-    GGML_TENSOR_LOCALS(int64_t, nec1, c1,  ne)
-    GGML_TENSOR_LOCALS(size_t,  nbc1, c1,  nb)
-    GGML_TENSOR_LOCALS(int64_t, ne,   dst, ne)
-    GGML_TENSOR_LOCALS(size_t,  nb,   dst, nb)
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int64_t D = nea0;
-    //const int64_t N = nea1;
-    const int64_t M = neb01;
-
-    GGML_ASSERT(ne0 == nea0);
-    GGML_ASSERT(ne1 == nea1);
-    GGML_ASSERT(ne2 == nea2);
-
-    GGML_ASSERT(nba0  == sizeof(ggml_fp16_t));
-    GGML_ASSERT(nbb00 == sizeof(ggml_fp16_t));
-    GGML_ASSERT(nbb10 == sizeof(float));
-    GGML_ASSERT(nbc00 == sizeof(ggml_fp16_t));
-    GGML_ASSERT(nbc10 == sizeof(float));
-
-    GGML_ASSERT(neb00 == D);
-    GGML_ASSERT(neb01 == M);
-    GGML_ASSERT(neb10 == M);
-    GGML_ASSERT(neb11 == 1);
-
-    GGML_ASSERT(nec00 == M);
-    GGML_ASSERT(nec01 == D);
-    GGML_ASSERT(nec10 == D);
-    GGML_ASSERT(nec11 == 1);
-
-    // dst cannot be transposed or permuted
-    GGML_ASSERT(nb0 == sizeof(float));
-    GGML_ASSERT(nb0 <= nb1);
-    GGML_ASSERT(nb1 <= nb2);
-    GGML_ASSERT(nb2 <= nb3);
-
-    if (params->type == GGML_TASK_INIT) {
-        return;
-    }
-
-    if (params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    // parallelize by a rows using ggml_vec_dot_f32
-
-    // total rows in a
-    const int nr = nea1*nea2*nea3;
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    for (int ir = ir0; ir < ir1; ++ir) {
-        // a indices
-        const int ia3 = ir/(nea2*nea1);
-        const int ia2 = (ir - ia3*nea2*nea1)/nea1;
-        const int ia1 = (ir - ia3*nea2*nea1 - ia2*nea1);
-
-        float * S = (float *) params->wdata + ith*(2*M + CACHE_LINE_SIZE_F32);
-
-        for (int64_t ic = 0; ic < neb01; ++ic) {
-            // b0 indices
-            const int ib03 = ia3;
-            const int ib02 = ia2;
-            const int ib01 = ic;
-
-            // S indices
-            const int i1 = ib01;
-
-            ggml_vec_dot_f16(nea0,
-                    S + i1,
-                    (ggml_fp16_t *) ((char *) b0->data + (ib01*nbb01 + ib02*nbb02 + ib03*nbb03)),
-                    (ggml_fp16_t *) ((char *)  a->data + ( ia1*nba1  +  ia2*nba2  +  ia3*nba3)));
-        }
-
-        ggml_vec_add_f32(neb01, S, S, (float *) b1->data);
-        //ggml_vec_gelu_f32(neb01, S, S);
-
-        ggml_fp16_t * S16 = (ggml_fp16_t *) ((float *) params->wdata + ith*(2*M + CACHE_LINE_SIZE_F32) + M);
-
-        for (int64_t i = 0; i < M; i++) {
-            S16[i] = GGML_FP32_TO_FP16(S[i]);
-        }
-
-        ggml_vec_gelu_f16(neb01, S16, S16);
-
-        {
-            // dst indices
-            const int i1 = ia1;
-            const int i2 = ia2;
-            const int i3 = ia3;
-
-            for (int64_t ic = 0; ic < nec01; ++ic) {
-
-                ggml_vec_dot_f16(neb01,
-                        (float *)       ((char *) dst->data + (ic*nb0 + i1*nb1   + i2*nb2   + i3*nb3)),
-                        (ggml_fp16_t *) ((char *) c0->data  + (         ic*nbc01 + i2*nbc02 + i3*nbc03)),
-                        S16);
-            }
-
-            ggml_vec_add_f32(nec01,
-                    (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3)),
-                    (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3)),
-                    (float *) c1->data);
-        }
-    }
-}
-
-static void ggml_compute_forward_flash_ff(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * a,
-        const struct ggml_tensor * b0,
-        const struct ggml_tensor * b1,
-        const struct ggml_tensor * c0,
-        const struct ggml_tensor * c1,
-        struct ggml_tensor * dst) {
-    switch (b0->type) {
-        case GGML_TYPE_F16:
-            {
-                ggml_compute_forward_flash_ff_f16(params, a, b0, b1, c0, c1, dst);
-            } break;
-        case GGML_TYPE_F32:
-            {
-                GGML_ASSERT(false); // TODO
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_flash_attn_back
-
-static void ggml_compute_forward_flash_attn_back_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * q,
-        const struct ggml_tensor * k,
-        const struct ggml_tensor * v,
-        const struct ggml_tensor * d,
-        const bool masked,
-              struct ggml_tensor * dst) {
-    int64_t t0 = ggml_perf_time_us();
-    UNUSED(t0);
-
-    GGML_TENSOR_LOCALS(int64_t, neq, q,   ne)
-    GGML_TENSOR_LOCALS(size_t,  nbq, q,   nb)
-    GGML_TENSOR_LOCALS(int64_t, nek, k,   ne)
-    GGML_TENSOR_LOCALS(size_t,  nbk, k,   nb)
-    GGML_TENSOR_LOCALS(int64_t, nev, v,   ne)
-    GGML_TENSOR_LOCALS(size_t,  nbv, v,   nb)
-    GGML_TENSOR_LOCALS(int64_t, ned, d,   ne)
-    GGML_TENSOR_LOCALS(size_t,  nbd, d,   nb)
-    GGML_TENSOR_LOCALS(int64_t, ne,  dst, ne)
-    GGML_TENSOR_LOCALS(size_t,  nb,  dst, nb)
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int64_t D = neq0;
-    const int64_t N = neq1;
-    const int64_t P = nek1 - N;
-    const int64_t M = P + N;
-
-    const int Mup  = ggml_up(M, GGML_SOFT_MAX_UNROLL);
-    const int mxDM = MAX(D, Mup);
-
-    // GGML_ASSERT(ne0 == D);
-    // GGML_ASSERT(ne1 == N);
-    GGML_ASSERT(P >= 0);
-
-    GGML_ASSERT(nbq0 == sizeof(float));
-    GGML_ASSERT(nbk0 == sizeof(float));
-    GGML_ASSERT(nbv0 == sizeof(float));
-
-    GGML_ASSERT(neq0 == D);
-    GGML_ASSERT(nek0 == D);
-    GGML_ASSERT(nev1 == D);
-    GGML_ASSERT(ned0 == D);
-
-    GGML_ASSERT(neq1 == N);
-    GGML_ASSERT(nek1 == N + P);
-    GGML_ASSERT(nev1 == D);
-    GGML_ASSERT(ned1 == N);
-
-    // dst cannot be transposed or permuted
-    GGML_ASSERT(nb0 == sizeof(float));
-    GGML_ASSERT(nb0 <= nb1);
-    GGML_ASSERT(nb1 <= nb2);
-    GGML_ASSERT(nb2 <= nb3);
-
-    if (params->type == GGML_TASK_INIT) {
-        if (ith == 0) {
-            memset(dst->data, 0, nb0*ne0*ne1*ne2*ne3);
-        }
-        return;
-    }
-
-    if (params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int64_t elem_q = ggml_nelements(q);
-    const int64_t elem_k = ggml_nelements(k);
-
-    enum ggml_type result_type = dst->type;
-    GGML_ASSERT(ggml_blck_size(result_type) == 1);
-    const size_t tsize = ggml_type_size(result_type);
-
-    const size_t offs_q = 0;
-    const size_t offs_k = offs_q + GGML_PAD(elem_q * tsize, GGML_MEM_ALIGN);
-    const size_t offs_v = offs_k + GGML_PAD(elem_k * tsize, GGML_MEM_ALIGN);
-
-    void * grad_q = (char *) dst->data;
-    void * grad_k = (char *) dst->data + offs_k;
-    void * grad_v = (char *) dst->data + offs_v;
-
-    const size_t nbgq1 = nb0*neq0;
-    const size_t nbgq2 = nb0*neq0*neq1;
-    const size_t nbgq3 = nb0*neq0*neq1*neq2;
-
-    const size_t nbgk1 = nb0*nek0;
-    const size_t nbgk2 = nb0*nek0*nek1;
-    const size_t nbgk3 = nb0*nek0*nek1*neq2;
-
-    const size_t nbgv1 = nb0*nev0;
-    const size_t nbgv2 = nb0*nev0*nev1;
-    const size_t nbgv3 = nb0*nev0*nev1*neq2;
-
-    // parallelize by k rows using ggml_vec_dot_f32
-
-    // total rows in k
-    const int nr = nek2*nek3;
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    const float scale = 1.0f/sqrtf(D);
-
-    //printf("P=%d N=%d D=%d ir0=%d ir1=%d scale = %f\n", P, N, D, ir0, ir1, scale);
-
-    // how often k2 (and v2) is repeated in q2
-    int nrep = neq2/nek2;
-
-    for (int ir = ir0; ir < ir1; ++ir) {
-        // q indices
-        const int ik3 = ir/(nek2);
-        const int ik2 = ir - ik3*nek2;
-
-        const int iq3 = ik3;
-        const int id3 = ik3;
-        const int iv3 = ik3;
-        const int iv2 = ik2;
-
-        for (int irep = 0; irep < nrep; ++irep) {
-            const int iq2 = ik2 + irep*nek2;
-            const int id2 = iq2;
-
-            // (ik2 + irep*nek2) % nek2 == ik2
-            for (int iq1 = 0; iq1 < neq1; ++iq1) {
-                const int id1 = iq1;
-
-                // not sure about CACHE_LINE_SIZE_F32..
-                // - maybe it must not be multiplied by 2 and excluded from .. in SM 1*(..) offset?
-                float * S  = (float *) params->wdata + ith*2*(mxDM + CACHE_LINE_SIZE_F32) + 0*(mxDM+CACHE_LINE_SIZE_F32);
-                float * SM = (float *) params->wdata + ith*2*(mxDM + CACHE_LINE_SIZE_F32) + 1*(mxDM+CACHE_LINE_SIZE_F32);
-
-                for (int i = M; i < Mup; ++i) {
-                    S[i] = -INFINITY;
-                }
-
-                const int64_t masked_begin = masked ? (P + iq1 + 1) : M;
-                for (int64_t ic = 0; ic < masked_begin; ++ic) {
-                    // k indices
-                    const int ik1 = ic;
-
-                    // S indices
-                    const int i1 = ik1;
-
-                    ggml_vec_dot_f32(neq0,
-                            S + i1,
-                            (float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)),
-                            (float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)));
-                }
-
-                // scale
-                ggml_vec_scale_f32(masked_begin, S, scale);
-
-                for (int64_t i = masked_begin; i < M; i++) {
-                    S[i] = -INFINITY;
-                }
-
-                // softmax
-                // exclude known -INF S[..] values from max and loop
-                // dont forget to set their SM values to zero
-                {
-                    float max = -INFINITY;
-                    ggml_vec_max_f32(masked_begin, &max, S);
-
-                    ggml_float sum = 0.0;
-                    {
-#ifdef GGML_SOFT_MAX_ACCELERATE
-                        max = -max;
-                        vDSP_vsadd(SM, 1, &max, SM, 1, Mup);
-                        vvexpf(SM, SM, &Mup);
-                        ggml_vec_sum_f32(Mup, &sum, SM);
-#else
-                        uint16_t   scvt[GGML_SOFT_MAX_UNROLL]; UNUSED(scvt);
-                        ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };
-
-                        for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
-                            if (i >= masked_begin) {
-                                break;
-                            }
-                            float * SR =  S + i;
-                            float * SW = SM + i;
-
-                            for (int j = 0; j < GGML_SOFT_MAX_UNROLL; ++j) {
-                                if (i + j >= masked_begin) {
-                                    break;
-                                } else if (SR[j] == -INFINITY) {
-                                    SW[j] = 0.0f;
-                                } else {
-#ifndef GGML_FLASH_ATTN_EXP_FP16
-                                    const float val = expf(SR[j] - max);
-#else
-                                    ggml_fp16_t s = GGML_FP32_TO_FP16(SR[j] - max);
-                                    memcpy(&scvt[j], &s, sizeof(uint16_t));
-                                    const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt[j]]);
-#endif
-                                    sump[j] += (ggml_float)val;
-                                    SW[j] = val;
-                                }
-                            }
-                        }
-
-                        for (int i = 0; i < GGML_SOFT_MAX_UNROLL; i++) {
-                            sum += sump[i];
-                        }
-#endif
-                    }
-
-                    assert(sum > 0.0);
-
-                    sum = 1.0/sum;
-                    ggml_vec_scale_f32(masked_begin, SM, sum);
-
-                }
-
-                // step-by-step explanation
-                {
-                    // forward-process                    shape      grads from backward process
-                    // parallel_for ik2,ik3:
-                    //  for irep:
-                    //   iq2 = ik2 + irep*nek2
-                    //   k[:D,:M,:,:]                     [D,M,:,:]  grad[k][:D,:M,ik2,ik3]  += grad[kcur]
-                    //   q[:D,:N,:,:]                     [D,N,:,:]  grad[q][:D,iq1,iq2,iq3] += grad[qcur]
-                    //   v[:M,:D,:,:]                     [M,D,:,:]  grad[v][:M,:D,iv2,iv3]  += grad[vcur]
-                    //   for iq1:
-                    //    kcur   = k[:D,:M,ik2,ik3]       [D,M,1,1]  grad[kcur] = grad[S1].T @ qcur
-                    //    qcur   = q[:D,iq1,iq2,iq3]      [D,1,1,1]  grad[qcur] = grad[S1]   @ kcur
-                    //    vcur   = v[:M,:D,iv2,iv3]       [M,D,1,1]  grad[vcur] = grad[S5].T @ S4
-                    //    S0     = -Inf                   [D,1,1,1]
-                    //   ~S1[i]  = dot(kcur[:D,i], qcur)
-                    //    S1     = qcur @ kcur.T          [M,1,1,1]  grad[S1]   = grad[S2] * scale
-                    //    S2     = S1 * scale             [M,1,1,1]  grad[S2]   = diag_mask_zero(grad[S3], P)
-                    //    S3     = diag_mask_inf(S2, P)   [M,1,1,1]  grad[S3]   = S4 * (grad[S4] - dot(S4, grad[S4]))
-                    //    S4     = softmax(S3)            [M,1,1,1]  grad[S4]   = grad[S5] @ vcur
-                    //   ~S5[i]  = dot(vcur[:,i], S4)
-                    //    S5     = S4 @ vcur.T            [D,1,1,1]  grad[S5]   = d[:D,id1,id2,id3]
-                    //   ~dst[i,iq1,iq2,iq3]  = S5[i]              ^
-                    //    dst[:D,iq1,iq2,iq3] = S5                 | grad[dst[:D,iq1,iq2,iq3]] = d[:D,id1,id2,id3]
-                    // dst                               backward-/ grad[dst]                 = d
-                    //
-                    // output gradients with their dependencies:
-                    //
-                    // grad[kcur] = grad[S1].T @ qcur
-                    // grad[S1]   = diag_mask_zero(grad[S3], P) * scale
-                    // grad[S3]   = S4 * (grad[S4] - dot(S4, grad[S4]))
-                    // grad[S4]   = grad[S5] @ vcur
-                    // grad[S4]   = d[:D,id1,id2,id3] @ vcur
-                    // grad[qcur] = grad[S1]   @ kcur
-                    // grad[vcur] = grad[S5].T @ S4
-                    // grad[vcur] = d[:D,id1,id2,id3].T @ S4
-                    //
-                    // in post-order:
-                    //
-                    // S1         = qcur @ kcur.T
-                    // S2         = S1 * scale
-                    // S3         = diag_mask_inf(S2, P)
-                    // S4         = softmax(S3)
-                    // grad[S4]   = d[:D,id1,id2,id3] @ vcur
-                    // grad[S3]   = S4 * (grad[S4] - dot(S4, grad[S4]))
-                    // grad[S1]   = diag_mask_zero(grad[S3], P) * scale
-                    // grad[qcur] = grad[S1]   @ kcur
-                    // grad[kcur] = grad[S1].T @ qcur
-                    // grad[vcur] = d[:D,id1,id2,id3].T @ S4
-                    //
-                    // using less variables (SM=S4):
-                    //
-                    // S             = diag_mask_inf(qcur @ kcur.T * scale, P)
-                    // SM            = softmax(S)
-                    // S             = d[:D,iq1,iq2,iq3] @ vcur
-                    // dot_SM_gradSM = dot(SM, S)
-                    // S             = SM * (S - dot(SM, S))
-                    // S             = diag_mask_zero(S, P) * scale
-                    //
-                    // grad[q][:D,iq1,iq2,iq3] += S   @ kcur
-                    // grad[k][:D,:M,ik2,ik3]  += S.T @ qcur
-                    // grad[v][:M,:D,iv2,iv3]  += d[:D,id1,id2,id3].T @ SM
-                }
-
-                // S = gradSM = d[:D,id1,id2,id3] @ vcur[:,:,iv2,iv3]
-                // S = d[:D,id1,id2,id3] @ vcur[:,:,iv2,iv3]
-                // for ic:
-                //   S[:M] += vcur[:M,ic,iv2,iv3] * d[ic,id1,id2,id3]
-                // exclude known future zero S[..] values from operation
-                ggml_vec_set_f32(masked_begin, S, 0);
-                for (int64_t ic = 0; ic < D; ++ic) {
-                    ggml_vec_mad_f32(masked_begin,
-                            S,
-                             (float *) ((char *) v->data + (          ic*nbv1  + iv2*nbv2 + iv3*nbv3)),
-                            *(float *) ((char *) d->data + (ic*nbd0 + id1*nbd1 + id2*nbd2 + id3*nbd3)));
-                }
-
-                // S = SM * (S - dot(SM, S))
-                float dot_SM_gradSM = 0;
-                ggml_vec_dot_f32 (masked_begin, &dot_SM_gradSM, SM, S);
-                ggml_vec_acc1_f32(M, S, -dot_SM_gradSM);
-                ggml_vec_mul_f32 (masked_begin, S, S, SM);
-
-                // S = diag_mask_zero(S, P) * scale
-                // already done by above ggml_vec_set_f32
-
-                // exclude known zero S[..] values from operation
-                ggml_vec_scale_f32(masked_begin, S, scale);
-
-                // S    shape [M,1]
-                // SM   shape [M,1]
-                // kcur shape [D,M]
-                // qcur shape [D,1]
-                // vcur shape [M,D]
-
-                // grad[q][:D,iq1,iq2,iq3] += S @ kcur
-                // grad[q][:D,iq1,iq2,iq3] += shape[M,1] @ shape[D,M]
-                // for ic:
-                //  grad[q][:D,iq1,iq2,iq3] += S[ic] * kcur[:D,ic,ik2,ik3]
-                // exclude known zero S[..] values from loop
-                for (int64_t ic = 0; ic < masked_begin; ++ic) {
-                    ggml_vec_mad_f32(D,
-                            (float *) ((char *) grad_q  + (iq1*nbgq1 + iq2*nbgq2  + iq3*nbgq3)),
-                            (float *) ((char *) k->data + (ic*nbk1   + ik2*nbk2   + ik3*nbk3)),
-                            S[ic]);
-                }
-
-                // grad[k][:D,:M,iq2,iq3] += S.T @ qcur
-                // for ic:
-                //  grad[k][:D,ic,iq2,iq3] += S.T[0,ic] * qcur[:D,0]
-                //  grad[k][:D,ic,iq2,iq3] += S[ic]     * qcur[:D,0]
-                // exclude known zero S[..] values from loop
-                for (int64_t ic = 0; ic < masked_begin; ++ic) {
-                    ggml_vec_mad_f32(D,
-                            (float *) ((char *) grad_k  + (ic*nbgk1  + ik2*nbgk2  + ik3*nbgk3)),
-                            (float *) ((char *) q->data + (iq1*nbq1  + iq2*nbq2   + iq3*nbq3)),
-                            S[ic]);
-                }
-
-                // grad[v][:M,:D,iv2,iv3] += d[:D,id1,id2,id3].T       @ SM
-                // for ic:
-                //  grad[v][:M,ic,iv2,iv3] += d[:D,id1,id2,id3].T[0,ic] * SM[:M]
-                //  grad[v][:M,ic,iv2,iv3] += d[ic,id1,id2,id3]         * SM[:M]
-                // exclude known zero SM[..] values from mad
-                for (int64_t ic = 0; ic < D; ++ic) {
-                    ggml_vec_mad_f32(masked_begin,
-                            (float *) ((char *) grad_v   + (          ic*nbgv1 + iv2*nbgv2 + iv3*nbgv3)),
-                            SM,
-                            *(float *) ((char *) d->data + (ic*nbd0 + id1*nbd1 + id2*nbd2  + id3*nbd3)));
-                }
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_flash_attn_back(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * q,
-        const struct ggml_tensor * k,
-        const struct ggml_tensor * v,
-        const struct ggml_tensor * d,
-        const bool masked,
-        struct ggml_tensor * dst) {
-    switch (q->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_flash_attn_back_f32(params, q, k, v, d, masked, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_win_part
-
-static void ggml_compute_forward_win_part_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne)
-    GGML_TENSOR_LOCALS(int64_t, ne,  dst,  ne)
-
-    const int32_t nep0 = ((const int32_t *)(dst->op_params))[0];
-    const int32_t nep1 = ((const int32_t *)(dst->op_params))[1];
-    const int32_t w    = ((const int32_t *)(dst->op_params))[2];
-
-    assert(ne00 == ne0);
-    assert(ne3  == nep0*nep1);
-
-    // TODO: optimize / multi-thread
-    for (int py = 0; py < nep1; ++py) {
-        for (int px = 0; px < nep0; ++px) {
-            const int64_t i3 = py*nep0 + px;
-            for (int64_t i2 = 0; i2 < ne2; ++i2) {
-                for (int64_t i1 = 0; i1 < ne1; ++i1) {
-                    for (int64_t i0 = 0; i0 < ne0; ++i0) {
-                        const int64_t i02 = py*w + i2;
-                        const int64_t i01 = px*w + i1;
-                        const int64_t i00 = i0;
-
-                        const int64_t i = i3*ne2*ne1*ne0 + i2*ne1*ne0    + i1*ne0   + i0;
-                        const int64_t j =                  i02*ne01*ne00 + i01*ne00 + i00;
-
-                        if (py*w + i2 >= ne02 || px*w + i1 >= ne01) {
-                            ((float *) dst->data)[i] = 0.0f;
-                        } else {
-                            ((float *) dst->data)[i] = ((float *) src0->data)[j];
-                        }
-                    }
-                }
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_win_part(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_win_part_f32(params, src0, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_win_unpart
-
-static void ggml_compute_forward_win_unpart_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne)
-    GGML_TENSOR_LOCALS(int64_t, ne,  dst,  ne)
-
-    const int32_t w = ((const int32_t *)(dst->op_params))[0];
-
-    // padding
-    const int px = (w - ne1%w)%w;
-    //const int py = (w - ne2%w)%w;
-
-    const int npx = (px + ne1)/w;
-    //const int npy = (py + ne2)/w;
-
-    assert(ne0 == ne00);
-
-    // TODO: optimize / multi-thread
-    for (int64_t i2 = 0; i2 < ne2; ++i2) {
-        for (int64_t i1 = 0; i1 < ne1; ++i1) {
-            for (int64_t i0 = 0; i0 < ne0; ++i0) {
-                const int ip2 = i2/w;
-                const int ip1 = i1/w;
-
-                const int64_t i02 = i2%w;
-                const int64_t i01 = i1%w;
-                const int64_t i00 = i0;
-
-                const int64_t i = (ip2*npx + ip1)*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00 + i00;
-                const int64_t j =                                  i2*ne1*ne0    + i1*ne0   + i0;
-
-                ((float *) dst->data)[j] = ((float *) src0->data)[i];
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_win_unpart(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_win_unpart_f32(params, src0, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-//gmml_compute_forward_unary
-
-static void ggml_compute_forward_unary(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    const enum ggml_unary_op op = ggml_get_unary_op(dst);
-
-    switch (op) {
-        case GGML_UNARY_OP_ABS:
-            {
-                ggml_compute_forward_abs(params, src0, dst);
-            } break;
-        case GGML_UNARY_OP_SGN:
-            {
-                ggml_compute_forward_sgn(params, src0, dst);
-            } break;
-        case GGML_UNARY_OP_NEG:
-            {
-                ggml_compute_forward_neg(params, src0, dst);
-            } break;
-        case GGML_UNARY_OP_STEP:
-            {
-                ggml_compute_forward_step(params, src0, dst);
-            } break;
-        case GGML_UNARY_OP_TANH:
-            {
-                ggml_compute_forward_tanh(params, src0, dst);
-            } break;
-        case GGML_UNARY_OP_ELU:
-            {
-                ggml_compute_forward_elu(params, src0, dst);
-            } break;
-        case GGML_UNARY_OP_RELU:
-            {
-                ggml_compute_forward_relu(params, src0, dst);
-            } break;
-        case GGML_UNARY_OP_GELU:
-            {
-                ggml_compute_forward_gelu(params, src0, dst);
-            } break;
-        case GGML_UNARY_OP_GELU_QUICK:
-            {
-                ggml_compute_forward_gelu_quick(params, src0, dst);
-            } break;
-        case GGML_UNARY_OP_SILU:
-            {
-                ggml_compute_forward_silu(params, src0, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_get_rel_pos
-
-static void ggml_compute_forward_get_rel_pos_f16(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/image_encoder.py#L292-L322
-
-    GGML_TENSOR_UNARY_OP_LOCALS
-
-    const int64_t w = ne1;
-
-    ggml_fp16_t * src0_data = (ggml_fp16_t *) src0->data;
-    ggml_fp16_t * dst_data  = (ggml_fp16_t *) dst->data;
-
-    for (int64_t i2 = 0; i2 < ne2; ++i2) {
-        for (int64_t i1 = 0; i1 < ne1; ++i1) {
-            const int64_t pos = (w - i1 - 1) + i2;
-            for (int64_t i0 = 0; i0 < ne0; ++i0) {
-                dst_data[i2*ne1*ne0 + i1*ne0 + i0] = src0_data[pos*ne00 + i0];
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_get_rel_pos(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F16:
-            {
-                ggml_compute_forward_get_rel_pos_f16(params, src0, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_add_rel_pos
-
-static void ggml_compute_forward_add_rel_pos_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        const struct ggml_tensor * src2,
-        struct ggml_tensor * dst) {
-
-    const bool inplace = (bool) ((int32_t *) dst->op_params)[0];
-    if (!inplace && params->type == GGML_TASK_INIT) {
-        memcpy((char *) dst->data, (char *) src0->data, ggml_nbytes(dst));
-        return;
-    }
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    int64_t t0 = ggml_perf_time_us();
-    UNUSED(t0);
-
-    // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/image_encoder.py#L357-L359
-
-    float * src1_data = (float *) src1->data;
-    float * src2_data = (float *) src2->data;
-    float * dst_data  = (float *) dst->data;
-
-    const int64_t ne10 = src1->ne[0];
-    const int64_t ne11 = src1->ne[1];
-    const int64_t ne12 = src1->ne[2];
-    const int64_t ne13 = src1->ne[3];
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    // total patches in dst
-    const int np = ne13;
-
-    // patches per thread
-    const int dp = (np + nth - 1)/nth;
-
-    // patch range for this thread
-    const int ip0 = dp*ith;
-    const int ip1 = MIN(ip0 + dp, np);
-
-    for (int64_t i13 = ip0; i13 < ip1; ++i13) {
-        for (int64_t i12 = 0; i12 < ne12; ++i12) {
-            for (int64_t i11 = 0; i11 < ne11; ++i11) {
-                const int64_t jp1 = i13*ne12*ne11*ne10 + i12*ne11*ne10 + i11*ne10;
-                for (int64_t i10 = 0; i10 < ne10; ++i10) {
-                    const int64_t jp0  = jp1 + i10;
-                    const float src1_e = src1_data[jp0];
-                    const float src2_e = src2_data[jp0];
-
-                    const int64_t jdh = jp0 * ne10;
-                    const int64_t jdw = jdh - (ne10 - 1) * i10;
-
-                    for (int64_t j = 0; j < ne10; ++j) {
-                        dst_data[jdh + j     ] += src2_e;
-                        dst_data[jdw + j*ne10] += src1_e;
-                    }
-                }
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_add_rel_pos(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        const struct ggml_tensor * src2,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_add_rel_pos_f32(params, src0, src1, src2, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_map_unary
-
-static void ggml_compute_forward_map_unary_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst,
-        const ggml_unary_op_f32_t fun) {
-    GGML_ASSERT(ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int n  = ggml_nrows(src0);
-    const int nc = src0->ne[0];
-
-    assert( dst->nb[0] == sizeof(float));
-    assert(src0->nb[0] == sizeof(float));
-
-    for (int i = 0; i < n; i++) {
-        fun(nc,
-                (float *) ((char *) dst->data  + i*( dst->nb[1])),
-                (float *) ((char *) src0->data + i*(src0->nb[1])));
-    }
-}
-
-static void ggml_compute_forward_map_unary(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst,
-        const ggml_unary_op_f32_t fun) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_map_unary_f32(params, src0, dst, fun);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_map_binary
-
-static void ggml_compute_forward_map_binary_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst,
-        const ggml_binary_op_f32_t fun) {
-    assert(params->ith == 0);
-    assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int n  = ggml_nrows(src0);
-    const int nc = src0->ne[0];
-
-    assert( dst->nb[0] == sizeof(float));
-    assert(src0->nb[0] == sizeof(float));
-    assert(src1->nb[0] == sizeof(float));
-
-    for (int i = 0; i < n; i++) {
-        fun(nc,
-                (float *) ((char *) dst->data  + i*( dst->nb[1])),
-                (float *) ((char *) src0->data + i*(src0->nb[1])),
-                (float *) ((char *) src1->data + i*(src1->nb[1])));
-    }
-}
-
-static void ggml_compute_forward_map_binary(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst,
-        const ggml_binary_op_f32_t fun) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_map_binary_f32(params, src0, src1, dst, fun);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_map_custom1
-
-static void ggml_compute_forward_map_custom1_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * a,
-        struct ggml_tensor * dst,
-        const ggml_custom1_op_f32_t fun) {
-    assert(params->ith == 0);
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    fun(dst, a);
-}
-
-// ggml_compute_forward_map_custom2
-
-static void ggml_compute_forward_map_custom2_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * a,
-        const struct ggml_tensor * b,
-        struct ggml_tensor * dst,
-        const ggml_custom2_op_f32_t fun) {
-    assert(params->ith == 0);
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    fun(dst, a, b);
-}
-
-// ggml_compute_forward_map_custom3
-
-static void ggml_compute_forward_map_custom3_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * a,
-        const struct ggml_tensor * b,
-        const struct ggml_tensor * c,
-        struct ggml_tensor * dst,
-        const ggml_custom3_op_f32_t fun) {
-    assert(params->ith == 0);
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    fun(dst, a, b, c);
-}
-
-// ggml_compute_forward_map_custom1
-
-static void ggml_compute_forward_map_custom1(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * a,
-              struct ggml_tensor * dst) {
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    struct ggml_map_custom1_op_params * p = (struct ggml_map_custom1_op_params *) dst->op_params;
-
-    p->fun(dst, a, params->ith, params->nth, p->userdata);
-}
-
-// ggml_compute_forward_map_custom2
-
-static void ggml_compute_forward_map_custom2(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * a,
-        const struct ggml_tensor * b,
-              struct ggml_tensor * dst) {
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    struct ggml_map_custom2_op_params * p = (struct ggml_map_custom2_op_params *) dst->op_params;
-
-    p->fun(dst, a, b, params->ith, params->nth, p->userdata);
-}
-
-// ggml_compute_forward_map_custom3
-
-static void ggml_compute_forward_map_custom3(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * a,
-        const struct ggml_tensor * b,
-        const struct ggml_tensor * c,
-              struct ggml_tensor * dst) {
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    struct ggml_map_custom3_op_params * p = (struct ggml_map_custom3_op_params *) dst->op_params;
-
-    p->fun(dst, a, b, c, params->ith, params->nth, p->userdata);
-}
-
-// ggml_compute_forward_cross_entropy_loss
-
-static void ggml_compute_forward_cross_entropy_loss_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
-    GGML_ASSERT(ggml_is_contiguous(src0));
-    GGML_ASSERT(ggml_is_contiguous(src1));
-    GGML_ASSERT(ggml_is_scalar(dst));
-    GGML_ASSERT(ggml_are_same_shape(src0, src1));
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    float * sums = (float *) params->wdata;
-
-    // TODO: handle transposed/permuted matrices
-    const int nc = src0->ne[0];
-    const int nr = ggml_nrows(src0);
-
-    GGML_ASSERT(params->wsize >= sizeof(float) * (nth + nth * nc));
-
-    if (params->type == GGML_TASK_INIT) {
-        if (ith == 0) {
-            memset(sums, 0, sizeof(float) * (nth + nth * nc));
-        }
-        return;
-    }
-
-    if (params->type == GGML_TASK_FINALIZE) {
-        if (ith == 0) {
-            float * dp = (float *) dst->data;
-            ggml_vec_sum_f32(nth, dp, sums);
-            dp[0] *= -1.0f / (float) nr;
-        }
-        return;
-    }
-
-    const double eps = 1e-9;
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    for (int i1 = ir0; i1 < ir1; i1++) {
-        float * s0 = (float *)((char *) src0->data + i1*src0->nb[1]);
-        float * s1 = (float *)((char *) src1->data + i1*src1->nb[1]);
-        float * st = ((float *) params->wdata) + nth + ith*nc;
-
-#ifndef NDEBUG
-        for (int i = 0; i < nc; ++i) {
-            //printf("p[%d] = %f\n", i, p[i]);
-            assert(!isnan(s0[i]));
-            assert(!isnan(s1[i]));
-        }
-#endif
-        // soft_max
-        ggml_float sum = 0.0;
-        {
-            float max = -INFINITY;
-            ggml_vec_max_f32(nc, &max, s0);
-
-            uint16_t scvt; UNUSED(scvt);
-            for (int i = 0; i < nc; i++) {
-                if (s0[i] == -INFINITY) {
-                    st[i] = 0.0f;
-                } else {
-#ifndef GGML_CROSS_ENTROPY_EXP_FP16
-                    const float s = s0[i] - max;
-                    const float val = expf(s);
-#else
-                    ggml_fp16_t s = GGML_FP32_TO_FP16(s0[i] - max);
-                    memcpy(&scvt, &s, sizeof(scvt));
-                    const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt]);
-#endif
-                    sum += (ggml_float)val;
-                    st[i] = val;
-                }
-            }
-
-            assert(sum > 0.0);
-            // sum = 1.0/sum;
-        }
-        // avoid log(0) by rescaling from [0..1] to [eps..1]
-        sum = (1.0 - eps) / sum;
-        ggml_vec_scale_f32(nc, st, sum);
-        ggml_vec_add1_f32(nc, st, st, eps);
-        ggml_vec_log_f32(nc, st, st);
-        ggml_vec_mul_f32(nc, st, st, s1);
-
-        float st_sum = 0;
-        ggml_vec_sum_f32(nc, &st_sum, st);
-        sums[ith] += st_sum;
-
-#ifndef NDEBUG
-        for (int i = 0; i < nc; ++i) {
-            assert(!isnan(st[i]));
-            assert(!isinf(st[i]));
-        }
-#endif
-    }
-
-}
-
-static void ggml_compute_forward_cross_entropy_loss(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_cross_entropy_loss_f32(params, src0, src1, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_cross_entropy_loss_back
-
-static void ggml_compute_forward_cross_entropy_loss_back_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        const struct ggml_tensor * opt0,
-        struct ggml_tensor * dst) {
-    GGML_ASSERT(ggml_is_contiguous(dst));
-    GGML_ASSERT(ggml_is_contiguous(src0));
-    GGML_ASSERT(ggml_is_contiguous(src1));
-    GGML_ASSERT(ggml_is_contiguous(opt0));
-    GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
-
-    const int64_t ith = params->ith;
-    const int64_t nth = params->nth;
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const double eps = 1e-9;
-
-    // TODO: handle transposed/permuted matrices
-    const int64_t nc = src0->ne[0];
-    const int64_t nr = ggml_nrows(src0);
-
-    // rows per thread
-    const int64_t dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int64_t ir0 = dr*ith;
-    const int64_t ir1 = MIN(ir0 + dr, nr);
-
-    float * d   = (float *) opt0->data;
-
-    for (int64_t i1 = ir0; i1 < ir1; i1++) {
-        float * ds0 = (float *)((char *) dst->data  + i1*dst->nb[1]);
-        float * s0  = (float *)((char *) src0->data + i1*src0->nb[1]);
-        float * s1  = (float *)((char *) src1->data + i1*src1->nb[1]);
-
-#ifndef NDEBUG
-        for (int i = 0; i < nc; ++i) {
-            //printf("p[%d] = %f\n", i, p[i]);
-            assert(!isnan(s0[i]));
-            assert(!isnan(s1[i]));
-        }
-#endif
-
-        // soft_max
-        ggml_float sum = 0.0;
-        {
-            float max = -INFINITY;
-            ggml_vec_max_f32(nc, &max, s0);
-
-            uint16_t scvt; UNUSED(scvt);
-            for (int i = 0; i < nc; i++) {
-                if (s0[i] == -INFINITY) {
-                    ds0[i] = 0.0f;
-                } else {
-#ifndef GGML_CROSS_ENTROPY_EXP_FP16
-                    const float s = s0[i] - max;
-                    const float val = expf(s);
-#else
-                    ggml_fp16_t s = GGML_FP32_TO_FP16(s0[i] - max);
-                    memcpy(&scvt, &s, sizeof(scvt));
-                    const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt]);
-#endif
-                    sum += (ggml_float)val;
-                    ds0[i] = val;
-                }
-            }
-
-            assert(sum > 0.0);
-            sum = (1.0 - eps)/sum;
-        }
-
-        // grad(src0) = (softmax(src0) - src1) * grad(cross_entropy_loss(src0, src1)) / nr
-        ggml_vec_scale_f32(nc, ds0, sum);
-        ggml_vec_add1_f32(nc, ds0, ds0, eps);
-        ggml_vec_sub_f32(nc, ds0, ds0, s1);
-        ggml_vec_scale_f32(nc, ds0, d[0] / (float) nr);
-
-#ifndef NDEBUG
-        for (int i = 0; i < nc; ++i) {
-            assert(!isnan(ds0[i]));
-            assert(!isinf(ds0[i]));
-        }
-#endif
-    }
-}
-
-static void ggml_compute_forward_cross_entropy_loss_back(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        const struct ggml_tensor * opt0,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_cross_entropy_loss_back_f32(params, src0, src1, opt0, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-/////////////////////////////////
-
-static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
-    GGML_ASSERT(params);
-
-    if (tensor->op == GGML_OP_NONE) {
-        return;
-    }
-
-#ifdef GGML_USE_CUBLAS
-    bool skip_cpu = ggml_cuda_compute_forward(params, tensor);
-    if (skip_cpu) {
-        return;
-    }
-    GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_CPU);
-    GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_CPU);
-#endif // GGML_USE_CUBLAS
-
-    switch (tensor->op) {
-        case GGML_OP_DUP:
-            {
-                ggml_compute_forward_dup(params, tensor->src[0], tensor);
-            } break;
-        case GGML_OP_ADD:
-            {
-                ggml_compute_forward_add(params, tensor->src[0], tensor->src[1], tensor);
-            } break;
-        case GGML_OP_ADD1:
-            {
-                ggml_compute_forward_add1(params, tensor->src[0], tensor->src[1], tensor);
-            } break;
-        case GGML_OP_ACC:
-            {
-                ggml_compute_forward_acc(params, tensor->src[0], tensor->src[1], tensor);
-            } break;
-        case GGML_OP_SUB:
-            {
-                ggml_compute_forward_sub(params, tensor->src[0], tensor->src[1], tensor);
-            } break;
-        case GGML_OP_MUL:
-            {
-                ggml_compute_forward_mul(params, tensor->src[0], tensor->src[1], tensor);
-            } break;
-        case GGML_OP_DIV:
-            {
-                ggml_compute_forward_div(params, tensor->src[0], tensor->src[1], tensor);
-            } break;
-        case GGML_OP_SQR:
-            {
-                ggml_compute_forward_sqr(params, tensor->src[0], tensor);
-            } break;
-        case GGML_OP_SQRT:
-            {
-                ggml_compute_forward_sqrt(params, tensor->src[0], tensor);
-            } break;
-        case GGML_OP_LOG:
-            {
-                ggml_compute_forward_log(params, tensor->src[0], tensor);
-            } break;
-        case GGML_OP_SUM:
-            {
-                ggml_compute_forward_sum(params, tensor->src[0], tensor);
-            } break;
-        case GGML_OP_SUM_ROWS:
-            {
-                ggml_compute_forward_sum_rows(params, tensor->src[0], tensor);
-            } break;
-        case GGML_OP_MEAN:
-            {
-                ggml_compute_forward_mean(params, tensor->src[0], tensor);
-            } break;
-        case GGML_OP_ARGMAX:
-            {
-                ggml_compute_forward_argmax(params, tensor->src[0], tensor);
-            } break;
-        case GGML_OP_REPEAT:
-            {
-                ggml_compute_forward_repeat(params, tensor->src[0], tensor);
-            } break;
-        case GGML_OP_REPEAT_BACK:
-            {
-                ggml_compute_forward_repeat_back(params, tensor->src[0], tensor);
-            } break;
-        case GGML_OP_CONCAT:
-            {
-                ggml_compute_forward_concat(params, tensor->src[0], tensor->src[1], tensor);
-            } break;
-        case GGML_OP_SILU_BACK:
-            {
-                ggml_compute_forward_silu_back(params, tensor->src[0], tensor->src[1], tensor);
-            } break;
-        case GGML_OP_NORM:
-            {
-                ggml_compute_forward_norm(params, tensor->src[0], tensor);
-            } break;
-        case GGML_OP_RMS_NORM:
-            {
-                ggml_compute_forward_rms_norm(params, tensor->src[0], tensor);
-            } break;
-        case GGML_OP_RMS_NORM_BACK:
-            {
-                ggml_compute_forward_rms_norm_back(params, tensor->src[0], tensor->src[1], tensor);
-            } break;
-        case GGML_OP_GROUP_NORM:
-            {
-                ggml_compute_forward_group_norm(params, tensor->src[0], tensor);
-            } break;
-        case GGML_OP_MUL_MAT:
-            {
-                ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor);
-            } break;
-        case GGML_OP_MUL_MAT_ID:
-            {
-                ggml_compute_forward_mul_mat_id(params, tensor->src[0], tensor->src[1], tensor);
-            } break;
-        case GGML_OP_OUT_PROD:
-            {
-                ggml_compute_forward_out_prod(params, tensor->src[0], tensor->src[1], tensor);
-            } break;
-        case GGML_OP_SCALE:
-            {
-                ggml_compute_forward_scale(params, tensor->src[0], tensor);
-            } break;
-        case GGML_OP_SET:
-            {
-                ggml_compute_forward_set(params, tensor->src[0], tensor->src[1], tensor);
-            } break;
-        case GGML_OP_CPY:
-            {
-                ggml_compute_forward_cpy(params, tensor->src[0], tensor);
-            } break;
-        case GGML_OP_CONT:
-            {
-                ggml_compute_forward_cont(params, tensor->src[0], tensor);
-            } break;
-        case GGML_OP_RESHAPE:
-            {
-                ggml_compute_forward_reshape(params, tensor->src[0], tensor);
-            } break;
-        case GGML_OP_VIEW:
-            {
-                ggml_compute_forward_view(params, tensor->src[0]);
-            } break;
-        case GGML_OP_PERMUTE:
-            {
-                ggml_compute_forward_permute(params, tensor->src[0]);
-            } break;
-        case GGML_OP_TRANSPOSE:
-            {
-                ggml_compute_forward_transpose(params, tensor->src[0]);
-            } break;
-        case GGML_OP_GET_ROWS:
-            {
-                ggml_compute_forward_get_rows(params, tensor->src[0], tensor->src[1], tensor);
-            } break;
-        case GGML_OP_GET_ROWS_BACK:
-            {
-                ggml_compute_forward_get_rows_back(params, tensor->src[0], tensor->src[1], tensor);
-            } break;
-        case GGML_OP_DIAG:
-            {
-                ggml_compute_forward_diag(params, tensor->src[0], tensor);
-            } break;
-        case GGML_OP_DIAG_MASK_INF:
-            {
-                ggml_compute_forward_diag_mask_inf(params, tensor->src[0], tensor);
-            } break;
-        case GGML_OP_DIAG_MASK_ZERO:
-            {
-                ggml_compute_forward_diag_mask_zero(params, tensor->src[0], tensor);
-            } break;
-        case GGML_OP_SOFT_MAX:
-            {
-                ggml_compute_forward_soft_max(params, tensor->src[0], tensor->src[1], tensor);
-            } break;
-        case GGML_OP_SOFT_MAX_BACK:
-            {
-                ggml_compute_forward_soft_max_back(params, tensor->src[0], tensor->src[1], tensor);
-            } break;
-        case GGML_OP_ROPE:
-            {
-                ggml_compute_forward_rope(params, tensor->src[0], tensor->src[1], tensor);
-            } break;
-        case GGML_OP_ROPE_BACK:
-            {
-                ggml_compute_forward_rope_back(params, tensor->src[0], tensor->src[1], tensor);
-            } break;
-        case GGML_OP_ALIBI:
-            {
-                ggml_compute_forward_alibi(params, tensor->src[0], tensor);
-            } break;
-        case GGML_OP_CLAMP:
-            {
-                ggml_compute_forward_clamp(params, tensor->src[0], tensor);
-            } break;
-        case GGML_OP_CONV_TRANSPOSE_1D:
-            {
-                ggml_compute_forward_conv_transpose_1d(params, tensor->src[0], tensor->src[1], tensor);
-            } break;
-        case GGML_OP_IM2COL:
-            {
-                ggml_compute_forward_im2col(params, tensor->src[0], tensor->src[1], tensor);
-            } break;
-        case GGML_OP_CONV_TRANSPOSE_2D:
-            {
-                ggml_compute_forward_conv_transpose_2d(params, tensor->src[0], tensor->src[1], tensor);
-            } break;
-        case GGML_OP_POOL_1D:
-            {
-                ggml_compute_forward_pool_1d(params, tensor->src[0], tensor);
-            } break;
-        case GGML_OP_POOL_2D:
-            {
-                ggml_compute_forward_pool_2d(params, tensor->src[0], tensor);
-            } break;
-        case GGML_OP_UPSCALE:
-            {
-                ggml_compute_forward_upscale(params, tensor->src[0], tensor);
-            } break;
-        case GGML_OP_PAD:
-            {
-                ggml_compute_forward_pad(params, tensor->src[0], tensor);
-            } break;
-        case GGML_OP_ARGSORT:
-            {
-                ggml_compute_forward_argsort(params, tensor->src[0], tensor);
-            } break;
-        case GGML_OP_LEAKY_RELU:
-            {
-                ggml_compute_forward_leaky_relu(params, tensor->src[0], tensor);
-            } break;
-        case GGML_OP_FLASH_ATTN:
-            {
-                const int32_t t = ggml_get_op_params_i32(tensor, 0);
-                GGML_ASSERT(t == 0 || t == 1);
-                const bool masked = t != 0;
-                ggml_compute_forward_flash_attn(params, tensor->src[0], tensor->src[1], tensor->src[2], masked, tensor);
-            } break;
-        case GGML_OP_FLASH_FF:
-            {
-                ggml_compute_forward_flash_ff(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], tensor->src[4], tensor);
-            } break;
-        case GGML_OP_FLASH_ATTN_BACK:
-            {
-                int32_t t = ggml_get_op_params_i32(tensor, 0);
-                GGML_ASSERT(t == 0 || t == 1);
-                bool masked = t != 0;
-                ggml_compute_forward_flash_attn_back(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], masked, tensor);
-            } break;
-        case GGML_OP_WIN_PART:
-            {
-                ggml_compute_forward_win_part(params, tensor->src[0], tensor);
-            } break;
-        case GGML_OP_WIN_UNPART:
-            {
-                ggml_compute_forward_win_unpart(params, tensor->src[0], tensor);
-            } break;
-        case GGML_OP_UNARY:
-            {
-                ggml_compute_forward_unary(params, tensor->src[0], tensor);
-            } break;
-        case GGML_OP_GET_REL_POS:
-            {
-                ggml_compute_forward_get_rel_pos(params, tensor->src[0], tensor);
-            } break;
-        case GGML_OP_ADD_REL_POS:
-            {
-                ggml_compute_forward_add_rel_pos(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
-            } break;
-        case GGML_OP_MAP_UNARY:
-            {
-                ggml_unary_op_f32_t fun;
-                memcpy(&fun, tensor->op_params, sizeof(fun));
-                ggml_compute_forward_map_unary(params, tensor->src[0], tensor, fun);
-            }
-            break;
-        case GGML_OP_MAP_BINARY:
-            {
-                ggml_binary_op_f32_t fun;
-                memcpy(&fun, tensor->op_params, sizeof(fun));
-                ggml_compute_forward_map_binary(params, tensor->src[0], tensor->src[1], tensor, fun);
-            }
-            break;
-        case GGML_OP_MAP_CUSTOM1_F32:
-            {
-                ggml_custom1_op_f32_t fun;
-                memcpy(&fun, tensor->op_params, sizeof(fun));
-                ggml_compute_forward_map_custom1_f32(params, tensor->src[0], tensor, fun);
-            }
-            break;
-        case GGML_OP_MAP_CUSTOM2_F32:
-            {
-                ggml_custom2_op_f32_t fun;
-                memcpy(&fun, tensor->op_params, sizeof(fun));
-                ggml_compute_forward_map_custom2_f32(params, tensor->src[0], tensor->src[1], tensor, fun);
-            }
-            break;
-        case GGML_OP_MAP_CUSTOM3_F32:
-            {
-                ggml_custom3_op_f32_t fun;
-                memcpy(&fun, tensor->op_params, sizeof(fun));
-                ggml_compute_forward_map_custom3_f32(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor, fun);
-            }
-            break;
-        case GGML_OP_MAP_CUSTOM1:
-            {
-                ggml_compute_forward_map_custom1(params, tensor->src[0], tensor);
-            }
-            break;
-        case GGML_OP_MAP_CUSTOM2:
-            {
-                ggml_compute_forward_map_custom2(params, tensor->src[0], tensor->src[1], tensor);
-            }
-            break;
-        case GGML_OP_MAP_CUSTOM3:
-            {
-                ggml_compute_forward_map_custom3(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
-            }
-            break;
-        case GGML_OP_CROSS_ENTROPY_LOSS:
-            {
-                ggml_compute_forward_cross_entropy_loss(params, tensor->src[0], tensor->src[1], tensor);
-            }
-            break;
-        case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
-            {
-                ggml_compute_forward_cross_entropy_loss_back(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
-            }
-            break;
-        case GGML_OP_NONE:
-            {
-                // nop
-            } break;
-        case GGML_OP_COUNT:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-static size_t ggml_hash_size(size_t min_sz) {
-    // next primes after powers of two
-    static const size_t primes[] = {
-        2, 3, 5, 11, 17, 37, 67, 131, 257, 521, 1031,
-        2053, 4099, 8209, 16411, 32771, 65537, 131101,
-        262147, 524309, 1048583, 2097169, 4194319, 8388617,
-        16777259, 33554467, 67108879, 134217757, 268435459,
-        536870923, 1073741827, 2147483659
-    };
-    static const size_t n_primes = sizeof(primes)/sizeof(primes[0]);
-
-    // find the smallest prime that is larger or equal to min_sz
-    size_t l = 0;
-    size_t r = n_primes;
-    while (l < r) {
-        size_t m = (l + r)/2;
-        if (primes[m] < min_sz) {
-            l = m + 1;
-        } else {
-            r = m;
-        }
-    }
-    size_t sz = l < n_primes ? primes[l] : min_sz | 1;
-    return sz;
-}
-
-static size_t ggml_hash(const void * p) {
-    return (size_t)p;
-}
-
-size_t ggml_hash_find(const struct ggml_hash_set hash_set, struct ggml_tensor * key) {
-    size_t h = ggml_hash(key) % hash_set.size;
-
-    // linear probing
-    size_t i = h;
-    while (hash_set.keys[i] != NULL && hash_set.keys[i] != key) {
-        i = (i + 1) % hash_set.size;
-        if (i == h) {
-            // visited all hash table entries -> not found
-            return GGML_HASHTABLE_FULL;
-        }
-    }
-    return i;
-}
-
-bool ggml_hash_contains(struct ggml_hash_set hash_set, struct ggml_tensor * key) {
-    size_t i = ggml_hash_find(hash_set, key);
-    return i != GGML_HASHTABLE_FULL && hash_set.keys[i] == key;
-}
-
-size_t ggml_hash_insert(struct ggml_hash_set hash_set, struct ggml_tensor * key) {
-    size_t i = ggml_hash_find(hash_set, key);
-
-    GGML_ASSERT(i != GGML_HASHTABLE_FULL);
-
-    if (hash_set.keys[i] == key) {
-        return GGML_HASHTABLE_ALREADY_EXISTS;
-    }
-
-    // insert
-    GGML_ASSERT(hash_set.keys[i] == NULL);
-    hash_set.keys[i] = key;
-    return i;
-}
-
-size_t ggml_hash_find_or_insert(struct ggml_hash_set hash_set, struct ggml_tensor * key) {
-    size_t i = ggml_hash_find(hash_set, key);
-
-    GGML_ASSERT(i != GGML_HASHTABLE_FULL);
-
-    hash_set.keys[i] = key;
-    return i;
-}
-
-struct ggml_hash_set ggml_hash_set_new(size_t size) {
-    size = ggml_hash_size(size);
-    struct ggml_hash_set result;
-    result.size = size;
-    result.keys = malloc(sizeof(struct ggml_tensor *) * size);
-    memset(result.keys, 0, sizeof(struct ggml_tensor *) * size);
-    return result;
-}
-
-static void ggml_hash_set_free(struct ggml_hash_set hash_set) {
-    free(hash_set.keys);
-}
-
-struct hash_map {
-    struct ggml_hash_set set;
-    struct ggml_tensor ** vals;
-};
-
-static struct hash_map * ggml_new_hash_map(size_t size) {
-    struct hash_map * result = malloc(sizeof(struct hash_map));
-    result->set = ggml_hash_set_new(size);
-    result->vals = malloc(sizeof(struct ggml_tensor *) * result->set.size);
-    memset(result->vals, 0, sizeof(struct ggml_tensor *) * result->set.size);
-    return result;
-}
-
-static void ggml_hash_map_free(struct hash_map * map) {
-    ggml_hash_set_free(map->set);
-    free(map->vals);
-    free(map);
-}
-
-// gradient checkpointing
-
-static struct ggml_tensor * ggml_recompute_graph_node(
-        struct ggml_context * ctx,
-        struct ggml_cgraph  * graph,
-        struct hash_map     * replacements,
-        struct ggml_tensor  * node) {
-
-    if (node == NULL) {
-        return NULL;
-    }
-
-    if (node->is_param) {
-        return node;
-    }
-
-    if (!ggml_hash_contains(graph->visited_hash_table, node)) {
-        return node;
-    }
-
-    int count_children = 0;
-    for (int k = 0; k < GGML_MAX_SRC; ++k) {
-        if (node->src[k]) {
-            ++count_children;
-        }
-    }
-
-    if (count_children == 0) {
-        return node;
-    }
-
-    size_t i = ggml_hash_find(replacements->set, node);
-    GGML_ASSERT(i != GGML_HASHTABLE_FULL); // assert that not full
-    if (replacements->set.keys[i] == node) {
-        return replacements->vals[i];
-    }
-
-    struct ggml_tensor * clone = ggml_new_tensor(ctx, node->type, GGML_MAX_DIMS, node->ne);
-
-    // insert clone into replacements
-    GGML_ASSERT(replacements->set.keys[i] == NULL); // assert that we don't overwrite
-    replacements->set.keys[i] = node;
-    replacements->vals[i] = clone;
-
-    clone->op       = node->op;
-    clone->grad     = node->grad;
-    clone->is_param = node->is_param;
-    clone->extra    = node->extra;
-    for (int k = 0; k < GGML_MAX_DIMS; ++k) {
-        clone->nb[k] = node->nb[k];
-    }
-    for (int k = 0; k < GGML_MAX_SRC; ++k) {
-        clone->src[k] = ggml_recompute_graph_node(ctx, graph, replacements, node->src[k]);
-    }
-    if (node->view_src != NULL) {
-        clone->data = (node->view_src->data == NULL)
-                        ? NULL // view_src not yet allocated
-                        : (char *) node->view_src->data // view_src already allocated
-                                 + node->view_offs;
-        clone->view_src  = node->view_src;
-        clone->view_offs = node->view_offs;
-    }
-
-    GGML_ASSERT(sizeof(node->op_params) == sizeof(int32_t) * (GGML_MAX_OP_PARAMS / sizeof(int32_t)));
-    GGML_ASSERT(sizeof(node->name)      == GGML_MAX_NAME);
-    memcpy(clone->op_params, node->op_params, sizeof(node->op_params));
-    ggml_format_name(clone, "%s (clone)", ggml_get_name(node));
-
-    return clone;
-}
-
-void ggml_build_backward_gradient_checkpointing(
-        struct ggml_context   * ctx,
-        struct ggml_cgraph    * gf,
-        struct ggml_cgraph    * gb,
-        struct ggml_cgraph    * gb_tmp,
-        struct ggml_tensor  * * checkpoints,
-        int                     n_checkpoints) {
-    ggml_graph_cpy(gf, gb_tmp);
-    ggml_build_backward_expand(ctx, gf, gb_tmp, true);
-
-    if (n_checkpoints <= 0) {
-        ggml_graph_cpy(gb_tmp, gb);
-        return;
-    }
-
-    struct hash_map * replacements = ggml_new_hash_map(gf->n_nodes + gf->n_leafs + n_checkpoints);
-
-    // insert checkpoints in replacements
-    for (int i = 0; i < n_checkpoints; ++i) {
-        size_t k = ggml_hash_find(replacements->set, checkpoints[i]);
-        GGML_ASSERT(k != GGML_HASHTABLE_FULL); // assert that not full
-        GGML_ASSERT(replacements->set.keys[k] == NULL); // assert that we don't overwrite
-        replacements->set.keys[k] = checkpoints[i];
-        replacements->vals[k]     = checkpoints[i];
-    }
-
-    ggml_graph_cpy(gf, gb);
-    // rewrite gb_tmp->nodes[gf->n_nodes:gb_tmp->n_nodes],
-    // replacing references to gb_tmp->nodes[0:gf->n_nodes] ( == gf->nodes[0:gf->n_nodes]),
-    // by recomputing them from checkpoints
-    for (int i = gf->n_nodes; i<gb_tmp->n_nodes; ++i) {
-        struct ggml_tensor * node = gb_tmp->nodes[i];
-        for (int k = 0; k < GGML_MAX_SRC; ++k) {
-            // insert new tensors recomputing src, reusing already made replacements,
-            // remember replacements: remember new tensors with mapping from corresponding gf nodes
-            // recurse for input tensors,
-            // unless (i.e. terminating when) input tensors are replacements (like checkpoints)
-            node->src[k] = ggml_recompute_graph_node(ctx, gf, replacements, node->src[k]);
-        }
-        // insert rewritten backward node with replacements made into resulting backward graph gb
-        ggml_build_forward_expand(gb, node);
-    }
-
-    ggml_hash_map_free(replacements);
-}
-
-// functions to change gradients considering the case that input a might be initial gradient with zero value
-
-static struct ggml_tensor * ggml_add_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_hash_set zero_table) {
-    if (ggml_hash_contains(zero_table, a)) {
-        return b;
-    } else {
-        return ggml_add_impl(ctx, a, b, false);
-    }
-}
-
-static struct ggml_tensor * ggml_acc_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, size_t nb1, size_t nb2, size_t nb3, size_t offset, struct ggml_hash_set zero_table) {
-    if (ggml_hash_contains(zero_table, a)) {
-        struct ggml_tensor * a_zero = ggml_scale(ctx, a, 0.0f);
-        return ggml_acc_impl(ctx, a_zero, b, nb1, nb2, nb3, offset, false);
-    } else {
-        return ggml_acc_impl(ctx, a, b, nb1, nb2, nb3, offset, false);
-    }
-}
-
-static struct ggml_tensor * ggml_add1_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_hash_set zero_table) {
-    if (ggml_hash_contains(zero_table, a)) {
-        return ggml_repeat(ctx, b, a);
-    } else {
-        return ggml_add1_impl(ctx, a, b, false);
-    }
-}
-
-static struct ggml_tensor * ggml_sub_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_hash_set zero_table) {
-    if (ggml_hash_contains(zero_table, a)) {
-        return ggml_neg(ctx, b);
-    } else {
-        return ggml_sub_impl(ctx, a, b, false);
-    }
-}
-
-static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor * tensor, struct ggml_hash_set zero_table) {
-    struct ggml_tensor * src0 = tensor->src[0];
-    struct ggml_tensor * src1 = tensor->src[1];
-
-    switch (tensor->op) {
-        case GGML_OP_DUP:
-            {
-                if (src0->grad) {
-                    src0->grad = ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table);
-                }
-            } break;
-        case GGML_OP_ADD:
-            {
-                if (src0->grad) {
-                    src0->grad = ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table);
-                }
-                if (src1->grad) {
-                    src1->grad = ggml_add_or_set(ctx, src1->grad, tensor->grad, zero_table);
-                }
-            } break;
-        case GGML_OP_ADD1:
-            {
-                if (src0->grad) {
-                    src0->grad = ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table);
-                }
-                if (src1->grad) {
-                    src1->grad = ggml_add_or_set(ctx,
-                        src1->grad,
-                        ggml_mean(ctx, tensor->grad), // TODO: should probably be sum instead of mean
-                        zero_table);
-                }
-            } break;
-        case GGML_OP_ACC:
-            {
-                if (src0->grad) {
-                    src0->grad = ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table);
-                }
-                if (src1->grad) {
-                    const size_t nb1     = ((int32_t *) tensor->op_params)[0];
-                    const size_t nb2     = ((int32_t *) tensor->op_params)[1];
-                    const size_t nb3     = ((int32_t *) tensor->op_params)[2];
-                    const size_t offset  = ((int32_t *) tensor->op_params)[3];
-
-                    struct ggml_tensor * tensor_grad_view = ggml_view_4d(ctx,
-                        tensor->grad,
-                        src1->grad->ne[0],
-                        src1->grad->ne[1],
-                        src1->grad->ne[2],
-                        src1->grad->ne[3],
-                        nb1, nb2, nb3, offset);
-
-                    src1->grad =
-                        ggml_add_or_set(ctx,
-                            src1->grad,
-                            ggml_reshape(ctx,
-                                ggml_cont(ctx, tensor_grad_view),
-                                src1->grad),
-                            zero_table);
-                }
-            } break;
-        case GGML_OP_SUB:
-            {
-                if (src0->grad) {
-                    src0->grad = ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table);
-                }
-                if (src1->grad) {
-                    src1->grad = ggml_sub_or_set(ctx, src1->grad, tensor->grad, zero_table);
-                }
-            } break;
-        case GGML_OP_MUL:
-            {
-                if (src0->grad) {
-                    src0->grad =
-                        ggml_add_or_set(ctx,
-                                src0->grad,
-                                ggml_mul(ctx, src1, tensor->grad),
-                                zero_table);
-                }
-                if (src1->grad) {
-                    src1->grad =
-                        ggml_add_or_set(ctx,
-                                src1->grad,
-                                ggml_mul(ctx, src0, tensor->grad),
-                                zero_table);
-                }
-            } break;
-        case GGML_OP_DIV:
-            {
-                if (src0->grad) {
-                    src0->grad =
-                        ggml_add_or_set(ctx,
-                                src0->grad,
-                                ggml_div(ctx, tensor->grad, src1),
-                                zero_table);
-                }
-                if (src1->grad) {
-                    src1->grad =
-                        ggml_sub_or_set(ctx,
-                                src1->grad,
-                                ggml_mul(ctx,
-                                    tensor->grad,
-                                    ggml_div(ctx, tensor, src1)),
-                                zero_table);
-                }
-            } break;
-        case GGML_OP_SQR:
-            {
-                if (src0->grad) {
-                    src0->grad =
-                        ggml_add_or_set(ctx,
-                                src0->grad,
-                                ggml_scale(ctx,
-                                    ggml_mul(ctx, src0, tensor->grad),
-                                    2.0f),
-                                zero_table);
-                }
-            } break;
-        case GGML_OP_SQRT:
-            {
-                if (src0->grad) {
-                    src0->grad =
-                        ggml_add_or_set(ctx,
-                                src0->grad,
-                                ggml_scale(ctx,
-                                    ggml_div(ctx,
-                                        tensor->grad,
-                                        tensor),
-                                    0.5f),
-                                zero_table);
-                }
-            } break;
-        case GGML_OP_LOG:
-            {
-                if (src0->grad) {
-                    src0->grad =
-                        ggml_add_or_set(ctx,
-                                src0->grad,
-                                ggml_div(ctx,
-                                    tensor->grad,
-                                    src0),
-                                zero_table);
-                }
-            } break;
-        case GGML_OP_SUM:
-            {
-                if (src0->grad) {
-                    src0->grad =
-                        ggml_add1_or_set(ctx,
-                                src0->grad,
-                                tensor->grad,
-                                zero_table);
-                }
-            } break;
-        case GGML_OP_SUM_ROWS:
-            {
-                if (src0->grad) {
-                    src0->grad =
-                        ggml_add_or_set(ctx,
-                                src0->grad,
-                                ggml_repeat(ctx,
-                                    tensor->grad,
-                                    src0->grad),
-                                zero_table);
-                }
-            } break;
-        case GGML_OP_MEAN:
-        case GGML_OP_ARGMAX:
-            {
-                GGML_ASSERT(false); // TODO: implement
-            } break;
-        case GGML_OP_REPEAT:
-            {
-                // necessary for llama
-                if (src0->grad) {
-                    src0->grad = ggml_add_or_set(ctx,
-                            src0->grad,
-                            ggml_repeat_back(ctx, tensor->grad, src0->grad),
-                            zero_table);
-                }
-            } break;
-        case GGML_OP_REPEAT_BACK:
-            {
-                if (src0->grad) {
-                    // TODO: test this
-                    src0->grad = ggml_add_or_set(ctx,
-                            src0->grad,
-                            ggml_repeat(ctx, tensor->grad, src0->grad),
-                            zero_table);
-                }
-            } break;
-        case GGML_OP_CONCAT:
-            {
-                GGML_ASSERT(false); // TODO: implement
-            } break;
-        case GGML_OP_SILU_BACK:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
-        case GGML_OP_NORM:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
-        case GGML_OP_RMS_NORM:
-            {
-                // necessary for llama
-                if (src0->grad) {
-                    float eps;
-                    memcpy(&eps, tensor->op_params, sizeof(float));
-
-                    src0->grad = ggml_add_or_set(ctx,
-                            src0->grad,
-                            ggml_rms_norm_back(ctx, src0, tensor->grad, eps),
-                            zero_table);
-                }
-            } break;
-        case GGML_OP_RMS_NORM_BACK:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
-        case GGML_OP_GROUP_NORM:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
-        case GGML_OP_MUL_MAT:
-            {
-                // https://cs231n.github.io/optimization-2/#staged
-                // # forward pass
-                // s0 = np.random.randn(5, 10)
-                // s1 = np.random.randn(10, 3)
-                // t = s0.dot(s1)
-
-                // # now suppose we had the gradient on t from above in the circuit
-                // dt = np.random.randn(*t.shape) # same shape as t
-                // ds0 = dt.dot(s1.T) #.T gives the transpose of the matrix
-                // ds1 = t.T.dot(dt)
-
-                // tensor.shape [m,p,qq,rr]
-                // src0.shape   [n,m,q1,r1]
-                // src1.shape   [n,p,qq,rr]
-
-                // necessary for llama
-                if (src0->grad) {
-                    struct ggml_tensor * s1_tg =
-                        ggml_out_prod(ctx, // [n,m,qq,rr]
-                            src1,          // [n,p,qq,rr]
-                            tensor->grad); // [m,p,qq,rr]
-                    const int64_t qq = s1_tg->ne[2];
-                    const int64_t rr = s1_tg->ne[3];
-                    const int64_t q1 = src0->ne[2];
-                    const int64_t r1 = src0->ne[3];
-                    const bool ne2_broadcasted = qq > q1;
-                    const bool ne3_broadcasted = rr > r1;
-                    if (ne2_broadcasted || ne3_broadcasted) {
-                        // sum broadcast repetitions of s1_tg into shape of src0
-                        s1_tg = ggml_repeat_back(ctx, s1_tg, src0);
-                    }
-                    src0->grad =
-                        ggml_add_or_set(ctx,
-                                src0->grad, // [n,m,q1,r1]
-                                s1_tg,      // [n,m,q1,r1]
-                                zero_table);
-                }
-                if (src1->grad) {
-                    src1->grad =
-                        ggml_add_or_set(ctx,
-                                src1->grad,                            // [n,p,qq,rr]
-                                // ggml_mul_mat(ctx,                   // [n,p,qq,rr]
-                                //     ggml_cont(ctx,                  // [m,n,q1,r1]
-                                //         ggml_transpose(ctx, src0)), // [m,n,q1,r1]
-                                //     tensor->grad),                  // [m,p,qq,rr]
-
-                                // // when src0 is bigger than tensor->grad (this is mostly the case in llama),
-                                // // avoid transpose of src0, rather transpose smaller tensor->grad
-                                // // and then use ggml_out_prod
-                                ggml_out_prod(ctx,                  // [n,p,qq,rr]
-                                    src0,                           // [n,m,q1,r1]
-                                    ggml_transpose(ctx,             // [p,m,qq,rr]
-                                        tensor->grad)),             // [m,p,qq,rr]
-                                zero_table);
-                }
-            } break;
-        case GGML_OP_MUL_MAT_ID:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
-        case GGML_OP_OUT_PROD:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
-        case GGML_OP_SCALE:
-            {
-                // necessary for llama
-                if (src0->grad) {
-                    float s;
-                    memcpy(&s, tensor->op_params, sizeof(float));
-
-                    src0->grad =
-                        ggml_add_or_set(ctx,
-                            src0->grad,
-                            ggml_scale_impl(ctx, tensor->grad, s, false),
-                            zero_table);
-                }
-            } break;
-        case GGML_OP_SET:
-            {
-                const size_t nb1     = ((int32_t *) tensor->op_params)[0];
-                const size_t nb2     = ((int32_t *) tensor->op_params)[1];
-                const size_t nb3     = ((int32_t *) tensor->op_params)[2];
-                const size_t offset  = ((int32_t *) tensor->op_params)[3];
-
-                struct ggml_tensor * tensor_grad_view = NULL;
-
-                if (src0->grad || src1->grad) {
-                    GGML_ASSERT(src0->type == tensor->type);
-                    GGML_ASSERT(tensor->grad->type == tensor->type);
-                    GGML_ASSERT(tensor->grad->type == src1->grad->type);
-
-                    tensor_grad_view = ggml_view_4d(ctx,
-                        tensor->grad,
-                        src1->grad->ne[0],
-                        src1->grad->ne[1],
-                        src1->grad->ne[2],
-                        src1->grad->ne[3],
-                        nb1, nb2, nb3, offset);
-                }
-
-                if (src0->grad) {
-                    src0->grad = ggml_add_or_set(ctx,
-                        src0->grad,
-                        ggml_acc_impl(ctx,
-                            tensor->grad,
-                            ggml_neg(ctx, tensor_grad_view),
-                            nb1, nb2, nb3, offset, false),
-                        zero_table);
-                }
-
-                if (src1->grad) {
-                    src1->grad =
-                        ggml_add_or_set(ctx,
-                            src1->grad,
-                            ggml_reshape(ctx,
-                                ggml_cont(ctx, tensor_grad_view),
-                                src1->grad),
-                            zero_table);
-                }
-            } break;
-        case GGML_OP_CPY:
-            {
-                // necessary for llama
-                // cpy overwrites value of src1 by src0 and returns view(src1)
-                // the overwriting is mathematically equivalent to:
-                // tensor = src0 * 1 + src1 * 0
-                if (src0->grad) {
-                    // dsrc0 = dtensor * 1
-                    src0->grad = ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table);
-                }
-                if (src1->grad) {
-                    // dsrc1 = dtensor * 0 -> noop
-                }
-            } break;
-        case GGML_OP_CONT:
-            {
-                // same as cpy
-                if (src0->grad) {
-                    GGML_ASSERT(ggml_is_contiguous(src0->grad));
-                    GGML_ASSERT(ggml_is_contiguous(tensor->grad));
-                    src0->grad = ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table);
-                }
-            } break;
-        case GGML_OP_RESHAPE:
-            {
-                // necessary for llama
-                if (src0->grad) {
-                    src0->grad =
-                        ggml_add_or_set(ctx, src0->grad,
-                            ggml_reshape(ctx,
-                                ggml_is_contiguous(tensor->grad)
-                                    ? tensor->grad
-                                    : ggml_cont(ctx, tensor->grad),
-                                src0->grad),
-                        zero_table);
-                }
-            } break;
-        case GGML_OP_VIEW:
-            {
-                // necessary for llama
-                if (src0->grad) {
-                    size_t offset;
-
-                    memcpy(&offset, tensor->op_params, sizeof(offset));
-
-                    size_t nb1     = tensor->nb[1];
-                    size_t nb2     = tensor->nb[2];
-                    size_t nb3     = tensor->nb[3];
-
-                    if (src0->type != src0->grad->type) {
-                        // gradient is typically F32, but src0 could be other type
-                        size_t ng = ggml_element_size(src0->grad);
-                        size_t n0 = ggml_element_size(src0);
-                        GGML_ASSERT(offset % n0 == 0);
-                        GGML_ASSERT(nb1 % n0 == 0);
-                        GGML_ASSERT(nb2 % n0 == 0);
-                        GGML_ASSERT(nb3 % n0 == 0);
-                        offset = (offset / n0) * ng;
-                        nb1 = (nb1 / n0) * ng;
-                        nb2 = (nb2 / n0) * ng;
-                        nb3 = (nb3 / n0) * ng;
-                    }
-
-                    src0->grad = ggml_acc_or_set(ctx, src0->grad, tensor->grad, nb1, nb2, nb3, offset, zero_table);
-                }
-            } break;
-        case GGML_OP_PERMUTE:
-            {
-                // necessary for llama
-                if (src0->grad) {
-                    int32_t * axes = (int32_t *) tensor->op_params;
-                    int axis0 = axes[0] & 0x3;
-                    int axis1 = axes[1] & 0x3;
-                    int axis2 = axes[2] & 0x3;
-                    int axis3 = axes[3] & 0x3;
-                    int axes_backward[4] = {0,0,0,0};
-                    axes_backward[axis0] = 0;
-                    axes_backward[axis1] = 1;
-                    axes_backward[axis2] = 2;
-                    axes_backward[axis3] = 3;
-                    src0->grad =
-                        ggml_add_or_set(ctx, src0->grad,
-                            ggml_permute(ctx,
-                                tensor->grad,
-                                axes_backward[0],
-                                axes_backward[1],
-                                axes_backward[2],
-                                axes_backward[3]),
-                            zero_table);
-                }
-            } break;
-        case GGML_OP_TRANSPOSE:
-            {
-                // necessary for llama
-                if (src0->grad) {
-                    src0->grad =
-                        ggml_add_or_set(ctx, src0->grad,
-                            ggml_transpose(ctx, tensor->grad),
-                        zero_table);
-                }
-            } break;
-        case GGML_OP_GET_ROWS:
-            {
-                // necessary for llama (only for tokenizer)
-                if (src0->grad) {
-                    src0->grad =
-                        ggml_add_or_set(ctx, src0->grad,
-                            // last ggml_get_rows_back argument src0->grad is only
-                            // necessary to setup correct output shape
-                            ggml_get_rows_back(ctx, tensor->grad, src1, src0->grad),
-                        zero_table);
-                }
-                if (src1->grad) {
-                    // noop
-                }
-            } break;
-        case GGML_OP_GET_ROWS_BACK:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
-        case GGML_OP_DIAG:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
-        case GGML_OP_DIAG_MASK_INF:
-            {
-                // necessary for llama
-                if (src0->grad) {
-                    const int n_past = ((int32_t *) tensor->op_params)[0];
-                    src0->grad =
-                        ggml_add_or_set(ctx, src0->grad,
-                            /* ggml_diag_mask_inf_impl() shouldn't be here */
-                            /* ref:  https://github.com/ggerganov/llama.cpp/pull/4203#discussion_r1412377992 */
-                            ggml_diag_mask_zero_impl(ctx, tensor->grad, n_past, false),
-                        zero_table);
-                }
-            } break;
-        case GGML_OP_DIAG_MASK_ZERO:
-            {
-                // necessary for llama
-                if (src0->grad) {
-                    const int n_past = ((int32_t *) tensor->op_params)[0];
-                    src0->grad =
-                        ggml_add_or_set(ctx, src0->grad,
-                            ggml_diag_mask_zero_impl(ctx, tensor->grad, n_past, false),
-                        zero_table);
-                }
-            } break;
-        case GGML_OP_SOFT_MAX:
-            {
-                // necessary for llama
-                if (src0->grad) {
-                    src0->grad =
-                        ggml_add_or_set(ctx, src0->grad,
-                            ggml_soft_max_back(ctx, tensor->grad, tensor),
-                        zero_table);
-                }
-
-            } break;
-        case GGML_OP_SOFT_MAX_BACK:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
-        case GGML_OP_ROPE:
-            {
-                // necessary for llama
-                if (src0->grad) {
-                    //const int n_past = ((int32_t *) tensor->op_params)[0];
-                    const int n_dims     = ((int32_t *) tensor->op_params)[1];
-                    const int mode       = ((int32_t *) tensor->op_params)[2];
-                    const int n_ctx      = ((int32_t *) tensor->op_params)[3];
-                    const int n_orig_ctx = ((int32_t *) tensor->op_params)[4];
-                    float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow, xpos_base, xpos_down;
-
-                    memcpy(&freq_base,   (int32_t *) tensor->op_params +  5, sizeof(float));
-                    memcpy(&freq_scale,  (int32_t *) tensor->op_params +  6, sizeof(float));
-                    memcpy(&ext_factor,  (int32_t *) tensor->op_params +  7, sizeof(float));
-                    memcpy(&attn_factor, (int32_t *) tensor->op_params +  8, sizeof(float));
-                    memcpy(&beta_fast,   (int32_t *) tensor->op_params +  9, sizeof(float));
-                    memcpy(&beta_slow,   (int32_t *) tensor->op_params + 10, sizeof(float));
-                    memcpy(&xpos_base,   (int32_t *) tensor->op_params + 11, sizeof(float));
-                    memcpy(&xpos_down,   (int32_t *) tensor->op_params + 12, sizeof(bool));
-
-                    src0->grad = ggml_add_or_set(ctx,
-                            src0->grad,
-                            ggml_rope_back(ctx,
-                                tensor->grad,
-                                src1,
-                                n_dims,
-                                mode,
-                                n_ctx,
-                                n_orig_ctx,
-                                freq_base,
-                                freq_scale,
-                                ext_factor,
-                                attn_factor,
-                                beta_fast,
-                                beta_slow,
-                                xpos_base,
-                                xpos_down),
-                            zero_table);
-                }
-            } break;
-        case GGML_OP_ROPE_BACK:
-            {
-                if (src0->grad) {
-                    //const int n_past = ((int32_t *) tensor->op_params)[0];
-                    const int n_dims     = ((int32_t *) tensor->op_params)[1];
-                    const int mode       = ((int32_t *) tensor->op_params)[2];
-                    const int n_ctx      = ((int32_t *) tensor->op_params)[3];
-                    const int n_orig_ctx = ((int32_t *) tensor->op_params)[4];
-                    float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow, xpos_base, xpos_down;
-
-                    memcpy(&freq_base,   (int32_t *) tensor->op_params +  5, sizeof(float));
-                    memcpy(&freq_scale,  (int32_t *) tensor->op_params +  6, sizeof(float));
-                    memcpy(&ext_factor,  (int32_t *) tensor->op_params +  7, sizeof(float));
-                    memcpy(&attn_factor, (int32_t *) tensor->op_params +  8, sizeof(float));
-                    memcpy(&beta_fast,   (int32_t *) tensor->op_params +  9, sizeof(float));
-                    memcpy(&beta_slow,   (int32_t *) tensor->op_params + 10, sizeof(float));
-                    memcpy(&xpos_base,   (int32_t *) tensor->op_params + 11, sizeof(float));
-                    memcpy(&xpos_down,   (int32_t *) tensor->op_params + 12, sizeof(bool));
-
-                    src0->grad = ggml_add_or_set(ctx,
-                            src0->grad,
-                            ggml_rope_impl(ctx,
-                                tensor->grad,
-                                src1,
-                                n_dims,
-                                mode,
-                                n_ctx,
-                                n_orig_ctx,
-                                freq_base,
-                                freq_scale,
-                                ext_factor,
-                                attn_factor,
-                                beta_fast,
-                                beta_slow,
-                                xpos_base,
-                                xpos_down,
-                                false),
-                            zero_table);
-                }
-            } break;
-        case GGML_OP_ALIBI:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
-        case GGML_OP_CLAMP:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
-        case GGML_OP_CONV_TRANSPOSE_1D:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
-        case GGML_OP_IM2COL:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
-        case GGML_OP_CONV_TRANSPOSE_2D:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
-        case GGML_OP_POOL_1D:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
-        case GGML_OP_POOL_2D:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
-        case GGML_OP_UPSCALE:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
-        case GGML_OP_PAD:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
-        case GGML_OP_ARGSORT:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
-        case GGML_OP_LEAKY_RELU:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
-        case GGML_OP_FLASH_ATTN:
-            {
-                struct ggml_tensor * flash_grad = NULL;
-                if (src0->grad || src1->grad || tensor->src[2]->grad) {
-                    int32_t t = ggml_get_op_params_i32(tensor, 0);
-                    GGML_ASSERT(t == 0 || t == 1);
-                    bool masked = t != 0;
-                    flash_grad =
-                        ggml_flash_attn_back(ctx,
-                            src0,
-                            src1,
-                            tensor->src[2],
-                            tensor->grad,
-                            masked);
-                }
-
-                struct ggml_tensor * src2 = tensor->src[2];
-                const int64_t elem_q = ggml_nelements(src0);
-                const int64_t elem_k = ggml_nelements(src1);
-                const int64_t elem_v = ggml_nelements(src2);
-
-                enum ggml_type result_type = flash_grad->type;
-                GGML_ASSERT(ggml_blck_size(result_type) == 1);
-                const size_t tsize = ggml_type_size(result_type);
-
-                const size_t offs_q = 0;
-                const size_t offs_k = offs_q + GGML_PAD(elem_q * tsize, GGML_MEM_ALIGN);
-                const size_t offs_v = offs_k + GGML_PAD(elem_k * tsize, GGML_MEM_ALIGN);
-
-                if (src0->grad) {
-                    struct ggml_tensor * view_q = ggml_view_1d(ctx, flash_grad, elem_q, offs_q);
-                    struct ggml_tensor * grad_q = ggml_reshape(ctx, view_q, src0);
-                    src0->grad = ggml_add_or_set(ctx,
-                            src0->grad,
-                            grad_q,
-                            zero_table);
-                }
-                if (src1->grad) {
-                    struct ggml_tensor * view_k = ggml_view_1d(ctx, flash_grad, elem_k, offs_k);
-                    struct ggml_tensor * grad_k = ggml_reshape(ctx, view_k, src1);
-                    src1->grad = ggml_add_or_set(ctx,
-                            src1->grad,
-                            grad_k,
-                            zero_table);
-                }
-                if (src2->grad) {
-                    struct ggml_tensor * view_v = ggml_view_1d(ctx, flash_grad, elem_v, offs_v);
-                    struct ggml_tensor * grad_v = ggml_reshape(ctx, view_v, src2);
-                    src2->grad = ggml_add_or_set(ctx,
-                            src2->grad,
-                            grad_v,
-                            zero_table);
-                }
-            } break;
-        case GGML_OP_FLASH_FF:
-            {
-                GGML_ASSERT(false); // not supported
-            } break;
-        case GGML_OP_FLASH_ATTN_BACK:
-            {
-                GGML_ASSERT(false); // not supported
-            } break;
-        case GGML_OP_WIN_PART:
-        case GGML_OP_WIN_UNPART:
-        case GGML_OP_UNARY:
-            {
-                switch (ggml_get_unary_op(tensor)) {
-                    case GGML_UNARY_OP_ABS:
-                        {
-                            if (src0->grad) {
-                                src0->grad =
-                                    ggml_add_or_set(ctx,
-                                            src0->grad,
-                                            ggml_mul(ctx,
-                                                ggml_sgn(ctx, src0),
-                                                tensor->grad),
-                                            zero_table);
-                            }
-                        } break;
-                    case GGML_UNARY_OP_SGN:
-                        {
-                            if (src0->grad) {
-                                // noop
-                            }
-                        } break;
-                    case GGML_UNARY_OP_NEG:
-                        {
-                            if (src0->grad) {
-                                src0->grad = ggml_sub_or_set(ctx, src0->grad, tensor->grad, zero_table);
-                            }
-                        } break;
-                    case GGML_UNARY_OP_STEP:
-                        {
-                            if (src0->grad) {
-                                // noop
-                            }
-                        } break;
-                    case GGML_UNARY_OP_TANH:
-                        {
-                            GGML_ASSERT(false); // TODO: not implemented
-                        } break;
-                    case GGML_UNARY_OP_ELU:
-                        {
-                            GGML_ASSERT(false); // TODO: not implemented
-                        } break;
-                    case GGML_UNARY_OP_RELU:
-                        {
-                            if (src0->grad) {
-                                src0->grad = ggml_add_or_set(ctx,
-                                        src0->grad,
-                                        ggml_mul(ctx,
-                                            ggml_step(ctx, src0),
-                                            tensor->grad),
-                                        zero_table);
-                            }
-                        } break;
-                    case GGML_UNARY_OP_GELU:
-                        {
-                            GGML_ASSERT(false); // TODO: not implemented
-                        } break;
-                    case GGML_UNARY_OP_GELU_QUICK:
-                        {
-                            GGML_ASSERT(false); // TODO: not implemented
-                        } break;
-                    case GGML_UNARY_OP_SILU:
-                        {
-                            // necessary for llama
-                            if (src0->grad) {
-                                src0->grad = ggml_add_or_set(ctx,
-                                        src0->grad,
-                                        ggml_silu_back(ctx, src0, tensor->grad),
-                                        zero_table);
-                            }
-                        } break;
-                    default:
-                        GGML_ASSERT(false);
-                }
-            } break;
-        case GGML_OP_GET_REL_POS:
-        case GGML_OP_ADD_REL_POS:
-        case GGML_OP_MAP_UNARY:
-        case GGML_OP_MAP_BINARY:
-        case GGML_OP_MAP_CUSTOM1_F32:
-        case GGML_OP_MAP_CUSTOM2_F32:
-        case GGML_OP_MAP_CUSTOM3_F32:
-        case GGML_OP_MAP_CUSTOM1:
-        case GGML_OP_MAP_CUSTOM2:
-        case GGML_OP_MAP_CUSTOM3:
-            {
-                GGML_ASSERT(false); // not supported
-            } break;
-        case GGML_OP_CROSS_ENTROPY_LOSS:
-            {
-                if (src0->grad) {
-                    src0->grad = ggml_add_or_set(ctx,
-                                src0->grad,
-                                ggml_cross_entropy_loss_back(ctx,
-                                    src0,
-                                    src1,
-                                    tensor->grad),
-                                zero_table);
-                }
-            } break;
-        case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
-            {
-                GGML_ASSERT(false); // not supported
-            } break;
-        case GGML_OP_NONE:
-            {
-                // nop
-            } break;
-        case GGML_OP_COUNT:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-
-    for (int i = 0; i < GGML_MAX_SRC; ++i) {
-        if (tensor->src[i] && tensor->src[i]->grad) {
-            GGML_ASSERT(ggml_are_same_shape(tensor->src[i], tensor->src[i]->grad));
-        }
-    }
-}
-
-static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor * node) {
-    if (node->grad == NULL) {
-        // this usually happens when we generate intermediate nodes from constants in the backward pass
-        // it can also happen during forward pass, if the user performs computations with constants
-        if (node->op != GGML_OP_NONE) {
-            //GGML_PRINT_DEBUG("%s: warning: node %p has no grad, but op %d\n", __func__, (void *) node, node->op);
-        }
-    }
-
-    // check if already visited
-    if (ggml_hash_insert(cgraph->visited_hash_table, node) == GGML_HASHTABLE_ALREADY_EXISTS) {
-        return;
-    }
-
-    for (int i = 0; i < GGML_MAX_SRC; ++i) {
-        const int k =
-            (cgraph->order == GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT) ? i :
-            (cgraph->order == GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT) ? (GGML_MAX_SRC-1-i) :
-            /* unknown order, just fall back to using i*/ i;
-        if (node->src[k]) {
-            ggml_visit_parents(cgraph, node->src[k]);
-        }
-    }
-
-    if (node->op == GGML_OP_NONE && node->grad == NULL) {
-        // reached a leaf node, not part of the gradient graph (e.g. a constant)
-        GGML_ASSERT(cgraph->n_leafs < cgraph->size);
-
-        if (strlen(node->name) == 0) {
-            ggml_format_name(node, "leaf_%d", cgraph->n_leafs);
-        }
-
-        cgraph->leafs[cgraph->n_leafs] = node;
-        cgraph->n_leafs++;
-    } else {
-        GGML_ASSERT(cgraph->n_nodes < cgraph->size);
-
-        if (strlen(node->name) == 0) {
-            ggml_format_name(node, "node_%d", cgraph->n_nodes);
-        }
-
-        cgraph->nodes[cgraph->n_nodes] = node;
-        if (cgraph->grads) {
-            cgraph->grads[cgraph->n_nodes] = node->grad;
-        }
-        cgraph->n_nodes++;
-    }
-}
-
-static void ggml_build_forward_impl(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor, bool expand) {
-    if (!expand) {
-        // TODO: this branch isn't accessible anymore, maybe move this to ggml_build_forward_expand
-        ggml_graph_clear(cgraph);
-    }
-
-    const int n0 = cgraph->n_nodes;
-    UNUSED(n0);
-
-    ggml_visit_parents(cgraph, tensor);
-
-    const int n_new = cgraph->n_nodes - n0;
-    GGML_PRINT_DEBUG("%s: visited %d new nodes\n", __func__, n_new);
-
-    if (n_new > 0) {
-        // the last added node should always be starting point
-        GGML_ASSERT(cgraph->nodes[cgraph->n_nodes - 1] == tensor);
-    }
-}
-
-void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor) {
-    ggml_build_forward_impl(cgraph, tensor, true);
-}
-
-void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep) {
-    GGML_ASSERT(gf->n_nodes > 0);
-
-    // if we are keeping the gradient graph, we have to detach the gradient nodes from the original graph
-    if (keep) {
-        for (int i = 0; i < gf->n_nodes; i++) {
-            struct ggml_tensor * node = gf->nodes[i];
-
-            if (node->grad) {
-                node->grad = ggml_dup_tensor(ctx, node);
-                gf->grads[i] = node->grad;
-            }
-        }
-    }
-
-    // remember original gradients which start with zero values
-    struct ggml_hash_set zero_table = ggml_hash_set_new(gf->size);
-    for (int i = 0; i < gf->n_nodes; i++) {
-        if (gf->grads[i]) {
-            ggml_hash_insert(zero_table, gf->grads[i]);
-        }
-    }
-
-    for (int i = gf->n_nodes - 1; i >= 0; i--) {
-        struct ggml_tensor * node = gf->nodes[i];
-
-        // inplace operations to add gradients are not created by ggml_compute_backward
-        // use allocator to automatically make inplace operations
-        if (node->grad) {
-            ggml_compute_backward(ctx, node, zero_table);
-        }
-    }
-
-    for (int i = 0; i < gf->n_nodes; i++) {
-        struct ggml_tensor * node = gf->nodes[i];
-
-        if (node->is_param) {
-            GGML_PRINT_DEBUG("%s: found root node %p\n", __func__, (void *) node);
-            ggml_build_forward_expand(gb, node->grad);
-        }
-    }
-
-    ggml_hash_set_free(zero_table);
-}
-
-static size_t ggml_graph_nbytes(size_t size, bool grads) {
-    size_t nbytes = sizeof(struct ggml_cgraph);
-    nbytes += size * sizeof(struct ggml_tensor *) * 2; // leafs + nodes
-    if (grads) {
-        nbytes += size * sizeof(struct ggml_tensor *); // grads
-    }
-    nbytes += ggml_hash_size(size * 2) * sizeof(struct ggml_tensor *); // hash set
-    return nbytes;
-}
-
-size_t ggml_graph_overhead_custom(size_t size, bool grads) {
-    return GGML_OBJECT_SIZE + GGML_PAD(ggml_graph_nbytes(size, grads), GGML_MEM_ALIGN);
-}
-
-size_t ggml_graph_overhead(void) {
-    return ggml_graph_overhead_custom(GGML_DEFAULT_GRAPH_SIZE, false);
-}
-
-struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t size, bool grads) {
-    const size_t obj_size = ggml_graph_nbytes(size, grads);
-    struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_GRAPH, obj_size);
-    struct ggml_cgraph * cgraph = (struct ggml_cgraph *) ((char *) ctx->mem_buffer + obj->offs);
-
-    struct ggml_tensor ** data_start = (struct ggml_tensor **) (cgraph + 1);
-
-    size_t hash_size = ggml_hash_size(size * 2);
-    struct ggml_tensor ** nodes_ptr = data_start;
-    struct ggml_tensor ** leafs_ptr = nodes_ptr + size;
-    struct ggml_tensor ** hash_keys_ptr = leafs_ptr + size;
-    struct ggml_tensor ** grads_ptr = grads ? hash_keys_ptr + hash_size : NULL;
-
-    // check that we allocated the correct amount of memory
-    assert(obj_size == (size_t) (
-        (grads ? (char *)(grads_ptr + size) : (char *)(hash_keys_ptr + hash_size)) - (char *)cgraph));
-
-    memset(hash_keys_ptr, 0, hash_size * sizeof(struct ggml_tensor *));
-
-    *cgraph = (struct ggml_cgraph) {
-        /*.size         =*/ size,
-        /*.n_nodes      =*/ 0,
-        /*.n_leafs      =*/ 0,
-        /*.nodes        =*/ nodes_ptr,
-        /*.grads        =*/ grads_ptr,
-        /*.leafs        =*/ leafs_ptr,
-        /*.hash_table   =*/ { hash_size, hash_keys_ptr },
-        /*.order        =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT,
-        /*.perf_runs    =*/ 0,
-        /*.perf_cycles  =*/ 0,
-        /*.perf_time_us =*/ 0,
-    };
-
-    return cgraph;
-}
-
-struct ggml_cgraph * ggml_new_graph(struct ggml_context * ctx) {
-    return ggml_new_graph_custom(ctx, GGML_DEFAULT_GRAPH_SIZE, false);
-}
-
-struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph0, int i0, int i1) {
-    struct ggml_cgraph cgraph = {
-        /*.size         =*/ 0,
-        /*.n_nodes      =*/ i1 - i0,
-        /*.n_leafs      =*/ 0,
-        /*.nodes        =*/ cgraph0->nodes + i0,
-        /*.grads        =*/ cgraph0->grads ? cgraph0->grads + i0 : NULL,
-        /*.leafs        =*/ NULL,
-        /*.hash_table   =*/ { 0, NULL },
-        /*.order        =*/ cgraph0->order,
-        /*.perf_runs    =*/ 0,
-        /*.perf_cycles  =*/ 0,
-        /*.perf_time_us =*/ 0,
-    };
-
-    return cgraph;
-}
-
-void ggml_graph_cpy(struct ggml_cgraph * src, struct ggml_cgraph * dst) {
-    GGML_ASSERT(dst->size >= src->n_leafs);
-    GGML_ASSERT(dst->size >= src->n_nodes);
-    GGML_ASSERT(dst->visited_hash_table.size >= src->visited_hash_table.size);
-
-    dst->n_leafs = src->n_leafs;
-    dst->n_nodes = src->n_nodes;
-    dst->order   = src->order;
-
-    for (int i = 0; i < src->n_leafs; ++i) {
-        dst->leafs[i] = src->leafs[i];
-    }
-
-    for (int i = 0; i < src->n_nodes; ++i) {
-        dst->nodes[i] = src->nodes[i];
-    }
-
-    if (src->grads) {
-        GGML_ASSERT(dst->grads != NULL);
-        for (int i = 0; i < src->n_nodes; ++i) {
-            dst->grads[i] = src->grads[i];
-        }
-    }
-
-    for (size_t i = 0; i < src->visited_hash_table.size; ++i) {
-        if (src->visited_hash_table.keys[i]) {
-            ggml_hash_insert(dst->visited_hash_table, src->visited_hash_table.keys[i]);
-        }
-    }
-}
-
-struct ggml_cgraph * ggml_graph_dup(struct ggml_context * ctx, struct ggml_cgraph * cgraph) {
-    struct ggml_cgraph * result = ggml_new_graph_custom(ctx, cgraph->size, cgraph->grads != NULL);
-    ggml_graph_cpy(cgraph, result);
-    return result;
-}
-
-void ggml_graph_reset(struct ggml_cgraph * cgraph) {
-    GGML_ASSERT(cgraph->grads != NULL);
-
-    for (int i = 0; i < cgraph->n_nodes; i++) {
-        struct ggml_tensor * grad = cgraph->grads[i];
-
-        if (grad) {
-            ggml_set_zero(grad);
-        }
-    }
-}
-
-void ggml_graph_clear(struct ggml_cgraph * cgraph) {
-    cgraph->n_leafs = 0;
-    cgraph->n_nodes = 0;
-    memset(cgraph->visited_hash_table.keys, 0, cgraph->visited_hash_table.size * sizeof(struct ggml_tensor *));
-}
-
-//
-// thread data
-//
-// synchronization is done via busy loops
-// I tried using spin locks, but not sure how to use them correctly - the things I tried were slower than busy loops
-//
-
-#ifdef __APPLE__
-
-//#include <os/lock.h>
-//
-//typedef os_unfair_lock ggml_lock_t;
-//
-//#define ggml_lock_init(x)    UNUSED(x)
-//#define ggml_lock_destroy(x) UNUSED(x)
-//#define ggml_lock_lock       os_unfair_lock_lock
-//#define ggml_lock_unlock     os_unfair_lock_unlock
-//
-//#define GGML_LOCK_INITIALIZER OS_UNFAIR_LOCK_INIT
-
-typedef int ggml_lock_t;
-
-#define ggml_lock_init(x)    UNUSED(x)
-#define ggml_lock_destroy(x) UNUSED(x)
-#define ggml_lock_lock(x)    UNUSED(x)
-#define ggml_lock_unlock(x)  UNUSED(x)
-
-#define GGML_LOCK_INITIALIZER 0
-
-typedef pthread_t ggml_thread_t;
-
-#define ggml_thread_create pthread_create
-#define ggml_thread_join   pthread_join
-
-#else
-
-//typedef pthread_spinlock_t ggml_lock_t;
-
-//#define ggml_lock_init(x) pthread_spin_init(x, PTHREAD_PROCESS_PRIVATE)
-//#define ggml_lock_destroy pthread_spin_destroy
-//#define ggml_lock_lock    pthread_spin_lock
-//#define ggml_lock_unlock  pthread_spin_unlock
-
-typedef int ggml_lock_t;
-
-#define ggml_lock_init(x)    UNUSED(x)
-#define ggml_lock_destroy(x) UNUSED(x)
-#if defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64))
-#define ggml_lock_lock(x)    _mm_pause()
-#else
-#define ggml_lock_lock(x)    UNUSED(x)
-#endif
-#define ggml_lock_unlock(x)  UNUSED(x)
-
-#define GGML_LOCK_INITIALIZER 0
-
-typedef pthread_t ggml_thread_t;
-
-#define ggml_thread_create pthread_create
-#define ggml_thread_join   pthread_join
-
-#endif
-
-// Android's libc implementation "bionic" does not support setting affinity
-#if defined(__linux__) && !defined(__BIONIC__)
-static void set_numa_thread_affinity(int thread_n, int n_threads) {
-    if (!ggml_is_numa()) {
-        return;
-    }
-
-    // run thread on node_num thread_n / (threads per node)
-    const int node_num = thread_n / ((n_threads + g_state.numa.n_nodes - 1) / g_state.numa.n_nodes);
-    struct ggml_numa_node * node = &g_state.numa.nodes[node_num];
-    size_t setsize = CPU_ALLOC_SIZE(g_state.numa.total_cpus);
-
-    cpu_set_t * cpus = CPU_ALLOC(g_state.numa.total_cpus);
-    CPU_ZERO_S(setsize, cpus);
-    for (size_t i = 0; i < node->n_cpus; ++i) {
-        CPU_SET_S(node->cpus[i], setsize, cpus);
-    }
-
-    int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
-    if (rv) {
-            fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",
-                    strerror(rv));
-    }
-
-    CPU_FREE(cpus);
-}
-
-static void clear_numa_thread_affinity(void) {
-    if (!ggml_is_numa()) {
-        return;
-    }
-
-    size_t setsize = CPU_ALLOC_SIZE(g_state.numa.total_cpus);
-
-    cpu_set_t * cpus = CPU_ALLOC(g_state.numa.total_cpus);
-    CPU_ZERO_S(setsize, cpus);
-    for (unsigned i = 0; i < g_state.numa.total_cpus; ++i) {
-        CPU_SET_S(i, setsize, cpus);
-    }
-
-    int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
-    if (rv) {
-        fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",
-            strerror(rv));
-    }
-
-    CPU_FREE(cpus);
-}
-#else
-// TODO: Windows etc.
-// (the linux implementation may also work on BSD, someone should test)
-static void set_numa_thread_affinity(int thread_n, int n_threads) { UNUSED(thread_n); UNUSED(n_threads);  }
-static void clear_numa_thread_affinity(void) {}
-#endif
-
-struct ggml_compute_state_shared {
-    const struct ggml_cgraph * cgraph;
-    const struct ggml_cplan  * cplan;
-
-    int64_t perf_node_start_cycles;
-    int64_t perf_node_start_time_us;
-
-    const int n_threads;
-
-    // synchronization primitives
-    atomic_int n_active; // num active threads
-    atomic_int node_n;   // active graph node
-
-    bool (*abort_callback)(void * data); // abort ggml_graph_compute when true
-    void * abort_callback_data;
-};
-
-struct ggml_compute_state {
-    ggml_thread_t thrd;
-    int ith;
-    struct ggml_compute_state_shared * shared;
-};
-
-static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_state_shared * st) {
-    int64_t cycles_cur  = ggml_perf_cycles()  - st->perf_node_start_cycles;
-    int64_t time_us_cur = ggml_perf_time_us() - st->perf_node_start_time_us;
-
-    node->perf_runs++;
-    node->perf_cycles  += cycles_cur;
-    node->perf_time_us += time_us_cur;
-}
-
-static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
-    int n_tasks = 0;
-
-    switch (node->op) {
-        case GGML_OP_CPY:
-        case GGML_OP_DUP:
-        case GGML_OP_ADD:
-        case GGML_OP_ADD1:
-        case GGML_OP_ACC:
-            {
-                n_tasks = n_threads;
-            } break;
-        case GGML_OP_SUB:
-        case GGML_OP_SQR:
-        case GGML_OP_SQRT:
-        case GGML_OP_LOG:
-        case GGML_OP_SUM:
-        case GGML_OP_SUM_ROWS:
-        case GGML_OP_MEAN:
-        case GGML_OP_ARGMAX:
-        case GGML_OP_REPEAT:
-        case GGML_OP_REPEAT_BACK:
-        case GGML_OP_LEAKY_RELU:
-            {
-                n_tasks = 1;
-            } break;
-        case GGML_OP_UNARY:
-            switch (ggml_get_unary_op(node)) {
-                case GGML_UNARY_OP_ABS:
-                case GGML_UNARY_OP_SGN:
-                case GGML_UNARY_OP_NEG:
-                case GGML_UNARY_OP_STEP:
-                case GGML_UNARY_OP_TANH:
-                case GGML_UNARY_OP_ELU:
-                case GGML_UNARY_OP_RELU:
-                    {
-                        n_tasks = 1;
-                    } break;
-
-                case GGML_UNARY_OP_GELU:
-                case GGML_UNARY_OP_GELU_QUICK:
-                case GGML_UNARY_OP_SILU:
-                    {
-                        n_tasks = n_threads;
-                    } break;
-                default:
-                    GGML_ASSERT(false);
-            }
-            break;
-        case GGML_OP_SILU_BACK:
-        case GGML_OP_MUL:
-        case GGML_OP_DIV:
-        case GGML_OP_NORM:
-        case GGML_OP_RMS_NORM:
-        case GGML_OP_RMS_NORM_BACK:
-        case GGML_OP_GROUP_NORM:
-        case GGML_OP_CONCAT:
-            {
-                n_tasks = n_threads;
-            } break;
-        case GGML_OP_MUL_MAT:
-            {
-                n_tasks = n_threads;
-
-                // TODO: use different scheduling for different matrix sizes
-                //const int nr0 = ggml_nrows(node->src[0]);
-                //const int nr1 = ggml_nrows(node->src[1]);
-
-                //n_tasks = MIN(n_threads, MAX(1, nr0/128));
-                //printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks%d\n", nr0, nr1, nr0*nr1, n_tasks);
-            } break;
-        case GGML_OP_MUL_MAT_ID:
-            {
-                n_tasks = n_threads;
-            } break;
-        case GGML_OP_OUT_PROD:
-            {
-                n_tasks = n_threads;
-            } break;
-        case GGML_OP_SCALE:
-        case GGML_OP_SET:
-        case GGML_OP_CONT:
-        case GGML_OP_RESHAPE:
-        case GGML_OP_VIEW:
-        case GGML_OP_PERMUTE:
-        case GGML_OP_TRANSPOSE:
-        case GGML_OP_GET_ROWS:
-        case GGML_OP_GET_ROWS_BACK:
-        case GGML_OP_DIAG:
-            {
-                n_tasks = 1;
-            } break;
-        case GGML_OP_DIAG_MASK_ZERO:
-        case GGML_OP_DIAG_MASK_INF:
-        case GGML_OP_SOFT_MAX_BACK:
-        case GGML_OP_ROPE:
-        case GGML_OP_ROPE_BACK:
-        case GGML_OP_ADD_REL_POS:
-            {
-                n_tasks = n_threads;
-            } break;
-        case GGML_OP_ALIBI:
-            {
-                n_tasks = 1; //TODO
-            } break;
-        case GGML_OP_CLAMP:
-            {
-                n_tasks = 1; //TODO
-            } break;
-        case GGML_OP_SOFT_MAX:
-            {
-                n_tasks = MIN(MIN(4, n_threads), ggml_nrows(node->src[0]));
-            } break;
-        case GGML_OP_CONV_TRANSPOSE_1D:
-            {
-                n_tasks = n_threads;
-            } break;
-        case GGML_OP_IM2COL:
-            {
-                n_tasks = n_threads;
-            } break;
-        case GGML_OP_CONV_TRANSPOSE_2D:
-            {
-                n_tasks = n_threads;
-            } break;
-        case GGML_OP_POOL_1D:
-        case GGML_OP_POOL_2D:
-            {
-                n_tasks = 1;
-            } break;
-        case GGML_OP_UPSCALE:
-            {
-                n_tasks = n_threads;
-            } break;
-        case GGML_OP_PAD:
-            {
-                n_tasks = n_threads;
-            } break;
-        case GGML_OP_ARGSORT:
-            {
-                n_tasks = n_threads;
-            } break;
-        case GGML_OP_FLASH_ATTN:
-            {
-                n_tasks = n_threads;
-            } break;
-        case GGML_OP_FLASH_FF:
-            {
-                n_tasks = n_threads;
-            } break;
-        case GGML_OP_FLASH_ATTN_BACK:
-            {
-                n_tasks = n_threads;
-            } break;
-        case GGML_OP_WIN_PART:
-        case GGML_OP_WIN_UNPART:
-        case GGML_OP_GET_REL_POS:
-        case GGML_OP_MAP_UNARY:
-        case GGML_OP_MAP_BINARY:
-        case GGML_OP_MAP_CUSTOM1_F32:
-        case GGML_OP_MAP_CUSTOM2_F32:
-        case GGML_OP_MAP_CUSTOM3_F32:
-            {
-                n_tasks = 1;
-            } break;
-        case GGML_OP_MAP_CUSTOM1:
-            {
-                struct ggml_map_custom1_op_params * p = (struct ggml_map_custom1_op_params *) node->op_params;
-                if (p->n_tasks == GGML_N_TASKS_MAX) {
-                    n_tasks = n_threads;
-                } else {
-                    n_tasks = MIN(p->n_tasks, n_threads);
-                }
-            } break;
-        case GGML_OP_MAP_CUSTOM2:
-            {
-                struct ggml_map_custom2_op_params * p = (struct ggml_map_custom2_op_params *) node->op_params;
-                if (p->n_tasks == GGML_N_TASKS_MAX) {
-                    n_tasks = n_threads;
-                } else {
-                    n_tasks = MIN(p->n_tasks, n_threads);
-                }
-            } break;
-        case GGML_OP_MAP_CUSTOM3:
-            {
-                struct ggml_map_custom3_op_params * p = (struct ggml_map_custom3_op_params *) node->op_params;
-                if (p->n_tasks == GGML_N_TASKS_MAX) {
-                    n_tasks = n_threads;
-                } else {
-                    n_tasks = MIN(p->n_tasks, n_threads);
-                }
-            } break;
-        case GGML_OP_CROSS_ENTROPY_LOSS:
-            {
-                n_tasks = n_threads;
-            } break;
-        case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
-            {
-                n_tasks = n_threads;
-            } break;
-        case GGML_OP_NONE:
-            {
-                n_tasks = 1;
-            } break;
-        case GGML_OP_COUNT:
-            {
-                GGML_ASSERT(false);
-            } break;
-        default:
-            {
-                fprintf(stderr, "%s: op not implemented: ", __func__);
-                if (node->op < GGML_OP_COUNT) {
-                    fprintf(stderr, "%s\n", ggml_op_name(node->op));
-                } else {
-                    fprintf(stderr, "%d\n", node->op);
-                }
-                GGML_ASSERT(false);
-            } break;
-    }
-
-    assert(n_tasks > 0);
-
-    return n_tasks;
-}
-
-static thread_ret_t ggml_graph_compute_thread(void * data) {
-    struct ggml_compute_state * state = (struct ggml_compute_state *) data;
-
-    const struct ggml_cgraph * cgraph = state->shared->cgraph;
-    const struct ggml_cplan  * cplan  = state->shared->cplan;
-
-    const int   n_threads   = state->shared->n_threads;
-
-    set_numa_thread_affinity(state->ith, n_threads);
-
-    int node_n = -1;
-
-    while (true) {
-        if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
-            state->shared->node_n += 1;
-            return (thread_ret_t) GGML_EXIT_ABORTED;
-        }
-
-        if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
-            // all other threads are finished and spinning
-            // do finalize and init here so we don't have synchronize again
-            struct ggml_compute_params params = {
-                /*.type  =*/ GGML_TASK_FINALIZE,
-                /*.ith   =*/ 0,
-                /*.nth   =*/ 0,
-                /*.wsize =*/ cplan->work_size,
-                /*.wdata =*/ cplan->work_data,
-            };
-
-            if (node_n != -1) {
-                /* FINALIZE */
-                struct ggml_tensor * node = cgraph->nodes[node_n];
-                if (GGML_OP_HAS_FINALIZE[node->op]) {
-                    params.nth = ggml_get_n_tasks(node, n_threads);
-                    ggml_compute_forward(&params, node);
-                }
-                ggml_graph_compute_perf_stats_node(node, state->shared);
-            }
-
-            // distribute new work or execute it direct if 1T
-            while (++node_n < cgraph->n_nodes) {
-                GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes);
-
-                struct ggml_tensor * node = cgraph->nodes[node_n];
-                const int n_tasks = ggml_get_n_tasks(node, n_threads);
-
-                state->shared->perf_node_start_cycles  = ggml_perf_cycles();
-                state->shared->perf_node_start_time_us = ggml_perf_time_us();
-
-                params.nth = n_tasks;
-
-                /* INIT */
-                if (GGML_OP_HAS_INIT[node->op]) {
-                    params.type = GGML_TASK_INIT;
-                    ggml_compute_forward(&params, node);
-                }
-
-                if (n_tasks == 1) {
-                    // TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1,
-                    // they do something more efficient than spinning (?)
-                    params.type = GGML_TASK_COMPUTE;
-                    ggml_compute_forward(&params, node);
-
-                    if (GGML_OP_HAS_FINALIZE[node->op]) {
-                        params.type = GGML_TASK_FINALIZE;
-                        ggml_compute_forward(&params, node);
-                    }
-
-                    ggml_graph_compute_perf_stats_node(node, state->shared);
-                } else {
-                    break;
-                }
-
-                if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
-                    break;
-                }
-            }
-
-            atomic_store(&state->shared->n_active, n_threads);
-            atomic_store(&state->shared->node_n,   node_n);
-        } else {
-            // wait for other threads to finish
-            const int last = node_n;
-
-            const bool do_yield = last < 0 || cgraph->nodes[last]->op == GGML_OP_MUL_MAT;
-
-            while (true) {
-                // TODO: this sched_yield can have significant impact on the performance - either positive or negative
-                //       depending on the workload and the operating system.
-                //       since it is not clear what is the best approach, it should potentially become user-configurable
-                //       ref: https://github.com/ggerganov/ggml/issues/291
-                // UPD:  adding the do_yield flag seems to resolve the issue universally
-                if (do_yield) {
-                    sched_yield();
-                }
-
-                node_n = atomic_load(&state->shared->node_n);
-                if (node_n != last) break;
-            };
-        }
-
-        // check if we should stop
-        if (node_n >= cgraph->n_nodes) break;
-
-        /* COMPUTE */
-        struct ggml_tensor * node = cgraph->nodes[node_n];
-        const int n_tasks = ggml_get_n_tasks(node, n_threads);
-
-        struct ggml_compute_params params = {
-            /*.type  =*/ GGML_TASK_COMPUTE,
-            /*.ith   =*/ state->ith,
-            /*.nth   =*/ n_tasks,
-            /*.wsize =*/ cplan->work_size,
-            /*.wdata =*/ cplan->work_data,
-        };
-
-        if (state->ith < n_tasks) {
-            ggml_compute_forward(&params, node);
-        }
-    }
-
-    return GGML_EXIT_SUCCESS;
-}
-
-struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threads) {
-    if (n_threads <= 0) {
-        n_threads = GGML_DEFAULT_N_THREADS;
-    }
-
-    size_t work_size = 0;
-
-    struct ggml_cplan cplan;
-    memset(&cplan, 0, sizeof(struct ggml_cplan));
-
-    // thread scheduling for the different operations + work buffer size estimation
-    for (int i = 0; i < cgraph->n_nodes; i++) {
-        struct ggml_tensor * node = cgraph->nodes[i];
-
-        const int n_tasks = ggml_get_n_tasks(node, n_threads);
-
-        size_t cur = 0;
-
-        switch (node->op) {
-            case GGML_OP_CPY:
-            case GGML_OP_DUP:
-                {
-                    if (ggml_is_quantized(node->type)) {
-                        cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
-                    }
-                } break;
-            case GGML_OP_ADD:
-            case GGML_OP_ADD1:
-                {
-                    if (ggml_is_quantized(node->src[0]->type)) {
-                        cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
-                    }
-                } break;
-            case GGML_OP_ACC:
-                {
-                    if (ggml_is_quantized(node->src[0]->type)) {
-                        cur = ggml_type_size(GGML_TYPE_F32) * node->src[1]->ne[0] * n_tasks;
-                    }
-                } break;
-            case GGML_OP_MUL_MAT:
-                {
-                    const enum ggml_type vec_dot_type = type_traits[node->src[0]->type].vec_dot_type;
-
-#if defined(GGML_USE_CLBLAST)
-                    if (ggml_cl_can_mul_mat(node->src[0], node->src[1], node)) {
-                        cur = ggml_cl_mul_mat_get_wsize(node->src[0], node->src[1], node);
-                    } else
-#endif
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
-                    if (ggml_compute_forward_mul_mat_use_blas(node)) {
-                        if (node->src[0]->type != GGML_TYPE_F32) {
-                            // here we need memory just for single 2D matrix from src0
-                            cur = ggml_type_size(GGML_TYPE_F32)*(node->src[0]->ne[0]*node->src[0]->ne[1]);
-                        }
-                    } else
-#endif
-                    if (node->src[1]->type != vec_dot_type) {
-                        cur = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1]));
-                    }
-                } break;
-            case GGML_OP_MUL_MAT_ID:
-                {
-                    cur = 0;
-                    const struct ggml_tensor * src0 = node->src[2];
-                    const struct ggml_tensor * src1 = node->src[1];
-                    const enum ggml_type vec_dot_type = type_traits[src0->type].vec_dot_type;
-                    if (src1->type != vec_dot_type) {
-                        cur += ggml_row_size(vec_dot_type, ggml_nelements(src1));
-                    }
-                    const int n_as = ggml_get_op_params_i32(node, 1);
-                    cur += GGML_PAD(cur, sizeof(int64_t));       // align
-                    cur += n_as * sizeof(int64_t);               // matrix_row_counts
-                    cur += n_as * src1->ne[1] * sizeof(int64_t); // matrix_rows
-                } break;
-            case GGML_OP_OUT_PROD:
-                {
-                    if (ggml_is_quantized(node->src[0]->type)) {
-                        cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
-                    }
-                } break;
-            case GGML_OP_SOFT_MAX:
-            case GGML_OP_ROPE:
-                {
-                    cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
-                } break;
-            case GGML_OP_CONV_TRANSPOSE_1D:
-                {
-                    GGML_ASSERT(node->src[0]->ne[3] == 1);
-                    GGML_ASSERT(node->src[1]->ne[2] == 1);
-                    GGML_ASSERT(node->src[1]->ne[3] == 1);
-
-                    const int64_t ne00 = node->src[0]->ne[0];  // K
-                    const int64_t ne01 = node->src[0]->ne[1];  // Cout
-                    const int64_t ne02 = node->src[0]->ne[2];  // Cin
-
-                    const int64_t ne10 = node->src[1]->ne[0];  // L
-                    const int64_t ne11 = node->src[1]->ne[1];  // Cin
-
-                    if (node->src[0]->type == GGML_TYPE_F16 &&
-                        node->src[1]->type == GGML_TYPE_F32) {
-                        cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02;
-                        cur += sizeof(ggml_fp16_t)*ne10*ne11;
-                    } else if (node->src[0]->type == GGML_TYPE_F32 &&
-                               node->src[1]->type == GGML_TYPE_F32) {
-                        cur += sizeof(float)*ne00*ne01*ne02;
-                        cur += sizeof(float)*ne10*ne11;
-                    } else {
-                        GGML_ASSERT(false);
-                    }
-                } break;
-            case GGML_OP_CONV_TRANSPOSE_2D:
-                {
-                    const int64_t ne00 = node->src[0]->ne[0]; // W
-                    const int64_t ne01 = node->src[0]->ne[1]; // H
-                    const int64_t ne02 = node->src[0]->ne[2]; // Channels Out
-                    const int64_t ne03 = node->src[0]->ne[3]; // Channels In
-
-                    const int64_t ne10 = node->src[1]->ne[0]; // W
-                    const int64_t ne11 = node->src[1]->ne[1]; // H
-                    const int64_t ne12 = node->src[1]->ne[2]; // Channels In
-
-                    cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02*ne03;
-                    cur += sizeof(ggml_fp16_t)*ne10*ne11*ne12;
-                } break;
-            case GGML_OP_FLASH_ATTN:
-                {
-                    const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL);
-
-                    if (node->src[1]->type == GGML_TYPE_F32) {
-                        cur  = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1)
-                        cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2
-                    } else if (node->src[1]->type == GGML_TYPE_F16) {
-                        cur  = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1)
-                        cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2
-                    }
-                } break;
-            case GGML_OP_FLASH_FF:
-                {
-                    if (node->src[1]->type == GGML_TYPE_F32) {
-                        cur  = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1)
-                        cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2
-                    } else if (node->src[1]->type == GGML_TYPE_F16) {
-                        cur  = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1)
-                        cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2
-                    }
-                } break;
-            case GGML_OP_FLASH_ATTN_BACK:
-                {
-                    const int64_t    D = node->src[0]->ne[0];
-                    const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL);
-                    const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back
-                    if (node->src[1]->type == GGML_TYPE_F32) {
-                        cur  = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
-                        cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
-                    } else if (node->src[1]->type == GGML_TYPE_F16) {
-                        cur  = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
-                        cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
-                    }
-                } break;
-
-            case GGML_OP_CROSS_ENTROPY_LOSS:
-                {
-                    cur = ggml_type_size(node->type)*(n_tasks + node->src[0]->ne[0]*n_tasks);
-                } break;
-            case GGML_OP_COUNT:
-                {
-                    GGML_ASSERT(false);
-                } break;
-            default:
-                break;
-        }
-
-        work_size = MAX(work_size, cur);
-    }
-
-    if (work_size > 0) {
-        work_size += CACHE_LINE_SIZE*(n_threads - 1);
-    }
-
-    cplan.n_threads = n_threads;
-    cplan.work_size = work_size;
-    cplan.work_data = NULL;
-
-    return cplan;
-}
-
-int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
-    {
-        GGML_ASSERT(cplan);
-        GGML_ASSERT(cplan->n_threads > 0);
-
-        if (cplan->work_size > 0) {
-            GGML_ASSERT(cplan->work_data);
-        }
-    }
-
-    const int n_threads = cplan->n_threads;
-
-    struct ggml_compute_state_shared state_shared = {
-        /*.cgraph                  =*/ cgraph,
-        /*.cgraph_plan             =*/ cplan,
-        /*.perf_node_start_cycles  =*/ 0,
-        /*.perf_node_start_time_us =*/ 0,
-        /*.n_threads               =*/ n_threads,
-        /*.n_active                =*/ n_threads,
-        /*.node_n                  =*/ -1,
-        /*.abort_callback          =*/ NULL,
-        /*.abort_callback_data     =*/ NULL,
-    };
-    struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
-
-    // create thread pool
-    if (n_threads > 1) {
-        for (int j = 1; j < n_threads; ++j) {
-            workers[j] = (struct ggml_compute_state) {
-                .thrd   = 0,
-                .ith = j,
-                .shared = &state_shared,
-            };
-
-            const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
-            GGML_ASSERT(rc == 0);
-            UNUSED(rc);
-        }
-    }
-
-    workers[0].ith = 0;
-    workers[0].shared = &state_shared;
-
-    const int64_t perf_start_cycles  = ggml_perf_cycles();
-    const int64_t perf_start_time_us = ggml_perf_time_us();
-
-    // this is a work thread too
-    int compute_status = (size_t) ggml_graph_compute_thread(&workers[0]);
-
-    // don't leave affinity set on the main thread
-    clear_numa_thread_affinity();
-
-    // join or kill thread pool
-    if (n_threads > 1) {
-        for (int j = 1; j < n_threads; j++) {
-            const int rc = ggml_thread_join(workers[j].thrd, NULL);
-            GGML_ASSERT(rc == 0);
-        }
-    }
-
-    // performance stats (graph)
-    {
-        int64_t perf_cycles_cur  = ggml_perf_cycles()  - perf_start_cycles;
-        int64_t perf_time_us_cur = ggml_perf_time_us() - perf_start_time_us;
-
-        cgraph->perf_runs++;
-        cgraph->perf_cycles  += perf_cycles_cur;
-        cgraph->perf_time_us += perf_time_us_cur;
-
-        GGML_PRINT_DEBUG("%s: perf (%d) - cpu = %.3f / %.3f ms, wall = %.3f / %.3f ms\n",
-                __func__, cgraph->perf_runs,
-                (double) perf_cycles_cur      / (double) ggml_cycles_per_ms(),
-                (double) cgraph->perf_cycles  / (double) ggml_cycles_per_ms() / (double) cgraph->perf_runs,
-                (double) perf_time_us_cur     / 1000.0,
-                (double) cgraph->perf_time_us / 1000.0 / cgraph->perf_runs);
-    }
-
-    return compute_status;
-}
-
-void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) {
-    struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads);
-
-    struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size);
-
-    cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
-
-    ggml_graph_compute(cgraph, &cplan);
-}
-
-struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name) {
-    for (int i = 0; i < cgraph->n_leafs; i++) {
-        struct ggml_tensor * leaf = cgraph->leafs[i];
-
-        if (strcmp(leaf->name, name) == 0) {
-            return leaf;
-        }
-    }
-
-    for (int i = 0; i < cgraph->n_nodes; i++) {
-        struct ggml_tensor * node = cgraph->nodes[i];
-
-        if (strcmp(node->name, name) == 0) {
-            return node;
-        }
-    }
-
-    return NULL;
-}
-
-static void ggml_graph_export_leaf(const struct ggml_tensor * tensor, FILE * fout) {
-    const int64_t * ne = tensor->ne;
-    const size_t  * nb = tensor->nb;
-
-    fprintf(fout, "%-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %16p %32s\n",
-            ggml_type_name(tensor->type),
-            ggml_op_name  (tensor->op),
-            ggml_n_dims(tensor),
-            ne[0], ne[1], ne[2], ne[3],
-            nb[0], nb[1], nb[2], nb[3],
-            tensor->data,
-            tensor->name);
-}
-
-static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char * arg, FILE * fout) {
-    const int64_t * ne = tensor->ne;
-    const size_t  * nb = tensor->nb;
-
-    fprintf(fout, "%-6s %-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %16p %32s\n",
-            arg,
-            ggml_type_name(tensor->type),
-            ggml_op_name  (tensor->op),
-            ggml_n_dims(tensor),
-            ne[0], ne[1], ne[2], ne[3],
-            nb[0], nb[1], nb[2], nb[3],
-            tensor->data,
-            tensor->name);
-}
-
-void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
-    uint64_t size_eval = 0;
-
-    // compute size of intermediate results
-    // TODO: does not take into account scratch buffers !!!!
-    for (int i = 0; i < cgraph->n_nodes; ++i) {
-        size_eval += ggml_nbytes_pad(cgraph->nodes[i]);
-    }
-
-    // print
-    {
-        FILE * fout = stdout;
-
-        fprintf(fout, "\n");
-        fprintf(fout, "%-16s %8x\n", "magic",        GGML_FILE_MAGIC);
-        fprintf(fout, "%-16s %8d\n", "version",      GGML_FILE_VERSION);
-        fprintf(fout, "%-16s %8d\n", "leafs",        cgraph->n_leafs);
-        fprintf(fout, "%-16s %8d\n", "nodes",        cgraph->n_nodes);
-        fprintf(fout, "%-16s %" PRIu64 "\n", "eval", size_eval);
-
-        // header
-        fprintf(fout, "\n");
-        fprintf(fout, "%-6s %-12s %8s %8s %8s %8s %8s %16s %16s %16s %16s %16s %16s\n",
-                "TYPE", "OP", "NDIMS", "NE0", "NE1", "NE2", "NE3", "NB0", "NB1", "NB2", "NB3", "DATA", "NAME");
-
-        for (int i = 0; i < cgraph->n_leafs; ++i) {
-            ggml_graph_export_leaf(cgraph->leafs[i], fout);
-
-            GGML_ASSERT(cgraph->leafs[i]->op   == GGML_OP_NONE);
-            GGML_ASSERT(cgraph->leafs[i]->src[0] == NULL);
-            GGML_ASSERT(cgraph->leafs[i]->src[1] == NULL);
-        }
-
-        // header
-        fprintf(fout, "\n");
-        fprintf(fout, "%-6s %-6s %-12s %8s %8s %8s %8s %8s %16s %16s %16s %16s %8s %16s %16s\n",
-                "ARG", "TYPE", "OP", "NDIMS", "NE0", "NE1", "NE2", "NE3", "NB0", "NB1", "NB2", "NB3", "NTASKS", "DATA", "NAME");
-
-        for (int i = 0; i < cgraph->n_nodes; ++i) {
-            ggml_graph_export_node(cgraph->nodes[i], "DST", fout);
-
-            for (int j = 0; j < GGML_MAX_SRC; ++j) {
-                if (cgraph->nodes[i]->src[j]) {
-                    ggml_graph_export_node(cgraph->nodes[i]->src[j], "SRC", fout);
-                }
-            }
-
-            fprintf(fout, "\n");
-        }
-
-        fprintf(fout, "\n");
-    }
-
-    // write binary data
-    {
-        FILE * fout = fopen(fname, "wb");
-
-        if (!fout) {
-            fprintf(stderr, "%s: failed to open %s\n", __func__, fname);
-            return;
-        }
-
-        // header
-        {
-            const uint32_t magic   = GGML_FILE_MAGIC;
-            const uint32_t version = GGML_FILE_VERSION;
-            const uint32_t n_leafs = cgraph->n_leafs;
-            const uint32_t n_nodes = cgraph->n_nodes;
-
-            fwrite(&magic,     sizeof(uint32_t), 1, fout);
-            fwrite(&version,   sizeof(uint32_t), 1, fout);
-            fwrite(&n_leafs,   sizeof(uint32_t), 1, fout);
-            fwrite(&n_nodes,   sizeof(uint32_t), 1, fout);
-            fwrite(&size_eval, sizeof(uint64_t), 1, fout);
-        }
-
-        // leafs
-        {
-            for (int i = 0; i < cgraph->n_leafs; ++i) {
-                const struct ggml_tensor * tensor = cgraph->leafs[i];
-
-                const uint32_t type   = tensor->type;
-                const uint32_t op     = tensor->op;
-
-                fwrite(&type,   sizeof(uint32_t), 1, fout);
-                fwrite(&op,     sizeof(uint32_t), 1, fout);
-
-                for (int j = 0; j < GGML_MAX_DIMS; ++j) {
-                    const uint64_t ne = tensor->ne[j];
-                    const uint64_t nb = tensor->nb[j];
-
-                    fwrite(&ne, sizeof(uint64_t), 1, fout);
-                    fwrite(&nb, sizeof(uint64_t), 1, fout);
-                }
-
-                fwrite(tensor->name,      sizeof(char), GGML_MAX_NAME,      fout);
-                fwrite(tensor->op_params, sizeof(char), GGML_MAX_OP_PARAMS, fout);
-
-                // dump the data
-                // TODO: pad this to 32 byte boundary
-                {
-                    const size_t size = ggml_nbytes(tensor);
-
-                    fwrite(tensor->data, sizeof(char), size, fout);
-                }
-            }
-        }
-
-        // nodes
-        {
-            for (int i = 0; i < cgraph->n_nodes; ++i) {
-                const struct ggml_tensor * tensor = cgraph->nodes[i];
-
-                const uint32_t type   = tensor->type;
-                const uint32_t op     = tensor->op;
-
-                fwrite(&type,   sizeof(uint32_t), 1, fout);
-                fwrite(&op,     sizeof(uint32_t), 1, fout);
-
-                for (int j = 0; j < GGML_MAX_DIMS; ++j) {
-                    const uint64_t ne = tensor->ne[j];
-                    const uint64_t nb = tensor->nb[j];
-
-                    fwrite(&ne, sizeof(uint64_t), 1, fout);
-                    fwrite(&nb, sizeof(uint64_t), 1, fout);
-                }
-
-                fwrite(tensor->name,      sizeof(char), GGML_MAX_NAME,      fout);
-                fwrite(tensor->op_params, sizeof(char), GGML_MAX_OP_PARAMS, fout);
-
-                // output the op arguments
-                {
-                    struct ggml_tensor * args[GGML_MAX_SRC] = { NULL };
-
-                    for (int j = 0; j < GGML_MAX_SRC; ++j) {
-                        args[j] = tensor->src[j];
-                    }
-
-                    for (int j = 0; j < GGML_MAX_SRC; ++j) {
-                        if (args[j]) {
-                            int32_t idx = -1;
-
-                            // check if leaf
-                            {
-                                for (int k = 0; k < cgraph->n_leafs; ++k) {
-                                    if (args[j] == cgraph->leafs[k]) {
-                                        idx = k;
-                                        break;
-                                    }
-                                }
-                            }
-
-                            // check if node
-                            if (idx == -1) {
-                                for (int k = 0; k < cgraph->n_nodes; ++k) {
-                                    if (args[j] == cgraph->nodes[k]) {
-                                        idx = cgraph->n_leafs + k;
-                                        break;
-                                    }
-                                }
-                            }
-
-                            if (idx == -1) {
-                                fprintf(stderr, "%s: failed to find tensor, arg = %d, node = %d\n", __func__, j, i);
-                                fclose(fout);
-                                return;
-                            }
-
-                            fwrite(&idx, sizeof(int32_t), 1, fout);
-                        } else {
-                            const int32_t nul = -1;
-
-                            fwrite(&nul, sizeof(int32_t), 1, fout);
-                        }
-                    }
-                }
-            }
-        }
-
-        fclose(fout);
-    }
-}
-
-struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval) {
-    assert(*ctx_data == NULL);
-    assert(*ctx_eval == NULL);
-
-    struct ggml_cgraph * result = NULL;
-
-    struct ggml_tensor * data = NULL;
-
-    // read file into data
-    {
-        FILE * fin = fopen(fname, "rb");
-        if (!fin) {
-            fprintf(stderr, "%s: failed to open %s\n", __func__, fname);
-            return result;
-        }
-
-        size_t fsize = 0;
-
-        fseek(fin, 0, SEEK_END);
-        fsize = ftell(fin);
-        fseek(fin, 0, SEEK_SET);
-
-        // create the data context
-        {
-            const size_t overhead = 1*ggml_tensor_overhead();
-
-            struct ggml_init_params params = {
-                .mem_size   = fsize + overhead,
-                .mem_buffer = NULL,
-                .no_alloc   = false,
-            };
-
-            *ctx_data = ggml_init(params);
-
-            if (!*ctx_data) {
-                fprintf(stderr, "%s: failed to create ggml context\n", __func__);
-                fclose(fin);
-                return result;
-            }
-        }
-
-        data = ggml_new_tensor_1d(*ctx_data, GGML_TYPE_I8, fsize);
-
-        {
-            const size_t ret = fread(data->data, sizeof(char), fsize, fin);
-            if (ret != fsize) {
-                fprintf(stderr, "%s: failed to read %s\n", __func__, fname);
-                fclose(fin);
-                return result;
-            }
-        }
-
-        fclose(fin);
-    }
-
-    // populate result
-    {
-        char * ptr = (char *) data->data;
-
-        const uint32_t magic = *(const uint32_t *) ptr; ptr += sizeof(magic);
-
-        if (magic != GGML_FILE_MAGIC) {
-            fprintf(stderr, "%s: invalid magic number, got %08x\n", __func__, magic);
-            return result;
-        }
-
-        const uint32_t version = *(const uint32_t *) ptr; ptr += sizeof(version);
-
-        if (version != GGML_FILE_VERSION) {
-            fprintf(stderr, "%s: invalid version number\n", __func__);
-            return result;
-        }
-
-        const uint32_t n_leafs   = *(const uint32_t *) ptr; ptr += sizeof(n_leafs);
-        const uint32_t n_nodes   = *(const uint32_t *) ptr; ptr += sizeof(n_nodes);
-        const uint64_t size_eval = *(const uint64_t *) ptr; ptr += sizeof(size_eval);
-        const int     graph_size = MAX(n_leafs, n_nodes);
-
-        // create the data context
-        {
-            const size_t overhead = (n_leafs + n_nodes)*ggml_tensor_overhead() + ggml_graph_overhead_custom(graph_size, false);
-
-            struct ggml_init_params params = {
-                .mem_size   = size_eval + overhead,
-                .mem_buffer = NULL,
-                .no_alloc   = true,
-            };
-
-            *ctx_eval = ggml_init(params);
-
-            if (!*ctx_eval) {
-                fprintf(stderr, "%s: failed to create ggml context\n", __func__);
-                return result;
-            }
-        }
-
-        result = ggml_new_graph_custom(*ctx_eval, graph_size, false);
-
-        result->n_leafs = n_leafs;
-        result->n_nodes = n_nodes;
-
-
-        // leafs
-        {
-            uint32_t type;
-            uint32_t op;
-
-            for (uint32_t i = 0; i < n_leafs; ++i) {
-                type   = *(const uint32_t *) ptr; ptr += sizeof(type);
-                op     = *(const uint32_t *) ptr; ptr += sizeof(op);
-
-                int64_t ne[GGML_MAX_DIMS];
-                size_t  nb[GGML_MAX_DIMS];
-
-                for (int j = 0; j < GGML_MAX_DIMS; ++j) {
-                    uint64_t ne_cur;
-                    uint64_t nb_cur;
-
-                    ne_cur = *(const uint64_t *) ptr; ptr += sizeof(ne_cur);
-                    nb_cur = *(const uint64_t *) ptr; ptr += sizeof(nb_cur);
-
-                    ne[j] = ne_cur;
-                    nb[j] = nb_cur;
-                }
-
-                struct ggml_tensor * tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, GGML_MAX_DIMS, ne);
-
-                tensor->op = (enum ggml_op) op;
-
-                memcpy(tensor->name,      ptr, GGML_MAX_NAME);      ptr += GGML_MAX_NAME;
-                memcpy(tensor->op_params, ptr, GGML_MAX_OP_PARAMS); ptr += GGML_MAX_OP_PARAMS;
-
-                tensor->data = (void *) ptr;
-
-                for (int j = 0; j < GGML_MAX_DIMS; ++j) {
-                    tensor->nb[j] = nb[j];
-                }
-
-                result->leafs[i] = tensor;
-
-                ptr += ggml_nbytes(tensor);
-
-                fprintf(stderr, "%s: loaded leaf %d: '%16s', %9zu bytes\n", __func__, i, tensor->name, ggml_nbytes(tensor));
-            }
-        }
-
-        ggml_set_no_alloc(*ctx_eval, false);
-
-        // nodes
-        {
-            uint32_t type;
-            uint32_t op;
-
-            for (uint32_t i = 0; i < n_nodes; ++i) {
-                type   = *(const uint32_t *) ptr; ptr += sizeof(type);
-                op     = *(const uint32_t *) ptr; ptr += sizeof(op);
-
-                enum ggml_op eop = (enum ggml_op) op;
-
-                int64_t ne[GGML_MAX_DIMS];
-                size_t  nb[GGML_MAX_DIMS];
-
-                for (int j = 0; j < GGML_MAX_DIMS; ++j) {
-                    uint64_t ne_cur;
-                    uint64_t nb_cur;
-
-                    ne_cur = *(const uint64_t *) ptr; ptr += sizeof(ne_cur);
-                    nb_cur = *(const uint64_t *) ptr; ptr += sizeof(nb_cur);
-
-                    ne[j] = ne_cur;
-                    nb[j] = nb_cur;
-                }
-
-                const char * ptr_name      = ptr; ptr += GGML_MAX_NAME;
-                const char * ptr_op_params = ptr; ptr += GGML_MAX_OP_PARAMS;
-
-                const int32_t * ptr_arg_idx = (const int32_t *) ptr; ptr += GGML_MAX_SRC*sizeof(int32_t);
-
-                struct ggml_tensor * args[GGML_MAX_SRC] = { NULL };
-
-                // parse args
-                for (int j = 0; j < GGML_MAX_SRC; ++j) {
-                    const int32_t arg_idx = ptr_arg_idx[j];
-
-                    if (arg_idx == -1) {
-                        continue;
-                    }
-
-                    if (arg_idx < result->n_leafs) {
-                        args[j] = result->leafs[arg_idx];
-                    } else {
-                        args[j] = result->nodes[arg_idx - result->n_leafs];
-                    }
-                }
-
-                // create the tensor
-                // "view" operations are handled differently
-                // TODO: handle inplace ops - currently a copy is always made
-
-                struct ggml_tensor * tensor = NULL;
-
-                switch (eop) {
-                    // TODO: implement other view ops
-                    case GGML_OP_RESHAPE:
-                        {
-                            tensor = ggml_reshape_4d(*ctx_eval, args[0], ne[0], ne[1], ne[2], ne[3]);
-                        } break;
-                    case GGML_OP_VIEW:
-                        {
-                            tensor = ggml_view_4d(*ctx_eval, args[0], ne[0], ne[1], ne[2], ne[3], 0, 0, 0, 0);
-
-                            size_t offs;
-                            memcpy(&offs, ptr_op_params, sizeof(offs));
-
-                            tensor->data = ((char *) tensor->data) + offs;
-                        } break;
-                    case GGML_OP_TRANSPOSE:
-                        {
-                            tensor = ggml_transpose(*ctx_eval, args[0]);
-                        } break;
-                    case GGML_OP_PERMUTE:
-                        {
-                            tensor = ggml_view_4d(*ctx_eval, args[0], ne[0], ne[1], ne[2], ne[3], 0, 0, 0, 0);
-                        } break;
-                    default:
-                        {
-                            tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, GGML_MAX_DIMS, ne);
-
-                            tensor->op = eop;
-                        } break;
-                }
-
-                memcpy(tensor->name,      ptr_name,      GGML_MAX_NAME);
-                memcpy(tensor->op_params, ptr_op_params, GGML_MAX_OP_PARAMS);
-
-                for (int j = 0; j < GGML_MAX_DIMS; ++j) {
-                    tensor->nb[j] = nb[j];
-                }
-
-                for (int j = 0; j < GGML_MAX_SRC; ++j) {
-                    tensor->src[j] = args[j];
-                }
-
-                result->nodes[i] = tensor;
-
-                fprintf(stderr, "%s: loaded node %d: '%16s', %9zu bytes\n", __func__, i, tensor->name, ggml_nbytes(tensor));
-            }
-        }
-    }
-
-    return result;
-}
-
-void ggml_graph_print(const struct ggml_cgraph * cgraph) {
-    int64_t perf_total_per_op_us[GGML_OP_COUNT] = {0};
-
-    GGML_PRINT("=== GRAPH ===\n");
-
-    GGML_PRINT("n_nodes = %d\n", cgraph->n_nodes);
-    for (int i = 0; i < cgraph->n_nodes; i++) {
-        struct ggml_tensor * node = cgraph->nodes[i];
-
-        perf_total_per_op_us[node->op] += MAX(1, node->perf_time_us);
-
-        GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 ", %5" PRId64 "] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n",
-                i,
-                node->ne[0], node->ne[1], node->ne[2],
-                ggml_op_name(node->op), node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs,
-                (double) node->perf_cycles  / (double) ggml_cycles_per_ms(),
-                (double) node->perf_cycles  / (double) ggml_cycles_per_ms() / (double) node->perf_runs,
-                (double) node->perf_time_us / 1000.0,
-                (double) node->perf_time_us / 1000.0 / node->perf_runs);
-    }
-
-    GGML_PRINT("n_leafs = %d\n", cgraph->n_leafs);
-    for (int i = 0; i < cgraph->n_leafs; i++) {
-        struct ggml_tensor * node = cgraph->leafs[i];
-
-        GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s %16s\n",
-                i,
-                node->ne[0], node->ne[1],
-                ggml_op_name(node->op),
-                ggml_get_name(node));
-    }
-
-    for (int i = 0; i < GGML_OP_COUNT; i++) {
-        if (perf_total_per_op_us[i] == 0) {
-            continue;
-        }
-
-        GGML_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n", ggml_op_name(i), (double) perf_total_per_op_us[i] / 1000.0);
-    }
-
-    GGML_PRINT("========================================\n");
-}
-
-// check if node is part of the graph
-static bool ggml_graph_find(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node) {
-    if (cgraph == NULL) {
-        return true;
-    }
-
-    for (int i = 0; i < cgraph->n_nodes; i++) {
-        if (cgraph->nodes[i] == node) {
-            return true;
-        }
-    }
-
-    return false;
-}
-
-static struct ggml_tensor * ggml_graph_get_parent(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node) {
-    for (int i = 0; i < cgraph->n_nodes; i++) {
-        struct ggml_tensor * parent = cgraph->nodes[i];
-
-        if (parent->grad == node) {
-            return parent;
-        }
-    }
-
-    return NULL;
-}
-
-static void ggml_graph_dump_dot_node_edge(FILE * fp, const struct ggml_cgraph * gb, struct ggml_tensor * node, struct ggml_tensor * parent, const char * label)  {
-    struct ggml_tensor * gparent = ggml_graph_get_parent(gb, node);
-    struct ggml_tensor * gparent0 = ggml_graph_get_parent(gb, parent);
-    fprintf(fp, "  \"%p\":%s -> \"%p\":%s [ arrowhead = %s; style = %s; label = \"%s\"; ]\n",
-            gparent0 ? (void *) gparent0 : (void *) parent,
-            gparent0 ? "g" : "x",
-            gparent ? (void *) gparent : (void *) node,
-            gparent ? "g" : "x",
-            gparent ? "empty" : "vee",
-            gparent ? "dashed" : "solid",
-            label);
-}
-
-static void ggml_graph_dump_dot_leaf_edge(FILE * fp, struct ggml_tensor * node, struct ggml_tensor * parent, const char * label)  {
-    fprintf(fp, "  \"%p\":%s -> \"%p\":%s [ label = \"%s\"; ]\n",
-            (void *) parent, "x",
-            (void *) node, "x",
-            label);
-}
-
-void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename) {
-    char color[16];
-
-    FILE * fp = fopen(filename, "w");
-    GGML_ASSERT(fp);
-
-    fprintf(fp, "digraph G {\n");
-    fprintf(fp, "  newrank = true;\n");
-    fprintf(fp, "  rankdir = LR;\n");
-
-    for (int i = 0; i < gb->n_nodes; i++) {
-        struct ggml_tensor * node = gb->nodes[i];
-
-        if (ggml_graph_get_parent(gb, node) != NULL) {
-            continue;
-        }
-
-        if (node->is_param) {
-            snprintf(color, sizeof(color), "yellow");
-        } else if (node->grad) {
-            if (ggml_graph_find(gf, node)) {
-                snprintf(color, sizeof(color), "green");
-            } else {
-                snprintf(color, sizeof(color), "lightblue");
-            }
-        } else {
-            snprintf(color, sizeof(color), "white");
-        }
-
-        fprintf(fp, "  \"%p\" [ "
-                    "style = filled; fillcolor = %s; shape = record; "
-                    "label=\"",
-                (void *) node, color);
-
-        if (strlen(node->name) > 0) {
-            fprintf(fp, "%s (%s)|", node->name, ggml_type_name(node->type));
-        } else {
-            fprintf(fp, "(%s)|", ggml_type_name(node->type));
-        }
-
-        if (ggml_is_matrix(node)) {
-            fprintf(fp, "%d [%" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], ggml_op_symbol(node->op));
-        } else {
-            fprintf(fp, "%d [%" PRId64 ", %" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], node->ne[2], ggml_op_symbol(node->op));
-        }
-
-        if (node->grad) {
-            fprintf(fp, " | <g>%s\"; ]\n", ggml_op_symbol(node->grad->op));
-        } else {
-            fprintf(fp, "\"; ]\n");
-        }
-    }
-
-    for (int i = 0; i < gb->n_leafs; i++) {
-        struct ggml_tensor * node = gb->leafs[i];
-
-        snprintf(color, sizeof(color), "pink");
-
-        fprintf(fp, "  \"%p\" [ "
-                    "style = filled; fillcolor = %s; shape = record; "
-                    "label=\"<x>",
-                (void *) node, color);
-
-        if (strlen(node->name) > 0) {
-            fprintf(fp, "%s (%s)|", node->name, ggml_type_name(node->type));
-        } else {
-            fprintf(fp, "(%s)|", ggml_type_name(node->type));
-        }
-
-        fprintf(fp, "CONST %d [%" PRId64 ", %" PRId64 "]", i, node->ne[0], node->ne[1]);
-        if (ggml_nelements(node) < 5) {
-            fprintf(fp, " | (");
-            for (int j = 0; j < ggml_nelements(node); j++) {
-                if (node->type == GGML_TYPE_I8 || node->type == GGML_TYPE_I16 || node->type == GGML_TYPE_I32) {
-                    fprintf(fp, "%d", ggml_get_i32_1d(node, j));
-                }
-                else if (node->type == GGML_TYPE_F32 || node->type == GGML_TYPE_F16) {
-                    fprintf(fp, "%.1e", (double)ggml_get_f32_1d(node, j));
-                }
-                else {
-                    fprintf(fp, "#");
-                }
-                if (j < ggml_nelements(node) - 1) {
-                    fprintf(fp, ", ");
-                }
-            }
-            fprintf(fp, ")");
-        }
-        fprintf(fp, "\"; ]\n");
-    }
-
-    for (int i = 0; i < gb->n_nodes; i++) {
-        struct ggml_tensor * node = gb->nodes[i];
-
-        for (int j = 0; j < GGML_MAX_SRC; j++) {
-            if (node->src[j]) {
-                char label[16];
-                snprintf(label, sizeof(label), "src %d", j);
-                ggml_graph_dump_dot_node_edge(fp, gb, node, node->src[j], label);
-            }
-        }
-    }
-
-    for (int i = 0; i < gb->n_leafs; i++) {
-        struct ggml_tensor * node = gb->leafs[i];
-
-        for (int j = 0; j < GGML_MAX_SRC; j++) {
-            if (node->src[j]) {
-                char label[16];
-                snprintf(label, sizeof(label), "src %d", j);
-                ggml_graph_dump_dot_leaf_edge(fp, node, node->src[j], label);
-            }
-        }
-    }
-
-    fprintf(fp, "}\n");
-
-    fclose(fp);
-
-    GGML_PRINT("%s: dot -Tpng %s -o %s.png && open %s.png\n", __func__, filename, filename, filename);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-static void ggml_opt_set_params(int np, struct ggml_tensor * const ps[], const float * x) {
-    int i = 0;
-    for (int p = 0; p < np; ++p) {
-        const int64_t ne = ggml_nelements(ps[p]) ;
-        // TODO: add function to set tensor from array
-        for (int64_t j = 0; j < ne; ++j) {
-            ggml_set_f32_1d(ps[p], j, x[i++]);
-        }
-    }
-}
-
-static void ggml_opt_get_params(int np, struct ggml_tensor * const ps[], float * x) {
-    int i = 0;
-    for (int p = 0; p < np; ++p) {
-        const int64_t ne = ggml_nelements(ps[p]) ;
-        // TODO: add function to get all elements at once
-        for (int64_t j = 0; j < ne; ++j) {
-            x[i++] = ggml_get_f32_1d(ps[p], j);
-        }
-    }
-}
-
-static void ggml_opt_get_grad(int np, struct ggml_tensor * const ps[], float * g) {
-    int64_t i = 0;
-    for (int p = 0; p < np; ++p) {
-        const int64_t ne = ggml_nelements(ps[p]) ;
-        // TODO: add function to get all elements at once
-        for (int64_t j = 0; j < ne; ++j) {
-            g[i++] = ggml_get_f32_1d(ps[p]->grad, j);
-        }
-    }
-}
-
-static void ggml_opt_acc_grad(int np, struct ggml_tensor * const ps[], float * g, float scale) {
-    int64_t i = 0;
-    for (int p = 0; p < np; ++p) {
-        const int64_t ne = ggml_nelements(ps[p]) ;
-        // TODO: add function to get all elements at once
-        for (int64_t j = 0; j < ne; ++j) {
-            g[i++] += ggml_get_f32_1d(ps[p]->grad, j) * scale;
-        }
-    }
-}
-
-//
-// Using AdamW - ref: https://arxiv.org/pdf/1711.05101v3.pdf
-//
-// (Original Adam - ref: https://arxiv.org/pdf/1412.6980.pdf)
-//
-
-static enum ggml_opt_result ggml_opt_adam(
-        struct ggml_context * ctx,
-        struct ggml_opt_context * opt,
-        struct ggml_opt_params params,
-        struct ggml_tensor * f,
-        struct ggml_cgraph * gf,
-        struct ggml_cgraph * gb,
-        ggml_opt_callback callback,
-        void * callback_data) {
-    GGML_ASSERT(ggml_is_scalar(f));
-
-    // these will store the parameters we want to optimize
-    struct ggml_tensor * ps[GGML_MAX_PARAMS];
-
-    int np = 0;
-    int64_t nx = 0;
-    for (int i = 0; i < gf->n_nodes; ++i) {
-        if (gf->nodes[i]->is_param) {
-            GGML_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op);
-
-            GGML_ASSERT(np < GGML_MAX_PARAMS);
-
-            ps[np++] = gf->nodes[i];
-            nx += ggml_nelements(gf->nodes[i]);
-        }
-    }
-
-    if ((opt->params.type != params.type) || (opt->nx != nx) || (opt->params.past != params.past)) {
-        int iter = opt->iter;
-        ggml_opt_init(opt->ctx, opt, params, nx);
-        opt->iter = iter;
-    }
-
-    // constants
-    float sched = params.adam.sched;
-    const float alpha = params.adam.alpha;
-    const float decay = params.adam.decay * alpha;
-    const float beta1 = params.adam.beta1;
-    const float beta2 = params.adam.beta2;
-    const float eps   = params.adam.eps;
-    const float gclip = params.adam.gclip;
-    const int decay_min_ndim = params.adam.decay_min_ndim;
-    const int n_accum = MAX(1, params.n_gradient_accumulation);
-    const float accum_norm = 1.0f / (float) n_accum;
-
-    float * g  = opt->adam.g->data;  // gradients
-    float * m  = opt->adam.m->data;  // first moment
-    float * v  = opt->adam.v->data;  // second moment
-
-    float * pf = params.past > 0 ? opt->adam.pf->data : NULL; // past function values
-
-    struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads);
-    struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size);
-    cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
-
-    bool cancel = false;
-
-    // compute the function value
-    float fx = 0;
-    ggml_set_zero(opt->adam.g);
-    for (int accum_step = 0; accum_step < n_accum; ++accum_step) {
-        if (callback) {
-            callback(callback_data, accum_step, &sched, &cancel);
-            if (cancel) {
-                return GGML_OPT_CANCEL;
-            }
-        }
-        // ggml_graph_reset  (gf);
-        ggml_set_f32      (f->grad, 1.0f);
-        ggml_graph_compute(gb, &cplan);
-        ggml_opt_acc_grad(np, ps, g, accum_norm);
-        fx += ggml_get_f32_1d(f, 0);
-    }
-    fx *= accum_norm;
-
-    opt->adam.fx_prev = fx;
-    opt->adam.fx_best = opt->adam.fx_prev;
-    if (pf) {
-        pf[opt->iter % params.past] = opt->adam.fx_prev;
-    }
-
-    opt->loss_before = opt->adam.fx_prev;
-    opt->loss_after  = opt->adam.fx_prev;
-
-    // initialize
-    if (opt->just_initialized) {
-        opt->adam.n_no_improvement = 0;
-        opt->just_initialized = false;
-    }
-
-    float * fx_best = &opt->adam.fx_best;
-    float * fx_prev = &opt->adam.fx_prev;
-    int * n_no_improvement = &opt->adam.n_no_improvement;
-
-    int iter0 = opt->iter;
-
-    // run the optimizer
-    for (int t = 0; t < params.adam.n_iter; ++t) {
-        opt->iter = iter0 + t + 1;
-        GGML_PRINT_DEBUG  ("=== iter %d ===\n", t);
-
-        GGML_PRINT_DEBUG  ("f      = %10.6f\n", ggml_get_f32_1d(f, 0));
-        GGML_PRINT_DEBUG_5("df/dx0 = %10.6f\n", ggml_get_f32_1d(ps[0]->grad, 0));
-        GGML_PRINT_DEBUG_5("df/dx1 = %10.6f\n", ggml_get_f32_1d(ps[1]->grad, 0));
-
-        for (int i = 0; i < np; ++i) {
-            GGML_PRINT_DEBUG("param %d: %10.6f, g = %10.6f\n", i,
-                    ggml_get_f32_1d(ps[i], 0), ggml_get_f32_1d(ps[i]->grad, 0));
-        }
-
-        const int64_t t_start_wall = ggml_time_us();
-        const int64_t t_start_cpu = ggml_cycles();
-        UNUSED(t_start_wall);
-        UNUSED(t_start_cpu);
-
-        {
-            float gnorm = 1.0f;
-            if (gclip > 0.0f) {
-                // gradient clipping
-                ggml_float sum = 0.0;
-                for (int64_t i = 0; i < nx; ++i) {
-                    sum += (ggml_float)(g[i]*g[i]);
-                }
-                ggml_float norm = sqrt(sum);
-                if (norm > (ggml_float) gclip) {
-                    gnorm = (float) ((ggml_float) gclip / norm);
-                }
-            }
-            const float beta1h = alpha*sched/(1.0f - powf(beta1, opt->iter));
-            const float beta2h =        1.0f/(1.0f - powf(beta2, opt->iter));
-            int64_t i = 0;
-            for (int p = 0; p < np; ++p) {
-                const int64_t ne = ggml_nelements(ps[p]);
-                const float p_decay = ((ggml_n_dims(ps[p]) >= decay_min_ndim) ? decay : 0.0f) * sched;
-                for (int64_t j = 0; j < ne; ++j) {
-                    float x  = ggml_get_f32_1d(ps[p], j);
-                    float g_ = g[i]*gnorm;
-                    m[i] = m[i]*beta1 +    g_*(1.0f - beta1);
-                    v[i] = v[i]*beta2 + g_*g_*(1.0f - beta2);
-                    float mh = m[i]*beta1h;
-                    float vh = v[i]*beta2h;
-                    vh = sqrtf(vh) + eps;
-                    x  = x*(1.0f - p_decay) - mh/vh;
-                    ggml_set_f32_1d(ps[p], j, x);
-                    ++i;
-                }
-            }
-        }
-
-        fx = 0;
-        ggml_set_zero(opt->adam.g);
-        for (int accum_step = 0; accum_step < n_accum; ++accum_step) {
-            if (callback) {
-                callback(callback_data, accum_step, &sched, &cancel);
-                if (cancel) {
-                    return GGML_OPT_CANCEL;;
-                }
-            }
-            // ggml_graph_reset  (gf);
-            ggml_set_f32      (f->grad, 1.0f);
-            ggml_graph_compute(gb, &cplan);
-            ggml_opt_acc_grad(np, ps, g, accum_norm);
-            fx += ggml_get_f32_1d(f, 0);
-        }
-        fx *= accum_norm;
-
-        opt->loss_after = fx;
-
-        // check convergence
-        if (fabsf(fx - fx_prev[0])/fx < params.adam.eps_f) {
-            GGML_PRINT_DEBUG("converged\n");
-
-            return GGML_OPT_OK;
-        }
-
-        // delta-based convergence test
-        if (pf != NULL) {
-            // need at least params.past iterations to start checking for convergence
-            if (params.past <= iter0 + t) {
-                const float rate = (pf[(iter0 + t)%params.past] - fx)/fx;
-
-                if (fabsf(rate) < params.delta) {
-                    return GGML_OPT_OK;
-                }
-            }
-
-            pf[(iter0 + t)%params.past] = fx;
-        }
-
-        // check for improvement
-        if (params.max_no_improvement > 0) {
-            if (fx_best[0] > fx) {
-                fx_best[0] = fx;
-                n_no_improvement[0] = 0;
-            } else {
-                ++n_no_improvement[0];
-
-                if (n_no_improvement[0] >= params.max_no_improvement) {
-                    return GGML_OPT_OK;
-                }
-            }
-        }
-
-        fx_prev[0] = fx;
-
-        {
-            const int64_t t_end_cpu = ggml_cycles();
-            GGML_PRINT_DEBUG("time iter:      %5.3f s\n", ((float)(t_end_cpu - t_start_cpu))/CLOCKS_PER_SEC);
-            UNUSED(t_end_cpu);
-
-            const int64_t t_end_wall = ggml_time_us();
-            GGML_PRINT_DEBUG("wall time iter: %5.3f s\n", (t_end_wall - t_start_wall)/1e6);
-            UNUSED(t_end_wall);
-        }
-    }
-
-    return GGML_OPT_DID_NOT_CONVERGE;
-}
-
-//
-// L-BFGS
-//
-// the L-BFGS implementation below is based on the following implementation:
-//
-//   https://github.com/chokkan/liblbfgs
-//
-
-struct ggml_lbfgs_iteration_data {
-    float alpha;
-    float ys;
-    float * s;
-    float * y;
-};
-
-static enum ggml_opt_result linesearch_backtracking(
-        const struct ggml_opt_params * params,
-        int nx,
-        float * x,
-        float * fx,
-        float * g,
-        float * d,
-        float * step,
-        const float * xp,
-        struct ggml_tensor * f,
-        struct ggml_cgraph * gb,
-        struct ggml_cplan  * cplan,
-        const int np,
-        struct ggml_tensor * ps[],
-        bool * cancel,
-        ggml_opt_callback callback,
-        void * callback_data) {
-    int count = 0;
-
-    float width  = 0.0f;
-    float dg     = 0.0f;
-    float finit  = 0.0f;
-    float dginit = 0.0f;
-    float dgtest = 0.0f;
-
-    const float dec = 0.5f;
-    const float inc = 2.1f;
-
-    const int n_accum = MAX(1, params->n_gradient_accumulation);
-    const float accum_norm = 1.0f / (float) n_accum;
-
-    if (*step <= 0.f) {
-        return GGML_LINESEARCH_INVALID_PARAMETERS;
-    }
-
-    // compute the initial gradient in the search direction
-    ggml_vec_dot_f32(nx, &dginit, g, d);
-
-    // make sure that d points to a descent direction
-    if (0 < dginit) {
-        return GGML_LINESEARCH_FAIL;
-    }
-
-    // initialize local variables
-    finit = *fx;
-    dgtest = params->lbfgs.ftol*dginit;
-
-    while (true) {
-        ggml_vec_cpy_f32(nx, x, xp);
-        ggml_vec_mad_f32(nx, x, d, *step);
-
-        // evaluate the function and gradient values
-        {
-            ggml_opt_set_params(np, ps, x);
-
-            *fx = 0;
-            memset(g, 0, sizeof(float)*nx);
-            for (int accum_step = 0; accum_step < n_accum; ++accum_step) {
-                if (callback) {
-                    // LBFG-S does not support learning rate -> ignore learning schedule
-                    float sched = 0;
-                    callback(callback_data, accum_step, &sched, cancel);
-                    if (*cancel) {
-                        return GGML_OPT_CANCEL;
-                    }
-                }
-                // ggml_graph_reset  (gf);
-                ggml_set_f32      (f->grad, 1.0f);
-                ggml_graph_compute(gb, cplan);
-                ggml_opt_acc_grad(np, ps, g, accum_norm);
-                *fx += ggml_get_f32_1d(f, 0);
-            }
-            *fx *= accum_norm;
-
-        }
-
-        ++count;
-
-        if (*fx > finit + (*step)*dgtest) {
-            width = dec;
-        } else {
-            // Armijo condition is satisfied
-            if (params->lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_ARMIJO) {
-                return count;
-            }
-
-            ggml_vec_dot_f32(nx, &dg, g, d);
-
-            // check the Wolfe condition
-            if (dg < params->lbfgs.wolfe * dginit) {
-                width = inc;
-            } else {
-                if(params->lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_WOLFE) {
-                    // regular Wolfe conditions
-                    return count;
-                }
-
-                if(dg > -params->lbfgs.wolfe*dginit) {
-                    width = dec;
-                } else {
-                    // strong Wolfe condition (GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE)
-                    return count;
-                }
-            }
-        }
-
-        if (*step < params->lbfgs.min_step) {
-            return GGML_LINESEARCH_MINIMUM_STEP;
-        }
-        if (*step > params->lbfgs.max_step) {
-            return GGML_LINESEARCH_MAXIMUM_STEP;
-        }
-        if (params->lbfgs.max_linesearch <= count) {
-            return GGML_LINESEARCH_MAXIMUM_ITERATIONS;
-        }
-
-        (*step) *= width;
-    }
-
-    GGML_UNREACHABLE();
-}
-
-static enum ggml_opt_result ggml_opt_lbfgs(
-        struct ggml_context * ctx,
-        struct ggml_opt_context * opt,
-        struct ggml_opt_params params,
-        struct ggml_tensor * f,
-        struct ggml_cgraph * gf,
-        struct ggml_cgraph * gb,
-        ggml_opt_callback callback,
-        void * callback_data) {
-    if (params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_WOLFE ||
-        params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE) {
-        if (params.lbfgs.wolfe <= params.lbfgs.ftol || 1.f <= params.lbfgs.wolfe) {
-            return GGML_OPT_INVALID_WOLFE;
-        }
-    }
-
-    const int m = params.lbfgs.m;
-
-    // these will store the parameters we want to optimize
-    struct ggml_tensor * ps[GGML_MAX_PARAMS];
-
-    int np = 0;
-    int nx = 0;
-    for (int i = 0; i < gf->n_nodes; ++i) {
-        if (gf->nodes[i]->is_param) {
-            GGML_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op);
-
-            GGML_ASSERT(np < GGML_MAX_PARAMS);
-
-            ps[np++] = gf->nodes[i];
-            nx += ggml_nelements(gf->nodes[i]);
-        }
-    }
-
-    if ((opt->params.type != params.type) || (opt->nx != nx) || (opt->params.past != params.past) || (opt->params.lbfgs.m != params.lbfgs.m)) {
-        int iter = opt->iter;
-        ggml_opt_init(ctx, opt, params, nx);
-        opt->iter = iter;
-    }
-
-    struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads);
-    struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size);
-    cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
-
-    float * x  = opt->lbfgs.x->data;  // current parameters
-    float * xp = opt->lbfgs.xp->data; // previous parameters
-    float * g  = opt->lbfgs.g->data;  // current gradient
-    float * gp = opt->lbfgs.gp->data; // previous gradient
-    float * d  = opt->lbfgs.d->data;  // search direction
-
-    float * pf = params.past > 0 ? opt->lbfgs.pf->data : NULL; // past function values
-
-    const int n_accum = MAX(1, params.n_gradient_accumulation);
-    const float accum_norm = 1.0f / (float) n_accum;
-
-    float fx    = 0.0f; // cost function value
-    float xnorm = 0.0f; // ||x||
-    float gnorm = 0.0f; // ||g||
-
-    // initialize x from the graph nodes
-    ggml_opt_get_params(np, ps, x);
-
-    // the L-BFGS memory
-    float * lm_alpha = opt->lbfgs.lmal->data;
-    float * lm_ys    = opt->lbfgs.lmys->data;
-    float * lm_s     = opt->lbfgs.lms->data;
-    float * lm_y     = opt->lbfgs.lmy->data;
-
-    bool cancel = false;
-
-    // evaluate the function value and its gradient
-    {
-        ggml_opt_set_params(np, ps, x);
-
-        fx = 0;
-        memset(g, 0, sizeof(float)*nx);
-        for (int accum_step = 0; accum_step < n_accum; ++accum_step) {
-            if (callback) {
-                // LBFG-S does not support learning rate -> ignore learning schedule
-                float sched = 0;
-                callback(callback_data, accum_step, &sched, &cancel);
-                if (cancel) {
-                    return GGML_OPT_CANCEL;
-                }
-            }
-            // ggml_graph_reset  (gf);
-            ggml_set_f32      (f->grad, 1.0f);
-            ggml_graph_compute(gb, &cplan);
-            ggml_opt_acc_grad(np, ps, g, accum_norm);
-            fx += ggml_get_f32_1d(f, 0);
-        }
-        fx *= accum_norm;
-
-        opt->loss_before = fx;
-        opt->loss_after  = fx;
-    }
-
-    // search direction = -gradient
-    ggml_vec_neg_f32(nx, d, g);
-
-    // ||x||, ||g||
-    ggml_vec_norm_f32(nx, &xnorm, x);
-    ggml_vec_norm_f32(nx, &gnorm, g);
-
-    if (xnorm < 1.0f) {
-        xnorm = 1.0f;
-    }
-
-    // already optimized
-    if (gnorm/xnorm <= params.lbfgs.eps) {
-        return GGML_OPT_OK;
-    }
-
-    if (opt->just_initialized) {
-        if (pf) {
-            pf[0] = fx;
-        }
-        opt->lbfgs.fx_best = fx;
-
-        // initial step
-        ggml_vec_norm_inv_f32(nx, &opt->lbfgs.step, d);
-        opt->lbfgs.j                = 0;
-        opt->lbfgs.k                = 1;
-        opt->lbfgs.end              = 0;
-        opt->lbfgs.n_no_improvement = 0;
-        opt->just_initialized       = false;
-    }
-
-    float * fx_best        = &opt->lbfgs.fx_best;
-    float * step           = &opt->lbfgs.step;
-    int * j                = &opt->lbfgs.j;
-    int * k                = &opt->lbfgs.k;
-    int * end              = &opt->lbfgs.end;
-    int * n_no_improvement = &opt->lbfgs.n_no_improvement;
-
-    int ls     = 0;
-    int bound  = 0;
-
-    float ys   = 0.0f;
-    float yy   = 0.0f;
-    float beta = 0.0f;
-
-    int it = 0;
-
-    while (true) {
-        // store the current position and gradient vectors
-        ggml_vec_cpy_f32(nx, xp, x);
-        ggml_vec_cpy_f32(nx, gp, g);
-
-        // TODO: instead of passing &cancel here, use the return code of the linesearch
-        //       to determine if the optimization should be cancelled
-        //       this is a simple change, but not doing this atm, since I don't have a nice
-        //       way to test and don't want to break something with so many changes lined up
-        ls = linesearch_backtracking(&params, nx, x, &fx, g, d, step, xp, f, gb, &cplan, np, ps, &cancel, callback, callback_data);
-        if (cancel) {
-            return GGML_OPT_CANCEL;
-        }
-
-        if (ls < 0) {
-            // linesearch failed - go back to the previous point and return
-            ggml_vec_cpy_f32(nx, x, xp);
-            ggml_vec_cpy_f32(nx, g, gp);
-
-            return ls;
-        }
-
-        opt->loss_after = fx;
-
-        ggml_vec_norm_f32(nx, &xnorm, x);
-        ggml_vec_norm_f32(nx, &gnorm, g);
-
-        GGML_PRINT_DEBUG("f = %10.6f\n", ggml_get_f32_1d(f, 0));
-
-        if (xnorm < 1.0f) {
-            xnorm = 1.0f;
-        }
-        if (gnorm/xnorm <= params.lbfgs.eps) {
-            // converged
-            return GGML_OPT_OK;
-        }
-
-        // delta-based convergence test
-        if (pf != NULL) {
-            // need at least params.past iterations to start checking for convergence
-            if (params.past <= k[0]) {
-                const float rate = (pf[k[0]%params.past] - fx)/fx;
-
-                if (fabsf(rate) < params.delta) {
-                    return GGML_OPT_OK;
-                }
-            }
-
-            pf[k[0]%params.past] = fx;
-        }
-
-        // check for improvement
-        if (params.max_no_improvement > 0) {
-            if (fx < fx_best[0]) {
-                fx_best[0] = fx;
-                n_no_improvement[0] = 0;
-            } else {
-                n_no_improvement[0]++;
-
-                if (n_no_improvement[0] >= params.max_no_improvement) {
-                    return GGML_OPT_OK;
-                }
-            }
-        }
-
-        if (params.lbfgs.n_iter != 0 && params.lbfgs.n_iter < it + 1) {
-            // reached the maximum number of iterations
-            return GGML_OPT_DID_NOT_CONVERGE;
-        }
-
-        // update vectors s and y:
-        //   s_{k+1} = x_{k+1} - x_{k} = \step * d_{k}.
-        //   y_{k+1} = g_{k+1} - g_{k}.
-        //
-        ggml_vec_sub_f32(nx, &lm_s[end[0]*nx], x, xp);
-        ggml_vec_sub_f32(nx, &lm_y[end[0]*nx], g, gp);
-
-        // compute scalars ys and yy:
-        //     ys = y^t \cdot s    -> 1 / \rho.
-        //     yy = y^t \cdot y.
-        //
-        ggml_vec_dot_f32(nx, &ys, &lm_y[end[0]*nx], &lm_s[end[0]*nx]);
-        ggml_vec_dot_f32(nx, &yy, &lm_y[end[0]*nx], &lm_y[end[0]*nx]);
-
-        lm_ys[end[0]] = ys;
-
-        // find new search direction
-        //   ref: https://en.wikipedia.org/wiki/Limited-memory_BFGS
-
-        bound = (m <= k[0]) ? m : k[0];
-        k[0]++;
-        it++;
-        end[0] = (end[0] + 1)%m;
-
-        // initialize search direction with -g
-        ggml_vec_neg_f32(nx, d, g);
-
-        j[0] = end[0];
-        for (int i = 0; i < bound; ++i) {
-            j[0] = (j[0] + m - 1) % m;
-            // \alpha_{j} = \rho_{j} s^{t}_{j} \cdot q_{k+1}
-            ggml_vec_dot_f32(nx, &lm_alpha[j[0]], &lm_s[j[0]*nx], d);
-            lm_alpha[j[0]] /= lm_ys[j[0]];
-            // q_{i} = q_{i+1} - \alpha_{i} y_{i}
-            ggml_vec_mad_f32(nx, d, &lm_y[j[0]*nx], -lm_alpha[j[0]]);
-        }
-
-        ggml_vec_scale_f32(nx, d, ys/yy);
-
-        for (int i = 0; i < bound; ++i) {
-            // \beta_{j} = \rho_{j} y^t_{j} \cdot \gamma_{i}
-            ggml_vec_dot_f32(nx, &beta, &lm_y[j[0]*nx], d);
-            beta /= lm_ys[j[0]];
-            // \gamma_{i+1} = \gamma_{i} + (\alpha_{j} - \beta_{j}) s_{j}
-            ggml_vec_mad_f32(nx, d, &lm_s[j[0]*nx], lm_alpha[j[0]] - beta);
-            j[0] = (j[0] + 1)%m;
-        }
-
-        step[0] = 1.0;
-    }
-
-    GGML_UNREACHABLE();
-}
-
-struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) {
-    struct ggml_opt_params result;
-
-    switch (type) {
-        case GGML_OPT_ADAM:
-            {
-                result = (struct ggml_opt_params) {
-                    .type       = GGML_OPT_ADAM,
-                    .graph_size = GGML_DEFAULT_GRAPH_SIZE,
-                    .n_threads  = 1, // FIXME: GGML_DEFAULT_N_THREADS ?
-                    .past       = 0,
-                    .delta      = 1e-5f,
-
-                    .max_no_improvement = 100,
-
-                    .print_forward_graph  = true,
-                    .print_backward_graph = true,
-
-                    .n_gradient_accumulation = 1,
-
-                    .adam = {
-                        .n_iter = 10000,
-                        .sched  = 1.000f,
-                        .decay  = 0.0f,
-                        .decay_min_ndim = 2,
-                        .alpha  = 0.001f,
-                        .beta1  = 0.9f,
-                        .beta2  = 0.999f,
-                        .eps    = 1e-8f,
-                        .eps_f  = 1e-5f,
-                        .eps_g  = 1e-3f,
-                        .gclip  = 0.0f,
-                    },
-                };
-            } break;
-        case GGML_OPT_LBFGS:
-            {
-                result = (struct ggml_opt_params) {
-                    .type       = GGML_OPT_LBFGS,
-                    .graph_size = GGML_DEFAULT_GRAPH_SIZE,
-                    .n_threads  = 1,
-                    .past       = 0,
-                    .delta      = 1e-5f,
-
-                    .max_no_improvement = 0,
-
-                    .print_forward_graph  = true,
-                    .print_backward_graph = true,
-
-                    .n_gradient_accumulation = 1,
-
-                    .lbfgs = {
-                        .m              = 6,
-                        .n_iter         = 100,
-                        .max_linesearch = 20,
-
-                        .eps      = 1e-5f,
-                        .ftol     = 1e-4f,
-                        .wolfe    = 0.9f,
-                        .min_step = 1e-20f,
-                        .max_step = 1e+20f,
-
-                        .linesearch = GGML_LINESEARCH_DEFAULT,
-                    },
-                };
-            } break;
-    }
-
-    return result;
-}
-
-GGML_API void ggml_opt_init(
-        struct ggml_context * ctx,
-        struct ggml_opt_context * opt,
-        struct ggml_opt_params params,
-        int64_t nx) {
-    opt->ctx = ctx;
-    opt->params = params;
-    opt->iter = 0;
-    opt->nx = nx;
-    opt->just_initialized = true;
-    if (opt->ctx == NULL) {
-        struct ggml_init_params ctx_opt_params;
-        if (opt->params.type == GGML_OPT_ADAM) {
-            ctx_opt_params.mem_size = GGML_MEM_ALIGN*3 + ggml_tensor_overhead()*3 + ggml_type_size(GGML_TYPE_F32)*nx*3;
-            if (opt->params.past > 0) {
-                ctx_opt_params.mem_size += GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_size(GGML_TYPE_F32)*opt->params.past;
-            }
-        } else if (opt->params.type == GGML_OPT_LBFGS) {
-            ctx_opt_params.mem_size = GGML_MEM_ALIGN*9 + ggml_tensor_overhead()*9 + ggml_type_size(GGML_TYPE_F32)*(nx*5 + opt->params.lbfgs.m*2 + nx*opt->params.lbfgs.m*2);
-            if (opt->params.past > 0) {
-                ctx_opt_params.mem_size += GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_size(GGML_TYPE_F32)*opt->params.past;
-            }
-        }
-        ctx_opt_params.mem_buffer = NULL;
-        ctx_opt_params.no_alloc   = false;
-
-        opt->ctx = ggml_init(ctx_opt_params);
-    }
-    switch (opt->params.type) {
-        case GGML_OPT_ADAM:
-            {
-                opt->adam.g  = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx);
-                opt->adam.m  = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx);
-                opt->adam.v  = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx);
-                opt->adam.pf = params.past > 0
-                    ? ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, params.past)
-                    : NULL;
-                ggml_set_zero(opt->adam.m);
-                ggml_set_zero(opt->adam.v);
-                if (opt->adam.pf) {
-                    ggml_set_zero(opt->adam.pf);
-                }
-            } break;
-        case GGML_OPT_LBFGS:
-            {
-                opt->lbfgs.x  = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx);
-                opt->lbfgs.xp = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx);
-                opt->lbfgs.g  = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx);
-                opt->lbfgs.gp = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx);
-                opt->lbfgs.d  = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx);
-                opt->lbfgs.pf = params.past > 0
-                    ? ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, params.past)
-                    : NULL;
-                opt->lbfgs.lmal = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, params.lbfgs.m);
-                opt->lbfgs.lmys = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, params.lbfgs.m);
-                opt->lbfgs.lms  = ggml_new_tensor_2d(opt->ctx, GGML_TYPE_F32, nx, params.lbfgs.m);
-                opt->lbfgs.lmy  = ggml_new_tensor_2d(opt->ctx, GGML_TYPE_F32, nx, params.lbfgs.m);
-                ggml_set_zero(opt->lbfgs.x);
-                ggml_set_zero(opt->lbfgs.xp);
-                ggml_set_zero(opt->lbfgs.g);
-                ggml_set_zero(opt->lbfgs.gp);
-                ggml_set_zero(opt->lbfgs.d);
-                if (opt->lbfgs.pf) {
-                    ggml_set_zero(opt->lbfgs.pf);
-                }
-                ggml_set_zero(opt->lbfgs.lmal);
-                ggml_set_zero(opt->lbfgs.lmys);
-                ggml_set_zero(opt->lbfgs.lms);
-                ggml_set_zero(opt->lbfgs.lmy);
-            } break;
-    }
-}
-
-enum ggml_opt_result ggml_opt(
-        struct ggml_context * ctx,
-        struct ggml_opt_params params,
-        struct ggml_tensor * f) {
-    bool free_ctx = false;
-    if (ctx == NULL) {
-        struct ggml_init_params params_ctx = {
-            .mem_size   = 16*1024*1024,
-            .mem_buffer = NULL,
-            .no_alloc   = false,
-        };
-
-        ctx = ggml_init(params_ctx);
-        if (ctx == NULL) {
-            return GGML_OPT_NO_CONTEXT;
-        }
-
-        free_ctx = true;
-    }
-
-    enum ggml_opt_result result = GGML_OPT_OK;
-
-    struct ggml_opt_context * opt = (struct ggml_opt_context *) alloca(sizeof(struct ggml_opt_context));
-
-    ggml_opt_init(ctx, opt, params, 0);
-    result = ggml_opt_resume(ctx, opt, f);
-
-    if (free_ctx) {
-        ggml_free(ctx);
-    }
-
-    return result;
-}
-
-enum ggml_opt_result ggml_opt_resume(
-        struct ggml_context * ctx,
-        struct ggml_opt_context * opt,
-        struct ggml_tensor * f) {
-
-    // build forward + backward compute graphs
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx, opt->params.graph_size, true);
-    ggml_build_forward_expand(gf, f);
-
-    struct ggml_cgraph * gb = ggml_graph_dup(ctx, gf);
-    ggml_build_backward_expand(ctx, gf, gb, true);
-
-    return ggml_opt_resume_g(ctx, opt, f, gf, gb, NULL, NULL);
-}
-
-enum ggml_opt_result ggml_opt_resume_g(
-        struct ggml_context * ctx,
-        struct ggml_opt_context * opt,
-        struct ggml_tensor * f,
-        struct ggml_cgraph * gf,
-        struct ggml_cgraph * gb,
-        ggml_opt_callback callback,
-        void * callback_data) {
-
-    // build forward + backward compute graphs
-    enum ggml_opt_result result = GGML_OPT_OK;
-
-    switch (opt->params.type) {
-        case GGML_OPT_ADAM:
-            {
-                result = ggml_opt_adam(ctx, opt, opt->params, f, gf, gb, callback, callback_data);
-            } break;
-        case GGML_OPT_LBFGS:
-            {
-                result = ggml_opt_lbfgs(ctx, opt, opt->params, f, gf, gb, callback, callback_data);
-            } break;
-    }
-
-    if (opt->params.print_forward_graph) {
-        ggml_graph_print   (gf);
-        ggml_graph_dump_dot(gf, NULL, "opt-forward.dot");
-    }
-
-    if (opt->params.print_backward_graph) {
-        ggml_graph_print   (gb);
-        ggml_graph_dump_dot(gb, gf, "opt-backward.dot");
-    }
-
-    return result;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist) {
-    assert(k % QK4_0 == 0);
-    const int nb = k / QK4_0;
-
-    for (int b = 0; b < n; b += k) {
-        block_q4_0 * restrict y = (block_q4_0 *) dst + b/QK4_0;
-
-        quantize_row_q4_0_reference(src + b, y, k);
-
-        for (int i = 0; i < nb; i++) {
-            for (int j = 0; j < QK4_0; j += 2) {
-                const uint8_t vi0 = y[i].qs[j/2] & 0x0F;
-                const uint8_t vi1 = y[i].qs[j/2] >> 4;
-
-                hist[vi0]++;
-                hist[vi1]++;
-            }
-        }
-    }
-
-    return (n/QK4_0*sizeof(block_q4_0));
-}
-
-size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist) {
-    assert(k % QK4_1 == 0);
-    const int nb = k / QK4_1;
-
-    for (int b = 0; b < n; b += k) {
-        block_q4_1 * restrict y = (block_q4_1 *) dst + b/QK4_1;
-
-        quantize_row_q4_1_reference(src + b, y, k);
-
-        for (int i = 0; i < nb; i++) {
-            for (int j = 0; j < QK4_1; j += 2) {
-                const uint8_t vi0 = y[i].qs[j/2] & 0x0F;
-                const uint8_t vi1 = y[i].qs[j/2] >> 4;
-
-                hist[vi0]++;
-                hist[vi1]++;
-            }
-        }
-    }
-
-    return (n/QK4_1*sizeof(block_q4_1));
-}
-
-size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist) {
-    assert(k % QK5_0 == 0);
-    const int nb = k / QK5_0;
-
-    for (int b = 0; b < n; b += k) {
-        block_q5_0 * restrict y = (block_q5_0 *)dst + b/QK5_0;
-
-        quantize_row_q5_0_reference(src + b, y, k);
-
-        for (int i = 0; i < nb; i++) {
-            uint32_t qh;
-            memcpy(&qh, &y[i].qh, sizeof(qh));
-
-            for (int j = 0; j < QK5_0; j += 2) {
-                const uint8_t vh0 = ((qh & (1u << (j/2 + 0 ))) >> (j/2 + 0 )) << 4;
-                const uint8_t vh1 = ((qh & (1u << (j/2 + 16))) >> (j/2 + 12));
-
-                // cast to 16 bins
-                const uint8_t vi0 = ((y[i].qs[j/2] & 0x0F) | vh0) / 2;
-                const uint8_t vi1 = ((y[i].qs[j/2] >>   4) | vh1) / 2;
-
-                hist[vi0]++;
-                hist[vi1]++;
-            }
-        }
-    }
-
-    return (n/QK5_0*sizeof(block_q5_0));
-}
-
-size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist) {
-    assert(k % QK5_1 == 0);
-    const int nb = k / QK5_1;
-
-    for (int b = 0; b < n; b += k) {
-        block_q5_1 * restrict y = (block_q5_1 *)dst + b/QK5_1;
-
-        quantize_row_q5_1_reference(src + b, y, k);
-
-        for (int i = 0; i < nb; i++) {
-            uint32_t qh;
-            memcpy(&qh, &y[i].qh, sizeof(qh));
-
-            for (int j = 0; j < QK5_1; j += 2) {
-                const uint8_t vh0 = ((qh & (1u << (j/2 + 0 ))) >> (j/2 + 0 )) << 4;
-                const uint8_t vh1 = ((qh & (1u << (j/2 + 16))) >> (j/2 + 12));
-
-                // cast to 16 bins
-                const uint8_t vi0 = ((y[i].qs[j/2] & 0x0F) | vh0) / 2;
-                const uint8_t vi1 = ((y[i].qs[j/2] >>   4) | vh1) / 2;
-
-                hist[vi0]++;
-                hist[vi1]++;
-            }
-        }
-    }
-
-    return (n/QK5_1*sizeof(block_q5_1));
-}
-
-size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist) {
-    assert(k % QK8_0 == 0);
-    const int nb = k / QK8_0;
-
-    for (int b = 0; b < n; b += k) {
-        block_q8_0 * restrict y = (block_q8_0 *)dst + b/QK8_0;
-
-        quantize_row_q8_0_reference(src + b, y, k);
-
-        for (int i = 0; i < nb; i++) {
-            for (int j = 0; j < QK8_0; ++j) {
-                const int8_t vi = y[i].qs[j];
-
-                hist[vi/16 + 8]++;
-            }
-        }
-    }
-
-    return (n/QK8_0*sizeof(block_q8_0));
-}
-
-size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist) {
-    size_t result = 0;
-    switch (type) {
-        case GGML_TYPE_Q4_0:
-            {
-                GGML_ASSERT(start % QK4_0 == 0);
-                block_q4_0 * block = (block_q4_0*)dst + start / QK4_0;
-                result = ggml_quantize_q4_0(src + start, block, n, n, hist);
-            } break;
-        case GGML_TYPE_Q4_1:
-            {
-                GGML_ASSERT(start % QK4_1 == 0);
-                block_q4_1 * block = (block_q4_1*)dst + start / QK4_1;
-                result = ggml_quantize_q4_1(src + start, block, n, n, hist);
-            } break;
-        case GGML_TYPE_Q5_0:
-            {
-                GGML_ASSERT(start % QK5_0 == 0);
-                block_q5_0 * block = (block_q5_0*)dst + start / QK5_0;
-                result = ggml_quantize_q5_0(src + start, block, n, n, hist);
-            } break;
-        case GGML_TYPE_Q5_1:
-            {
-                GGML_ASSERT(start % QK5_1 == 0);
-                block_q5_1 * block = (block_q5_1*)dst + start / QK5_1;
-                result = ggml_quantize_q5_1(src + start, block, n, n, hist);
-            } break;
-        case GGML_TYPE_Q8_0:
-            {
-                GGML_ASSERT(start % QK8_0 == 0);
-                block_q8_0 * block = (block_q8_0*)dst + start / QK8_0;
-                result = ggml_quantize_q8_0(src + start, block, n, n, hist);
-            } break;
-        case GGML_TYPE_Q2_K:
-            {
-                GGML_ASSERT(start % QK_K == 0);
-                block_q2_K * block = (block_q2_K*)dst + start / QK_K;
-                result = ggml_quantize_q2_K(src + start, block, n, n, hist);
-            } break;
-        case GGML_TYPE_Q3_K:
-            {
-                GGML_ASSERT(start % QK_K == 0);
-                block_q3_K * block = (block_q3_K*)dst + start / QK_K;
-                result = ggml_quantize_q3_K(src + start, block, n, n, hist);
-            } break;
-        case GGML_TYPE_Q4_K:
-            {
-                GGML_ASSERT(start % QK_K == 0);
-                block_q4_K * block = (block_q4_K*)dst + start / QK_K;
-                result = ggml_quantize_q4_K(src + start, block, n, n, hist);
-            } break;
-        case GGML_TYPE_Q5_K:
-            {
-                GGML_ASSERT(start % QK_K == 0);
-                block_q5_K * block = (block_q5_K*)dst + start / QK_K;
-                result = ggml_quantize_q5_K(src + start, block, n, n, hist);
-            } break;
-        case GGML_TYPE_Q6_K:
-            {
-                GGML_ASSERT(start % QK_K == 0);
-                block_q6_K * block = (block_q6_K*)dst + start / QK_K;
-                result = ggml_quantize_q6_K(src + start, block, n, n, hist);
-            } break;
-        case GGML_TYPE_IQ2_XXS:
-            {
-                GGML_ASSERT(start % QK_K == 0);
-                block_iq2_xxs * block = (block_iq2_xxs*)dst + start / QK_K;
-                result = ggml_quantize_iq2_xxs(src + start, block, n, n, hist);
-            } break;
-        case GGML_TYPE_IQ2_XS:
-            {
-                GGML_ASSERT(start % QK_K == 0);
-                block_iq2_xs * block = (block_iq2_xs*)dst + start / QK_K;
-                result = ggml_quantize_iq2_xs(src + start, block, n, n, hist);
-            } break;
-        case GGML_TYPE_F16:
-            {
-                int elemsize = sizeof(ggml_fp16_t);
-                ggml_fp32_to_fp16_row(src + start, (ggml_fp16_t *)dst + start, n);
-                result = n * elemsize;
-            } break;
-        case GGML_TYPE_F32:
-            {
-                int elemsize = sizeof(float);
-                result = n * elemsize;
-                memcpy((uint8_t *)dst + start * elemsize, src + start, result);
-            } break;
-        default:
-            assert(false);
-    }
-    return result;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-struct gguf_str {
-    uint64_t n;  // GGUFv2
-    char * data;
-};
-
-static const size_t GGUF_TYPE_SIZE[GGUF_TYPE_COUNT] = {
-    [GGUF_TYPE_UINT8]   = sizeof(uint8_t),
-    [GGUF_TYPE_INT8]    = sizeof(int8_t),
-    [GGUF_TYPE_UINT16]  = sizeof(uint16_t),
-    [GGUF_TYPE_INT16]   = sizeof(int16_t),
-    [GGUF_TYPE_UINT32]  = sizeof(uint32_t),
-    [GGUF_TYPE_INT32]   = sizeof(int32_t),
-    [GGUF_TYPE_FLOAT32] = sizeof(float),
-    [GGUF_TYPE_BOOL]    = sizeof(bool),
-    [GGUF_TYPE_STRING]  = sizeof(struct gguf_str),
-    [GGUF_TYPE_UINT64]  = sizeof(uint64_t),
-    [GGUF_TYPE_INT64]   = sizeof(int64_t),
-    [GGUF_TYPE_FLOAT64] = sizeof(double),
-    [GGUF_TYPE_ARRAY]   = 0, // undefined
-};
-static_assert(GGUF_TYPE_COUNT == 13, "GGUF_TYPE_COUNT != 13");
-
-static const char * GGUF_TYPE_NAME[GGUF_TYPE_COUNT] = {
-    [GGUF_TYPE_UINT8]   = "u8",
-    [GGUF_TYPE_INT8]    = "i8",
-    [GGUF_TYPE_UINT16]  = "u16",
-    [GGUF_TYPE_INT16]   = "i16",
-    [GGUF_TYPE_UINT32]  = "u32",
-    [GGUF_TYPE_INT32]   = "i32",
-    [GGUF_TYPE_FLOAT32] = "f32",
-    [GGUF_TYPE_BOOL]    = "bool",
-    [GGUF_TYPE_STRING]  = "str",
-    [GGUF_TYPE_ARRAY]   = "arr",
-    [GGUF_TYPE_UINT64]  = "u64",
-    [GGUF_TYPE_INT64]   = "i64",
-    [GGUF_TYPE_FLOAT64] = "f64",
-};
-static_assert(GGUF_TYPE_COUNT == 13, "GGUF_TYPE_COUNT != 13");
-
-union gguf_value {
-    uint8_t  uint8;
-    int8_t   int8;
-    uint16_t uint16;
-    int16_t  int16;
-    uint32_t uint32;
-    int32_t  int32;
-    float    float32;
-    uint64_t uint64;
-    int64_t  int64;
-    double   float64;
-    bool     bool_;
-
-    struct gguf_str str;
-
-    struct {
-        enum gguf_type type;
-
-        uint64_t n;  // GGUFv2
-        void * data;
-    } arr;
-};
-
-struct gguf_kv {
-    struct gguf_str key;
-
-    enum  gguf_type  type;
-    union gguf_value value;
-};
-
-struct gguf_header {
-    char magic[4];
-
-    uint32_t version;
-    uint64_t n_tensors; // GGUFv2
-    uint64_t n_kv;      // GGUFv2
-};
-
-struct gguf_tensor_info {
-    struct gguf_str name;
-
-    uint32_t n_dims;
-    uint64_t ne[GGML_MAX_DIMS];
-
-    enum ggml_type type;
-
-    uint64_t offset; // offset from start of `data`, must be a multiple of `ALIGNMENT`
-
-    // for writing API
-    const void * data;
-    size_t size;
-};
-
-struct gguf_context {
-    struct gguf_header header;
-
-    struct gguf_kv          * kv;
-    struct gguf_tensor_info * infos;
-
-    size_t alignment;
-    size_t offset;    // offset of `data` from beginning of file
-    size_t size;      // size of `data` in bytes
-
-    //uint8_t * padding;
-    void * data;
-};
-
-static bool gguf_fread_el(FILE * file, void * dst, size_t size, size_t * offset) {
-    const size_t n = fread(dst, 1, size, file);
-    *offset += n;
-    return n == size;
-}
-
-static bool gguf_fread_str(FILE * file, struct gguf_str * p, size_t * offset) {
-    p->n    = 0;
-    p->data = NULL;
-
-    bool ok = true;
-
-    ok = ok && gguf_fread_el(file, &p->n,    sizeof(p->n), offset); p->data = calloc(p->n + 1, 1);
-    ok = ok && gguf_fread_el(file,  p->data, p->n,         offset);
-
-    return ok;
-}
-
-struct gguf_context * gguf_init_empty(void) {
-    struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
-
-    memcpy(ctx->header.magic, GGUF_MAGIC, sizeof(ctx->header.magic));
-    ctx->header.version   = GGUF_VERSION;
-    ctx->header.n_tensors = 0;
-    ctx->header.n_kv      = 0;
-
-    ctx->kv    = NULL;
-    ctx->infos = NULL;
-
-    ctx->alignment = GGUF_DEFAULT_ALIGNMENT;
-    ctx->offset    = 0;
-    ctx->size      = 0;
-
-    ctx->data = NULL;
-
-    return ctx;
-}
-
-struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params) {
-    FILE * file = fopen(fname, "rb");
-    if (!file) {
-        return NULL;
-    }
-
-    // offset from start of file
-    size_t offset = 0;
-
-    char magic[4];
-
-    // check the magic before making allocations
-    {
-        gguf_fread_el(file, &magic, sizeof(magic), &offset);
-
-        for (uint32_t i = 0; i < sizeof(magic); i++) {
-            if (magic[i] != GGUF_MAGIC[i]) {
-                fprintf(stderr, "%s: invalid magic characters '%c%c%c%c'\n", __func__, magic[0], magic[1], magic[2], magic[3]);
-                fclose(file);
-                return NULL;
-            }
-        }
-    }
-
-    bool ok = true;
-
-    struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
-
-    // read the header
-    {
-        strncpy(ctx->header.magic, magic, 4);
-
-        ctx->kv    = NULL;
-        ctx->infos = NULL;
-        ctx->data  = NULL;
-
-        ok = ok && gguf_fread_el(file, &ctx->header.version,   sizeof(ctx->header.version),   &offset);
-        ok = ok && gguf_fread_el(file, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors), &offset);
-        ok = ok && gguf_fread_el(file, &ctx->header.n_kv,      sizeof(ctx->header.n_kv),      &offset);
-
-        if (ctx->header.version == 1) {
-            fprintf(stderr, "%s: GGUFv1 is no longer supported. please use a more up-to-date version\n", __func__);
-            fclose(file);
-            gguf_free(ctx);
-            return NULL;
-        }
-
-        if (!ok) {
-            fprintf(stderr, "%s: failed to read header\n", __func__);
-            fclose(file);
-            gguf_free(ctx);
-            return NULL;
-        }
-    }
-
-    // read the kv pairs
-    {
-        ctx->kv = malloc(ctx->header.n_kv * sizeof(struct gguf_kv));
-
-        for (uint64_t i = 0; i < ctx->header.n_kv; ++i) {
-            struct gguf_kv * kv = &ctx->kv[i];
-
-            //fprintf(stderr, "%s: reading kv %d\n", __func__, i);
-
-            ok = ok && gguf_fread_str(file, &kv->key,                    &offset);
-            ok = ok && gguf_fread_el (file, &kv->type, sizeof(kv->type), &offset);
-
-            //fprintf(stderr, "%s: reading kv with key %s\n", __func__, kv->key.data);
-
-            switch (kv->type) {
-                case GGUF_TYPE_UINT8:   ok = ok && gguf_fread_el (file, &kv->value.uint8,   sizeof(kv->value.uint8),   &offset); break;
-                case GGUF_TYPE_INT8:    ok = ok && gguf_fread_el (file, &kv->value.int8,    sizeof(kv->value.int8),    &offset); break;
-                case GGUF_TYPE_UINT16:  ok = ok && gguf_fread_el (file, &kv->value.uint16,  sizeof(kv->value.uint16),  &offset); break;
-                case GGUF_TYPE_INT16:   ok = ok && gguf_fread_el (file, &kv->value.int16,   sizeof(kv->value.int16),   &offset); break;
-                case GGUF_TYPE_UINT32:  ok = ok && gguf_fread_el (file, &kv->value.uint32,  sizeof(kv->value.uint32),  &offset); break;
-                case GGUF_TYPE_INT32:   ok = ok && gguf_fread_el (file, &kv->value.int32,   sizeof(kv->value.int32),   &offset); break;
-                case GGUF_TYPE_FLOAT32: ok = ok && gguf_fread_el (file, &kv->value.float32, sizeof(kv->value.float32), &offset); break;
-                case GGUF_TYPE_UINT64:  ok = ok && gguf_fread_el (file, &kv->value.uint64,  sizeof(kv->value.uint64),  &offset); break;
-                case GGUF_TYPE_INT64:   ok = ok && gguf_fread_el (file, &kv->value.int64,   sizeof(kv->value.int64),   &offset); break;
-                case GGUF_TYPE_FLOAT64: ok = ok && gguf_fread_el (file, &kv->value.float64, sizeof(kv->value.float64), &offset); break;
-                case GGUF_TYPE_BOOL:    ok = ok && gguf_fread_el (file, &kv->value.bool_,   sizeof(kv->value.bool_),   &offset); break;
-                case GGUF_TYPE_STRING:  ok = ok && gguf_fread_str(file, &kv->value.str,                                &offset); break;
-                case GGUF_TYPE_ARRAY:
-                    {
-                        ok = ok && gguf_fread_el(file, &kv->value.arr.type, sizeof(kv->value.arr.type), &offset);
-                        ok = ok && gguf_fread_el(file, &kv->value.arr.n,    sizeof(kv->value.arr.n), &offset);
-
-                        switch (kv->value.arr.type) {
-                            case GGUF_TYPE_UINT8:
-                            case GGUF_TYPE_INT8:
-                            case GGUF_TYPE_UINT16:
-                            case GGUF_TYPE_INT16:
-                            case GGUF_TYPE_UINT32:
-                            case GGUF_TYPE_INT32:
-                            case GGUF_TYPE_FLOAT32:
-                            case GGUF_TYPE_UINT64:
-                            case GGUF_TYPE_INT64:
-                            case GGUF_TYPE_FLOAT64:
-                            case GGUF_TYPE_BOOL:
-                                {
-                                    kv->value.arr.data = malloc(kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type]);
-                                    ok = ok && gguf_fread_el(file, kv->value.arr.data, kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type], &offset);
-                                } break;
-                            case GGUF_TYPE_STRING:
-                                {
-                                    kv->value.arr.data = malloc(kv->value.arr.n * sizeof(struct gguf_str));
-                                    for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
-                                        ok = ok && gguf_fread_str(file, &((struct gguf_str *) kv->value.arr.data)[j], &offset);
-                                    }
-                                } break;
-                            case GGUF_TYPE_ARRAY:
-                            case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); break;
-                        }
-                    } break;
-                case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type");
-            }
-
-            if (!ok) {
-                break;
-            }
-        }
-
-        if (!ok) {
-            fprintf(stderr, "%s: failed to read key-value pairs\n", __func__);
-            fclose(file);
-            gguf_free(ctx);
-            return NULL;
-        }
-    }
-
-    // read the tensor infos
-    {
-        ctx->infos = malloc(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
-
-        for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
-            struct gguf_tensor_info * info = &ctx->infos[i];
-
-            for (int j = 0; j < GGML_MAX_DIMS; ++j) {
-                info->ne[j] = 1;
-            }
-
-            ok = ok && gguf_fread_str(file, &info->name,                          &offset);
-            ok = ok && gguf_fread_el (file, &info->n_dims, sizeof(info->n_dims),  &offset);
-            for (uint32_t j = 0; j < info->n_dims; ++j) {
-                ok = ok && gguf_fread_el(file, &info->ne[j], sizeof(info->ne[j]), &offset);
-            }
-            ok = ok && gguf_fread_el (file, &info->type,   sizeof(info->type),    &offset);
-            ok = ok && gguf_fread_el (file, &info->offset, sizeof(info->offset),  &offset);
-
-            if (!ok) {
-                fprintf(stderr, "%s: failed to read tensor info\n", __func__);
-                fclose(file);
-                gguf_free(ctx);
-                return NULL;
-            }
-        }
-    }
-
-    ctx->alignment = GGUF_DEFAULT_ALIGNMENT;
-
-    int alignment_idx = gguf_find_key(ctx, "general.alignment");
-    if (alignment_idx != -1) {
-        ctx->alignment = gguf_get_val_u32(ctx, alignment_idx);
-    }
-
-    // we require the data section to be aligned, so take into account any padding
-    {
-        const size_t offset_pad = offset % ctx->alignment;
-
-        if (offset_pad != 0) {
-            offset += ctx->alignment - offset_pad;
-            fseek(file, offset, SEEK_SET);
-        }
-    }
-
-    // store the current file offset - this is where the data section starts
-    ctx->offset = offset;
-
-    // compute the total size of the data section, taking into account the alignment
-    {
-        ctx->size = 0;
-        for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
-            struct gguf_tensor_info * info = &ctx->infos[i];
-
-            const int64_t ne =
-                (int64_t) info->ne[0] *
-                (int64_t) info->ne[1] *
-                (int64_t) info->ne[2] *
-                (int64_t) info->ne[3];
-
-            if (ne % ggml_blck_size(info->type) != 0) {
-                fprintf(stderr, "%s: tensor '%s' of type %d (%s) number of elements (%" PRId64 ") is not a multiple of block size (%d)\n",
-                        __func__, info->name.data, (int)info->type, ggml_type_name(info->type), ne, ggml_blck_size(info->type));
-                fclose(file);
-                gguf_free(ctx);
-                return NULL;
-            }
-
-            const size_t size_cur = ggml_row_size(info->type, ne);
-
-            ctx->size += GGML_PAD(size_cur, ctx->alignment);
-        }
-    }
-
-    // load the tensor data only if requested
-    if (params.ctx != NULL) {
-        // if the provided gguf_context is no_alloc, then we create "empty" tensors and do not read the binary blob
-        // otherwise, we load the binary blob into the created ggml_context as well, and point the "data" members of
-        // the ggml_tensor structs to the appropriate locations in the binary blob
-
-        // compute the exact size needed for the new ggml_context
-        const size_t mem_size =
-            params.no_alloc ?
-            (ctx->header.n_tensors    )*ggml_tensor_overhead() :
-            (ctx->header.n_tensors + 1)*ggml_tensor_overhead() + ctx->size;
-
-        struct ggml_init_params pdata = {
-            .mem_size   = mem_size,
-            .mem_buffer = NULL,
-            .no_alloc   = params.no_alloc,
-        };
-
-        *params.ctx = ggml_init(pdata);
-
-        struct ggml_context * ctx_data = *params.ctx;
-
-        struct ggml_tensor * data = NULL;
-
-        if (!params.no_alloc) {
-            data = ggml_new_tensor_1d(ctx_data, GGML_TYPE_I8, ctx->size);
-
-            ok = ok && data != NULL;
-
-            // read the binary blob with the tensor data
-            ok = ok && gguf_fread_el(file, data->data, ctx->size, &offset);
-
-            if (!ok) {
-                fprintf(stderr, "%s: failed to read tensor data\n", __func__);
-                fclose(file);
-                ggml_free(ctx_data);
-                gguf_free(ctx);
-                return NULL;
-            }
-
-            ctx->data = data->data;
-        }
-
-        ggml_set_no_alloc(ctx_data, true);
-
-        // create the tensors
-        for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
-            const int64_t ne[GGML_MAX_DIMS] = {
-                ctx->infos[i].ne[0],
-                ctx->infos[i].ne[1],
-                ctx->infos[i].ne[2],
-                ctx->infos[i].ne[3],
-            };
-
-            struct ggml_tensor * cur = ggml_new_tensor(ctx_data, ctx->infos[i].type, ctx->infos[i].n_dims, ne);
-
-            ok = ok && cur != NULL;
-
-            ggml_set_name(cur, ctx->infos[i].name.data);
-
-            if (!ok) {
-                break;
-            }
-
-            // point the data member to the appropriate location in the binary blob using the tensor infos
-            if (!params.no_alloc) {
-              //cur->data = (char *) data->data + ctx->infos[i].offset - ctx->offset; // offset from start of file
-                cur->data = (char *) data->data + ctx->infos[i].offset;               // offset from data
-            }
-        }
-
-        if (!ok) {
-            fprintf(stderr, "%s: failed to read the tensor data\n", __func__);
-            fclose(file);
-            ggml_free(ctx_data);
-            gguf_free(ctx);
-            return NULL;
-        }
-
-        ggml_set_no_alloc(ctx_data, params.no_alloc);
-    }
-
-    fclose(file);
-
-    return ctx;
-}
-
-void gguf_free(struct gguf_context * ctx) {
-    if (ctx == NULL) {
-        return;
-    }
-
-    if (ctx->kv) {
-        // free string memory - not great..
-        for (uint64_t i = 0; i < ctx->header.n_kv; ++i) {
-            struct gguf_kv * kv = &ctx->kv[i];
-
-            if (kv->key.data) {
-                free(kv->key.data);
-            }
-
-            if (kv->type == GGUF_TYPE_STRING) {
-                if (kv->value.str.data) {
-                    free(kv->value.str.data);
-                }
-            }
-
-            if (kv->type == GGUF_TYPE_ARRAY) {
-                if (kv->value.arr.data) {
-                    if (kv->value.arr.type == GGUF_TYPE_STRING) {
-                        for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
-                            struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[j];
-                            if (str->data) {
-                                free(str->data);
-                            }
-                        }
-                    }
-                    free(kv->value.arr.data);
-                }
-            }
-        }
-
-        free(ctx->kv);
-    }
-
-    if (ctx->infos) {
-        for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
-            struct gguf_tensor_info * info = &ctx->infos[i];
-
-            if (info->name.data) {
-                free(info->name.data);
-            }
-        }
-
-        free(ctx->infos);
-    }
-
-    GGML_ALIGNED_FREE(ctx);
-}
-
-const char * gguf_type_name(enum gguf_type type) {
-    return GGUF_TYPE_NAME[type];
-}
-
-int gguf_get_version(const struct gguf_context * ctx) {
-    return ctx->header.version;
-}
-
-size_t gguf_get_alignment(const struct gguf_context * ctx) {
-    return ctx->alignment;
-}
-
-size_t gguf_get_data_offset(const struct gguf_context * ctx) {
-    return ctx->offset;
-}
-
-void * gguf_get_data(const struct gguf_context * ctx) {
-    return ctx->data;
-}
-
-int gguf_get_n_kv(const struct gguf_context * ctx) {
-    return ctx->header.n_kv;
-}
-
-int gguf_find_key(const struct gguf_context * ctx, const char * key) {
-    // return -1 if key not found
-    int keyfound = -1;
-
-    const int n_kv = gguf_get_n_kv(ctx);
-
-    for (int i = 0; i < n_kv; ++i) {
-        if (strcmp(key, gguf_get_key(ctx, i)) == 0) {
-            keyfound = i;
-            break;
-        }
-    }
-
-    return keyfound;
-}
-
-const char * gguf_get_key(const struct gguf_context * ctx, int key_id) {
-    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
-    return ctx->kv[key_id].key.data;
-}
-
-enum gguf_type gguf_get_kv_type(const struct gguf_context * ctx, int key_id) {
-    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
-    return ctx->kv[key_id].type;
-}
-
-enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int key_id) {
-    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
-    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
-    return ctx->kv[key_id].value.arr.type;
-}
-
-const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id) {
-    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
-    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
-    return ctx->kv[key_id].value.arr.data;
-}
-
-const char * gguf_get_arr_str(const struct gguf_context * ctx, int key_id, int i) {
-    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
-    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
-    struct gguf_kv * kv = &ctx->kv[key_id];
-    struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[i];
-    return str->data;
-}
-
-int gguf_get_arr_n(const struct gguf_context * ctx, int key_id) {
-    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
-    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
-    return ctx->kv[key_id].value.arr.n;
-}
-
-uint8_t gguf_get_val_u8(const struct gguf_context * ctx, int key_id) {
-    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
-    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT8);
-    return ctx->kv[key_id].value.uint8;
-}
-
-int8_t gguf_get_val_i8(const struct gguf_context * ctx, int key_id) {
-    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
-    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT8);
-    return ctx->kv[key_id].value.int8;
-}
-
-uint16_t gguf_get_val_u16(const struct gguf_context * ctx, int key_id) {
-    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
-    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT16);
-    return ctx->kv[key_id].value.uint16;
-}
-
-int16_t gguf_get_val_i16(const struct gguf_context * ctx, int key_id) {
-    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
-    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT16);
-    return ctx->kv[key_id].value.int16;
-}
-
-uint32_t gguf_get_val_u32(const struct gguf_context * ctx, int key_id) {
-    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
-    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT32);
-    return ctx->kv[key_id].value.uint32;
-}
-
-int32_t gguf_get_val_i32(const struct gguf_context * ctx, int key_id) {
-    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
-    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT32);
-    return ctx->kv[key_id].value.int32;
-}
-
-float gguf_get_val_f32(const struct gguf_context * ctx, int key_id) {
-    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
-    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_FLOAT32);
-    return ctx->kv[key_id].value.float32;
-}
-
-uint64_t gguf_get_val_u64(const struct gguf_context * ctx, int key_id) {
-    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
-    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT64);
-    return ctx->kv[key_id].value.uint64;
-}
-
-int64_t gguf_get_val_i64(const struct gguf_context * ctx, int key_id) {
-    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
-    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT64);
-    return ctx->kv[key_id].value.int64;
-}
-
-double gguf_get_val_f64(const struct gguf_context * ctx, int key_id) {
-    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
-    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_FLOAT64);
-    return ctx->kv[key_id].value.float64;
-}
-
-bool gguf_get_val_bool(const struct gguf_context * ctx, int key_id) {
-    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
-    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_BOOL);
-    return ctx->kv[key_id].value.bool_;
-}
-
-const char * gguf_get_val_str(const struct gguf_context * ctx, int key_id) {
-    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
-    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_STRING);
-    return ctx->kv[key_id].value.str.data;
-}
-
-const void * gguf_get_val_data(const struct gguf_context * ctx, int key_id) {
-    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
-    GGML_ASSERT(ctx->kv[key_id].type != GGUF_TYPE_ARRAY);
-    GGML_ASSERT(ctx->kv[key_id].type != GGUF_TYPE_STRING);
-    return &ctx->kv[key_id].value;
-}
-
-int gguf_get_n_tensors(const struct gguf_context * ctx) {
-    return ctx->header.n_tensors;
-}
-
-int gguf_find_tensor(const struct gguf_context * ctx, const char * name) {
-    // return -1 if tensor not found
-    int tensorfound = -1;
-
-    const int n_tensors = gguf_get_n_tensors(ctx);
-
-    for (int i = 0; i < n_tensors; ++i) {
-        if (strcmp(name, gguf_get_tensor_name(ctx, i)) == 0) {
-            tensorfound = i;
-            break;
-        }
-    }
-
-    return tensorfound;
-}
-
-size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i) {
-    return ctx->infos[i].offset;
-}
-
-char * gguf_get_tensor_name(const struct gguf_context * ctx, int i) {
-    return ctx->infos[i].name.data;
-}
-
-enum ggml_type gguf_get_tensor_type(const struct gguf_context * ctx, int i) {
-    return ctx->infos[i].type;
-}
-
-// returns the index
-static int gguf_get_or_add_key(struct gguf_context * ctx, const char * key) {
-    const int idx = gguf_find_key(ctx, key);
-    if (idx >= 0) {
-        return idx;
-    }
-
-    const int n_kv = gguf_get_n_kv(ctx);
-
-    ctx->kv = realloc(ctx->kv, (n_kv + 1) * sizeof(struct gguf_kv));
-    ctx->kv[n_kv].key.n    = strlen(key);
-    ctx->kv[n_kv].key.data = strdup(key);
-    ctx->header.n_kv++;
-
-    return n_kv;
-}
-
-void gguf_set_val_u8(struct gguf_context * ctx, const char * key, uint8_t val) {
-    const int idx = gguf_get_or_add_key(ctx, key);
-
-    ctx->kv[idx].type        = GGUF_TYPE_UINT8;
-    ctx->kv[idx].value.uint8 = val;
-}
-
-void gguf_set_val_i8(struct gguf_context * ctx, const char * key, int8_t val) {
-    const int idx = gguf_get_or_add_key(ctx, key);
-
-    ctx->kv[idx].type       = GGUF_TYPE_INT8;
-    ctx->kv[idx].value.int8 = val;
-}
-
-void gguf_set_val_u16(struct gguf_context * ctx, const char * key, uint16_t val) {
-    const int idx = gguf_get_or_add_key(ctx, key);
-
-    ctx->kv[idx].type         = GGUF_TYPE_UINT16;
-    ctx->kv[idx].value.uint16 = val;
-}
-
-void gguf_set_val_i16(struct gguf_context * ctx, const char * key, int16_t val) {
-    const int idx = gguf_get_or_add_key(ctx, key);
-
-    ctx->kv[idx].type        = GGUF_TYPE_INT16;
-    ctx->kv[idx].value.int16 = val;
-}
-
-void gguf_set_val_u32(struct gguf_context * ctx, const char * key, uint32_t val) {
-    const int idx = gguf_get_or_add_key(ctx, key);
-
-    ctx->kv[idx].type         = GGUF_TYPE_UINT32;
-    ctx->kv[idx].value.uint32 = val;
-}
-
-void gguf_set_val_i32(struct gguf_context * ctx, const char * key, int32_t val) {
-    const int idx = gguf_get_or_add_key(ctx, key);
-
-    ctx->kv[idx].type        = GGUF_TYPE_INT32;
-    ctx->kv[idx].value.int32 = val;
-}
-
-void gguf_set_val_f32(struct gguf_context * ctx, const char * key, float val) {
-    const int idx = gguf_get_or_add_key(ctx, key);
-
-    ctx->kv[idx].type          = GGUF_TYPE_FLOAT32;
-    ctx->kv[idx].value.float32 = val;
-}
-
-void gguf_set_val_u64(struct gguf_context * ctx, const char * key, uint64_t val) {
-    const int idx = gguf_get_or_add_key(ctx, key);
-
-    ctx->kv[idx].type         = GGUF_TYPE_UINT64;
-    ctx->kv[idx].value.uint64 = val;
-}
-
-void gguf_set_val_i64(struct gguf_context * ctx, const char * key, int64_t val) {
-    const int idx = gguf_get_or_add_key(ctx, key);
-
-    ctx->kv[idx].type        = GGUF_TYPE_INT64;
-    ctx->kv[idx].value.int64 = val;
-}
-
-void gguf_set_val_f64(struct gguf_context * ctx, const char * key, double val) {
-    const int idx = gguf_get_or_add_key(ctx, key);
-
-    ctx->kv[idx].type          = GGUF_TYPE_FLOAT64;
-    ctx->kv[idx].value.float64 = val;
-}
-
-void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val) {
-    const int idx = gguf_get_or_add_key(ctx, key);
-
-    ctx->kv[idx].type        = GGUF_TYPE_BOOL;
-    ctx->kv[idx].value.bool_ = val;
-}
-
-void gguf_set_val_str(struct gguf_context * ctx, const char * key, const char * val) {
-    const int idx = gguf_get_or_add_key(ctx, key);
-
-    ctx->kv[idx].type           = GGUF_TYPE_STRING;
-    ctx->kv[idx].value.str.n    = strlen(val);
-    ctx->kv[idx].value.str.data = strdup(val);
-}
-
-void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, int n) {
-    const int idx = gguf_get_or_add_key(ctx, key);
-
-    ctx->kv[idx].type           = GGUF_TYPE_ARRAY;
-    ctx->kv[idx].value.arr.type = type;
-    ctx->kv[idx].value.arr.n    = n;
-    ctx->kv[idx].value.arr.data = malloc(n*GGUF_TYPE_SIZE[type]);
-    memcpy(ctx->kv[idx].value.arr.data, data, n*GGUF_TYPE_SIZE[type]);
-}
-
-void gguf_set_arr_str(struct gguf_context * ctx, const char * key, const char ** data, int n) {
-    const int idx = gguf_get_or_add_key(ctx, key);
-
-    ctx->kv[idx].type           = GGUF_TYPE_ARRAY;
-    ctx->kv[idx].value.arr.type = GGUF_TYPE_STRING;
-    ctx->kv[idx].value.arr.n    = n;
-    ctx->kv[idx].value.arr.data = malloc(n*sizeof(struct gguf_str));
-    for (int i = 0; i < n; i++) {
-        struct gguf_str * str = &((struct gguf_str *)ctx->kv[idx].value.arr.data)[i];
-        str->n    = strlen(data[i]);
-        str->data = strdup(data[i]);
-    }
-}
-
-// set or add KV pairs from another context
-void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) {
-    for (uint32_t i = 0; i < src->header.n_kv; i++) {
-        switch (src->kv[i].type) {
-            case GGUF_TYPE_UINT8:   gguf_set_val_u8  (ctx, src->kv[i].key.data, src->kv[i].value.uint8);    break;
-            case GGUF_TYPE_INT8:    gguf_set_val_i8  (ctx, src->kv[i].key.data, src->kv[i].value.int8);     break;
-            case GGUF_TYPE_UINT16:  gguf_set_val_u16 (ctx, src->kv[i].key.data, src->kv[i].value.uint16);   break;
-            case GGUF_TYPE_INT16:   gguf_set_val_i16 (ctx, src->kv[i].key.data, src->kv[i].value.int16);    break;
-            case GGUF_TYPE_UINT32:  gguf_set_val_u32 (ctx, src->kv[i].key.data, src->kv[i].value.uint32);   break;
-            case GGUF_TYPE_INT32:   gguf_set_val_i32 (ctx, src->kv[i].key.data, src->kv[i].value.int32);    break;
-            case GGUF_TYPE_FLOAT32: gguf_set_val_f32 (ctx, src->kv[i].key.data, src->kv[i].value.float32);  break;
-            case GGUF_TYPE_UINT64:  gguf_set_val_u64 (ctx, src->kv[i].key.data, src->kv[i].value.uint64);   break;
-            case GGUF_TYPE_INT64:   gguf_set_val_i64 (ctx, src->kv[i].key.data, src->kv[i].value.int64);    break;
-            case GGUF_TYPE_FLOAT64: gguf_set_val_f64 (ctx, src->kv[i].key.data, src->kv[i].value.float64);  break;
-            case GGUF_TYPE_BOOL:    gguf_set_val_bool(ctx, src->kv[i].key.data, src->kv[i].value.bool_);    break;
-            case GGUF_TYPE_STRING:  gguf_set_val_str (ctx, src->kv[i].key.data, src->kv[i].value.str.data); break;
-            case GGUF_TYPE_ARRAY:
-                {
-                    if (src->kv[i].value.arr.type == GGUF_TYPE_STRING) {
-                        const char ** data = malloc(src->kv[i].value.arr.n*sizeof(char *));
-                        for (uint32_t j = 0; j < src->kv[i].value.arr.n; j++) {
-                            data[j] = ((struct gguf_str *)src->kv[i].value.arr.data)[j].data;
-                        }
-                        gguf_set_arr_str(ctx, src->kv[i].key.data, data, src->kv[i].value.arr.n);
-                        free((void *)data);
-                    } else if (src->kv[i].value.arr.type == GGUF_TYPE_ARRAY) {
-                        GGML_ASSERT(false && "nested arrays not supported");
-                    } else {
-                        gguf_set_arr_data(ctx, src->kv[i].key.data, src->kv[i].value.arr.type, src->kv[i].value.arr.data, src->kv[i].value.arr.n);
-                    }
-                } break;
-            case GGUF_TYPE_COUNT:  GGML_ASSERT(false && "invalid type"); break;
-        }
-    }
-}
-
-void gguf_add_tensor(
-             struct gguf_context * ctx,
-        const struct ggml_tensor * tensor) {
-    const int idx = ctx->header.n_tensors;
-    ctx->infos = realloc(ctx->infos, (idx + 1)*sizeof(struct gguf_tensor_info));
-
-    ctx->infos[idx].name.n    = strlen(tensor->name);
-    ctx->infos[idx].name.data = strdup(tensor->name);
-
-    for (int i = 0; i < GGML_MAX_DIMS; ++i) {
-        ctx->infos[idx].ne[i] = 1;
-    }
-
-    ctx->infos[idx].n_dims = ggml_n_dims(tensor);
-    for (uint32_t i = 0; i < ctx->infos[idx].n_dims; i++) {
-        ctx->infos[idx].ne[i] = tensor->ne[i];
-    }
-
-    ctx->infos[idx].type   = tensor->type;
-    ctx->infos[idx].offset = 0;
-    ctx->infos[idx].data   = tensor->data;
-    ctx->infos[idx].size   = ggml_nbytes(tensor);
-
-    if (ctx->header.n_tensors > 0) {
-        ctx->infos[idx].offset = ctx->infos[idx - 1].offset + GGML_PAD(ctx->infos[idx - 1].size, ctx->alignment);
-    }
-
-    ctx->header.n_tensors++;
-}
-
-void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type) {
-    const int idx = gguf_find_tensor(ctx, name);
-    if (idx < 0) {
-        GGML_ASSERT(false && "tensor not found");
-    }
-
-    ctx->infos[idx].type = type;
-}
-
-void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size) {
-    const int idx = gguf_find_tensor(ctx, name);
-    if (idx < 0) {
-        GGML_ASSERT(false && "tensor not found");
-    }
-
-    ctx->infos[idx].data = data;
-    ctx->infos[idx].size = size;
-
-    // update offsets
-    for (uint32_t i = idx + 1; i < ctx->header.n_tensors; ++i) {
-        ctx->infos[i].offset = ctx->infos[i - 1].offset + GGML_PAD(ctx->infos[i - 1].size, ctx->alignment);
-    }
-}
-
-//static void gguf_fwrite_str(FILE * file, const struct gguf_str * val) {
-//    fwrite(&val->n,   sizeof(val->n),    1, file);
-//    fwrite(val->data, sizeof(char), val->n, file);
-//}
-//
-//static void gguf_fwrite_el(FILE * file, const void * val, size_t size) {
-//    fwrite(val, sizeof(char), size, file);
-//}
-
-struct gguf_buf {
-    void * data;
-    size_t size;
-    size_t offset;
-};
-
-static struct gguf_buf gguf_buf_init(size_t size) {
-    struct gguf_buf buf = {
-        /*buf.data   =*/ size == 0 ? NULL : malloc(size),
-        /*buf.size   =*/ size,
-        /*buf.offset =*/ 0,
-    };
-
-    return buf;
-}
-
-static void gguf_buf_free(struct gguf_buf buf) {
-    if (buf.data) {
-        free(buf.data);
-    }
-}
-
-static void gguf_buf_grow(struct gguf_buf * buf, size_t size) {
-    if (buf->offset + size > buf->size) {
-        buf->size = 1.5*(buf->offset + size);
-        if (buf->data) {
-            buf->data = realloc(buf->data, buf->size);
-        }
-    }
-}
-
-static void gguf_bwrite_str(struct gguf_buf * buf, const struct gguf_str * val) {
-    gguf_buf_grow(buf, sizeof(val->n) + val->n);
-
-    if (buf->data) {
-        memcpy((char *) buf->data + buf->offset, &val->n, sizeof(val->n));
-    }
-    buf->offset += sizeof(val->n);
-
-    if (buf->data) {
-        memcpy((char *) buf->data + buf->offset, val->data, val->n);
-    }
-    buf->offset += val->n;
-}
-
-static void gguf_bwrite_el(struct gguf_buf * buf, const void * val, size_t el_size) {
-    gguf_buf_grow(buf, el_size);
-
-    if (buf->data) {
-        memcpy((char *) buf->data + buf->offset, val, el_size);
-    }
-    buf->offset += el_size;
-}
-
-static void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf * buf, bool only_meta) {
-    // write header
-    gguf_bwrite_el(buf, &ctx->header.magic,     sizeof(ctx->header.magic));
-    gguf_bwrite_el(buf, &ctx->header.version,   sizeof(ctx->header.version));
-    gguf_bwrite_el(buf, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors));
-    gguf_bwrite_el(buf, &ctx->header.n_kv,      sizeof(ctx->header.n_kv));
-
-    // write key-value pairs
-    for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
-        struct gguf_kv * kv = &ctx->kv[i];
-
-        gguf_bwrite_str(buf, &kv->key);
-        gguf_bwrite_el (buf, &kv->type, sizeof(kv->type));
-
-        switch (kv->type) {
-            case GGUF_TYPE_UINT8:   gguf_bwrite_el( buf, &kv->value.uint8,   sizeof(kv->value.uint8)  ); break;
-            case GGUF_TYPE_INT8:    gguf_bwrite_el (buf, &kv->value.int8,    sizeof(kv->value.int8)   ); break;
-            case GGUF_TYPE_UINT16:  gguf_bwrite_el (buf, &kv->value.uint16,  sizeof(kv->value.uint16) ); break;
-            case GGUF_TYPE_INT16:   gguf_bwrite_el (buf, &kv->value.int16,   sizeof(kv->value.int16)  ); break;
-            case GGUF_TYPE_UINT32:  gguf_bwrite_el (buf, &kv->value.uint32,  sizeof(kv->value.uint32) ); break;
-            case GGUF_TYPE_INT32:   gguf_bwrite_el (buf, &kv->value.int32,   sizeof(kv->value.int32)  ); break;
-            case GGUF_TYPE_FLOAT32: gguf_bwrite_el (buf, &kv->value.float32, sizeof(kv->value.float32)); break;
-            case GGUF_TYPE_UINT64:  gguf_bwrite_el (buf, &kv->value.uint64,  sizeof(kv->value.uint64) ); break;
-            case GGUF_TYPE_INT64:   gguf_bwrite_el (buf, &kv->value.int64,   sizeof(kv->value.int64)  ); break;
-            case GGUF_TYPE_FLOAT64: gguf_bwrite_el (buf, &kv->value.float64, sizeof(kv->value.float64)); break;
-            case GGUF_TYPE_BOOL:    gguf_bwrite_el (buf, &kv->value.bool_,   sizeof(kv->value.bool_)  ); break;
-            case GGUF_TYPE_STRING:  gguf_bwrite_str(buf, &kv->value.str                               ); break;
-            case GGUF_TYPE_ARRAY:
-                {
-                    gguf_bwrite_el(buf, &kv->value.arr.type, sizeof(kv->value.arr.type));
-                    gguf_bwrite_el(buf, &kv->value.arr.n,    sizeof(kv->value.arr.n)   );
-
-                    switch (kv->value.arr.type) {
-                        case GGUF_TYPE_UINT8:
-                        case GGUF_TYPE_INT8:
-                        case GGUF_TYPE_UINT16:
-                        case GGUF_TYPE_INT16:
-                        case GGUF_TYPE_UINT32:
-                        case GGUF_TYPE_INT32:
-                        case GGUF_TYPE_FLOAT32:
-                        case GGUF_TYPE_UINT64:
-                        case GGUF_TYPE_INT64:
-                        case GGUF_TYPE_FLOAT64:
-                        case GGUF_TYPE_BOOL:
-                            {
-                                gguf_bwrite_el(buf, kv->value.arr.data, kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type]);
-                            } break;
-                        case GGUF_TYPE_STRING:
-                            {
-                                for (uint32_t j = 0; j < kv->value.arr.n; ++j) {
-                                    gguf_bwrite_str(buf, &((struct gguf_str *) kv->value.arr.data)[j]);
-                                }
-                            } break;
-                        case GGUF_TYPE_ARRAY:
-                        case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); break;
-                    }
-                } break;
-            case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type");
-        }
-    }
-
-    // write tensor infos
-    for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
-        struct gguf_tensor_info * info = &ctx->infos[i];
-
-        gguf_bwrite_str(buf, &info->name);
-        gguf_bwrite_el (buf, &info->n_dims, sizeof(info->n_dims));
-        for (uint32_t j = 0; j < info->n_dims; ++j) {
-            gguf_bwrite_el(buf, &info->ne[j], sizeof(info->ne[j]));
-        }
-        gguf_bwrite_el(buf, &info->type,   sizeof(info->type));
-        gguf_bwrite_el(buf, &info->offset, sizeof(info->offset));
-    }
-
-    // we require the data section to be aligned, so take into account any padding
-    {
-        const size_t offset     = buf->offset;
-        const size_t offset_pad = GGML_PAD(offset, ctx->alignment);
-
-        if (offset_pad != offset) {
-            uint8_t pad = 0;
-            for (size_t i = 0; i < offset_pad - offset; ++i) {
-                gguf_bwrite_el(buf, &pad, sizeof(pad));
-            }
-        }
-    }
-
-    if (only_meta) {
-        return;
-    }
-
-    size_t offset = 0;
-
-    // write tensor data
-    for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
-        struct gguf_tensor_info * info = &ctx->infos[i];
-
-        const size_t size     = info->size;
-        const size_t size_pad = GGML_PAD(size, ctx->alignment);
-
-        gguf_bwrite_el(buf, info->data, size);
-
-        if (size_pad != size) {
-            uint8_t pad = 0;
-            for (size_t j = 0; j < size_pad - size; ++j) {
-                gguf_bwrite_el(buf, &pad, sizeof(pad));
-            }
-        }
-
-        GGML_ASSERT(offset == info->offset);
-
-        offset += size_pad;
-    }
-}
-
-void gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta) {
-    FILE * file = fopen(fname, "wb");
-    if (!file) {
-        GGML_ASSERT(false && "failed to open file for writing");
-    }
-
-    struct gguf_buf buf = gguf_buf_init(16*1024);
-
-    gguf_write_to_buf(ctx, &buf, only_meta);
-
-    fwrite(buf.data, 1, buf.offset, file);
-
-    gguf_buf_free(buf);
-
-    fclose(file);
-}
-
-size_t gguf_get_meta_size(const struct gguf_context * ctx) {
-    // no allocs - only compute size
-    struct gguf_buf buf = gguf_buf_init(0);
-
-    gguf_write_to_buf(ctx, &buf, true);
-
-    return buf.offset;
-}
-
-void gguf_get_meta_data(const struct gguf_context * ctx, void * data) {
-    struct gguf_buf buf = gguf_buf_init(16*1024);
-
-    gguf_write_to_buf(ctx, &buf, true);
-
-    memcpy(data, buf.data, buf.offset);
-
-    gguf_buf_free(buf);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-int ggml_cpu_has_avx(void) {
-#if defined(__AVX__)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_avx_vnni(void) {
-#if defined(__AVXVNNI__)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_avx2(void) {
-#if defined(__AVX2__)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_avx512(void) {
-#if defined(__AVX512F__)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_avx512_vbmi(void) {
-#if defined(__AVX512VBMI__)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_avx512_vnni(void) {
-#if defined(__AVX512VNNI__)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_fma(void) {
-#if defined(__FMA__)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_neon(void) {
-#if defined(__ARM_NEON)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_arm_fma(void) {
-#if defined(__ARM_FEATURE_FMA)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_metal(void) {
-#if defined(GGML_USE_METAL)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_f16c(void) {
-#if defined(__F16C__)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_fp16_va(void) {
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_wasm_simd(void) {
-#if defined(__wasm_simd128__)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_blas(void) {
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_cublas(void) {
-#if defined(GGML_USE_CUBLAS)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_clblast(void) {
-#if defined(GGML_USE_CLBLAST)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_gpublas(void) {
-    return ggml_cpu_has_cublas() || ggml_cpu_has_clblast();
-}
-
-int ggml_cpu_has_sse3(void) {
-#if defined(__SSE3__)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_ssse3(void) {
-#if defined(__SSSE3__)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_vsx(void) {
-#if defined(__POWER9_VECTOR__)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/ggml/tests/CMakeLists.txt b/ggml/tests/CMakeLists.txt
deleted file mode 100644
index 754d637..0000000
--- a/ggml/tests/CMakeLists.txt
+++ /dev/null
@@ -1,414 +0,0 @@
-# check systems
-if (NOT UNAME_S)
-    execute_process(COMMAND uname -s OUTPUT_VARIABLE UNAME_S)
-endif()
-if (NOT UNAME_P)
-    execute_process(COMMAND uname -p OUTPUT_VARIABLE UNAME_P)
-endif()
-if (NOT UNAME_M)
-    execute_process(COMMAND uname -m OUTPUT_VARIABLE UNAME_M)
-endif()
-#message(STATUS "UNAME_S: ${UNAME_S}  UNAME_P: ${UNAME_P}  UNAME_M: ${UNAME_M}")
-
-# Mac OS + Arm can report x86_64
-# ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
-if (UNAME_S MATCHES "Darwin")
-    if (NOT UNAME_P MATCHES "arm")
-        execute_process(COMMAND sysctl -n hw.optional.arm64 OUTPUT_VARIABLE SYSCTL_M)
-        if (SYSCTL_M MATCHES "1")
-            #set(UNAME_P "arm")
-            #set(UNAME_M "arm64")
-            message(WARNING "Your arch is announced as x86_64, but it seems to actually be ARM64. Not fixing that can lea
-d to bad performance. For more info see: https://github.com/ggerganov/whisper.cpp/issues/66\#issuecomment-#1282546789")
-        endif()
-    endif()
-endif()
-
-if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64")
-    message(STATUS "ARM detected")
-    #set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mcpu=apple-m1")
-elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
-    message(STATUS "PPC64 detected")
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mpower9-vector")
-else()
-    message(STATUS "x86 detected")
-    #set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx -mavx2 -mfma -mf16c")
-    if (UNAME_S MATCHES "Darwin")
-        execute_process(COMMAND sysctl machdep.cpu.features OUTPUT_VARIABLE AVX1_M)
-        if (AVX1_M MATCHES "AVX1.0")
-            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx")
-        endif()
-        execute_process(COMMAND sysctl machdep.cpu.leaf7_features OUTPUT_VARIABLE AVX2_M)
-        if (AVX2_M MATCHES "AVX2")
-            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx2")
-        endif()
-        if (AVX1_M MATCHES "FMA")
-            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfma")
-        endif()
-        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mf16c")
-    elseif (UNAME_S MATCHES "Linux")
-        message(STATUS "Linux detected")
-        execute_process(COMMAND grep "avx " /proc/cpuinfo OUTPUT_VARIABLE AVX1_M)
-        if (AVX1_M MATCHES "avx")
-            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx")
-        endif()
-        execute_process(COMMAND grep "avx2 " /proc/cpuinfo OUTPUT_VARIABLE AVX2_M)
-        if (AVX2_M MATCHES "avx2")
-            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx2")
-        endif()
-        execute_process(COMMAND grep "fma " /proc/cpuinfo OUTPUT_VARIABLE FMA_M)
-        if (FMA_M MATCHES "fma")
-            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfma")
-        endif()
-        execute_process(COMMAND grep "f16c " /proc/cpuinfo OUTPUT_VARIABLE F16C_M)
-        if (F16C_M MATCHES "f16c")
-            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mf16c")
-        endif()
-        execute_process(COMMAND grep "sse3 " /proc/cpuinfo OUTPUT_VARIABLE SSE3_M)
-        if (SSE3_M MATCHES "sse3")
-            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -msse3")
-        endif()
-    elseif (UNAME_S MATCHES "Haiku")
-        message(STATUS "Haiku detected")
-	execute_process(COMMAND sysinfo -cpu COMMAND grep "AVX " OUTPUT_VARIABLE AVX1_M)
-        if (AVX1_M MATCHES "avx")
-            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx")
-        endif()
-	execute_process(COMMAND sysinfo -cpu COMMAND grep "AVX2 " OUTPUT_VARIABLE AVX2_M)
-        if (AVX2_M MATCHES "avx2")
-            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx2")
-        endif()
-	execute_process(COMMAND sysinfo -cpu COMMAND grep "FMA " OUTPUT_VARIABLE FMA_M)
-        if (FMA_M MATCHES "fma")
-            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfma")
-        endif()
-	execute_process(COMMAND sysinfo -cpu COMMAND grep "F16C " OUTPUT_VARIABLE F16C_M)
-        if (F16C_M MATCHES "f16c")
-            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mf16c")
-        endif()
-    elseif (MSVC)
-        if (GGML_AVX512)
-            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /arch:AVX512")
-            # MSVC has no compile-time flags enabling specific
-            # AVX512 extensions, neither it defines the
-            # macros corresponding to the extensions.
-            # Do it manually.
-            if (GGML_AVX512_VBMI)
-                add_compile_definitions(__AVX512VBMI__)
-            endif()
-            if (GGML_AVX512_VNNI)
-                add_compile_definitions(__AVX512VNNI__)
-            endif()
-        elseif (GGML_AVX2)
-            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /arch:AVX2")
-        elseif (GGML_AVX)
-            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /arch:AVX")
-        endif()
-    else()
-        set(CMAKE_C_FLAGS  "${CMAKE_C_FLAGS} -mfma -mf16c -mavx -mavx2")
-    endif()
-endif()
-
-# on APPLE - include Accelerate framework
-if (APPLE AND NOT GGML_NO_ACCELERATE)
-    find_library(ACCELERATE_FRAMEWORK Accelerate)
-    if (ACCELERATE_FRAMEWORK)
-        message(STATUS "Accelerate framework found")
-
-        set(GGML_EXTRA_LIBS  ${GGML_EXTRA_LIBS}  ${ACCELERATE_FRAMEWORK})
-        set(GGML_EXTRA_FLAGS ${GGML_EXTRA_FLAGS} -DGGML_USE_ACCELERATE)
-    else()
-        message(WARNING "Accelerate framework not found")
-    endif()
-endif()
-
-if (GGML_OPENBLAS)
-    set(OPENBLAS_INCLUDE_SEARCH_PATHS
-        /usr/include
-        /usr/include/openblas
-        /usr/include/openblas-base
-        /usr/local/include
-        /usr/local/include/openblas
-        /usr/local/include/openblas-base
-        /opt/OpenBLAS/include
-        $ENV{OpenBLAS_HOME}
-        $ENV{OpenBLAS_HOME}/include
-        )
-    find_path(OPENBLAS_INC NAMES cblas.h PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS})
-    find_library(OPENBLAS_LIB NAMES openblas libopenblas)
-    if (OPENBLAS_LIB)
-        message(STATUS "OpenBLAS found")
-
-        set(GGML_EXTRA_LIBS  ${GGML_EXTRA_LIBS}  ${OPENBLAS_LIB})
-        set(GGML_EXTRA_INCS  ${GGML_EXTRA_INCS}  ${OPENBLAS_INC})
-        set(GGML_EXTRA_FLAGS ${GGML_EXTRA_FLAGS} -DGGML_USE_OPENBLAS)
-    else()
-        message(WARNING "OpenBLAS not found")
-    endif()
-endif()
-
-# undefine NDEBUG so asserts don't get disabled in tests
-add_definitions(-UNDEBUG)
-
-#
-# test-vec0
-
-set(TEST_TARGET test-vec0)
-add_executable(${TEST_TARGET} ${TEST_TARGET}.c)
-target_link_libraries(${TEST_TARGET} PRIVATE ggml)
-
-#
-# test-vec1 (x86)
-if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86" AND "${CMAKE_C_FLAGS}" MATCHES "avx")
-    set(TEST_TARGET test-vec1)
-    add_executable(${TEST_TARGET} ${TEST_TARGET}.c)
-    target_link_libraries(${TEST_TARGET} PRIVATE ggml)
-endif()
-
-#
-# test-vec2 (arm)
-if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm")
-    set(TEST_TARGET test-vec2)
-    add_executable(${TEST_TARGET} ${TEST_TARGET}.c)
-    target_link_libraries(${TEST_TARGET} PRIVATE ggml)
-endif()
-
-#
-# test-grad0
-
-set(TEST_TARGET test-grad0)
-add_executable(${TEST_TARGET} ${TEST_TARGET}.cpp)
-target_link_libraries(${TEST_TARGET} PRIVATE ggml)
-add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
-set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw")
-
-#
-# test-opt
-
-set(TEST_TARGET test-opt)
-add_executable(${TEST_TARGET} ${TEST_TARGET}.cpp)
-target_link_libraries(${TEST_TARGET} PRIVATE ggml)
-add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
-set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw")
-
-#
-# test-quantize-fns
-
-set(TEST_TARGET test-quantize-fns)
-add_executable(${TEST_TARGET} ${TEST_TARGET}.cpp)
-target_link_libraries(${TEST_TARGET} PRIVATE ggml)
-add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
-set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw")
-
-#
-# test-quantize-perf
-
-set(TEST_TARGET test-quantize-perf)
-add_executable(${TEST_TARGET} ${TEST_TARGET}.cpp)
-target_link_libraries(${TEST_TARGET} PRIVATE ggml)
-add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
-set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw")
-
-#
-# test-mul-mat0
-
-set(TEST_TARGET test-mul-mat0)
-add_executable(${TEST_TARGET} ${TEST_TARGET}.c)
-target_link_libraries(${TEST_TARGET} PRIVATE ggml ${GGML_EXTRA_LIBS})
-if (MSVC)
-    target_link_options(${TEST_TARGET} PRIVATE "/STACK: 8388608") # 8MB
-endif()
-target_compile_options(${TEST_TARGET} PRIVATE ${GGML_EXTRA_FLAGS})
-add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
-set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw")
-
-#
-# test-mul-mat1 (arm)
-
-if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" AND NOT GGML_NO_ACCELERATE)
-    set(TEST_TARGET test-mul-mat1)
-    add_executable(${TEST_TARGET} ${TEST_TARGET}.c)
-    target_link_libraries(${TEST_TARGET} PRIVATE ggml ${GGML_EXTRA_LIBS})
-    target_compile_options(${TEST_TARGET} PRIVATE ${GGML_EXTRA_FLAGS})
-endif()
-
-#
-# test-blas0 (arm)
-
-if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" AND NOT GGML_NO_ACCELERATE)
-    set(TEST_TARGET test-blas0)
-    add_executable(${TEST_TARGET} ${TEST_TARGET}.c)
-    target_link_libraries(${TEST_TARGET} PRIVATE ggml ${GGML_EXTRA_LIBS})
-    target_compile_options(${TEST_TARGET} PRIVATE ${GGML_EXTRA_FLAGS})
-    add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}> 128 128 128)
-    set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw")
-endif()
-
-#
-# test-mul-mat2
-
-set(TEST_TARGET test-mul-mat2)
-add_executable(${TEST_TARGET} ${TEST_TARGET}.c)
-target_link_libraries(${TEST_TARGET} PRIVATE ggml)
-add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
-set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw")
-
-#
-# test0
-
-set(TEST_TARGET test0)
-add_executable(${TEST_TARGET} ${TEST_TARGET}.c)
-target_link_libraries(${TEST_TARGET} PRIVATE ggml)
-add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
-set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw")
-
-#
-# test1
-
-set(TEST_TARGET test1)
-add_executable(${TEST_TARGET} ${TEST_TARGET}.c)
-target_link_libraries(${TEST_TARGET} PRIVATE ggml)
-if (MSVC)
-    target_link_options(${TEST_TARGET} PRIVATE "/STACK: 8388608") # 8MB
-endif()
-add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
-set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw")
-
-#
-# test2
-
-set(TEST_TARGET test2)
-add_executable(${TEST_TARGET} ${TEST_TARGET}.c)
-target_link_libraries(${TEST_TARGET} PRIVATE ggml)
-add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
-set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw")
-
-#
-# test3
-
-set(TEST_TARGET test3)
-add_executable(${TEST_TARGET} ${TEST_TARGET}.c)
-target_link_libraries(${TEST_TARGET} PRIVATE ggml)
-add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
-set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw")
-
-#
-# test-pool
-
-set(TEST_TARGET test-pool)
-add_executable(${TEST_TARGET} ${TEST_TARGET}.c)
-target_link_libraries(${TEST_TARGET} PRIVATE ggml)
-if (MSVC)
-    target_link_options(${TEST_TARGET} PRIVATE "/STACK: 8388608") # 8MB
-endif()
-add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
-set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw")
-
-#
-# test-conv-transpose
-
-set(TEST_TARGET test-conv-transpose)
-add_executable(${TEST_TARGET} ${TEST_TARGET}.c)
-target_link_libraries(${TEST_TARGET} PRIVATE ggml)
-add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
-
-#
-# test-dup
-
-set(TEST_TARGET test-dup)
-add_executable(${TEST_TARGET} ${TEST_TARGET}.c)
-target_link_libraries(${TEST_TARGET} PRIVATE ggml)
-add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
-
-#
-# test-rel-pos
-
-set(TEST_TARGET test-rel-pos)
-add_executable(${TEST_TARGET} ${TEST_TARGET}.c)
-target_link_libraries(${TEST_TARGET} PRIVATE ggml)
-add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
-
-#
-# test-svd0 (arm/x86)
-
-if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" AND NOT GGML_NO_ACCELERATE)
-    set(TEST_TARGET test-svd0)
-    add_executable(${TEST_TARGET} ${TEST_TARGET}.c)
-    target_link_libraries(${TEST_TARGET} PRIVATE ggml ${GGML_EXTRA_LIBS})
-    target_compile_options(${TEST_TARGET} PRIVATE ${GGML_EXTRA_FLAGS})
-elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86" AND GGML_OPENBLAS)
-    set(TEST_TARGET test-svd0)
-    add_executable(${TEST_TARGET} ${TEST_TARGET}.c)
-    target_link_libraries(${TEST_TARGET} PRIVATE ggml ${GGML_EXTRA_LIBS})
-    target_compile_options(${TEST_TARGET} PRIVATE ${GGML_EXTRA_FLAGS})
-endif()
-
-#
-# test-customop
-
-set(TEST_TARGET test-customop)
-add_executable(${TEST_TARGET} ${TEST_TARGET}.c)
-target_link_libraries(${TEST_TARGET} PRIVATE ggml)
-if (MSVC)
-    target_link_options(${TEST_TARGET} PRIVATE "/STACK: 8388608") # 8MB
-endif()
-add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
-set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw")
-
-#
-# test-xpos
-
-set(TEST_TARGET test-xpos)
-add_executable(${TEST_TARGET} ${TEST_TARGET}.c)
-target_link_libraries(${TEST_TARGET} PRIVATE ggml)
-add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
-set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw")
-
-#
-# test-conv1d
-
-set(TEST_TARGET test-conv1d)
-add_executable(${TEST_TARGET} ${TEST_TARGET}.cpp)
-target_link_libraries(${TEST_TARGET} PRIVATE ggml)
-add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
-set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw")
-
-
-#
-# test-conv2d
-
-set(TEST_TARGET test-conv2d)
-add_executable(${TEST_TARGET} ${TEST_TARGET}.cpp)
-target_link_libraries(${TEST_TARGET} PRIVATE ggml)
-add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
-set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw")
-
-
-#
-# test-mul-mat
-
-set(TEST_TARGET test-mul-mat)
-add_executable(${TEST_TARGET} ${TEST_TARGET}.cpp)
-target_link_libraries(${TEST_TARGET} PRIVATE ggml)
-add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
-set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw")
-
-
-#
-# test-backend-buffer
-
-set(TEST_TARGET test-backend-buffer)
-add_executable(${TEST_TARGET} ${TEST_TARGET}.cpp)
-target_link_libraries(${TEST_TARGET} PRIVATE ggml)
-add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
-set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw")
-
-
-#
-# test-backend-ops
-
-set(TEST_TARGET test-backend-ops)
-add_executable(${TEST_TARGET} ${TEST_TARGET}.cpp)
-target_link_libraries(${TEST_TARGET} PRIVATE ggml)
-add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
-set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw")
diff --git a/ggml/tests/test-backend-buffer.cpp b/ggml/tests/test-backend-buffer.cpp
deleted file mode 100644
index 0110144..0000000
--- a/ggml/tests/test-backend-buffer.cpp
+++ /dev/null
@@ -1,84 +0,0 @@
-#include <cstring>
-#include <ggml.h>
-#include <ggml-alloc.h>
-#include <ggml-backend.h>
-#include <ggml-backend-impl.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-
-static bool is_pow2(size_t x) {
-    return (x & (x - 1)) == 0;
-}
-
-static void test_buffer(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
-    GGML_ASSERT(ggml_backend_get_default_buffer_type(backend) == buft);
-
-    GGML_ASSERT(ggml_backend_buft_supports_backend(buft, backend));
-
-    //ggml_backend_buffer_t buffer = ggml_backend_alloc_buffer(backend, 1024);
-    ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, 1024);
-
-    GGML_ASSERT(buffer != NULL);
-
-    GGML_ASSERT(is_pow2(ggml_backend_buffer_get_alignment(buffer)));
-
-    GGML_ASSERT(ggml_backend_buffer_get_base(buffer) != NULL);
-
-    GGML_ASSERT(ggml_backend_buffer_get_size(buffer) >= 1024);
-
-    struct ggml_init_params params = {
-        /* .mem_size = */ 1024,
-        /* .mem_base = */ NULL,
-        /* .no_alloc = */ true,
-    };
-    struct ggml_context * ctx = ggml_init(params);
-
-    static const size_t n = 10;
-
-    struct ggml_tensor * tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n);
-
-    GGML_ASSERT(ggml_backend_buffer_get_alloc_size(buffer, tensor) >= n * sizeof(float));
-
-    ggml_tallocr_t allocr = ggml_tallocr_new_from_buffer(buffer);
-    ggml_tallocr_alloc(allocr, tensor);
-
-    GGML_ASSERT(tensor->data != NULL);
-
-    GGML_ASSERT(tensor->data >= ggml_backend_buffer_get_base(buffer));
-
-    float data[n];
-    for (size_t i = 0; i < n; i++) {
-        data[i] = (float) i;
-    }
-
-    ggml_backend_tensor_set(tensor, data, 0, sizeof(data));
-
-    float data2[n];
-    ggml_backend_tensor_get(tensor, data2, 0, sizeof(data2));
-
-    GGML_ASSERT(memcmp(data, data2, sizeof(data)) == 0);
-
-    ggml_tallocr_free(allocr);
-    ggml_backend_buffer_free(buffer);
-    ggml_free(ctx);
-}
-
-int main() {
-    // enumerate backends
-    printf("Testing %zu backends\n\n", ggml_backend_reg_get_count());
-
-    for (size_t i = 0; i < ggml_backend_reg_get_count(); i++) {
-        printf("Backend %zu/%zu (%s)\n", i + 1, ggml_backend_reg_get_count(), ggml_backend_reg_get_name(i));
-
-        ggml_backend_t backend = ggml_backend_reg_init_backend(i, NULL);
-        GGML_ASSERT(backend != NULL);
-        printf("  Backend name: %s\n", ggml_backend_name(backend));
-
-        test_buffer(backend, ggml_backend_reg_get_default_buffer_type(i));
-
-        ggml_backend_free(backend);
-
-        printf("  OK\n\n");
-    }
-}
diff --git a/ggml/tests/test-backend-ops.cpp b/ggml/tests/test-backend-ops.cpp
deleted file mode 100644
index d9b8b10..0000000
--- a/ggml/tests/test-backend-ops.cpp
+++ /dev/null
@@ -1,1757 +0,0 @@
-#include <ggml.h>
-#include <ggml-alloc.h>
-#include <ggml-backend.h>
-#include <ggml-backend-impl.h>
-#include <algorithm>
-#include <array>
-#include <cfloat>
-#include <cstring>
-#include <functional>
-#include <memory>
-#include <random>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string>
-#include <thread>
-#include <vector>
-
-static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float max = 1.0f) {
-    size_t size = ggml_nelements(tensor);
-    std::vector<float> data(size);
-
-#if 0
-    static std::default_random_engine generator(1234);
-    std::uniform_real_distribution<float> distribution(min, max);
-
-    for (size_t i = 0; i < size; i++) {
-        data[i] = distribution(generator);
-    }
-#else
-    auto init_thread = [&](size_t start, size_t end) {
-        std::random_device rd;
-        std::default_random_engine generator(rd());
-        std::uniform_real_distribution<float> distribution(min, max);
-
-        for (size_t i = start; i < end; i++) {
-            data[i] = distribution(generator);
-        }
-    };
-
-    size_t n_threads = std::thread::hardware_concurrency();
-    std::vector<std::thread> threads;
-    threads.reserve(n_threads);
-    for (size_t i = 0; i < n_threads; i++) {
-        size_t start =     i*size/n_threads;
-        size_t end   = (i+1)*size/n_threads;
-        threads.emplace_back(init_thread, start, end);
-    }
-    for (auto & t : threads) {
-        t.join();
-    }
-#endif
-
-    if (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_I32) {
-        ggml_backend_tensor_set(tensor, data.data(), 0, size * sizeof(float));
-    } else if (ggml_is_quantized(tensor->type) || tensor->type == GGML_TYPE_F16) {
-        GGML_ASSERT(size % ggml_blck_size(tensor->type) == 0);
-        std::vector<uint8_t> dataq(ggml_row_size(tensor->type, size));
-        int64_t hist[16];
-        ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), 0, size, hist);
-        ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size());
-    } else if (tensor->type == GGML_TYPE_I8 || tensor->type == GGML_TYPE_I16 || tensor->type == GGML_TYPE_I32) {
-        // This is going to create some weird integers though.
-        ggml_backend_tensor_set(tensor, data.data(), 0, ggml_nbytes(tensor));
-    } else {
-        GGML_ASSERT(false);
-    }
-}
-
-static std::vector<float> tensor_to_float(const ggml_tensor * t) {
-    std::vector<float> tv;
-    tv.reserve(ggml_nelements(t));
-
-    std::vector<uint8_t> buf(ggml_nbytes(t));
-    ggml_backend_tensor_get(t, buf.data(), 0, ggml_nbytes(t));
-
-    ggml_type_traits_t tt = ggml_internal_get_type_traits(t->type);
-    size_t bs = ggml_blck_size(t->type);
-    std::vector<float> vq(ggml_blck_size(t->type));
-    bool quantized = ggml_is_quantized(t->type);
-
-    // access elements by index to avoid gaps in views
-    for (int64_t i3 = 0; i3 < t->ne[3]; i3++) {
-        for (int64_t i2 = 0; i2 < t->ne[2]; i2++) {
-            for (int64_t i1 = 0; i1 < t->ne[1]; i1++) {
-                for (int64_t i0 = 0; i0 < t->ne[0]; i0 += bs) {
-                    size_t i = i3*t->nb[3] + i2*t->nb[2] + i1*t->nb[1] + i0/bs*t->nb[0];
-                    if (t->type == GGML_TYPE_F16) {
-                        tv.push_back(ggml_fp16_to_fp32(*(ggml_fp16_t*)&buf[i]));
-                    } else if (t->type == GGML_TYPE_F32) {
-                        tv.push_back(*(float *) &buf[i]);
-                    } else if (t->type == GGML_TYPE_I32) {
-                        tv.push_back((float)*(int32_t *) &buf[i]);
-                    } else if (t->type == GGML_TYPE_I16) {
-                        tv.push_back((float)*(int16_t *) &buf[i]);
-                    } else if (t->type == GGML_TYPE_I8) {
-                        tv.push_back((float)*(int8_t *) &buf[i]);
-                    } else if (quantized) {
-                        std::vector<float> vq(ggml_blck_size(t->type));
-                        tt.to_float(&buf[i], vq.data(), ggml_blck_size(t->type));
-                        tv.insert(tv.end(), vq.begin(), vq.end());
-                    } else {
-                        GGML_ASSERT(false);
-                    }
-                }
-            }
-        }
-    }
-
-    return tv;
-}
-
-/*
-static double cosine_similarity(const float * v1, const float * v2, size_t n) {
-    double dot = 0.0;
-    double mag1 = 0.0;
-    double mag2 = 0.0;
-
-    for (size_t i = 0; i < n; i++) {
-        if (std::isnan(v1[i]) || std::isnan(v2[i])) {
-            return -1.0f;
-        }
-        if (std::isinf(v1[i]) && std::isinf(v2[i])) {
-            continue;
-        }
-        dot  += v1[i]*v2[i];
-        mag1 += v1[i]*v1[i];
-        mag2 += v2[i]*v2[i];
-    }
-
-    return dot/sqrt(mag1*mag2);
-}
-
-static float distance(const float * v1, const float * v2, size_t n) {
-    double d = 0.0;
-
-    for (size_t i = 0; i < n; i++) {
-        if (std::isnan(v1[i]) || std::isnan(v2[i])) {
-            return INFINITY;
-        }
-        if (std::isinf(v1[i]) && std::isinf(v2[i])) {
-            continue;
-        }
-        d += (v1[i] - v2[i])*(v1[i] - v2[i]);
-    }
-
-    return sqrt(d);
-}
-
-static float vec_len(const float * v, size_t n) {
-    double d = 0.0;
-
-    for (size_t i = 0; i < n; i++) {
-        if (std::isnan(v[i])) {
-            return INFINITY;
-        }
-        if (std::isinf(v[i])) {
-            continue;
-        }
-        d += v[i]*v[i];
-    }
-
-    return sqrt(d);
-}
-*/
-
-// normalized mean squared error = mse(a, b) / mse(a, 0)
-static double nmse(const float * a, const float * b, size_t n) {
-    double mse_a_b = 0.0;
-    double mse_a_0 = 0.0;
-
-    for (size_t i = 0; i < n; i++) {
-        float a_i = a[i];
-        float b_i = b[i];
-
-        mse_a_b += (a_i - b_i) * (a_i - b_i);
-        mse_a_0 += a_i * a_i;
-    }
-
-    return mse_a_b / mse_a_0;
-}
-
-// utils for printing the variables of the test cases
-#define VAR_TO_STR(x) (#x "=" + var_to_str(x))
-
-template<typename T>
-static std::string var_to_str(const T & x) {
-    return std::to_string(x);
-}
-
-template<typename T, size_t N>
-static std::string var_to_str(const T (&x)[N]) {
-    std::string s = "[";
-    for (size_t i = 0; i < N; i++) {
-        if (i > 0) {
-            s += ",";
-        }
-        s += var_to_str(x[i]);
-    }
-    s += "]";
-    return s;
-}
-
-template<typename T, size_t N>
-static std::string var_to_str(const std::array<T, N> & x) {
-    std::string s = "[";
-    for (size_t i = 0; i < N; i++) {
-        if (i > 0) {
-            s += ",";
-        }
-        s += var_to_str(x[i]);
-    }
-    s += "]";
-    return s;
-}
-
-//static std::string var_to_str(ggml_unary_op unary_op) {
-//    return ggml_unary_op_name(unary_op);
-//}
-
-static std::string var_to_str(ggml_type type) {
-    return ggml_type_name(type);
-}
-
-#define VARS_TO_STR1(a) VAR_TO_STR(a)
-#define VARS_TO_STR2(a, b) VAR_TO_STR(a) + "," + VAR_TO_STR(b)
-#define VARS_TO_STR3(a, b, c) VAR_TO_STR(a) + "," + VARS_TO_STR2(b, c)
-#define VARS_TO_STR4(a, b, c, d) VAR_TO_STR(a) + "," + VARS_TO_STR3(b, c, d)
-#define VARS_TO_STR5(a, b, c, d, e) VAR_TO_STR(a) + "," + VARS_TO_STR4(b, c, d, e)
-#define VARS_TO_STR6(a, b, c, d, e, f) VAR_TO_STR(a) + "," + VARS_TO_STR5(b, c, d, e, f)
-#define VARS_TO_STR7(a, b, c, d, e, f, g) VAR_TO_STR(a) + "," + VARS_TO_STR6(b, c, d, e, f, g)
-#define VARS_TO_STR8(a, b, c, d, e, f, g, h) VAR_TO_STR(a) + "," + VARS_TO_STR7(b, c, d, e, f, g, h)
-#define VARS_TO_STR9(a, b, c, d, e, f, g, h, i) VAR_TO_STR(a) + "," + VARS_TO_STR8(b, c, d, e, f, g, h, i)
-#define VARS_TO_STR10(a, b, c, d, e, f, g, h, i, j) VAR_TO_STR(a) + "," + VARS_TO_STR9(b, c, d, e, f, g, h, i, j)
-#define VARS_TO_STR11(a, b, c, d, e, f, g, h, i, j, k) VAR_TO_STR(a) + "," + VARS_TO_STR10(b, c, d, e, f, g, h, i, j, k)
-
-
-// accept FLT_MAX as infinity
-static bool isinf_or_max(float f) {
-    return std::isinf(f) || f == FLT_MAX || f == -FLT_MAX;
-}
-
-static bool ggml_is_view_op(enum ggml_op op) {
-    return op == GGML_OP_VIEW || op == GGML_OP_RESHAPE || op == GGML_OP_PERMUTE || op == GGML_OP_TRANSPOSE;
-}
-
-enum test_mode {
-    MODE_TEST,
-    MODE_PERF,
-};
-
-struct test_case {
-    virtual ~test_case() {}
-
-    virtual std::string op_desc(ggml_tensor * t) {
-        return ggml_op_desc(t);
-    }
-
-    virtual std::string vars() {
-        return "";
-    }
-
-    virtual ggml_tensor * build_graph(ggml_context * ctx) = 0;
-
-    virtual double max_nmse_err() {
-        return 1e-7;
-    }
-
-    virtual void initialize_tensors(ggml_context * ctx) {
-        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
-            init_tensor_uniform(t);
-        }
-    }
-
-    virtual size_t op_size(ggml_tensor * t) {
-        size_t size = ggml_nbytes(t);
-        // add source tensors
-        for (int i = 0; i < GGML_MAX_SRC; i++) {
-            if (t->src[i] != NULL) {
-                size += ggml_nbytes(t->src[i]);
-            }
-        }
-        return size;
-    }
-
-    ggml_cgraph * gf = nullptr;
-
-    static const int sentinel_size = 1024;
-
-    test_mode mode;
-
-    std::vector<ggml_tensor *> sentinels;
-
-    void add_sentinel(ggml_context * ctx) {
-        if (mode == MODE_PERF) {
-            return;
-        }
-        ggml_tensor * sentinel = ::ggml_new_tensor_1d(ctx, GGML_TYPE_F32, sentinel_size);
-        ggml_format_name(sentinel, "sent_%zu", sentinels.size());
-        sentinels.push_back(sentinel);
-    }
-
-    // hijack ggml_new_tensor to add sentinels after each tensor to check for overflows in the backend
-
-    ggml_tensor * ggml_new_tensor(ggml_context * ctx, ggml_type type, int n_dims, const int64_t * ne) {
-        ggml_tensor * t = ::ggml_new_tensor(ctx, type, n_dims, ne);
-        add_sentinel(ctx);
-        return t;
-    }
-
-    ggml_tensor * ggml_new_tensor_1d(ggml_context * ctx, ggml_type type, int64_t ne0) {
-        ggml_tensor * t = ::ggml_new_tensor_1d(ctx, type, ne0);
-        add_sentinel(ctx);
-        return t;
-    }
-
-    ggml_tensor * ggml_new_tensor_2d(ggml_context * ctx, ggml_type type, int64_t ne0, int64_t ne1) {
-        ggml_tensor * t = ::ggml_new_tensor_2d(ctx, type, ne0, ne1);
-        add_sentinel(ctx);
-        return t;
-    }
-
-    ggml_tensor * ggml_new_tensor_3d(ggml_context * ctx, ggml_type type, int64_t ne0, int64_t ne1, int64_t ne2) {
-        ggml_tensor * t = ::ggml_new_tensor_3d(ctx, type, ne0, ne1, ne2);
-        add_sentinel(ctx);
-        return t;
-    }
-
-    ggml_tensor * ggml_new_tensor_4d(ggml_context * ctx, ggml_type type, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) {
-        ggml_tensor * t = ::ggml_new_tensor_4d(ctx, type, ne0, ne1, ne2, ne3);
-        add_sentinel(ctx);
-        return t;
-    }
-
-    bool eval(ggml_backend_t backend1, ggml_backend_t backend2, const char * op_name) {
-        mode = MODE_TEST;
-
-        ggml_init_params params = {
-            /* .mem_size = */ ggml_tensor_overhead()*128 + ggml_graph_overhead(),
-            /* .mem_base = */ NULL,
-            /* .no_alloc = */ true,
-        };
-        ggml_context * ctx = ggml_init(params);
-
-        gf = ggml_new_graph(ctx);
-
-        // pre-graph sentinel
-        add_sentinel(ctx);
-
-        ggml_tensor * out = build_graph(ctx);
-
-        if (op_name != nullptr && op_desc(out) != op_name) {
-            //printf("  %s: skipping\n", op_desc(out).c_str());
-            ggml_free(ctx);
-            return true;
-        }
-
-        printf("  %s(%s): ", op_desc(out).c_str(), vars().c_str());
-        fflush(stdout);
-
-        // check if backends support op
-        bool supported = true;
-        for (ggml_backend_t backend : {backend1, backend2}) {
-            if (!ggml_backend_supports_op(backend, out)) {
-                printf("not supported [%s] ", ggml_backend_name(backend));
-                supported = false;
-            }
-        }
-        if (!supported) {
-            printf("\n");
-            ggml_free(ctx);
-            return true;
-        }
-
-        // post-graph sentinel
-        add_sentinel(ctx);
-
-        // allocate
-        ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(ctx, backend1);
-        if (buf == NULL) {
-            printf("failed to allocate tensors [%s] ", ggml_backend_name(backend1));
-            ggml_free(ctx);
-            return false;
-        }
-
-        // build graph
-        ggml_build_forward_expand(gf, out);
-
-        // add sentinels as graph nodes so that they are checked in the callback
-        for (ggml_tensor * sentinel : sentinels) {
-            gf->nodes[gf->n_nodes++] = sentinel;
-        }
-
-        // randomize tensors
-        initialize_tensors(ctx);
-
-        // compare
-        struct callback_userdata {
-            bool   ok;
-            double max_err;
-            ggml_backend_t backend1;
-            ggml_backend_t backend2;
-        };
-
-        callback_userdata ud {
-            true,
-            max_nmse_err(),
-            backend1,
-            backend2
-        };
-
-        auto callback = [](int index, ggml_tensor * t1, ggml_tensor * t2, void * user_data) -> bool {
-            callback_userdata * ud = (callback_userdata *) user_data;
-            const char * bn1 = ggml_backend_name(ud->backend1);
-            const char * bn2 = ggml_backend_name(ud->backend2);
-
-            if (t1->op == GGML_OP_NONE) {
-                // sentinels must be unchanged
-                std::vector<uint8_t> t1_data(ggml_nbytes(t1));
-                std::vector<uint8_t> t2_data(ggml_nbytes(t2));
-                ggml_backend_tensor_get(t1, t1_data.data(), 0, ggml_nbytes(t1));
-                ggml_backend_tensor_get(t2, t2_data.data(), 0, ggml_nbytes(t2));
-
-                if (memcmp(t1_data.data(), t2_data.data(), ggml_nbytes(t1)) != 0) {
-                    printf("sentinel mismatch: %s ", t1->name);
-                    ud->ok = false;
-                    return true;
-                }
-            }
-
-            std::vector<float> f1 = tensor_to_float(t1);
-            std::vector<float> f2 = tensor_to_float(t2);
-
-            for (size_t i = 0; i < f1.size(); i++) {
-                // check for nans
-                if (std::isnan(f1[i]) || std::isnan(f2[i])) {
-                    printf("[%s] NaN at index %zu (%s=%f %s=%f) ", ggml_op_desc(t1), i, bn1, f1[i], bn2, f2[i]);
-                    ud->ok = false;
-                    return true;
-                }
-                // check for infs: both must be inf of the same sign, or both must be finite
-                if (isinf_or_max(f1[i]) || isinf_or_max(f2[i])) {
-                    if (isinf_or_max(f1[i]) && isinf_or_max(f2[i])) {
-                        if (std::signbit(f1[i]) != std::signbit(f2[i])) {
-                            printf("[%s] inf sign mismatch: %s=%f %s=%f ", ggml_op_desc(t1), bn1, f1[i], bn2, f2[i]);
-                            ud->ok = false;
-                            return true;
-                        }
-                    } else {
-                        printf("[%s] inf mismatch: %s=%f %s=%f ", ggml_op_desc(t1), bn1, f1[i], bn2, f2[i]);
-                        ud->ok = false;
-                        return true;
-                    }
-                }
-            }
-
-            double err = nmse(f1.data(), f2.data(), f1.size());
-            if (err > ud->max_err) {
-                printf("[%s] NMSE = %.9f > %.9f ", ggml_op_desc(t1), err, ud->max_err);
-                //for (int i = 0; i < (int) f1.size(); i++) {
-                //    printf("%5d %9.6f %9.6f, diff = %9.6f\n", i, f1[i], f2[i], f1[i] - f2[i]);
-                //}
-                //printf("\n");
-                //exit(1);
-                ud->ok = false;
-            }
-            return true;
-
-            GGML_UNUSED(index);
-        };
-
-        const bool cmp_ok = ggml_backend_compare_graph_backend(backend1, backend2, gf, callback, &ud);
-
-        if (!cmp_ok) {
-            printf("compare failed ");
-        }
-
-        ggml_backend_buffer_free(buf);
-
-        ggml_free(ctx);
-
-        if (ud.ok && cmp_ok) {
-            printf("\033[1;32mOK\033[0m\n");
-            return true;
-        }
-
-        printf("\033[1;31mFAIL\033[0m\n");
-        return false;
-    }
-
-    bool eval_perf(ggml_backend_t backend, const char * op_name) {
-        mode = MODE_PERF;
-
-        static const size_t graph_nodes = 8192;
-
-        ggml_init_params params = {
-            /* .mem_size = */ ggml_tensor_overhead()*128 + ggml_graph_overhead_custom(graph_nodes, false),
-            /* .mem_base = */ NULL,
-            /* .no_alloc = */ true,
-        };
-        ggml_context * ctx = ggml_init(params);
-
-        ggml_tensor * out = build_graph(ctx);
-
-        if (op_name != nullptr && op_desc(out) != op_name) {
-            //printf("  %s: skipping\n", op_desc(out).c_str());
-            ggml_free(ctx);
-            return true;
-        }
-
-        int len = printf("  %s(%s): ", op_desc(out).c_str(), vars().c_str());
-        fflush(stdout);
-
-        // check if backends support op
-        if (!ggml_backend_supports_op(backend, out)) {
-            printf("not supported\n");
-            ggml_free(ctx);
-            return true;
-        }
-
-        // align while also leaving some margin for variations in parameters
-        int align = 20;
-        int last = (len + align - 1) / align * align;
-        if (last - len < 5) {
-            last += align;
-        }
-        last = std::max(last, 60);
-        printf("%*s", last - len, "");
-
-        // allocate
-        ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(ctx, backend);
-        if (buf == NULL) {
-            printf("failed to allocate tensors\n");
-            ggml_free(ctx);
-            return false;
-        }
-
-        // randomize tensors
-        initialize_tensors(ctx);
-
-        // build graph
-        ggml_cgraph * gf = ggml_new_graph_custom(ctx, graph_nodes, false);
-        ggml_build_forward_expand(gf, out);
-
-        // warmup run
-        ggml_backend_graph_compute(backend, gf);
-
-        // duplicate the op
-        size_t target_size = ggml_backend_is_cpu(backend) ? 1ULL << 33 : 1ULL << 35; // 8 GB CPU, 32 GB GPU
-        int n_runs = std::min((size_t)gf->size - gf->n_nodes, target_size / op_size(out)) + 1;
-        for (int i = 1; i < n_runs; i++) {
-            gf->nodes[gf->n_nodes++] = out;
-        }
-
-        // calculate memory
-        size_t mem = n_runs * op_size(out);
-        auto tensor_op_size = [](ggml_tensor * t) {
-            size_t size = ggml_nbytes(t);
-            // add source tensors
-            for (int i = 0; i < GGML_MAX_SRC; i++) {
-                if (t->src[i] != NULL) {
-                    size += ggml_nbytes(t->src[i]);
-                }
-            }
-            return size;
-        };
-        for (int i = 0; i < gf->n_nodes; i++) {
-            if (ggml_is_view_op(gf->nodes[i]->op) || gf->nodes[i] == out) {
-                continue;
-            }
-            mem += tensor_op_size(gf->nodes[i]);
-        }
-
-        // run
-        ggml_backend_synchronize(backend);
-
-        int64_t start_time = ggml_time_us();
-        ggml_backend_graph_compute(backend, gf);
-        ggml_backend_synchronize(backend);
-        int64_t end_time = ggml_time_us();
-        double time_us = end_time - start_time;
-
-        printf("    %5d runs - %8.2f us/run - %8zu kB/run - \033[1;34m%7.2f GB/s\033[0m\n",
-            n_runs,
-            time_us / n_runs,
-            op_size(out) / 1024,
-            mem / (time_us/1e6) / 1024.0 / 1024.0 / 1024.0);
-
-        ggml_backend_buffer_free(buf);
-
-        ggml_free(ctx);
-
-        return true;
-    }
-};
-
-// GGML_OP_UNARY
-struct test_unary : public test_case {
-    const ggml_unary_op op;
-    const ggml_type type;
-    const std::array<int64_t, 4> ne;
-
-    std::string vars() override {
-        return VARS_TO_STR2(type, ne);
-    }
-
-    test_unary(ggml_unary_op op,
-            ggml_type type = GGML_TYPE_F32,
-            std::array<int64_t, 4> ne = {128, 10, 10, 10})
-        : op(op), type(type), ne(ne) {}
-
-    ggml_tensor * build_graph(ggml_context * ctx) override {
-        ggml_tensor * in = ggml_new_tensor(ctx, type, 4, ne.data());
-        ggml_tensor * out = ggml_unary(ctx, in, op);
-        return out;
-    }
-};
-
-// GGML_OP_GET_ROWS
-struct test_get_rows : public test_case {
-    const ggml_type type;
-    const int n; // cols
-    const int m; // rows
-    const int r; // rows to get
-    const int b; // batch size
-    const bool v; // view (non-contiguous src1)
-
-    std::string vars() override {
-        return VARS_TO_STR6(type, n, m, r, b, v);
-    }
-
-    test_get_rows(ggml_type type = GGML_TYPE_F32, int n = 10, int m = 5, int r = 3, int b = 1, bool v = false)
-        : type(type), n(n), m(m), r(r), b(b), v(v) {}
-
-    ggml_tensor * build_graph(ggml_context * ctx) override {
-        ggml_tensor * in = ggml_new_tensor_3d(ctx, type, n, m, b);
-        ggml_tensor * rows = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, r, b);
-        if (v) {
-            rows = ggml_view_2d(ctx, rows, r/2, b, rows->nb[1], 0);
-        }
-        ggml_tensor * out = ggml_get_rows(ctx, in, rows);
-        return out;
-    }
-
-    void initialize_tensors(ggml_context * ctx) override {
-        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
-            if (t->type == GGML_TYPE_I32) {
-                if (ggml_is_view_op(t->op)) { continue; }
-                // rows
-                std::vector<int> data(r*b);
-                for (int i = 0; i < r*b; i++) {
-                    data[i] = rand() % m;
-                }
-                ggml_backend_tensor_set(t, data.data(), 0, r * b * sizeof(int));
-            } else {
-                init_tensor_uniform(t);
-            }
-        }
-    }
-};
-
-// GGML_OP_REPEAT
-struct test_repeat : public test_case {
-    const ggml_type type;
-    const std::array<int64_t, 4> ne;
-    const std::array<int, 4> nr;
-
-    std::string vars() override {
-        return VARS_TO_STR3(type, ne, nr);
-    }
-
-    size_t op_size(ggml_tensor * t) override {
-        return ggml_nbytes(t) * 2;
-    }
-
-    test_repeat(ggml_type type = GGML_TYPE_F32,
-            std::array<int64_t, 4> ne = {10, 10, 10, 10},
-            std::array<int, 4> nr = {2, 2, 2, 2})
-        : type(type), ne(ne), nr(nr) {}
-
-    ggml_tensor * build_graph(ggml_context * ctx) override {
-        ggml_tensor * target = ggml_new_tensor_4d(ctx, type, ne[0]*nr[0], ne[1]*nr[1], ne[2]*nr[2], ne[3]*nr[3]);
-        ggml_tensor * src = ggml_new_tensor(ctx, type, 4, ne.data());
-        ggml_tensor * out = ggml_repeat(ctx, src, target);
-        return out;
-    }
-};
-
-// GGML_OP_DUP
-struct test_dup : public test_case {
-    const ggml_type type;
-    const std::array<int64_t, 4> ne;
-    const std::array<int64_t, 4> permute;
-    bool _use_permute;
-
-    std::string vars() override {
-        std::string v = VARS_TO_STR2(type, ne);
-        if (_use_permute) v += "," + VAR_TO_STR(permute);
-        return v;
-    }
-
-    test_dup(ggml_type type = GGML_TYPE_F32,
-            std::array<int64_t, 4> ne = {10, 10, 10, 1},
-            std::array<int64_t, 4> permute = {0, 0, 0, 0})
-        : type(type), ne(ne), permute(permute),
-            _use_permute(permute[0] + permute[1] + permute[2] + permute[3] > 0) {}
-
-    ggml_tensor * build_graph(ggml_context * ctx) override {
-        ggml_tensor * src = ggml_new_tensor(ctx, type, 4, ne.data());
-        if (_use_permute) {
-            src = ggml_permute(ctx, src, permute[0], permute[1], permute[2], permute[3]);
-        }
-        ggml_tensor * out = ggml_dup(ctx, src);
-        return out;
-    }
-};
-
-// GGML_OP_CPY
-struct test_cpy : public test_case {
-    const ggml_type type_src;
-    const ggml_type type_dst;
-    const std::array<int64_t, 4> ne;
-
-    std::string vars() override {
-        return VARS_TO_STR3(type_src, type_dst, ne);
-    }
-
-    size_t op_size(ggml_tensor * t) override {
-        return ggml_nbytes(t) + ggml_nbytes(t->src[0]);
-    }
-
-    test_cpy(ggml_type type_src = GGML_TYPE_F32, ggml_type type_dst = GGML_TYPE_F32,
-            std::array<int64_t, 4> ne = {10, 10, 10, 1})
-        : type_src(type_src), type_dst(type_dst), ne(ne) {}
-
-    ggml_tensor * build_graph(ggml_context * ctx) override {
-        ggml_tensor * src = ggml_new_tensor(ctx, type_src, 4, ne.data());
-        ggml_tensor * dst = ggml_new_tensor(ctx, type_dst, 4, ne.data());
-        ggml_tensor * out = ggml_cpy(ctx, src, dst);
-        return out;
-    }
-};
-
-// GGML_OP_CONT
-struct test_cont : public test_case {
-    const ggml_type type;
-    const std::array<int64_t, 4> ne;
-
-    std::string vars() override {
-        return VARS_TO_STR2(type, ne);
-    }
-
-    test_cont(ggml_type type = GGML_TYPE_F32,
-            std::array<int64_t, 4> ne = {10, 10, 10, 1})
-        : type(type), ne(ne) {}
-
-    ggml_tensor * build_graph(ggml_context * ctx) override {
-        ggml_tensor * src = ggml_new_tensor(ctx, type, 4, ne.data());
-        src = ggml_transpose(ctx, src);
-        ggml_tensor * out = ggml_cont(ctx, src);
-
-        return out;
-    }
-};
-
-// GGML_OP_ADD
-// GGML_OP_MUL
-// GGML_OP_DIV
-struct test_bin_bcast : public test_case {
-    using op_t = ggml_tensor * (*) (ggml_context *, ggml_tensor *, ggml_tensor *);
-    op_t op;
-    const ggml_type type;
-    const std::array<int64_t, 4> ne;
-    const std::array<int, 4> nr;
-
-    std::string vars() override {
-        return VARS_TO_STR3(type, ne, nr);
-    }
-
-    size_t op_size(ggml_tensor * t) override {
-        return ggml_nbytes(t) * 3;
-    }
-
-    test_bin_bcast(op_t op, ggml_type type = GGML_TYPE_F32,
-            std::array<int64_t, 4> ne = {10, 10, 1, 1},
-            std::array<int, 4> nr = {1, 2, 1, 1})
-        : op(op), type(type), ne(ne), nr(nr) {}
-
-    ggml_tensor * build_graph(ggml_context * ctx) override {
-        ggml_tensor * a = ggml_new_tensor_4d(ctx, type, ne[0]*nr[0], ne[1]*nr[1], ne[2]*nr[2], ne[3]*nr[3]);
-        ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne.data());
-        ggml_tensor * out = op(ctx, a, b);
-        return out;
-    }
-
-    void initialize_tensors(ggml_context * ctx) override {
-        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
-            if (op == ggml_div) {
-                // avoid division by zero
-                init_tensor_uniform(t, 1.0f, 2.0f);
-            } else {
-                init_tensor_uniform(t);
-            }
-        }
-    }
-};
-
-// GGML_OP_SCALE
-struct test_scale : public test_case {
-    const ggml_type type;
-    const std::array<int64_t, 4> ne;
-    float scale;
-
-    std::string vars() override {
-        return VARS_TO_STR3(type, ne, scale);
-    }
-
-    test_scale(ggml_type type = GGML_TYPE_F32,
-            std::array<int64_t, 4> ne = {10, 10, 10, 10},
-            float scale = 2.0f)
-        : type(type), ne(ne), scale(scale) {}
-
-    ggml_tensor * build_graph(ggml_context * ctx) override {
-        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
-        ggml_tensor * out = ggml_scale(ctx, a, scale);
-        return out;
-    }
-};
-
-// GGML_OP_NORM
-struct test_norm : public test_case {
-    const ggml_type type;
-    const std::array<int64_t, 4> ne;
-    float eps;
-
-    std::string vars() override {
-        return VARS_TO_STR3(type, ne, eps);
-    }
-
-    test_norm(ggml_type type = GGML_TYPE_F32,
-            std::array<int64_t, 4> ne = {64, 10, 10, 10},
-            float eps = 1e-6f)
-        : type(type), ne(ne), eps(eps) {}
-
-    ggml_tensor * build_graph(ggml_context * ctx) override {
-        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
-        ggml_tensor * out = ggml_norm(ctx, a, eps);
-        return out;
-    }
-};
-
-// GGML_OP_RMS_NORM
-struct test_rms_norm : public test_case {
-    const ggml_type type;
-    const std::array<int64_t, 4> ne;
-    float eps;
-
-    std::string vars() override {
-        return VARS_TO_STR3(type, ne, eps);
-    }
-
-    test_rms_norm(ggml_type type = GGML_TYPE_F32,
-            std::array<int64_t, 4> ne = {64, 10, 10, 10},
-            float eps = 1e-6f)
-        : type(type), ne(ne), eps(eps) {}
-
-    ggml_tensor * build_graph(ggml_context * ctx) override {
-        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
-        ggml_tensor * out = ggml_rms_norm(ctx, a, eps);
-        return out;
-    }
-};
-
-// GGML_OP_MUL_MAT
-struct test_mul_mat : public test_case {
-    const ggml_type type_a;
-    const ggml_type type_b;
-    const int64_t m;
-    const int64_t n;
-    const int64_t k;
-    const std::array<int64_t, 2> bs; // dims 3 and 4
-    const std::array<int64_t, 2> nr; // repeat in dims 3 and 4
-
-    std::string vars() override {
-        return VARS_TO_STR7(type_a, type_b, m, n, k, bs, nr);
-    }
-
-    double max_nmse_err() override {
-        return 5e-4;
-    }
-
-    size_t op_size(ggml_tensor * t) override {
-        size_t a = ggml_nbytes(t->src[0]) * n * nr[0] * nr[1];
-        size_t b = ggml_nbytes(t->src[1]) * m;
-        size_t c  = ggml_nbytes(t);
-        return a + b + c;
-
-        GGML_UNUSED(t);
-    }
-
-    test_mul_mat(ggml_type type_a = GGML_TYPE_F32, ggml_type type_b = GGML_TYPE_F32,
-            int64_t m = 32, int64_t n = 32, int64_t k = 32,
-            std::array<int64_t, 2> bs = {10, 10},
-            std::array<int64_t, 2> nr = {2, 2})
-        : type_a(type_a), type_b(type_b), m(m), n(n), k(k), bs(bs), nr(nr) {}
-
-    ggml_tensor * build_graph(ggml_context * ctx) override {
-        // C^T = A * B^T: (k, m) * (k, n) => (m, n)
-        ggml_tensor * a = ggml_new_tensor_4d(ctx, type_a, k, m, bs[0]      , bs[1]);
-        ggml_tensor * b = ggml_new_tensor_4d(ctx, type_b, k, n, bs[0]*nr[0], bs[1]*nr[1]);
-        ggml_tensor * out = ggml_mul_mat(ctx, a, b);
-        return out;
-    }
-};
-
-// GGML_OP_MUL_MAT_ID
-struct test_mul_mat_id : public test_case {
-    const ggml_type type_a;
-    const ggml_type type_b;
-    const int n_mats;
-    const int id;
-    const int64_t m;
-    const int64_t n;
-    const int64_t k;
-    const bool v; // view (non-contiguous ids)
-
-    std::string vars() override {
-        return VARS_TO_STR8(type_a, type_b, n_mats, id, m, n, k, v);
-    }
-
-    double max_nmse_err() override {
-        return 5e-4;
-    }
-
-    size_t op_size(ggml_tensor * t) override {
-        size_t a = ggml_nbytes(t->src[2]) * n;
-        size_t b = ggml_nbytes(t->src[1]) * m;
-        size_t c  = ggml_nbytes(t);
-        return a + b + c;
-
-        GGML_UNUSED(t);
-    }
-
-    test_mul_mat_id(ggml_type type_a = GGML_TYPE_F32, ggml_type type_b = GGML_TYPE_F32,
-            int n_mats = 2, int id = 0,
-            int64_t m = 32, int64_t n = 32, int64_t k = 32, bool v = false)
-        : type_a(type_a), type_b(type_b), n_mats(n_mats), id(id),
-            m(m), n(n), k(k), v(v) {}
-
-    ggml_tensor * build_graph(ggml_context * ctx) override {
-        // C^T = A * B^T: (k, m) * (k, n) => (m, n)
-        std::vector<ggml_tensor *> mats;
-        for (int i = 0; i < n_mats; i++) {
-            ggml_tensor * a = ggml_new_tensor_2d(ctx, type_a, k, m);
-            mats.push_back(a);
-        }
-        ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_mats, n);
-        if (v) {
-            ids = ggml_view_2d(ctx, ids, n_mats/2, ids->ne[1], ids->nb[1], 0);
-        }
-        ggml_tensor * b = ggml_new_tensor_2d(ctx, type_b, k, n);
-        ggml_tensor * out = ggml_mul_mat_id(ctx, mats.data(), n_mats, ids, v ? id/2 : id, b);
-        return out;
-    }
-
-    void initialize_tensors(ggml_context * ctx) override {
-        std::random_device rd;
-        std::default_random_engine rng(rd());
-        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
-            if (t->type == GGML_TYPE_I32) {
-                if (ggml_is_view_op(t->op)) { continue; }
-                // ids
-                for (int64_t r = 0; r < ggml_nrows(t); r++) {
-                    std::vector<int32_t> data(t->ne[0]);
-                    for (int i = 0; i < t->ne[0]; i++) {
-                        data[i] = i % n_mats;
-                    }
-                    std::shuffle(data.begin(), data.end(), rng);
-                    ggml_backend_tensor_set(t, data.data(), r * t->nb[1], t->ne[0] * sizeof(int32_t));
-                }
-            } else {
-                init_tensor_uniform(t);
-            }
-        }
-    }
-};
-
-// GGML_OP_SQR
-struct test_sqr : public test_case {
-    const ggml_type type;
-    const std::array<int64_t, 4> ne;
-
-    std::string vars() override {
-        return VARS_TO_STR2(type, ne);
-    }
-
-    test_sqr(ggml_type type = GGML_TYPE_F32,
-            std::array<int64_t, 4> ne = {10, 10, 10, 10})
-        : type(type), ne(ne) {}
-
-    ggml_tensor * build_graph(ggml_context * ctx) override {
-        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
-        ggml_tensor * out = ggml_sqr(ctx, a);
-        return out;
-    }
-};
-
-// GGML_OP_CLAMP
-struct test_clamp : public test_case {
-    const ggml_type type;
-    const std::array<int64_t, 4> ne;
-    float min;
-    float max;
-
-    std::string vars() override {
-        return VARS_TO_STR4(type, ne, min, max);
-    }
-
-    test_clamp(ggml_type type = GGML_TYPE_F32,
-            std::array<int64_t, 4> ne = {10, 10, 10, 10},
-            float min = -0.5f, float max = 0.5f)
-        : type(type), ne(ne), min(min), max(max) {}
-
-    ggml_tensor * build_graph(ggml_context * ctx) override {
-        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
-        ggml_tensor * out = ggml_clamp(ctx, a, min, max);
-        return out;
-    }
-};
-
-// GGML_OP_DIAG_MASK_INF
-struct test_diag_mask_inf : public test_case {
-    const ggml_type type;
-    const std::array<int64_t, 4> ne;
-    const int n_past;
-
-    std::string vars() override {
-        return VARS_TO_STR3(type, ne, n_past);
-    }
-
-    test_diag_mask_inf(ggml_type type = GGML_TYPE_F32,
-            std::array<int64_t, 4> ne = {10, 10, 10, 10},
-            int n_past = 5)
-        : type(type), ne(ne), n_past(n_past) {}
-
-    ggml_tensor * build_graph(ggml_context * ctx) override {
-        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
-        ggml_tensor * out = ggml_diag_mask_inf(ctx, a, n_past);
-        return out;
-    }
-};
-
-// GGML_OP_SOFT_MAX
-struct test_soft_max : public test_case {
-    const ggml_type type;
-    const std::array<int64_t, 4> ne;
-
-    std::string vars() override {
-        return VARS_TO_STR2(type, ne);
-    }
-
-    test_soft_max(ggml_type type = GGML_TYPE_F32,
-            std::array<int64_t, 4> ne = {10, 10, 10, 10})
-        : type(type), ne(ne) {}
-
-    ggml_tensor * build_graph(ggml_context * ctx) override {
-        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
-        ggml_tensor * out = ggml_soft_max(ctx, a);
-        return out;
-    }
-};
-
-// GGML_OP_ROPE
-struct test_rope : public test_case {
-    const ggml_type type;
-    const std::array<int64_t, 4> ne;
-    int n_dims;
-    int mode;
-    int n_ctx;
-
-    std::string vars() override {
-        return VARS_TO_STR5(type, ne, n_dims, mode, n_ctx);
-    }
-
-    test_rope(ggml_type type = GGML_TYPE_F32,
-            std::array<int64_t, 4> ne = {10, 10, 10, 1},
-            int n_dims = 10, int mode = 0, int n_ctx = 512)
-        : type(type), ne(ne), n_dims(n_dims), mode(mode), n_ctx(n_ctx) {}
-
-    ggml_tensor * build_graph(ggml_context * ctx) override {
-        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
-        ggml_tensor * pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, ne[2]);
-        ggml_tensor * out = ggml_rope(ctx, a, pos, n_dims, mode, n_ctx);
-        return out;
-    }
-
-    void initialize_tensors(ggml_context * ctx) override {
-        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
-            if (t->type == GGML_TYPE_I32) {
-                // pos
-                std::vector<int> data(ne[2]);
-                for (int i = 0; i < ne[2]; i++) {
-                    data[i] = rand() % n_ctx;
-                }
-                ggml_backend_tensor_set(t, data.data(), 0, ne[2] * sizeof(int));
-            } else {
-                init_tensor_uniform(t);
-            }
-        }
-    }
-};
-
-// GGML_OP_ALIBI
-struct test_alibi : public test_case {
-    const ggml_type type;
-    const std::array<int64_t, 4> ne;
-    int n_past;
-    int n_head;
-    float bias_max;
-
-    std::string vars() override {
-        return VARS_TO_STR5(type, ne, n_past, n_head, bias_max);
-    }
-
-    test_alibi(ggml_type type = GGML_TYPE_F32,
-            std::array<int64_t, 4> ne = {10, 10, 10, 10},
-            int n_past = 512, int n_head = 10, float bias_max = 0.5f)
-        : type(type), ne(ne), n_past(n_past), n_head(n_head), bias_max(bias_max) {}
-
-    ggml_tensor * build_graph(ggml_context * ctx) override {
-        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
-        ggml_tensor * out = ggml_alibi(ctx, a, n_past, n_head, bias_max);
-        return out;
-    }
-};
-
-// GGML_OP_IM2COL
-struct test_im2col : public test_case {
-    const ggml_type type_input;
-    const ggml_type type_kernel;
-    const std::array<int64_t, 4> ne_input;
-    const std::array<int64_t, 4> ne_kernel;
-    // stride
-    const int s0;
-    const int s1;
-    // padding
-    const int p0;
-    const int p1;
-    // dilatation
-    const int d0;
-    const int d1;
-    // mode
-    const bool is_2D;
-
-    std::string vars() override {
-        return VARS_TO_STR11(type_input, type_kernel, ne_input, ne_kernel, s0, s1, p0, p1, d0, d1, is_2D);
-    }
-
-    test_im2col(ggml_type type_input = GGML_TYPE_F32, ggml_type type_kernel = GGML_TYPE_F16,
-            std::array<int64_t, 4> ne_input = {10, 10, 3, 1}, // [input_width, input_height, input_channels, 1]
-            std::array<int64_t, 4> ne_kernel = {3, 3, 3, 1}, // [kernel_width, kernel_height, input_channels, 1]
-            int s0 = 1, int s1 = 1,
-            int p0 = 1, int p1 = 1,
-            int d0 = 1, int d1 = 1,
-            bool is_2D = true)
-        : type_input(type_input), type_kernel(type_kernel), ne_input(ne_input), ne_kernel(ne_kernel), s0(s0), s1(s1), p0(p0), p1(p1), d0(d0), d1(d1), is_2D(is_2D) {}
-
-    ggml_tensor * build_graph(ggml_context * ctx) override {
-        ggml_tensor * input = ggml_new_tensor(ctx, type_input, 4, ne_input.data());
-        ggml_tensor * kernel = ggml_new_tensor(ctx, type_kernel, 4, ne_kernel.data());
-        ggml_tensor * out = ggml_im2col(ctx, kernel, input, s0, s1, p0, p1, d0, d1, is_2D);
-        return out;
-    }
-};
-
-// GGML_OP_CONCAT
-struct test_concat : public test_case {
-    const ggml_type type;
-    const std::array<int64_t, 4> ne;
-    const int64_t b_ne2;
-
-    std::string vars() override {
-        return VARS_TO_STR3(type, ne, b_ne2);
-    }
-
-    test_concat(ggml_type type = GGML_TYPE_F32,
-            std::array<int64_t, 4> ne = {10, 10, 10, 10},
-            int64_t b_ne2 = 10)
-        : type(type), ne(ne), b_ne2(b_ne2) {}
-
-    ggml_tensor * build_graph(ggml_context * ctx) override {
-        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
-        ggml_tensor * b = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], b_ne2, ne[3]);
-        ggml_tensor * out = ggml_concat(ctx, a, b);
-        return out;
-    }
-};
-
-// GGML_OP_ARGSORT
-struct test_argsort : public test_case {
-    const ggml_type type;
-    const std::array<int64_t, 4> ne;
-    ggml_sort_order order;
-
-    std::string vars() override {
-        return VARS_TO_STR3(type, ne, order);
-    }
-
-    test_argsort(ggml_type type = GGML_TYPE_F32,
-            std::array<int64_t, 4> ne = {16, 10, 10, 10},
-            ggml_sort_order order = GGML_SORT_ASC)
-        : type(type), ne(ne), order(order) {}
-
-    ggml_tensor * build_graph(ggml_context * ctx) override {
-        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
-        ggml_tensor * out = ggml_argsort(ctx, a, order);
-        return out;
-    }
-
-    void initialize_tensors(ggml_context * ctx) override {
-        std::random_device rd;
-        std::default_random_engine rng(rd());
-        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
-            if (t->type == GGML_TYPE_I32) {
-                // indices
-                std::vector<int> data(ggml_nelements(t));
-                for (int i = 0; i < ggml_nelements(t); i++) {
-                    data[i] = rand();
-                }
-                std::shuffle(data.begin(), data.end(), rng);
-                ggml_backend_tensor_set(t, data.data(), 0, ne[0]*ne[1]*ne[2]*ne[3] * sizeof(int));
-            } else if (t->type == GGML_TYPE_F32) {
-                // initialize with unique values to avoid ties
-                for (int64_t r = 0; r < ggml_nrows(t); r++) {
-                    std::vector<float> data(t->ne[0]);
-                    for (int i = 0; i < t->ne[0]; i++) {
-                        data[i] = i;
-                    }
-                    std::shuffle(data.begin(), data.end(), rng);
-                    ggml_backend_tensor_set(t, data.data(), r * t->nb[1], t->ne[0] * sizeof(float));
-                }
-            } else {
-                GGML_ASSERT(false);
-            }
-        }
-    }
-};
-
-// GGML_OP_SUM_ROWS
-struct test_sum_rows : public test_case {
-    const ggml_type type;
-    const std::array<int64_t, 4> ne;
-
-    std::string vars() override {
-        return VARS_TO_STR2(type, ne);
-    }
-
-    test_sum_rows(ggml_type type = GGML_TYPE_F32,
-            std::array<int64_t, 4> ne = {10, 10, 10, 10})
-        : type(type), ne(ne) {}
-
-    ggml_tensor * build_graph(ggml_context * ctx) override {
-        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
-        ggml_tensor * out = ggml_sum_rows(ctx, a);
-        return out;
-    }
-};
-
-// GGML_OP_UPSCALE
-struct test_upscale : public test_case {
-    const ggml_type type;
-    const std::array<int64_t, 4> ne;
-    const int32_t scale_factor;
-
-    std::string vars() override {
-        return VARS_TO_STR3(type, ne, scale_factor);
-    }
-
-    test_upscale(ggml_type type = GGML_TYPE_F32,
-            std::array<int64_t, 4> ne = {512, 512, 3, 1},
-            int32_t scale_factor = 2)
-        : type(type), ne(ne), scale_factor(scale_factor) {}
-
-    ggml_tensor * build_graph(ggml_context * ctx) override {
-        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
-        ggml_tensor * out = ggml_upscale(ctx, a, scale_factor);
-        return out;
-    }
-};
-
-// GGML_OP_GROUP_NORM
-struct test_group_norm : public test_case {
-    const ggml_type type;
-    const std::array<int64_t, 4> ne;
-    const int32_t num_groups;
-
-    std::string vars() override {
-        return VARS_TO_STR3(type, ne, num_groups);
-    }
-
-    test_group_norm(ggml_type type = GGML_TYPE_F32,
-            std::array<int64_t, 4> ne = {64, 64, 320, 1},
-            int32_t num_groups = 32)
-        : type(type), ne(ne), num_groups(num_groups) {}
-
-    ggml_tensor * build_graph(ggml_context * ctx) override {
-        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
-        ggml_tensor * out = ggml_group_norm(ctx, a, num_groups);
-        return out;
-    }
-};
-
-// GGML_OP_ACC
-struct test_acc : public test_case {
-    const ggml_type type;
-    const std::array<int64_t, 4> ne_a;
-    const std::array<int64_t, 4> ne_b;
-
-    std::string vars() override {
-        return VARS_TO_STR3(type, ne_a, ne_b);
-    }
-
-    test_acc(ggml_type type = GGML_TYPE_F32,
-            std::array<int64_t, 4> ne_a = {1024, 577, 1, 1},
-            std::array<int64_t, 4> ne_b = {1024, 576, 1, 1})
-        : type(type), ne_a(ne_a), ne_b(ne_b) {}
-
-    ggml_tensor * build_graph(ggml_context * ctx) override {
-        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
-        ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne_b.data());
-        ggml_tensor * out = ggml_acc(ctx, a, b, a->nb[1], a->nb[2], a->nb[3], b->nb[1]);
-        return out;
-    }
-};
-
-// GGML_OP_PAD
-struct test_pad : public test_case {
-    const ggml_type type;
-    const std::array<int64_t, 4> ne_a;
-    const int pad_0;
-    const int pad_1;
-
-    std::string vars() override {
-        return VARS_TO_STR4(type, ne_a, pad_0, pad_1);
-    }
-
-    test_pad(ggml_type type = GGML_TYPE_F32,
-            std::array<int64_t, 4> ne_a = {512, 512, 1, 1},
-            int pad_0 = 1, int pad_1 = 1)
-        : type(type), ne_a(ne_a), pad_0(pad_0), pad_1(pad_1)  {}
-
-    ggml_tensor * build_graph(ggml_context * ctx) override {
-        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
-        ggml_tensor * out = ggml_pad(ctx, a, pad_0, pad_1, 0, 0);
-        return out;
-    }
-};
-
-// GGML_OP_LEAKY_RELU
-struct test_leaky_relu : public test_case {
-    const ggml_type type;
-    const std::array<int64_t, 4> ne_a;
-    const float negative_slope;
-
-    std::string vars() override {
-        return VARS_TO_STR3(type, ne_a, negative_slope);
-    }
-
-    test_leaky_relu(ggml_type type = GGML_TYPE_F32,
-            std::array<int64_t, 4> ne_a = {10, 10, 10, 10},
-            float negative_slope = 0.1f)
-        : type(type), ne_a(ne_a), negative_slope(negative_slope)  {}
-
-    ggml_tensor * build_graph(ggml_context * ctx) override {
-        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
-        ggml_tensor * out = ggml_leaky_relu(ctx, a, negative_slope, true);
-        return out;
-    }
-};
-
-// Mixtral MOE
-struct test_moe : public test_case {
-    const int n_experts;
-    const int n_experts_per_tok;
-    const int n_tokens;
-    const int n_embd;
-    const int n_ff;
-
-    std::string op_desc(ggml_tensor * t) override {
-        return "MOE";
-
-        GGML_UNUSED(t);
-    }
-
-    std::string vars() override {
-        return VARS_TO_STR5(n_experts, n_experts_per_tok, n_tokens, n_embd, n_ff);
-    }
-
-    test_moe(int n_experts = 8, int n_experts_per_tok = 2, int n_tokens = 1, int n_embd = 4096, int n_ff = 14336)
-        : n_experts(n_experts), n_experts_per_tok(n_experts_per_tok), n_tokens(n_tokens), n_embd(n_embd), n_ff(n_ff) {
-    }
-
-    ggml_tensor * build_graph(ggml_context * ctx) override {
-        ggml_tensor * ffn_gate_inp = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_experts);
-
-        std::vector<ggml_tensor *> ffn_up_exp(n_experts);
-        std::vector<ggml_tensor *> ffn_gate_exp(n_experts);
-        std::vector<ggml_tensor *> ffn_down_exp(n_experts);
-
-        for (int i = 0; i < n_experts; ++i) {
-            ffn_up_exp[i] = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff);
-            ffn_gate_exp[i] = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff);
-            ffn_down_exp[i] = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_ff, n_embd);
-        }
-
-        ggml_tensor * cur = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_tokens);
-
-        ggml_tensor * logits = ggml_mul_mat(ctx, ffn_gate_inp, cur);
-        ggml_tensor * probs = ggml_soft_max_ext(ctx, logits, nullptr, 1.0f/sqrtf(n_embd));
-
-        // select experts
-        ggml_tensor * selected_experts = ggml_top_k(ctx, probs, n_experts_per_tok);
-
-        ggml_tensor * weights = ggml_get_rows(ctx,
-                ggml_reshape_3d(ctx, probs, 1, n_experts, n_tokens), selected_experts);
-
-        weights = ggml_reshape_2d(ctx, weights, n_experts_per_tok, n_tokens);
-
-        ggml_tensor * weights_sum = ggml_sum_rows(ctx, weights);
-
-        weights = ggml_div(ctx, weights, weights_sum);
-
-        // compute expert outputs
-        ggml_tensor * moe_out = nullptr;
-
-        for (int i = 0; i < n_experts_per_tok; ++i) {
-            ggml_tensor * cur_expert;
-
-            ggml_tensor * cur_up = ggml_mul_mat_id(ctx, ffn_up_exp.data(), n_experts, selected_experts, i, cur);
-
-            ggml_tensor * cur_gate = ggml_mul_mat_id(ctx, ffn_gate_exp.data(), n_experts, selected_experts, i, cur);
-
-            cur_gate = ggml_silu(ctx, cur_gate);
-
-            cur_expert = ggml_mul(ctx, cur_up, cur_gate);
-
-            cur_expert = ggml_mul_mat_id(ctx, ffn_down_exp.data(), n_experts, selected_experts, i, cur_expert);
-
-            cur_expert = ggml_mul(ctx, cur_expert,
-                    ggml_view_2d(ctx, weights, 1, n_tokens, weights->nb[1], i*weights->nb[0]));
-
-            if (i == 0) {
-                moe_out = cur_expert;
-            } else {
-                moe_out = ggml_add(ctx, moe_out, cur_expert);
-            }
-        }
-
-        cur = moe_out;
-
-        return cur;
-    }
-};
-
-static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op_name) {
-    std::vector<std::unique_ptr<test_case>> test_cases;
-    std::default_random_engine rng(0);
-
-    const ggml_type all_types[] = {
-        GGML_TYPE_F32, GGML_TYPE_F16,
-        GGML_TYPE_Q4_0, GGML_TYPE_Q4_1,
-        GGML_TYPE_Q5_0, GGML_TYPE_Q5_1,
-        GGML_TYPE_Q8_0,
-        GGML_TYPE_Q2_K, GGML_TYPE_Q3_K,
-        GGML_TYPE_Q4_K, GGML_TYPE_Q5_K,
-        GGML_TYPE_Q6_K
-    };
-
-    // unary ops
-    for (int op = 0; op < GGML_UNARY_OP_COUNT; op++) {
-        test_cases.emplace_back(new test_unary((ggml_unary_op) op));
-    }
-
-    test_cases.emplace_back(new test_get_rows(GGML_TYPE_F32, 1, 8, 2, 1, false));
-    for (ggml_type type : all_types) {
-        for (int b : {1, 7}) {
-            for (bool v : {false, true}) {
-                test_cases.emplace_back(new test_get_rows(type, 256, 5, 4, b, v));
-            }
-        }
-    }
-    for (int b : {1, 7}) {
-        for (bool v : {false, true}) {
-            test_cases.emplace_back(new test_get_rows(GGML_TYPE_I32, 256, 5, 4, b, v));
-        }
-    }
-
-    test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {1, 1, 1, 1}));
-    test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {2, 1, 1, 1}));
-    test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {1, 2, 1, 1}));
-    test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {1, 1, 2, 1}));
-    test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {1, 1, 1, 2}));
-    test_cases.emplace_back(new test_repeat(GGML_TYPE_I32, {10, 10, 10, 10}, {2, 1, 1, 1}));
-    test_cases.emplace_back(new test_repeat(GGML_TYPE_I16, {10, 10, 10, 10}, {1, 1, 1, 2}));
-
-    test_cases.emplace_back(new test_dup(GGML_TYPE_F32));
-    test_cases.emplace_back(new test_dup(GGML_TYPE_F16));
-    test_cases.emplace_back(new test_dup(GGML_TYPE_I32));
-    test_cases.emplace_back(new test_dup(GGML_TYPE_I16));
-    test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10, 8, 3, 1}, {0, 2, 1, 3}));
-    test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10, 8, 3, 1}, {1, 2, 0, 3}));
-
-    for (ggml_type type : all_types) {
-       test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, type, {256, 10, 10, 1}));
-    }
-
-    test_cases.emplace_back(new test_cont());
-
-    auto add_test_bin_bcast = [&](ggml_type type, std::array<int64_t, 4> ne, std::array<int, 4> nr) {
-        for (auto op : {ggml_add, ggml_mul, ggml_div}) {
-            test_cases.emplace_back(new test_bin_bcast(op, type, ne, nr));
-        }
-    };
-
-    add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 8, 1}, {1, 1, 1, 1});
-    add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 1, 1}, {32, 1, 1, 1});
-    add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 320, 320}, {1, 1, 1, 1});
-    add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 1, 1}, {1, 1, 1, 1});
-    add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 1}, {1, 1, 1, 1});
-    add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 10}, {1, 1, 1, 1});
-    add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 10}, {2, 1, 1, 1});
-    add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 10}, {1, 2, 1, 1});
-    add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 10}, {1, 1, 2, 1});
-    add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 10}, {1, 1, 1, 2});
-    add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 10}, {1, 1, 2, 2});
-    add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 10}, {1, 2, 2, 2});
-    add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 10}, {2, 2, 2, 2});
-
-    // stable diffusion
-    add_test_bin_bcast(GGML_TYPE_F32, {1280, 1, 1, 1}, {1, 1, 1, 1});
-    add_test_bin_bcast(GGML_TYPE_F32, {1280, 1, 1, 1}, {1, 16, 16, 1});
-    add_test_bin_bcast(GGML_TYPE_F32, {1280, 16, 16, 1}, {1, 1, 1, 1});
-    add_test_bin_bcast(GGML_TYPE_F32, {1280, 1, 1, 1}, {1, 256, 1, 1});
-    add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 1280, 1}, {16, 16, 1, 1});
-    add_test_bin_bcast(GGML_TYPE_F32, {16, 16, 1280, 1}, {1, 1, 1, 1});
-    add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 1920, 1}, {16, 16, 1, 1});
-    add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 2560, 1}, {16, 16, 1, 1});
-    add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 1280, 1}, {32, 32, 1, 1});
-    add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 1920, 1}, {32, 32, 1, 1});
-    add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 640, 1}, {32, 32, 1, 1});
-    add_test_bin_bcast(GGML_TYPE_F32, {5120, 1, 1, 1}, {1, 256, 1, 1});
-    add_test_bin_bcast(GGML_TYPE_F32, {640, 1, 1, 1}, {1, 1, 1, 1});
-    //add_test_bin_bcast(GGML_TYPE_F32, {3, 3, 2560, 1280}, {1, 1, 1, 1});
-    //add_test_bin_bcast(GGML_TYPE_F32, {3, 3, 2560, 1280}, {2, 1, 1, 1});
-
-    test_cases.emplace_back(new test_scale());
-
-    for (float eps : {1e-6f, 1e-5f, 1e-3f, 1e-1f}) {
-        test_cases.emplace_back(new test_norm(GGML_TYPE_F32, {64, 10, 10, 10}, eps));
-        test_cases.emplace_back(new test_rms_norm(GGML_TYPE_F32, {64, 10, 10, 10}, eps));
-    }
-
-    for (ggml_type type_a : all_types) {
-        for (ggml_type type_b : {GGML_TYPE_F32, GGML_TYPE_F16}) {
-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, { 1,  1}, {1, 1}));
-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10,  1}, {1, 1}));
-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10,  1}, {2, 1}));
-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 10}, {1, 1}));
-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 10}, {2, 1}));
-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 10}, {1, 2}));
-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 10}, {2, 2}));
-
-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, { 1,  1}, {1, 1}));
-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10,  1}, {1, 1}));
-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10,  1}, {2, 1}));
-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10, 10}, {1, 1}));
-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10, 10}, {2, 1}));
-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10, 10}, {1, 2}));
-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10, 10}, {2, 2}));
-        }
-    }
-
-    for (ggml_type type_a : all_types) {
-        for (ggml_type type_b : {GGML_TYPE_F32 /*, GGML_TYPE_F16 */}) {
-            for (int n_mats : {2, 4, 8}) {
-                for (int id = 0; id < n_mats; id++) {
-                    for (bool v : {false, true}) {
-                        test_cases.emplace_back(new test_mul_mat_id(type_a, type_b, n_mats, id, 16, 16, 256, v));
-                    }
-                }
-            }
-        }
-    }
-
-    test_cases.emplace_back(new test_sqr());
-    test_cases.emplace_back(new test_clamp());
-
-    test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, {10, 10,  1,  1}, 5));
-    test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, {10, 10, 10,  1}, 5));
-    test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, {10, 10, 10, 10}, 5));
-
-    std::uniform_int_distribution<> dist_ne1(1, 50);
-    int exponent = 1;
-    while (exponent < (1 << 17)) {
-        std::uniform_int_distribution<> dist_ne0(exponent, 2*exponent);
-
-        for (int n = 0; n < 10; ++n) {
-            int64_t ne0 = dist_ne0(rng);
-            int64_t ne1 = dist_ne1(rng);
-            test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {ne0, ne1, 1, 1}));
-        }
-
-        exponent <<= 1;
-    }
-
-    for (ggml_type type : {GGML_TYPE_F32, GGML_TYPE_F16}) {
-        test_cases.emplace_back(new test_rope(type, {128,  32, 10, 1}, 128, 0, 512)); // llama 7B
-        test_cases.emplace_back(new test_rope(type, {128,  40, 10, 1}, 128, 0, 512)); // llama 13B
-        test_cases.emplace_back(new test_rope(type, {128,  52, 10, 1}, 128, 0, 512)); // llama 30B
-        test_cases.emplace_back(new test_rope(type, {128,  64, 10, 1}, 128, 0, 512)); // llama 65B
-        test_cases.emplace_back(new test_rope(type, { 64,   1, 10, 1},  64, 2, 512)); // neox (falcon 7B)
-        test_cases.emplace_back(new test_rope(type, { 64,  71, 10, 1},  64, 2, 512)); // neox (falcon 7B)
-        test_cases.emplace_back(new test_rope(type, { 64,   8, 10, 1},  64, 2, 512)); // neox (falcon 40B)
-        test_cases.emplace_back(new test_rope(type, { 64, 128, 10, 1},  64, 2, 512)); // neox (falcon 40B)
-        test_cases.emplace_back(new test_rope(type, { 80,  32, 10, 1},  20, 2, 512)); // neox (stablelm)
-        test_cases.emplace_back(new test_rope(type, { 80,  32, 10, 1},  32, 2, 512)); // neox (phi-2)
-    }
-
-    test_cases.emplace_back(new test_alibi());
-    test_cases.emplace_back(new test_im2col());
-    test_cases.emplace_back(new test_concat(GGML_TYPE_F32));
-    test_cases.emplace_back(new test_concat(GGML_TYPE_I32));
-
-    for (ggml_sort_order order : {GGML_SORT_ASC, GGML_SORT_DESC}) {
-        test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {8, 1, 1, 1}, order));
-        test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {16, 10, 10, 10}, order));
-    }
-
-    test_cases.emplace_back(new test_sum_rows());
-    test_cases.emplace_back(new test_upscale());
-    test_cases.emplace_back(new test_group_norm());
-    test_cases.emplace_back(new test_acc());
-    test_cases.emplace_back(new test_pad());
-    test_cases.emplace_back(new test_leaky_relu());
-
-#if !defined(__SANITIZE_THREAD__)
-    // FIXME: these tests use too much memory with thread sanitizer
-    test_cases.emplace_back(new test_moe(8, 2, 1, 4096, 8*1024));
-    //test_cases.emplace_back(new test_moe(8, 2, 8, 4096, 14336));
-#endif
-
-    // run tests
-    if (mode == MODE_TEST) {
-        ggml_backend_t backend_cpu = ggml_backend_cpu_init();
-
-        size_t n_ok = 0;
-        for (auto & test : test_cases) {
-            if (test->eval(backend, backend_cpu, op_name)) {
-                n_ok++;
-            }
-        }
-        printf("  %zu/%zu tests passed\n", n_ok, test_cases.size());
-
-        ggml_backend_free(backend_cpu);
-
-        return n_ok == test_cases.size();
-    }
-
-    if (mode == MODE_PERF) {
-        for (auto & test : test_cases) {
-            test->eval_perf(backend, op_name);
-        }
-        return true;
-    }
-
-    GGML_ASSERT(false);
-    return false;
-}
-
-static void usage(char ** argv) {
-    printf("Usage: %s [mode] [-o op] [-b backend]\n", argv[0]);
-    printf("  valid modes are: test (compare with CPU backend for correctness) or perf (performance evaluation)\n");
-    printf("  op names are as given by ggml_op_desc()\n");
-}
-
-int main(int argc, char ** argv) {
-    test_mode mode = MODE_TEST;
-    const char * op_name = NULL;
-    const char * backend = NULL;
-
-    for (int i = 1; i < argc; i++) {
-        if (strcmp(argv[i], "test") == 0) {
-            mode = MODE_TEST;
-        } else if (strcmp(argv[i], "perf") == 0) {
-            mode = MODE_PERF;
-        } else if (strcmp(argv[i], "-o") == 0) {
-            if (i + 1 < argc) {
-                op_name = argv[++i];
-            } else {
-                usage(argv);
-                return 1;
-            }
-        } else if (strcmp(argv[i], "-b") == 0) {
-            if (i + 1 < argc) {
-                backend = argv[++i];
-            } else {
-                usage(argv);
-                return 1;
-            }
-        } else {
-            usage(argv);
-            return 1;
-        }
-    }
-
-    // enumerate backends
-    printf("Testing %zu backends\n\n", ggml_backend_reg_get_count());
-
-    size_t n_ok = 0;
-
-    for (size_t i = 0; i < ggml_backend_reg_get_count(); i++) {
-        printf("Backend %zu/%zu (%s)\n", i + 1, ggml_backend_reg_get_count(), ggml_backend_reg_get_name(i));
-
-        if (backend != NULL && strcmp(backend, ggml_backend_reg_get_name(i)) != 0) {
-            printf("  Skipping\n");
-            n_ok++;
-            continue;
-        }
-
-        ggml_backend_t backend = ggml_backend_reg_init_backend(i, NULL);
-        GGML_ASSERT(backend != NULL);
-        printf("  Backend name: %s\n", ggml_backend_name(backend));
-
-        bool ok = test_backend(backend, mode, op_name);
-
-        printf("  Backend %s: ", ggml_backend_name(backend));
-        if (ok) {
-            printf("\033[1;32mOK\033[0m\n");
-            n_ok++;
-        } else {
-            printf("\033[1;31mFAIL\033[0m\n");
-        }
-
-        printf("\n");
-
-        ggml_backend_free(backend);
-    }
-
-    printf("%zu/%zu backends passed\n", n_ok, ggml_backend_reg_get_count());
-
-    if (n_ok != ggml_backend_reg_get_count()) {
-        printf("\033[1;31mFAIL\033[0m\n");
-        return 1;
-    }
-
-    printf("\033[1;32mOK\033[0m\n");
-    return 0;
-}
diff --git a/ggml/tests/test-blas0.c b/ggml/tests/test-blas0.c
deleted file mode 100644
index 2050ad2..0000000
--- a/ggml/tests/test-blas0.c
+++ /dev/null
@@ -1,269 +0,0 @@
-#include "ggml.h"
-
-#include <stdint.h>
-#include <stdio.h>
-#include <assert.h>
-#include <stdlib.h>
-#include <string.h>
-#include <time.h>
-#include <math.h>
-
-#include <sys/time.h>
-
-#include <arm_neon.h>
-
-#include <Accelerate/Accelerate.h>
-
-uint64_t get_time_us(void) {
-    struct timeval tv;
-    gettimeofday(&tv, NULL);
-    return tv.tv_sec * 1000000 + tv.tv_usec;
-}
-
-//
-// naive implementation
-//
-
-void mul_mat_f32_0(
-    const float * restrict src0, // M x K
-    const float * restrict src1, // N x K (transposed)
-    float * dst,
-    int m, int n, int k) {
-    for (int i = 0; i < m; i++) {
-        for (int j = 0; j < n; j++) {
-            float sum = 0;
-            for (int l = 0; l < k; l++) {
-                sum += src0[i*k + l] * src1[j*k + l];
-            }
-            dst[j*m + i] = sum;
-        }
-    }
-}
-
-int main(int argc, const char ** argv) {
-    if (argc < 4) {
-        printf("Usage: %s M N K\n", argv[0]);
-        return 1;
-    }
-
-    const int n_threads = 1;
-
-    int M = atoi(argv[1]);
-    int N = atoi(argv[2]);
-    int K = atoi(argv[3]);
-
-    srand(time(NULL));
-
-    if (M == 0) M = rand() % 1000 + 1;
-    if (N == 0) N = rand() % 1000 + 1;
-    if (K == 0) K = rand() % 1000 + 1;
-
-    printf("M = %d, N = %d, K = %d\n", M, N, K);
-
-    float * src0 = malloc(sizeof(float)*M*K);
-    float * src1 = malloc(sizeof(float)*N*K);
-    float * dst0 = malloc(sizeof(float)*M*N); // naive
-    float * dst1 = malloc(sizeof(float)*M*N); // blas
-
-    struct ggml_init_params params = {
-        .mem_size   = 2048ul*1024*1024,
-        .mem_buffer = NULL,
-        .no_alloc   = false,
-    };
-
-    struct ggml_context * ctx0 = ggml_init(params);
-
-    struct ggml_tensor * s0_f32 = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, K, M);
-    struct ggml_tensor * s1_f32 = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, K, N);
-
-    struct ggml_tensor * s0_f16 = ggml_new_tensor_2d(ctx0, GGML_TYPE_F16, K, M);
-    struct ggml_tensor * s1_f16 = ggml_new_tensor_2d(ctx0, GGML_TYPE_F16, K, N);
-
-    for (int j = 0; j < M; j++) {
-        for (int i = 0; i < K; i++) {
-            //src0[j*K + i] = j;
-            src0[j*K + i] = 1e-3*(rand() % 1000);
-        }
-    }
-
-    for (int j = 0; j < N; j++) {
-        for (int i = 0; i < K; i++) {
-            //src1[j*K + i] = j + 1;
-            src1[j*K + i] = 1e-3*(rand() % 1000);
-        }
-    }
-
-    // copy src0 to s0_f32
-    {
-        float       * p_f32 = s0_f32->data;
-        ggml_fp16_t * p_f16 = s0_f16->data;
-        for (int i = 0; i < M; i++) {
-            for (int j = 0; j < K; j++) {
-                p_f32[i*K + j] = src0[i*K + j];
-                p_f16[i*K + j] = ggml_fp32_to_fp16(src0[i*K + j]);
-            }
-        }
-    }
-
-    // copy src1 to s1_f32
-    {
-        float       * p_f32 = s1_f32->data;
-        ggml_fp16_t * p_f16 = s1_f16->data;
-        for (int i = 0; i < N; i++) {
-            for (int j = 0; j < K; j++) {
-                p_f32[i*K + j] = src1[i*K + j];
-                p_f16[i*K + j] = ggml_fp32_to_fp16(src1[i*K + j]);
-            }
-        }
-    }
-
-    const clock_t start = clock();
-    const uint64_t start_us = get_time_us();
-
-    double iM = 1.0/M;
-    mul_mat_f32_0(src0, src1, dst0, M, N, K);
-
-    // Use BLAS sgemm from Accelerate framework
-    cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, N, M, K, 1.0f, src1, K, src0, K, 0.0f, dst1, M);
-
-    struct ggml_tensor * dst2 = NULL;
-    struct ggml_tensor * dst3 = NULL;
-
-    {
-        dst2 = ggml_mul_mat(ctx0, s0_f32, s1_f32);
-
-        struct ggml_cgraph * gf = ggml_new_graph(ctx0);
-        ggml_build_forward_expand(gf, dst2);
-        ggml_graph_compute_with_ctx(ctx0, gf, n_threads);
-    }
-
-    {
-        dst3 = ggml_mul_mat(ctx0, s0_f16, s1_f32);
-
-        struct ggml_cgraph * gf = ggml_new_graph(ctx0);
-        ggml_build_forward_expand(gf, dst3);
-        ggml_graph_compute_with_ctx(ctx0, gf, n_threads);
-    }
-
-    bool ok_blas = true;
-    bool ok_ggml_f32 = true;
-    bool ok_ggml_f16 = true;
-
-    // check BLAS
-    for (int i = 0; i < M*N; i++) {
-        if (fabs(dst0[i] - dst1[i])/fabs(dst0[i]) > 0.0001) {
-            printf("dst0[%d] = %f, dst1[%d] = %f\n", i, dst0[i], i, dst1[i]);
-            ok_blas = false;
-        }
-    }
-
-    // check ggml (f32)
-    {
-        float * p = dst2->data;
-        for (int i = 0; i < M*N; i++) {
-            if (fabs(dst0[i] - p[i])/fabs(dst0[i]) > 0.0001) {
-                printf("dst0[%d] = %f, dst2[%d] = %f\n", i, dst0[i], i, p[i]);
-                ok_ggml_f32 = false;
-            }
-        }
-    }
-
-    // check ggml (f16)
-    {
-        float * p = dst3->data;
-        for (int i = 0; i < M*N; i++) {
-            if (fabs(dst0[i] - p[i])/fabs(dst0[i]) > 0.01) {
-                printf("dst0[%d] = %f, dst3[%d] = %f\n", i, dst0[i], i, p[i]);
-                ok_ggml_f16 = false;
-            }
-        }
-    }
-
-    {
-        const clock_t end = clock();
-        const uint64_t end_us = get_time_us();
-        printf("%s: elapsed ticks: %ld\n",  __func__, end - start);
-    }
-
-#if 0
-    // print src0
-    printf("src0:\n");
-    for (int i = 0; i < M; i++) {
-        for (int j = 0; j < K; j++) {
-            printf("%4.1f ", src0[i*K+j]);
-        }
-        printf("\n");
-    }
-
-    // print src1
-    printf("src1:\n");
-    for (int i = 0; i < N; i++) {
-        for (int j = 0; j < K; j++) {
-            printf("%4.1f ", src1[i*K+j]);
-        }
-        printf("\n");
-    }
-
-    printf("\n");
-    printf("dst0 (naive):\n");
-    for (int j = 0; j < N; j++) {
-        for (int i = 0; i < M; i++) {
-            printf("%4.1f ", dst0[j*M+i]);
-        }
-        printf("\n");
-    }
-
-    printf("\n");
-    printf("dst1 (BLAS):\n");
-    for (int j = 0; j < N; j++) {
-        for (int i = 0; i < M; i++) {
-            printf("%4.1f ", dst1[j*M+i]);
-        }
-        printf("\n");
-    }
-
-    printf("\n");
-    printf("dst2 (ggml f32):\n");
-    for (int j = 0; j < N; j++) {
-        for (int i = 0; i < M; i++) {
-            printf("%4.1f ", ((float *)dst2->data)[j*M+i]);
-        }
-        printf("\n");
-    }
-
-    printf("\n");
-    printf("dst3 (ggml f16):\n");
-    for (int j = 0; j < N; j++) {
-        for (int i = 0; i < M; i++) {
-            printf("%4.1f ", ((float *)dst3->data)[j*M+i]);
-        }
-        printf("\n");
-    }
-
-    printf("\n");
-#endif
-
-    free(src0);
-    free(src1);
-    free(dst0);
-    free(dst1);
-
-    ggml_free(ctx0);
-
-    printf("ok_blas = %d\n", ok_blas);
-    if (!ok_blas) {
-        printf("ERROR: BLAS failed\n");
-    }
-
-    printf("ok_ggml_f32 = %d\n", ok_ggml_f32);
-    if (!ok_ggml_f32) {
-        printf("ERROR: ggml failed\n");
-    }
-
-    printf("ok_ggml_f16 = %d\n", ok_ggml_f16);
-    if (!ok_ggml_f16) {
-        printf("ERROR: ggml failed\n");
-    }
-
-    return (ok_blas && ok_ggml_f32 && ok_ggml_f16) ? 0 : 1;
-}
diff --git a/ggml/tests/test-conv-transpose.c b/ggml/tests/test-conv-transpose.c
deleted file mode 100644
index 116266e..0000000
--- a/ggml/tests/test-conv-transpose.c
+++ /dev/null
@@ -1,247 +0,0 @@
-#include "ggml/ggml.h"
-
-#include <string.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-struct ggml_context* make_ctx(void) {
-    struct ggml_init_params params = {
-        .mem_size = 2 * 1024 * 1024,
-    };
-
-    return ggml_init(params);
-}
-
-void printf_tensor(struct ggml_tensor * t) {
-    if (t->type == GGML_TYPE_F32) {
-        const float * t_d = ggml_get_data_f32(t);
-        for (int i = 0; i < t->ne[2]; ++i) {
-            for (int j = 0; j < t->ne[1]; ++j) {
-                for (int k = 0; k < t->ne[0]; ++k) {
-                    printf("%.1f ", t_d[i * t->ne[1] * t->ne[0] + j * t->ne[0] + k]);
-                }
-                printf("\n");
-            }
-            printf("---\n");
-        }
-    }
-    else if (t->type == GGML_TYPE_F16) {
-        const ggml_fp16_t * t_d = ggml_get_data(t);
-        for (int i = 0; i < t->ne[2]; ++i) {
-            for (int j = 0; j < t->ne[1]; ++j) {
-                for (int k = 0; k < t->ne[0]; ++k) {
-                    printf("%.1f ", ggml_fp16_to_fp32(t_d[i * t->ne[1] * t->ne[0] + j * t->ne[0] + k]));
-                }
-                printf("\n");
-            }
-            printf("---\n");
-        }
-    }
-    else {
-        printf("unknown type\n");
-    }
-}
-
-void check_tensor(struct ggml_tensor * t, float * expected_t_d, int ne0, int ne1, int ne2) {
-    GGML_ASSERT(t->type == GGML_TYPE_F32);
-    GGML_ASSERT(t->ne[0] == ne0);
-    GGML_ASSERT(t->ne[1] == ne1);
-    GGML_ASSERT(t->ne[2] == ne2);
-    for (int i2 = 0; i2 < ne2; ++i2) {
-        for (int i1 = 0; i1 < ne1; ++i1) {
-            for (int i0 = 0; i0 < ne0; ++i0) {
-                float expected = *(expected_t_d + i2 * ne1 * ne0 + i1 * ne0 + i0);
-                float actual = ggml_get_data_f32(t)[i2 * ne1 * ne0 + i1 * ne0 + i0];
-                if (expected != actual) {
-                    printf("expected %.1f, got %.1f\n", expected, actual);
-                }
-                GGML_ASSERT(expected == actual);
-            }
-        }
-    }
-}
-
-void test_conv_transpose_1d(void) {
-
-    float buf_f32[1024];
-    for (int i = 0; i < 1024; ++i) {
-        buf_f32[i] = (float)i;
-    }
-
-    ggml_fp16_t buf_f16[1024];
-    for (int i = 0; i < 1024; ++i) {
-        buf_f16[i] = ggml_fp32_to_fp16((float)i);
-    }
-
-    float expected_out_1[3][4] = {
-        {18.0, 45.0, 59.0, 37.0},
-        {24.0, 61.0, 83.0, 51.0},
-        {30.0, 77.0, 107.0, 65.0},
-    };
-    float expected_out_2[3][6] = {
-        {18.0, 21.0, 24.0, 29.0, 30.0, 37.0},
-        {24.0, 27.0, 34.0, 39.0, 44.0, 51.0},
-        {30.0, 33.0, 44.0, 49.0, 58.0, 65.0},
-    };
-    float expected_out_3[3][8] = {
-        {18.0, 21.0, 0.0, 24.0, 29.0, 0.0, 30.0, 37.0},
-        {24.0, 27.0, 0.0, 34.0, 39.0, 0.0, 44.0, 51.0},
-        {30.0, 33.0, 0.0, 44.0, 49.0, 0.0, 58.0, 65.0},
-    };
-
-    // conv transpose 1d with stride 1, 2 & 3
-    {
-        struct ggml_context * ctx = make_ctx();
-
-        struct ggml_tensor * t = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 3, 2); // l x cin
-        memcpy(t->data, buf_f32, ggml_nbytes(t));
-
-        struct ggml_tensor * k = ggml_new_tensor_3d(ctx, GGML_TYPE_F16, 2, 3, 2); // k x cout x cin
-        memcpy(k->data, buf_f16, ggml_nbytes(k));
-
-        struct ggml_tensor * out_1 = ggml_conv_transpose_1d(ctx, k, t, 1 /* s0 */, 0 /* p0 */, 1 /* d0 */);
-        struct ggml_tensor * out_2 = ggml_conv_transpose_1d(ctx, k, t, 2 /* s0 */, 0 /* p0 */, 1 /* d0 */);
-        struct ggml_tensor * out_3 = ggml_conv_transpose_1d(ctx, k, t, 3 /* s0 */, 0 /* p0 */, 1 /* d0 */);
-
-        struct ggml_cgraph * gf_1 = ggml_new_graph(ctx);
-        struct ggml_cgraph * gf_2 = ggml_new_graph(ctx);
-        struct ggml_cgraph * gf_3 = ggml_new_graph(ctx);
-
-        ggml_build_forward_expand(gf_1, out_1);
-        ggml_build_forward_expand(gf_2, out_2);
-        ggml_build_forward_expand(gf_3, out_3);
-
-        ggml_graph_compute_with_ctx(ctx, gf_1, 1);
-        ggml_graph_compute_with_ctx(ctx, gf_2, 1);
-        ggml_graph_compute_with_ctx(ctx, gf_3, 1);
-
-        check_tensor(out_1, (float*)expected_out_1, 4, 3, 1);
-        check_tensor(out_2, (float*)expected_out_2, 6, 3, 1);
-        check_tensor(out_3, (float*)expected_out_3, 8, 3, 1);
-    }
-}
-
-void test_conv_transpose_2d(void) {
-
-    float buf_f32[1024];
-    for (int i = 0; i < 1024; ++i) {
-        buf_f32[i] = (float)i;
-    }
-
-    ggml_fp16_t buf_f16[1024];
-    for (int i = 0; i < 1024; ++i) {
-        buf_f16[i] = ggml_fp32_to_fp16((float)i);
-    }
-
-    float expected_out_1[3][3][4] = {
-        {
-            {72.0, 162.0, 188.0, 106.0},
-            {192.0, 430.0, 490.0, 274.0},
-            {132.0, 292.0, 326.0, 180.0},
-        },
-        {
-            {96.0, 218.0, 260.0, 146.0},
-            {264.0, 590.0, 682.0, 378.0},
-            {180.0, 396.0, 446.0, 244.0},
-        },
-        {
-            {120.0, 274.0, 332.0, 186.0},
-            {336.0, 750.0, 874.0, 482.0},
-            {228.0, 500.0, 566.0, 308.0},
-        },
-    };
-
-    float expected_out_2[3][4][6] = {
-        {
-            {72.0, 78.0, 84.0, 92.0, 96.0, 106.0},
-            {84.0, 90.0, 100.0, 108.0, 116.0, 126.0},
-            {108.0, 120.0, 120.0, 134.0, 132.0, 148.0},
-            {132.0, 144.0, 148.0, 162.0, 164.0, 180.0},
-        },
-        {
-            {96.0, 102.0, 116.0, 124.0, 136.0, 146.0},
-            {108.0, 114.0, 132.0, 140.0, 156.0, 166.0},
-            {156.0, 168.0, 176.0, 190.0, 196.0, 212.0},
-            {180.0, 192.0, 204.0, 218.0, 228.0, 244.0},
-        },
-        {
-            {120.0, 126.0, 148.0, 156.0, 176.0, 186.0},
-            {132.0, 138.0, 164.0, 172.0, 196.0, 206.0},
-            {204.0, 216.0, 232.0, 246.0, 260.0, 276.0},
-            {228.0, 240.0, 260.0, 274.0, 292.0, 308.0},
-        },
-    };
-
-    float expected_out_3[3][5][8] = {
-        {
-            {72.0, 78.0, 0.0, 84.0, 92.0, 0.0, 96.0, 106.0},
-            {84.0, 90.0, 0.0, 100.0, 108.0, 0.0, 116.0, 126.0},
-            {0.0, 0.0, 0.0, 0.0, 0.0, 0.0},
-            {108.0, 120.0, 0.0, 120.0, 134.0, 0.0, 132.0, 148.0},
-            {132.0, 144.0, 0.0, 148.0, 162.0, 0.0, 164.0, 180.0},
-        },
-        {
-            {96.0, 102.0, 0.0, 116.0, 124.0, 0.0, 136.0, 146.0},
-            {108.0, 114.0, 0.0, 132.0, 140.0, 0.0, 156.0, 166.0},
-            {0.0, 0.0, 0.0, 0.0, 0.0, 0.0},
-            {156.0, 168.0, 0.0, 176.0, 190.0, 0.0, 196.0, 212.0},
-            {180.0, 192.0, 0.0, 204.0, 218.0, 0.0, 228.0, 244.0},
-        },
-        {
-            {120.0, 126.0, 0.0, 148.0, 156.0, 0.0, 176.0, 186.0},
-            {132.0, 138.0, 0.0, 164.0, 172.0, 0.0, 196.0, 206.0},
-            {0.0, 0.0, 0.0, 0.0, 0.0, 0.0},
-            {204.0, 216.0, 0.0, 232.0, 246.0, 0.0, 260.0, 276.0},
-            {228.0, 240.0, 0.0, 260.0, 274.0, 0.0, 292.0, 308.0},
-        },
-    };
-
-    // conv transpose 2d with stride 1, 2 & 3
-    {
-        struct ggml_context * ctx = make_ctx();
-
-        struct ggml_tensor * t = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, 3, 2, 2, 1); // w x h x cin
-        memcpy(t->data, buf_f32, ggml_nbytes(t));
-
-        struct ggml_tensor * k = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 2, 2, 3, 2); // w x h cin x cout
-        memcpy(k->data, buf_f16, ggml_nbytes(k));
-
-        struct ggml_tensor * out_1 = ggml_conv_transpose_2d_p0(ctx, k, t, 1);
-        struct ggml_tensor * out_2 = ggml_conv_transpose_2d_p0(ctx, k, t, 2);
-        struct ggml_tensor * out_3 = ggml_conv_transpose_2d_p0(ctx, k, t, 3);
-
-        struct ggml_cgraph * gf_1 = ggml_new_graph(ctx);
-        struct ggml_cgraph * gf_2 = ggml_new_graph(ctx);
-        struct ggml_cgraph * gf_3 = ggml_new_graph(ctx);
-
-        ggml_build_forward_expand(gf_1, out_1);
-        ggml_build_forward_expand(gf_2, out_2);
-        ggml_build_forward_expand(gf_3, out_3);
-
-        ggml_graph_compute_with_ctx(ctx, gf_1, 1);
-        ggml_graph_compute_with_ctx(ctx, gf_2, 1);
-        ggml_graph_compute_with_ctx(ctx, gf_3, 1);
-
-        // printf("in\n");
-        // printf_tensor(t);
-        // printf("\n\nkernel\n");
-        // printf_tensor(k);
-        // printf("\n\nout\n");
-        // printf_tensor(out);
-        // printf("\n\nout_2\n");
-        // printf_tensor(out_2);
-        // printf("\n\nout_3\n");
-        // printf_tensor(out_3);
-
-        check_tensor(out_1, (float*)expected_out_1, 4, 3, 3);
-        check_tensor(out_2, (float*)expected_out_2, 6, 4, 3);
-        check_tensor(out_3, (float*)expected_out_3, 8, 5, 3);
-
-    }
-}
-
-int main(int argc, const char * argv[]) {
-    test_conv_transpose_1d();
-    test_conv_transpose_2d();
-    return 0;
-}
diff --git a/ggml/tests/test-conv1d.cpp b/ggml/tests/test-conv1d.cpp
deleted file mode 100644
index 936f966..0000000
--- a/ggml/tests/test-conv1d.cpp
+++ /dev/null
@@ -1,303 +0,0 @@
-#include "ggml.h"
-#include "ggml/ggml-alloc.h"
-#include "ggml/ggml-backend.h"
-
-// #define GGML_USE_CUBLAS
-
-#ifdef GGML_USE_CUBLAS
-#include "ggml-cuda.h"
-#endif
-
-#ifdef GGML_USE_METAL
-#include "ggml-metal.h"
-#endif
-
-#include <cassert>
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <fstream>
-#include <map>
-#include <string>
-#include <vector>
-
-static void ggml_log_callback_default(ggml_log_level level, const char * text, void * user_data) {
-    (void) level;
-    (void) user_data;
-    fputs(text, stderr);
-    fflush(stderr);
-}
-
-struct test_model {
-    struct ggml_tensor * a;
-    struct ggml_tensor * b;
-    ggml_backend_t backend = NULL;
-    ggml_backend_buffer_t buffer;
-    struct ggml_context * ctx;
-};
-
-void load_model(test_model & model, bool use_gpu = false) {
-    // create data
-    int K = 3, IC = 10, OC = 10;
-    int IL = 8, N = 1;
-
-    // Initialize adata
-    float * adata = new float[K * IC * OC];
-    for (int i = 0; i < K * IC * OC; i++) {
-        adata[i] = 4.5f;
-    }
-
-    // Convert adata to fp16 format
-    std::vector<ggml_fp16_t> hadata(K * IC * OC);
-    ggml_fp32_to_fp16_row(adata, hadata.data(), K * IC * OC);
-
-    // Initialize bdata
-    float * bdata =  new float[IL * IC * N];
-    for (int i = 0; i < IL * IC * N; i++) {
-        bdata[i] = 2.5f;
-    }
-
-    size_t buffer_size = 0;
-    {
-        buffer_size += K * IC * OC * ggml_type_size(GGML_TYPE_F16); // tensor a
-        buffer_size += IL * IC * N * ggml_type_size(GGML_TYPE_F32); // tensor b
-        buffer_size += 1024; // overhead
-    }
-
-    printf("%s: ggml tensor size    = %d bytes\n", __func__, (int) sizeof(ggml_tensor));
-    printf("%s: backend buffer size = %0.2f MB\n", __func__, (buffer_size/ 1024.f/ 1024.f));
-
-    int num_tensors = 2;
-    struct ggml_init_params params {
-            /*.mem_size   =*/ ggml_tensor_overhead() * num_tensors,
-            /*.mem_buffer =*/ NULL,
-            /*.no_alloc   =*/ true,
-    };
-
-    // initialize the backend
-#ifdef GGML_USE_CUBLAS
-    if (use_gpu) {
-        fprintf(stderr, "%s: using CUDA backend\n", __func__);
-        model.backend = ggml_backend_cuda_init(0);
-        if (!model.backend) {
-            fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
-        }
-    }
-#endif
-
-#ifdef GGML_USE_METAL
-    if (use_gpu) {
-        fprintf(stderr, "%s: using Metal backend\n", __func__);
-        ggml_backend_metal_log_set_callback(ggml_log_callback_default, nullptr);
-        model.backend = ggml_backend_metal_init();
-        if (!model.backend) {
-            fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
-        }
-    }
-#endif
-
-    if(!model.backend) {
-        // fallback to CPU backend
-        model.backend = ggml_backend_cpu_init();
-    }
-
-    model.buffer = ggml_backend_alloc_buffer(model.backend, buffer_size);
-
-    // create context
-    model.ctx = ggml_init(params);
-
-    // create tensors
-    model.a = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F16,  K, IC, OC);
-    model.b = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, IL, IC, N);
-
-    // create a allocator
-    ggml_allocr * alloc = ggml_allocr_new_from_buffer(model.buffer);
-
-    // alloc memory
-    ggml_allocr_alloc(alloc, model.a);
-
-    // load data to buffer
-    if(ggml_backend_is_cpu(model.backend)) {
-        memcpy(model.a->data, hadata.data(), ggml_nbytes(model.a));
-    } else {
-        ggml_backend_tensor_set(model.a, hadata.data(), 0, ggml_nbytes(model.a));
-    }
-
-    // alloc memory
-    ggml_allocr_alloc(alloc, model.b);
-
-    if(ggml_backend_is_cpu(model.backend)
-#ifdef GGML_USE_METAL
-                || ggml_backend_is_metal(model.backend)
-#endif
-    ) {
-        memcpy(model.b->data, bdata, ggml_nbytes(model.b));
-    } else {
-        ggml_backend_tensor_set(model.b, bdata, 0, ggml_nbytes(model.b));
-    }
-
-    ggml_allocr_free(alloc);
-}
-
-struct ggml_cgraph * build_graph(const test_model& model, struct ggml_allocr * allocr) {
-    static size_t buf_size = ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead();
-    static std::vector<uint8_t> buf(buf_size);
-
-    struct ggml_init_params params0 = {
-        /*.mem_size   =*/ buf_size,
-        /*.mem_buffer =*/ buf.data(),
-        /*.no_alloc   =*/ true, // the tensors will be allocated later by ggml_allocr_alloc_graph()
-    };
-
-    // create a temporally context to build the graph
-    struct ggml_context * ctx0 = ggml_init(params0);
-
-    struct ggml_cgraph  * gf = ggml_new_graph(ctx0);
-
-    int s0 = 1;
-    int p0 = 1;
-    int d0 = 1;
-
-    // split conv1d in fundamental methods for test unit
-    struct ggml_tensor* im2col_0 = ggml_im2col(ctx0, model.a, model.b, s0, 0, p0, 0, d0, 0, false);
-    ggml_set_name(im2col_0, "im2col_res");
-    ggml_build_forward_expand(gf, im2col_0);
-
-    struct ggml_tensor* conv1d_res = ggml_conv_1d(ctx0, model.a, model.b, s0, p0, d0);
-    ggml_set_name(conv1d_res, "conv1d_res");
-    ggml_build_forward_expand(gf, conv1d_res);
-
-    // delete the temporally context used to build the graph
-    ggml_free(ctx0);
-    return gf;
-}
-
-struct ggml_cgraph* compute_graph(const test_model & model, struct ggml_allocr * allocr) {
-    // reset the allocator to free all the memory allocated during the previous inference
-    ggml_allocr_reset(allocr);
-
-    struct ggml_cgraph * gf = build_graph(model, allocr);
-
-    // allocate tensors
-    ggml_allocr_alloc_graph(allocr, gf);
-    int n_threads = 1;
-
-    if (ggml_backend_is_cpu(model.backend)) {
-        ggml_backend_cpu_set_n_threads(model.backend, n_threads);
-    }
-
-#ifdef GGML_USE_METAL
-    if (ggml_backend_is_metal(model.backend)) {
-        ggml_backend_metal_set_n_cb(model.backend, n_threads);
-    }
-#endif
-
-    ggml_backend_graph_compute(model.backend, gf);
-
-    //ggml_graph_print(gf);
-
-    return gf;
-}
-
-int main(void)
-{
-    ggml_time_init();
-
-    test_model model;
-    load_model(model, true);
-
-    ggml_backend_buffer_t buf_compute; // for compute
-    struct ggml_allocr * allocr = NULL;
-
-    {
-        allocr = ggml_allocr_new_measure_from_backend(model.backend);
-
-        //create the worst case graph for memory usage estimation
-        struct ggml_cgraph * gf = build_graph(model, allocr);
-        size_t mem_size = ggml_allocr_alloc_graph(allocr, gf);
-        ggml_allocr_free(allocr);
-
-        // compute the required memory
-        buf_compute = ggml_backend_alloc_buffer(model.backend, mem_size);
-        allocr = ggml_allocr_new_from_buffer(buf_compute);
-        fprintf(stderr, "%s: compute buffer size: %.2f MB\n", __func__, mem_size/1024.0f/1024.0f);
-    }
-
-    struct ggml_cgraph * gf_res = compute_graph(model, allocr);
-
-     struct ggml_tensor * im2col_res = NULL;
-    struct ggml_tensor * conv1d_res = NULL;
-
-    for(int i = 0; i < gf_res->n_nodes; i++) {
-        if(strcmp(ggml_get_name(gf_res->nodes[i]), "im2col_res") == 0) {
-            im2col_res = gf_res->nodes[i];
-        } else if(strcmp(ggml_get_name(gf_res->nodes[i]), "conv1d_res") == 0) {
-            conv1d_res = gf_res->nodes[i];
-        }
-    }
-
-    uint16_t* im2col_data = new uint16_t[ggml_nelements(im2col_res)];
-    float* conv2d_data = new float[ggml_nelements(conv1d_res)];
-
-    ggml_backend_tensor_get(im2col_res, im2col_data, 0, ggml_nbytes(im2col_res));
-    ggml_backend_tensor_get(conv1d_res, conv2d_data, 0, ggml_nbytes(conv1d_res));
-
-    const int n_conv1d_test = 80;
-    const int n_im2col_test = 240;
-
-    float expected_conv1d[n_conv1d_test] = {
-        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
-        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
-        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
-        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
-        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
-        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
-        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
-        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
-        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
-        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f
-    };
-    // first im2col test
-
-    uint16_t expected_im2col[n_conv1d_test] = {
-        0, 16640, 16640, 0, 16640, 16640, 0, 16640,
-        16640, 0, 16640, 16640, 0, 16640, 16640, 0,
-        16640, 16640, 0, 16640, 16640, 0, 16640, 16640,
-        0, 16640, 16640, 0, 16640, 16640, 16640, 16640,
-        16640, 16640, 16640, 16640, 16640, 16640, 16640, 16640,
-        16640, 16640, 16640, 16640, 16640, 16640, 16640, 16640,
-        16640, 16640, 16640, 16640, 16640, 16640, 16640, 16640,
-        16640, 16640, 16640, 16640, 16640, 16640, 16640, 16640,
-        16640, 16640, 16640, 16640, 16640, 16640, 16640, 16640,
-        16640, 16640, 16640, 16640, 16640, 16640, 16640, 16640
-    };
-
-    printf("\nPerforming test:\n");
-
-    bool passed = true;
-    for(int i = 0; i < n_conv1d_test; i++) {
-        if(
-            im2col_data[i] != expected_im2col[i]) {
-            passed = false;
-            break;
-        }
-    }
-
-    printf("ggml_im2col (%d): %s\n", (int) ggml_nelements(im2col_res), passed && (ggml_nelements(im2col_res) == n_im2col_test) ? "\033[32mPASSED\033[0m" : "\033[31mFAILED\033[0m");
-
-    passed = true;
-    for(int i = 0; i < n_conv1d_test; i++) {
-        if(conv2d_data[i] != expected_conv1d[i]) {
-            passed = false;
-            break;
-        }
-    }
-
-    printf("ggml_conv1d (%d): %s\n", (int) ggml_nelements(conv1d_res), passed && (ggml_nelements(conv1d_res) == n_conv1d_test) ? "\033[32mPASSED\033[0m" : "\033[31mFAILED\033[0m");
-    ggml_free(model.ctx);
-
-    ggml_backend_buffer_free(model.buffer);
-    ggml_backend_buffer_free(buf_compute);
-    ggml_backend_free(model.backend);
-    return 0;
-}
diff --git a/ggml/tests/test-conv2d.cpp b/ggml/tests/test-conv2d.cpp
deleted file mode 100644
index 06398d0..0000000
--- a/ggml/tests/test-conv2d.cpp
+++ /dev/null
@@ -1,405 +0,0 @@
-#include "ggml.h"
-#include "ggml/ggml-alloc.h"
-#include "ggml/ggml-backend.h"
-
-// #define GGML_USE_CUBLAS
-
-#ifdef GGML_USE_CUBLAS
-#include "ggml-cuda.h"
-#endif
-
-#ifdef GGML_USE_METAL
-#include "ggml-metal.h"
-#endif
-
-#include <cassert>
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <fstream>
-#include <map>
-#include <string>
-#include <vector>
-
-static void ggml_log_callback_default(ggml_log_level level, const char * text, void * user_data) {
-    (void) level;
-    (void) user_data;
-    fputs(text, stderr);
-    fflush(stderr);
-}
-
-struct test_model {
-    struct ggml_tensor * a;
-    struct ggml_tensor * b;
-    ggml_backend_t backend = NULL;
-    ggml_backend_buffer_t buffer;
-    struct ggml_context * ctx;
-};
-
-void load_model(test_model & model, bool use_gpu = false) {
-    // create data
-    int KW = 3, KH = 3, IC = 10, OC = 10;
-    int IW = 8, IH = 6, N = 1;
-
-    // Initialize adata
-    float * adata = new float[KW * KH * IC * OC];
-    for (int i = 0; i < KW * KH * IC * OC; i++) {
-        adata[i] = 2.5f;
-    }
-
-    // Convert adata to fp16 format
-    std::vector<ggml_fp16_t> hadata(KW * KH * IC * OC);
-    ggml_fp32_to_fp16_row(adata, hadata.data(), KW * KH * IC * OC);
-
-    // Initialize bdata
-    float * bdata =  new float[IW * IH * IC * N];
-    for (int i = 0; i < IW * IH * IC * N; i++) {
-        bdata[i] = 1.5f;
-    }
-
-    size_t buffer_size = 0;
-    {
-        buffer_size += KW * KH * IC * OC * ggml_type_size(GGML_TYPE_F16); // tensor a
-        buffer_size += IW * IH * IC * N  * ggml_type_size(GGML_TYPE_F32); // tensor b
-        buffer_size += 1024; // overhead
-    }
-
-    printf("%s: ggml tensor size    = %d bytes\n", __func__, (int) sizeof(ggml_tensor));
-    printf("%s: backend buffer size = %0.2f MB\n", __func__, (buffer_size/ 1024.f/ 1024.f));
-
-    int num_tensors = 2;
-    struct ggml_init_params params {
-            /*.mem_size   =*/ ggml_tensor_overhead() * num_tensors,
-            /*.mem_buffer =*/ NULL,
-            /*.no_alloc   =*/ true,
-    };
-
-    // initialize the backend
-#ifdef GGML_USE_CUBLAS
-    if (use_gpu) {
-        fprintf(stderr, "%s: using CUDA backend\n", __func__);
-        model.backend = ggml_backend_cuda_init(0);
-        if (!model.backend) {
-            fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
-        }
-    }
-#endif
-
-#ifdef GGML_USE_METAL
-    if (use_gpu) {
-        fprintf(stderr, "%s: using Metal backend\n", __func__);
-        ggml_backend_metal_log_set_callback(ggml_log_callback_default, nullptr);
-        model.backend = ggml_backend_metal_init();
-        if (!model.backend) {
-            fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
-        }
-    }
-#endif
-
-    if(!model.backend) {
-        // fallback to CPU backend
-        model.backend = ggml_backend_cpu_init();
-    }
-
-    model.buffer = ggml_backend_alloc_buffer(model.backend, buffer_size);
-
-    // create context
-    model.ctx = ggml_init(params);
-
-    // create tensors
-    model.a = ggml_new_tensor_4d(model.ctx, GGML_TYPE_F16,  KW, KH, IC, OC);
-    model.b = ggml_new_tensor_4d(model.ctx, GGML_TYPE_F32, IW, IH, IC, N);
-
-    // create a allocator
-    ggml_allocr * alloc = ggml_allocr_new_from_buffer(model.buffer);
-
-    // alloc memory
-    ggml_allocr_alloc(alloc, model.a);
-
-    // load data to buffer
-    if(ggml_backend_is_cpu(model.backend)) {
-        memcpy(model.a->data, hadata.data(), ggml_nbytes(model.a));
-    } else {
-        ggml_backend_tensor_set(model.a, hadata.data(), 0, ggml_nbytes(model.a));
-    }
-
-    // alloc memory
-    ggml_allocr_alloc(alloc, model.b);
-
-    if(ggml_backend_is_cpu(model.backend)
-#ifdef GGML_USE_METAL
-                || ggml_backend_is_metal(model.backend)
-#endif
-    ) {
-        memcpy(model.b->data, bdata, ggml_nbytes(model.b));
-    } else {
-        ggml_backend_tensor_set(model.b, bdata, 0, ggml_nbytes(model.b));
-    }
-
-    ggml_allocr_free(alloc);
-}
-
-struct ggml_cgraph * build_graph(const test_model& model, struct ggml_allocr * allocr) {
-    static size_t buf_size = ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead();
-    static std::vector<uint8_t> buf(buf_size);
-
-    struct ggml_init_params params0 = {
-        /*.mem_size   =*/ buf_size,
-        /*.mem_buffer =*/ buf.data(),
-        /*.no_alloc   =*/ true, // the tensors will be allocated later by ggml_allocr_alloc_graph()
-    };
-
-    // create a temporally context to build the graph
-    struct ggml_context * ctx0 = ggml_init(params0);
-
-    struct ggml_cgraph  * gf = ggml_new_graph(ctx0);
-
-    int s0 = 1;
-    int s1 = 1;
-    int p0 = 1;
-    int p1 = 1;
-    int d0 = 1;
-    int d1 = 1;
-
-    // split conv2d in fundamental methods for test unit
-    struct ggml_tensor* im2col_0 = ggml_im2col(ctx0, model.a, model.b, s0, s1, p0, p1, d0, d1, true);
-    ggml_set_name(im2col_0, "im2col_res");
-    ggml_build_forward_expand(gf, im2col_0);
-
-    // recalculate for avoid fragmentation
-    struct ggml_tensor* conv2d_res = ggml_conv_2d(ctx0, model.a, model.b, s0, s1, p0, p1, d0, d1);
-    ggml_set_name(conv2d_res, "conv2d_res");
-    ggml_build_forward_expand(gf, conv2d_res);
-
-    ggml_free(ctx0);
-    return gf;
-}
-
-struct ggml_cgraph * compute_graph(const test_model & model, struct ggml_allocr * allocr) {
-    // reset the allocator to free all the memory allocated during the previous inference
-    ggml_allocr_reset(allocr);
-
-    struct ggml_cgraph * gf = build_graph(model, allocr);
-
-    // allocate tensors
-    ggml_allocr_alloc_graph(allocr, gf);
-    int n_threads = 1;
-
-    if (ggml_backend_is_cpu(model.backend)) {
-        ggml_backend_cpu_set_n_threads(model.backend, n_threads);
-    }
-
-#ifdef GGML_USE_METAL
-    if (ggml_backend_is_metal(model.backend)) {
-        ggml_backend_metal_set_n_cb(model.backend, n_threads);
-    }
-#endif
-
-    ggml_backend_graph_compute(model.backend, gf);
-
-    //ggml_graph_print(gf);
-
-    return gf;
-}
-
-int main(void)
-{
-    ggml_time_init();
-
-    test_model model;
-    load_model(model, true);
-
-    ggml_backend_buffer_t buf_compute; // for compute
-    struct ggml_allocr * allocr = NULL;
-
-    {
-        allocr = ggml_allocr_new_measure_from_backend(model.backend);
-
-        //create the worst case graph for memory usage estimation
-        struct ggml_cgraph * gf = build_graph(model, allocr);
-        size_t mem_size = ggml_allocr_alloc_graph(allocr, gf);
-        ggml_allocr_free(allocr);
-
-        // compute the required memory
-        buf_compute = ggml_backend_alloc_buffer(model.backend, mem_size);
-        allocr = ggml_allocr_new_from_buffer(buf_compute);
-        fprintf(stderr, "%s: compute buffer size: %.2f MB\n", __func__, mem_size/1024.0f/1024.0f);
-    }
-
-    struct ggml_cgraph * gf_res = compute_graph(model, allocr);
-
-    struct ggml_tensor * im2col_res = NULL;
-    struct ggml_tensor * conv2d_res = NULL;
-
-    for(int i = 0; i < gf_res->n_nodes; i++) {
-        if(strcmp(ggml_get_name(gf_res->nodes[i]), "im2col_res") == 0) {
-            im2col_res = gf_res->nodes[i];
-        } else if(strcmp(ggml_get_name(gf_res->nodes[i]), "conv2d_res") == 0) {
-            conv2d_res = gf_res->nodes[i];
-        }
-    }
-
-    uint16_t* im2col_data = new uint16_t[ggml_nelements(im2col_res)];
-    float* conv2d_data = new float[ggml_nelements(conv2d_res)];
-
-    ggml_backend_tensor_get(im2col_res, im2col_data, 0, ggml_nbytes(im2col_res));
-    ggml_backend_tensor_get(conv2d_res, conv2d_data, 0, ggml_nbytes(conv2d_res));
-
-    const int n_conv2d_test = 480;
-    const int n_im2col_test = 4320;
-
-    float expected_conv2d [n_conv2d_test] = {
-        150.00f, 225.00f, 225.00f, 225.00f, 225.00f, 225.00f, 225.00f, 150.00f,
-        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
-        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
-        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
-        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
-        150.00f, 225.00f, 225.00f, 225.00f, 225.00f, 225.00f, 225.00f, 150.00f,
-        150.00f, 225.00f, 225.00f, 225.00f, 225.00f, 225.00f, 225.00f, 150.00f,
-        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
-        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
-        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
-        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
-        150.00f, 225.00f, 225.00f, 225.00f, 225.00f, 225.00f, 225.00f, 150.00f,
-        150.00f, 225.00f, 225.00f, 225.00f, 225.00f, 225.00f, 225.00f, 150.00f,
-        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
-        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
-        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
-        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
-        150.00f, 225.00f, 225.00f, 225.00f, 225.00f, 225.00f, 225.00f, 150.00f,
-        150.00f, 225.00f, 225.00f, 225.00f, 225.00f, 225.00f, 225.00f, 150.00f,
-        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
-        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
-        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
-        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
-        150.00f, 225.00f, 225.00f, 225.00f, 225.00f, 225.00f, 225.00f, 150.00f,
-        150.00f, 225.00f, 225.00f, 225.00f, 225.00f, 225.00f, 225.00f, 150.00f,
-        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
-        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
-        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
-        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
-        150.00f, 225.00f, 225.00f, 225.00f, 225.00f, 225.00f, 225.00f, 150.00f,
-        150.00f, 225.00f, 225.00f, 225.00f, 225.00f, 225.00f, 225.00f, 150.00f,
-        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
-        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
-        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
-        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
-        150.00f, 225.00f, 225.00f, 225.00f, 225.00f, 225.00f, 225.00f, 150.00f,
-        150.00f, 225.00f, 225.00f, 225.00f, 225.00f, 225.00f, 225.00f, 150.00f,
-        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
-        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
-        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
-        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
-        150.00f, 225.00f, 225.00f, 225.00f, 225.00f, 225.00f, 225.00f, 150.00f,
-        150.00f, 225.00f, 225.00f, 225.00f, 225.00f, 225.00f, 225.00f, 150.00f,
-        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
-        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
-        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
-        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
-        150.00f, 225.00f, 225.00f, 225.00f, 225.00f, 225.00f, 225.00f, 150.00f,
-        150.00f, 225.00f, 225.00f, 225.00f, 225.00f, 225.00f, 225.00f, 150.00f,
-        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
-        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
-        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
-        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
-        150.00f, 225.00f, 225.00f, 225.00f, 225.00f, 225.00f, 225.00f, 150.00f,
-        150.00f, 225.00f, 225.00f, 225.00f, 225.00f, 225.00f, 225.00f, 150.00f,
-        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
-        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
-        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
-        225.00f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 337.50f, 225.00f,
-        150.00f, 225.00f, 225.00f, 225.00f, 225.00f, 225.00f, 225.00f, 150.00f };
-
-    uint16_t expected_im2col[n_conv2d_test] = {
-            0, 0, 0, 0, 15872, 15872, 0, 15872,
-            15872, 0, 0, 0, 0, 15872, 15872, 0,
-            15872, 15872, 0, 0, 0, 0, 15872, 15872,
-            0, 15872, 15872, 0, 0, 0, 0, 15872,
-            15872, 0, 15872, 15872, 0, 0, 0, 0,
-            15872, 15872, 0, 15872, 15872, 0, 0, 0,
-            0, 15872, 15872, 0, 15872, 15872, 0, 0,
-            0, 0, 15872, 15872, 0, 15872, 15872, 0,
-            0, 0, 0, 15872, 15872, 0, 15872, 15872,
-            0, 0, 0, 0, 15872, 15872, 0, 15872,
-            15872, 0, 0, 0, 0, 15872, 15872, 0,
-            15872, 15872, 0, 0, 0, 15872, 15872, 15872,
-            15872, 15872, 15872, 0, 0, 0, 15872, 15872,
-            15872, 15872, 15872, 15872, 0, 0, 0, 15872,
-            15872, 15872, 15872, 15872, 15872, 0, 0, 0,
-            15872, 15872, 15872, 15872, 15872, 15872, 0, 0,
-            0, 15872, 15872, 15872, 15872, 15872, 15872, 0,
-            0, 0, 15872, 15872, 15872, 15872, 15872, 15872,
-            0, 0, 0, 15872, 15872, 15872, 15872, 15872,
-            15872, 0, 0, 0, 15872, 15872, 15872, 15872,
-            15872, 15872, 0, 0, 0, 15872, 15872, 15872,
-            15872, 15872, 15872, 0, 0, 0, 15872, 15872,
-            15872, 15872, 15872, 15872, 0, 0, 0, 15872,
-            15872, 15872, 15872, 15872, 15872, 0, 0, 0,
-            15872, 15872, 15872, 15872, 15872, 15872, 0, 0,
-            0, 15872, 15872, 15872, 15872, 15872, 15872, 0,
-            0, 0, 15872, 15872, 15872, 15872, 15872, 15872,
-            0, 0, 0, 15872, 15872, 15872, 15872, 15872,
-            15872, 0, 0, 0, 15872, 15872, 15872, 15872,
-            15872, 15872, 0, 0, 0, 15872, 15872, 15872,
-            15872, 15872, 15872, 0, 0, 0, 15872, 15872,
-            15872, 15872, 15872, 15872, 0, 0, 0, 15872,
-            15872, 15872, 15872, 15872, 15872, 0, 0, 0,
-            15872, 15872, 15872, 15872, 15872, 15872, 0, 0,
-            0, 15872, 15872, 15872, 15872, 15872, 15872, 0,
-            0, 0, 15872, 15872, 15872, 15872, 15872, 15872,
-            0, 0, 0, 15872, 15872, 15872, 15872, 15872,
-            15872, 0, 0, 0, 15872, 15872, 15872, 15872,
-            15872, 15872, 0, 0, 0, 15872, 15872, 15872,
-            15872, 15872, 15872, 0, 0, 0, 15872, 15872,
-            15872, 15872, 15872, 15872, 0, 0, 0, 15872,
-            15872, 15872, 15872, 15872, 15872, 0, 0, 0,
-            15872, 15872, 15872, 15872, 15872, 15872, 0, 0,
-            0, 15872, 15872, 15872, 15872, 15872, 15872, 0,
-            0, 0, 15872, 15872, 15872, 15872, 15872, 15872,
-            0, 0, 0, 15872, 15872, 15872, 15872, 15872,
-            15872, 0, 0, 0, 15872, 15872, 15872, 15872,
-            15872, 15872, 0, 0, 0, 15872, 15872, 15872,
-            15872, 15872, 15872, 0, 0, 0, 15872, 15872,
-            15872, 15872, 15872, 15872, 0, 0, 0, 15872,
-            15872, 15872, 15872, 15872, 15872, 0, 0, 0,
-            15872, 15872, 15872, 15872, 15872, 15872, 0, 0,
-            0, 15872, 15872, 15872, 15872, 15872, 15872, 0,
-            0, 0, 15872, 15872, 15872, 15872, 15872, 15872,
-            0, 0, 0, 15872, 15872, 15872, 15872, 15872,
-            15872, 0, 0, 0, 15872, 15872, 15872, 15872,
-            15872, 15872, 0, 0, 0, 15872, 15872, 15872,
-            15872, 15872, 15872, 0, 0, 0, 15872, 15872,
-            15872, 15872, 15872, 15872, 0, 0, 0, 15872,
-            15872, 15872, 15872, 15872, 15872, 0, 0, 0
-    };
-
-    printf("\nPerforming test:\n");
-
-    bool passed = true;
-    for(int i = 0; i < n_conv2d_test; i++) {
-        if(
-            im2col_data[i] != expected_im2col[i]) {
-            passed = false;
-            break;
-        }
-    }
-
-    printf("ggml_im2col (%d): %s\n", (int) ggml_nelements(im2col_res), passed && (ggml_nelements(im2col_res) == n_im2col_test) ? "\033[32mPASSED\033[0m" : "\033[31mFAILED\033[0m");
-
-    passed = true;
-    for(int i = 0; i < n_conv2d_test; i++) {
-        if(conv2d_data[i] != expected_conv2d[i]) {
-            passed = false;
-            break;
-        }
-    }
-
-    printf("ggml_conv2d (%d): %s\n", (int) ggml_nelements(conv2d_res), passed && (ggml_nelements(conv2d_res) == n_conv2d_test) ? "\033[32mPASSED\033[0m" : "\033[31mFAILED\033[0m");
-
-    ggml_free(model.ctx);
-
-    ggml_backend_buffer_free(model.buffer);
-    ggml_backend_buffer_free(buf_compute);
-    ggml_backend_free(model.backend);
-    return 0;
-}
diff --git a/ggml/tests/test-customop.c b/ggml/tests/test-customop.c
deleted file mode 100644
index e96aa67..0000000
--- a/ggml/tests/test-customop.c
+++ /dev/null
@@ -1,226 +0,0 @@
-#include "ggml/ggml.h"
-#include <string.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <assert.h>
-
-#if defined(_WIN32)
-#include <windows.h>
-typedef volatile LONG atomic_int;
-static LONG atomic_fetch_add(atomic_int * ptr, LONG inc) {
-    return InterlockedExchangeAdd(ptr, inc);
-}
-#else
-#include <stdatomic.h>
-#endif
-
-#define MIN(a, b) ((a) < (b) ? (a) : (b))
-#define MAX(a, b) ((a) > (b) ? (a) : (b))
-
-struct ggml_context * make_ctx(void) {
-    struct ggml_init_params params = {
-        /*.mem_size   =*/ 1 * 1024 * 1024,
-        /*.mem_buffer =*/ NULL,
-        /*.no_alloc   =*/ false,
-    };
-
-    return ggml_init(params);
-}
-
-char g_userdata[] = "ggml";
-atomic_int g_custom1_count = 0;
-atomic_int g_custom2_count = 0;
-atomic_int g_custom3_count = 0;
-
-void custom1(struct ggml_tensor * dst , const struct ggml_tensor * a, int ith, int nth, void * userdata) {
-    // check that the userdata is correct
-    assert(userdata == NULL);
-    assert(ggml_are_same_shape(dst, a));
-
-    atomic_fetch_add(&g_custom1_count, 1);
-
-    const float * a_data = ggml_get_data_f32(a);
-    float * dst_data = ggml_get_data_f32(dst);
-
-    // this assumes that the tensors are contiguous
-    assert(ggml_is_contiguous(dst));
-    assert(ggml_is_contiguous(a));
-
-    // parallelize by elements
-    const int ne = (int)ggml_nelements(dst);
-    const int dr = (ne + nth - 1) / nth;
-    const int ie0 = dr * ith;
-    const int ie1 = MIN(ie0 + dr, ne);
-
-    for (int i = ie0; i < ie1; ++i) {
-        dst_data[i] = a_data[i] * 2;
-    }
-}
-
-void custom2(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, int ith, int nth, void * userdata) {
-    // check that the userdata is correct
-    assert(userdata == g_userdata);
-    assert(strcmp(userdata, "ggml") == 0);
-    assert(ggml_are_same_shape(dst, a));
-    assert(ggml_are_same_shape(dst, b));
-
-    atomic_fetch_add(&g_custom2_count, 1);
-
-    const float * a_data = ggml_get_data_f32(a);
-    const float * b_data = ggml_get_data_f32(b);
-    float * dst_data = ggml_get_data_f32(dst);
-
-    // parallelize by rows
-    const int nr = (int)ggml_nrows(dst);
-    // number of rows per thread
-    const int dr = (nr + nth - 1) / nth;
-    // row range for this thread
-    const int ir0 = dr * ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    // number of columns
-    const int nc = (int)dst->ne[0];
-
-    // this assumes that the tensors are contiguous
-    assert(ggml_is_contiguous(dst));
-    assert(ggml_is_contiguous(a));
-    assert(ggml_is_contiguous(b));
-
-    for (int ir = ir0; ir < ir1; ++ir) {
-        for (int ic = 0; ic < nc; ++ic) {
-            const int i = ir * nc + ic;
-            dst_data[i] = a_data[i] + b_data[i];
-        }
-    }
-}
-
-void custom3(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, const struct ggml_tensor * c, int ith, int nth, void * userdata) {
-    // check that the userdata is correct
-    assert(userdata == g_userdata);
-    assert(strcmp(userdata, "ggml") == 0);
-    assert(ggml_are_same_shape(dst, a));
-    assert(ggml_are_same_shape(dst, b));
-    assert(ggml_are_same_shape(dst, c));
-
-    atomic_fetch_add(&g_custom3_count, 1);
-
-    const float * a_data = ggml_get_data_f32(a);
-    const float * b_data = ggml_get_data_f32(b);
-    const float * c_data = ggml_get_data_f32(c);
-    float * dst_data = ggml_get_data_f32(dst);
-
-    // dont parallelize
-    assert(ith == 0);
-
-    // number of elements
-    const int ne = (int)ggml_nelements(dst);
-
-    // this assumes that the tensors are contiguous
-    assert(ggml_is_contiguous(dst));
-    assert(ggml_is_contiguous(a));
-    assert(ggml_is_contiguous(b));
-    assert(ggml_is_contiguous(c));
-
-    for (int i = 0; i < ne; ++i) {
-        dst_data[i] = a_data[i] + b_data[i] + c_data[i];
-    }
-}
-
-int main(int argc, const char** argv) {
-
-    float buf1_f32[1024];
-    for (int i = 0; i < 1024; ++i) {
-        buf1_f32[i] = (float)(i + 1);
-    }
-    float buf2_f32[1024];
-    for (int i = 0; i < 1024; ++i) {
-        buf2_f32[i] = (float)(i + 1) * 2;
-    }
-    float buf3_f32[1024];
-    for (int i = 0; i < 1024; ++i) {
-        buf3_f32[i] = (float)(i + 1) * 3;
-    }
-
-    // map_custom1
-    // 2 tasks, no userdata, parallelized by elements
-    {
-        struct ggml_context * ctx = make_ctx();
-        struct ggml_tensor * t = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 10, 2);
-        memcpy(t->data, buf1_f32, ggml_nbytes(t));
-
-        struct ggml_tensor * m1 = ggml_map_custom1(ctx, t, custom1, 2, NULL);
-
-        struct ggml_cgraph * graph = ggml_new_graph(ctx);
-        ggml_build_forward_expand(graph, m1);
-
-        ggml_graph_compute_with_ctx(ctx, graph, 4);
-
-        const float * output = ggml_get_data_f32(m1);
-
-        for (int i = 0; i < ggml_nelements(m1); ++i) {
-            assert(output[i] == buf1_f32[i] * 2);
-        }
-        assert(g_custom1_count == 2);
-
-        ggml_free(ctx);
-    }
-
-    // map_custom2
-    // max tasks (4), userdata, parallelized by rows
-    {
-        struct ggml_context * ctx = make_ctx();
-        struct ggml_tensor * t1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 10, 2);
-        memcpy(t1->data, buf1_f32, ggml_nbytes(t1));
-        struct ggml_tensor * t2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 10, 2);
-        memcpy(t2->data, buf2_f32, ggml_nbytes(t2));
-
-        struct ggml_tensor * m2 = ggml_map_custom2(ctx, t1, t2, custom2, GGML_N_TASKS_MAX, g_userdata);
-
-        struct ggml_cgraph * graph = ggml_new_graph(ctx);
-        ggml_build_forward_expand(graph, m2);
-
-        ggml_graph_compute_with_ctx(ctx, graph, 4);
-
-        const float * output = ggml_get_data_f32(m2);
-
-        for (int i = 0; i < ggml_nelements(m2); ++i) {
-            assert(output[i] == buf1_f32[i] + buf2_f32[i]);
-        }
-
-        assert(g_custom2_count == 4);
-
-        ggml_free(ctx);
-    }
-
-    // map_custom3
-    // 1 task, userdata, not parallelized
-    {
-        struct ggml_context * ctx = make_ctx();
-        struct ggml_tensor * t1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 10, 2);
-        memcpy(t1->data, buf1_f32, ggml_nbytes(t1));
-        struct ggml_tensor * t2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 10, 2);
-        memcpy(t2->data, buf2_f32, ggml_nbytes(t2));
-        struct ggml_tensor * t3 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 10, 2);
-        memcpy(t3->data, buf3_f32, ggml_nbytes(t3));
-
-        struct ggml_tensor * m3 = ggml_map_custom3(ctx, t1, t2, t3, custom3, 1, g_userdata);
-
-        struct ggml_cgraph * graph = ggml_new_graph(ctx);
-        ggml_build_forward_expand(graph, m3);
-
-        ggml_graph_compute_with_ctx(ctx, graph, 4);
-
-        const float * output = ggml_get_data_f32(m3);
-
-        for (int i = 0; i < ggml_nelements(m3); ++i) {
-            assert(output[i] == buf1_f32[i] + buf2_f32[i] + buf3_f32[i]);
-        }
-
-        assert(g_custom3_count == 1);
-
-        ggml_free(ctx);
-    }
-
-
-    return 0;
-}
diff --git a/ggml/tests/test-dup.c b/ggml/tests/test-dup.c
deleted file mode 100644
index afc887b..0000000
--- a/ggml/tests/test-dup.c
+++ /dev/null
@@ -1,110 +0,0 @@
-#include "ggml/ggml.h"
-
-#include <stdio.h>
-#include <stdlib.h>
-
-void arange(struct ggml_tensor* tensor) {
-    GGML_ASSERT(ggml_is_contiguous(tensor));
-    for (int i = 0; i < ggml_nelements(tensor); ++i) {
-        ggml_set_i32_1d(tensor, i, i);
-    }
-}
-
-void dup_to(struct ggml_tensor* src, struct ggml_tensor* dst) {
-    GGML_ASSERT(dst->op == GGML_OP_VIEW);
-    GGML_ASSERT(ggml_nelements(src) == ggml_nelements(dst));
-    dst->op = GGML_OP_DUP;
-    dst->src[0] = src;
-}
-
-bool can_dup(enum ggml_type src_type, enum ggml_type dst_type) {
-    if (src_type == dst_type) return true;
-    if (src_type == GGML_TYPE_F32 && ggml_internal_get_type_traits(dst_type).from_float) return true;
-    if (dst_type == GGML_TYPE_F32 && ggml_internal_get_type_traits(src_type).to_float) return true;
-
-    return false;
-}
-
-int main(int argc, const char ** argv) {
-    struct ggml_init_params params = {
-        .mem_size   = 128*1024*1024,
-        .mem_buffer = NULL,
-        .no_alloc   = false,
-    };
-
-    enum ggml_type type[4] = {GGML_TYPE_I16, GGML_TYPE_I32, GGML_TYPE_F16, GGML_TYPE_F32};
-    for (int i = 0; i < 4; ++i) {
-        enum ggml_type src_type = type[i];
-        for (int j = 0; j < 4; ++j) {
-            enum ggml_type dst_type = type[j];
-            if (!can_dup(src_type, dst_type)) continue;
-            printf("Testing dup on %s -> %s copy\n", ggml_type_name(src_type), ggml_type_name(dst_type));
-
-            struct ggml_context * ctx = ggml_init(params);
-
-            struct ggml_tensor * src = ggml_new_tensor_2d(ctx, src_type, 10, 11);
-            arange(src);
-            struct ggml_tensor * dst = ggml_new_tensor_2d(ctx, dst_type, 10, 11);
-            ggml_set_i32(dst, 0);
-
-            // 2nd-row: [20, 21, ..., 29]
-            struct ggml_tensor * src_cont = ggml_view_1d(ctx, src, 10, src->nb[1] * 2);
-
-            // 3rd-col: [03, 13, ..., 93]
-            struct ggml_tensor * src_stride = ggml_view_2d(ctx, src, 1, 10, src->nb[1], src->nb[0] * 3);
-
-            struct ggml_tensor * dst_cont_1 = ggml_view_1d(ctx, dst, 10, dst->nb[1] * 5); // 5nd-row
-            struct ggml_tensor * dst_cont_2 = ggml_view_1d(ctx, dst, 10, dst->nb[1] * 6); // 6rd-row
-
-            struct ggml_tensor * dst_stride_1 = ggml_view_2d(ctx, dst, 1, 10, dst->nb[1], dst->nb[0] * 7); // 7th-col
-            struct ggml_tensor * dst_stride_2 = ggml_view_2d(ctx, dst, 1, 10, dst->nb[1], dst->nb[0] * 8); // 8th-col
-
-            struct ggml_cgraph * gf = ggml_new_graph(ctx);
-
-            dup_to(src_cont,   dst_cont_1);
-            dup_to(src_stride, dst_cont_2);
-            dup_to(src_cont,   dst_stride_1);
-            dup_to(src_stride, dst_stride_2);
-
-            ggml_build_forward_expand(gf, dst_cont_1);
-            ggml_build_forward_expand(gf, dst_cont_2);
-            ggml_build_forward_expand(gf, dst_stride_1);
-            ggml_build_forward_expand(gf, dst_stride_2);
-
-            ggml_graph_compute_with_ctx(ctx, gf, 1);
-
-            // src_cont -> dst_cont_1
-            GGML_ASSERT(ggml_get_i32_1d(dst, 49) == 0);
-            GGML_ASSERT(ggml_get_i32_1d(dst, 50) == 20);
-            GGML_ASSERT(ggml_get_i32_1d(dst, 51) == 21);
-            GGML_ASSERT(ggml_get_i32_1d(dst, 52) == 22);
-            GGML_ASSERT(ggml_get_i32_1d(dst, 59) == 29);
-
-            // src_stride -> dst_cont_2
-            GGML_ASSERT(ggml_get_i32_1d(dst, 60) == 3);
-            GGML_ASSERT(ggml_get_i32_1d(dst, 61) == 13);
-            GGML_ASSERT(ggml_get_i32_1d(dst, 62) == 23);
-            GGML_ASSERT(ggml_get_i32_1d(dst, 69) == 93);
-            GGML_ASSERT(ggml_get_i32_1d(dst, 70) == 0);
-
-            // src_cont -> dst_stride_1
-            GGML_ASSERT(ggml_get_i32_1d(dst, 6)   == 0);
-            GGML_ASSERT(ggml_get_i32_1d(dst, 7)   == 20);
-            GGML_ASSERT(ggml_get_i32_1d(dst, 17)  == 21);
-            GGML_ASSERT(ggml_get_i32_1d(dst, 27)  == 22);
-            GGML_ASSERT(ggml_get_i32_1d(dst, 97)  == 29);
-            GGML_ASSERT(ggml_get_i32_1d(dst, 107) == 0);
-
-            // src_stride -> dst_stride_2
-            GGML_ASSERT(ggml_get_i32_1d(dst, 8)   == 03);
-            GGML_ASSERT(ggml_get_i32_1d(dst, 18)  == 13);
-            GGML_ASSERT(ggml_get_i32_1d(dst, 28)  == 23);
-            GGML_ASSERT(ggml_get_i32_1d(dst, 98)  == 93);
-            GGML_ASSERT(ggml_get_i32_1d(dst, 108) == 0);
-
-            ggml_free(ctx);
-        }
-    }
-
-    return 0;
-}
diff --git a/ggml/tests/test-grad0.cpp b/ggml/tests/test-grad0.cpp
deleted file mode 100644
index 8ff76c8..0000000
--- a/ggml/tests/test-grad0.cpp
+++ /dev/null
@@ -1,1606 +0,0 @@
-#define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnings on Windows
-#include "ggml.h"
-
-#include <cmath>
-#include <cstdio>
-#include <cstdlib>
-#include <cassert>
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-
-#if defined(__GNUC__)
-#pragma GCC diagnostic ignored "-Wdouble-promotion"
-#endif
-
-#define MAX_NARGS 3
-
-#undef MIN
-#undef MAX
-#define MIN(a, b) ((a) < (b) ? (a) : (b))
-#define MAX(a, b) ((a) > (b) ? (a) : (b))
-
-#define GGML_SILU_FP16
-
-//
-// logging
-//
-
-#if (GGML_DEBUG >= 1)
-#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
-#else
-#define GGML_PRINT_DEBUG(...)
-#endif
-
-#if (GGML_DEBUG >= 5)
-#define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
-#else
-#define GGML_PRINT_DEBUG_5(...)
-#endif
-
-#if (GGML_DEBUG >= 10)
-#define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
-#else
-#define GGML_PRINT_DEBUG_10(...)
-#endif
-
-#define GGML_PRINT(...) printf(__VA_ARGS__)
-
-static float frand(void) {
-    return (float)rand()/(float)RAND_MAX;
-}
-
-static int irand(int n) {
-    if (n == 0) return 0;
-    return rand()%n;
-}
-
-static void get_random_dims(int64_t * dims, int ndims) {
-    dims[0] = dims[1] = dims[2] = dims[3] = 1;
-
-    for (int i = 0; i < ndims; i++) {
-        dims[i] = 1 + irand(4);
-    }
-}
-
-static struct ggml_tensor * get_random_tensor_f32(
-        struct ggml_context * ctx0,
-        int ndims,
-        int64_t ne[],
-        float fmin,
-        float fmax) {
-    struct ggml_tensor * result = ggml_new_tensor(ctx0, GGML_TYPE_F32, ndims, ne);
-
-    switch (ndims) {
-        case 1:
-            for (int i0 = 0; i0 < ne[0]; i0++) {
-                ((float *)result->data)[i0] = frand()*(fmax - fmin) + fmin;
-            }
-            break;
-        case 2:
-            for (int i1 = 0; i1 < ne[1]; i1++) {
-                for (int i0 = 0; i0 < ne[0]; i0++) {
-                    ((float *)result->data)[i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
-                }
-            }
-            break;
-        case 3:
-            for (int i2 = 0; i2 < ne[2]; i2++) {
-                for (int i1 = 0; i1 < ne[1]; i1++) {
-                    for (int i0 = 0; i0 < ne[0]; i0++) {
-                        ((float *)result->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
-                    }
-                }
-            }
-            break;
-        case 4:
-            for (int i3 = 0; i3 < ne[3]; i3++) {
-                for (int i2 = 0; i2 < ne[2]; i2++) {
-                    for (int i1 = 0; i1 < ne[1]; i1++) {
-                        for (int i0 = 0; i0 < ne[0]; i0++) {
-                            ((float *)result->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
-                        }
-                    }
-                }
-            }
-            break;
-        default:
-            assert(false);
-    }
-
-    return result;
-}
-
-static struct ggml_tensor * get_random_tensor_f16(
-        struct ggml_context * ctx0,
-        int ndims,
-        int64_t ne[],
-        float fmin,
-        float fmax) {
-    struct ggml_tensor * result = ggml_new_tensor(ctx0, GGML_TYPE_F16, ndims, ne);
-
-    switch (ndims) {
-        case 1:
-            for (int i0 = 0; i0 < ne[0]; i0++) {
-                ((ggml_fp16_t *)result->data)[i0] = ggml_fp32_to_fp16(frand()*(fmax - fmin) + fmin);
-            }
-            break;
-        case 2:
-            for (int i1 = 0; i1 < ne[1]; i1++) {
-                for (int i0 = 0; i0 < ne[0]; i0++) {
-                    ((ggml_fp16_t *)result->data)[i1*ne[0] + i0] = ggml_fp32_to_fp16(frand()*(fmax - fmin) + fmin);
-                }
-            }
-            break;
-        case 3:
-            for (int i2 = 0; i2 < ne[2]; i2++) {
-                for (int i1 = 0; i1 < ne[1]; i1++) {
-                    for (int i0 = 0; i0 < ne[0]; i0++) {
-                        ((ggml_fp16_t *)result->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = ggml_fp32_to_fp16(frand()*(fmax - fmin) + fmin);
-                    }
-                }
-            }
-            break;
-        case 4:
-            for (int i3 = 0; i3 < ne[3]; i3++) {
-                for (int i2 = 0; i2 < ne[2]; i2++) {
-                    for (int i1 = 0; i1 < ne[1]; i1++) {
-                        for (int i0 = 0; i0 < ne[0]; i0++) {
-                            ((ggml_fp16_t *)result->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = ggml_fp32_to_fp16(frand()*(fmax - fmin) + fmin);
-                        }
-                    }
-                }
-            }
-            break;
-        default:
-            assert(false);
-    }
-
-    return result;
-}
-
-static struct ggml_tensor * get_random_tensor_i32(
-        struct ggml_context * ctx0,
-        int ndims,
-        int64_t ne[],
-        int32_t imin,
-        int32_t imax) {
-    struct ggml_tensor * result = ggml_new_tensor(ctx0, GGML_TYPE_I32, ndims, ne);
-
-    switch (ndims) {
-        case 1:
-            for (int i0 = 0; i0 < ne[0]; i0++) {
-                ((int32_t *)result->data)[i0] = irand(imax - imin) + imin;
-            }
-            break;
-        case 2:
-            for (int i1 = 0; i1 < ne[1]; i1++) {
-                for (int i0 = 0; i0 < ne[0]; i0++) {
-                    ((int32_t *)result->data)[i1*ne[0] + i0] = irand(imax - imin) + imin;
-                }
-            }
-            break;
-        case 3:
-            for (int i2 = 0; i2 < ne[2]; i2++) {
-                for (int i1 = 0; i1 < ne[1]; i1++) {
-                    for (int i0 = 0; i0 < ne[0]; i0++) {
-                        ((int32_t *)result->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = irand(imax - imin) + imin;
-                    }
-                }
-            }
-            break;
-        case 4:
-            for (int i3 = 0; i3 < ne[3]; i3++) {
-                for (int i2 = 0; i2 < ne[2]; i2++) {
-                    for (int i1 = 0; i1 < ne[1]; i1++) {
-                        for (int i0 = 0; i0 < ne[0]; i0++) {
-                            ((int32_t *)result->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = irand(imax - imin) + imin;
-                        }
-                    }
-                }
-            }
-            break;
-        default:
-            assert(false);
-    }
-
-    return result;
-}
-
-static bool check_gradient(
-        const char * op_name,
-        struct ggml_context * ctx0,
-        struct ggml_tensor * x[],
-        struct ggml_tensor * f,
-        int ndims,
-        int nargs,
-        float eps,
-        float max_error_abs,
-        float max_error_rel) {
-
-    static int n_threads = -1;
-    if (n_threads < 0) {
-        n_threads = GGML_DEFAULT_N_THREADS;
-
-        const char *env = getenv("GGML_N_THREADS");
-        if (env) {
-            n_threads = atoi(env);
-        }
-
-        printf("GGML_N_THREADS = %d\n", n_threads);
-    }
-
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, GGML_DEFAULT_GRAPH_SIZE, true);
-    struct ggml_cgraph * gb = ggml_new_graph_custom(ctx0, GGML_DEFAULT_GRAPH_SIZE, true);
-    ggml_build_forward_expand(gf, f);
-    ggml_graph_cpy(gf, gb);
-    ggml_build_backward_expand(ctx0, gf, gb, false);
-
-    ggml_graph_compute_with_ctx(ctx0, gf, n_threads);
-
-    ggml_graph_reset  (gf);
-    ggml_set_f32      (f->grad, 1.0f);
-
-    ggml_graph_compute_with_ctx(ctx0, gb, n_threads);
-
-    // ggml_graph_dump_dot(gf, NULL, "test-grad0-forward.dot");
-    // ggml_graph_dump_dot(gb, gf,  "test-grad0-backward.dot");
-
-    for (int i = 0; i < nargs; ++i) {
-        const int nelements = ggml_nelements(x[i]);
-        for (int k = 0; k < nelements; ++k) {
-            // compute gradient using finite differences
-            const float x0 = ggml_get_f32_1d(x[i], k);
-            const float xm = x0 - eps;
-            const float xp = x0 + eps;
-            ggml_set_f32_1d(x[i], k, xp);
-
-            ggml_graph_compute_with_ctx(ctx0, gf, n_threads);
-
-            const double f0 = ggml_get_f32_1d(f, 0);
-
-            ggml_set_f32_1d(x[i], k, xm);
-
-            ggml_graph_compute_with_ctx(ctx0, gf, n_threads);
-
-            const double f1 = ggml_get_f32_1d(f, 0);
-            const double g0 = (f0 - f1)/(2.0*(double) eps);
-
-            ggml_set_f32_1d(x[i], k, x0);
-
-            // compute gradient using backward graph
-            ggml_graph_reset  (gf);
-            ggml_set_f32      (f->grad, 1.0f);
-
-            ggml_graph_compute_with_ctx(ctx0, gb, n_threads);
-
-            const double g1 = ggml_get_f32_1d(x[i]->grad, k);
-
-            const double error_abs = fabs(g0 - g1);
-            const double error_rel = g0 != 0 ? fabs(g0 - g1)/fabs(g0) : 0;
-
-            if (error_abs > max_error_abs || error_rel > max_error_rel) {
-                printf("%s: ndims=%d, i=%d, k=%d, x0=%f, xm=%f, xp=%f, f0=%f, f1=%f, g0=%f, g1=%f, eps=%f, error_abs=%f, error_rel=%f\n",
-                            op_name, ndims, i, k, x0, xm, xp, f0, f1, g0, g1, eps, error_abs, error_rel);
-                //assert(false);
-                return false;
-            }
-        }
-    }
-
-    return true;
-}
-
-// TODO: clean-up this ..
-static bool check_mat_mul(
-        const struct ggml_tensor * y,
-        const struct ggml_tensor * x0,
-        const struct ggml_tensor * x1) {
-    float * dst  = (float *) y->data;
-    float * src0 = (float *) x0->data;
-    float * src1 = (float *) x1->data;
-
-    const int nc = x0->ne[1];
-    const int nr = x1->ne[1];
-    const int nk = x0->ne[0];
-
-    GGML_PRINT_DEBUG("check_mat_mul: nc=%d, nr=%d, nk=%d\n", nc, nr, nk);
-
-    GGML_PRINT_DEBUG("x0:\n");
-    for (int j = 0; j < x0->ne[1]; ++j) {
-        for (int i = 0; i < x0->ne[0]; ++i) {
-            GGML_PRINT_DEBUG("%6.3f ", src0[j*nk + i]);
-        }
-        GGML_PRINT_DEBUG("\n");
-    }
-    GGML_PRINT_DEBUG("\n");
-
-    GGML_PRINT_DEBUG("x1:\n");
-    for (int j = 0; j < x1->ne[1]; ++j) {
-        for (int i = 0; i < x1->ne[0]; ++i) {
-            GGML_PRINT_DEBUG("%6.3f ", src1[j*nk + i]);
-        }
-        GGML_PRINT_DEBUG("\n");
-    }
-    GGML_PRINT_DEBUG("\n");
-
-    GGML_PRINT_DEBUG("y: n_dims = %d, (%lld, %lld)\n", y->n_dims, y->ne[0], y->ne[1]);
-    for (int j = 0; j < y->ne[1]; ++j) {
-        for (int i = 0; i < y->ne[0]; ++i) {
-            GGML_PRINT_DEBUG("%6.3f ", dst[j*nr + i]);
-        }
-        GGML_PRINT_DEBUG("\n");
-    }
-
-    for (int i = 0; i < nr; ++i) {
-        for (int j = 0; j < nc; ++j) {
-            float sum = 0.0f;
-
-            for (int k = 0; k < nk; ++k) {
-                sum += src0[j*nk + k]*src1[i*nk + k];
-            }
-
-            if (fabsf(dst[i*nc + j] - sum) > 1e-5f) {
-                fprintf(stderr, "check_mat_mul: dst[%d] = %f, sum = %f\n", i*nc + j, dst[i*nc + j], sum);
-                assert(false);
-                return false;
-            }
-        }
-    }
-
-    return true;
-}
-
-#define NUM_PERMUTATIONS (4*3*2*1)
-
-int main(int argc, const char ** argv) {
-    struct ggml_init_params params = {
-        /* .mem_size   = */ 256*1024*1024,
-        /* .mem_buffer = */ NULL,
-        /* .no_alloc   = */ false,
-    };
-
-    int64_t ne[4];
-
-    int all_permutations[4 * NUM_PERMUTATIONS];
-    {
-        int count = 0;
-        for (int ax0=0; ax0<4; ++ax0) {
-            for (int ax1=0; ax1<4; ++ax1) {
-                if (ax1 == ax0) continue;
-                for (int ax2=0; ax2<4; ++ax2) {
-                    if (ax2 == ax0) continue;
-                    if (ax2 == ax1) continue;
-                    for (int ax3=0; ax3<4; ++ax3) {
-                        if (ax3 == ax0) continue;
-                        if (ax3 == ax1) continue;
-                        if (ax3 == ax2) continue;
-                        assert(count < NUM_PERMUTATIONS);
-                        all_permutations[count*4+0] = ax0;
-                        all_permutations[count*4+1] = ax1;
-                        all_permutations[count*4+2] = ax2;
-                        all_permutations[count*4+3] = ax3;
-                        ++count;
-                    }
-                }
-            }
-        }
-    }
-
-    unsigned seed_iter = 1;
-
-    // original loop: 1000
-    int niter = 4;
-    const char *env = getenv("GGML_NLOOP");
-    if (env != NULL) {
-        niter = atoi(env);
-    }
-    if (argc > 1) {
-        niter = atoi(argv[1]);
-    }
-    for (int iter = 0; iter < niter; ++iter) {
-        srand(seed_iter);
-        seed_iter = rand();
-        unsigned seed = rand();
-
-        printf("test-grad0: iter:%d/%d\n", iter, niter);
-        struct ggml_context * ctx0 = ggml_init(params);
-
-        get_random_dims(ne, 4);
-
-        struct ggml_tensor * x[MAX_NARGS];
-
-        // add f32
-        {
-            srand(seed);
-            const int nargs = 2;
-
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_add(ctx0, x[0], x[1]));
-
-                check_gradient("add f32", ctx0, x, f, ndims, nargs, 1e-3f, 2e-3f, 2e-3f);
-            }
-        }
-
-        // add f16
-        {
-            srand(seed);
-            const int nargs = 2;
-
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f16(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_add(ctx0, x[0], x[1]));
-
-                check_gradient("add f16", ctx0, x, f, ndims, nargs, 1e-1f, 2e-1f, 2e-1f);
-            }
-        }
-
-        // sub
-        {
-            srand(seed);
-            const int nargs = 2;
-
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_sub(ctx0, x[0], x[1]));
-
-                check_gradient("sub", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
-            }
-        }
-
-        // mul
-        {
-            srand(seed);
-            const int nargs = 2;
-
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_mul(ctx0, x[0], x[1]));
-
-                check_gradient("mul", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-            }
-        }
-
-        // div
-        {
-            srand(seed);
-            const int nargs = 2;
-
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, 0.5f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_div(ctx0, x[0], x[1]));
-
-                check_gradient("div", ctx0, x, f, ndims, nargs, 1e-3f, 1e-1f, 1e-1f);
-            }
-        }
-
-        // sqr
-        {
-            srand(seed);
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 2; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, x[0]));
-
-                check_gradient("sqr", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-            }
-        }
-
-        // sqrt
-        {
-            srand(seed);
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 2; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, 2.0f*1e-3f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqrt(ctx0, x[0]));
-
-                check_gradient("sqrt", ctx0, x, f, ndims, nargs, 1e-3f, 2e-2f, 1e-1f);
-            }
-        }
-
-        // log
-        {
-            srand(seed);
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 2; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, 2.0f*1e-3f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_log(ctx0, x[0]));
-
-                check_gradient("log", ctx0, x, f, ndims, nargs, 1e-3f, INFINITY, 1e-1f);
-            }
-        }
-
-        // sum
-        {
-            srand(seed);
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 2; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor * f = ggml_sum(ctx0, x[0]);
-
-                check_gradient("sum", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
-            }
-        }
-
-
-        // sum_rows
-        {
-            srand(seed);
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, ggml_sum_rows(ctx0, x[0])));
-
-                check_gradient("sum_rows", ctx0, x, f, ndims, nargs, 1e-3f, 1e-2f, INFINITY);
-            }
-        }
-
-        // mean, not yet fully implemented
-        if(0)
-        {
-            srand(seed);
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_mean(ctx0, x[0]));
-
-                check_gradient("mean", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
-            }
-        }
-
-        // argmax
-        if (0)
-        {
-            srand(seed);
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_argmax(ctx0, x[0]));
-
-                check_gradient("argmax", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
-            }
-        }
-
-        // repeat
-        {
-            srand(seed);
-            int64_t ne2[4];
-            get_random_dims(ne2, 4);
-
-            ne2[0] = ne[0] * ne2[0];
-            ne2[1] = ne[1] * ne2[1];
-            ne2[2] = 1;
-            ne2[3] = 1;
-
-            const int nargs = 1;
-            for (int ndims = 1; ndims <= 2; ++ndims) {
-                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                x[1] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
-                ggml_set_param(ctx0, x[0]);
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, ggml_sub(ctx0, x[1], ggml_repeat(ctx0, x[0], x[1]))));
-
-                check_gradient("repeat", ctx0, x, f, ndims, nargs, 1e-3f, 1e-2f, INFINITY);
-            }
-        }
-
-        // repeat back
-        {
-            srand(seed);
-            int64_t ne2[4];
-            get_random_dims(ne2, 4);
-
-            ne2[0] = ne[0] * ne2[0];
-            ne2[1] = ne[1] * ne2[1];
-            ne2[2] = 1;
-            ne2[3] = 1;
-
-            const int nargs = 1;
-            for (int ndims = 1; ndims <= 2; ++ndims) {
-                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                x[1] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
-                ggml_set_param(ctx0, x[0]);
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, ggml_sub(ctx0, x[0], ggml_repeat_back(ctx0, x[1], x[0]))));
-
-                check_gradient("repeat back", ctx0, x, f, ndims, nargs, 1e-3f, 1e-2f, INFINITY);
-            }
-        }
-
-        // abs (finite differences do not work)
-        //{
-        //    const int nargs = 1;
-
-        //    for (int ndims = 1; ndims <= 2; ++ndims) {
-        //        for (int i = 0; i < nargs; ++i) {
-        //            x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-        //            ggml_set_param(ctx0, x[i]);
-        //        }
-
-        //        struct ggml_tensor * f = ggml_sum(ctx0, ggml_abs(ctx0, x[0]));
-
-        //        check_gradient("abs", ctx0, x, f, ndims, nargs, 1e-3f, INFINITY, 1e-3f);
-        //    }
-        //}
-
-        // sgn
-        {
-            srand(seed);
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor* f = ggml_sum(ctx0, ggml_sgn(ctx0, x[0]));
-
-                check_gradient("sgn", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
-            }
-        }
-
-        // neg
-        {
-            srand(seed);
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor* f = ggml_sum(ctx0, ggml_neg(ctx0, x[0]));
-
-                check_gradient("neg", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
-            }
-        }
-
-        // step
-        {
-            srand(seed);
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor* f = ggml_sum(ctx0, ggml_step(ctx0, x[0]));
-
-                check_gradient("step", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
-            }
-        }
-
-        // tanh, not yet fully implemented
-        if(0)
-        {
-            srand(seed);
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor* f = ggml_sum(ctx0, ggml_tanh(ctx0, x[0]));
-
-                check_gradient("tanh", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
-            }
-        }
-
-        // mul_mat
-        {
-            srand(seed);
-            const int nargs = 2;
-
-            for (int ndims = 2; ndims <= 4; ++ndims) {
-                int max_nrep = (ndims >= 3) ? 2 : 1;
-                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                for (int nrep2 = 1; nrep2 < max_nrep; ++nrep2) {
-                    for (int nrep3 = 1; nrep3 < max_nrep; ++nrep3) {
-                        {
-                            int64_t ne2[4];
-                            get_random_dims(ne2, 4);
-                            ne2[0] = ne[0];
-                            ne2[2] = nrep2 * ne[2];
-                            ne2[3] = nrep3 * ne[3];
-                            x[1] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
-                        }
-
-                        ggml_set_param(ctx0, x[0]);
-                        ggml_set_param(ctx0, x[1]);
-
-                        struct ggml_tensor * m = ggml_mul_mat(ctx0, x[1], x[0]);
-                        struct ggml_tensor * f = ggml_sum(ctx0, m);
-
-                        GGML_PRINT_DEBUG("testing: mul_mat, [%lld, %lld] (%d) * [%lld, %lld] (%d)\n", x[1]->ne[0], x[1]->ne[1], x[1]->n_dims, x[0]->ne[0], x[0]->ne[1], x[0]->n_dims);
-
-                        check_gradient("mul_mat", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-                        if (ndims == 2) {
-                            // check_mat_mul does not support ndims > 2
-                            check_mat_mul(m, x[1], x[0]);
-                        }
-                    }
-                }
-            }
-        }
-
-        // elu, not yet fully implemented
-        if(0)
-        {
-            srand(seed);
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor* f = ggml_sum(ctx0, ggml_elu(ctx0, x[0]));
-
-                check_gradient("elu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
-            }
-        }
-
-        // relu
-        {
-            srand(seed);
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor* f = ggml_sum(ctx0, ggml_relu(ctx0, x[0]));
-
-                check_gradient("relu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-            }
-        }
-
-        // gelu, not yet fully implemented
-        if(0)
-        {
-            srand(seed);
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor* f = ggml_sum(ctx0, ggml_gelu(ctx0, x[0]));
-
-                check_gradient("gelu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
-            }
-        }
-
-        // silu
-        {
-            srand(seed);
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 2; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_silu(ctx0, x[0]));
-
-#ifdef GGML_SILU_FP16
-                // due to GGML_SILU_FP16 the finite difference method will be slightly wrong -> increase error bounds.
-                check_gradient("silu", ctx0, x, f, ndims, nargs, 1e-3f, 0.5, INFINITY);
-#else
-                check_gradient("silu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-#endif
-            }
-        }
-
-        // rms_norm
-        {
-            srand(seed);
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 2; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_rms_norm(ctx0, x[0], 1e-6f));
-
-                check_gradient("rms_norm", ctx0, x, f, ndims, nargs, 1e-4f, 1.0f, INFINITY);
-            }
-        }
-
-        // scale
-        {
-            srand(seed);
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 2; ++ndims) {
-                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-
-                const float s = -1.0f + 2.0f*frand();
-
-                ggml_set_param(ctx0, x[0]);
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_scale(ctx0, x[0], s));
-
-                check_gradient("scale", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-            }
-        }
-
-        // cpy f32
-        {
-            srand(seed);
-            const int nargs = 2;
-
-            for (int ndims = 1; ndims <= 2; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-                // x[1] is overwritten by x[0], so the gradients don't propagate to x[1]
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_cpy(ctx0, x[0], x[1]));
-
-                check_gradient("cpy f32", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-            }
-        }
-
-        // cpy f16
-        {
-            srand(seed);
-            const int nargs = 2;
-
-            for (int ndims = 1; ndims <= 2; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f16(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-                // x[1] is overwritten by x[0], so the gradients don't propagate to x[1]
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_cpy(ctx0, x[0], x[1]));
-
-                check_gradient("cpy f16", ctx0, x, f, ndims, nargs, 1e-1f, 1e-1f, INFINITY);
-            }
-        }
-
-        // reshape (1d->nd)
-        {
-            srand(seed);
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 2; ++ndims) {
-                int64_t ne2[4];
-                ne2[0] = 1;
-                ne2[1] = 1;
-                ne2[2] = 1;
-                ne2[3] = 1;
-                for (int i = 0; i < ndims; ++i) {
-                    ne2[0] *= ne[i];
-                }
-                x[0] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f);
-                x[1] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                ggml_set_param(ctx0, x[0]);
-
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_reshape(ctx0, x[0], x[1]));
-                check_gradient("reshape", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-            }
-        }
-
-        // reshape (nd->1d)
-        {
-            srand(seed);
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 2; ++ndims) {
-                int64_t ne2[4];
-                ne2[0] = 1;
-                ne2[1] = 1;
-                ne2[2] = 1;
-                ne2[3] = 1;
-                for (int i = 0; i < ndims; ++i) {
-                    ne2[0] *= ne[i];
-                }
-                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                x[1] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f);
-                ggml_set_param(ctx0, x[0]);
-
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_reshape(ctx0, x[0], x[1]));
-                check_gradient("reshape", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-            }
-        }
-
-        // acc 1d
-        {
-            srand(seed);
-            int64_t ne2[4] = { 1, 1, 1, 1 };
-
-            const int nargs = 2;
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-
-                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                ggml_set_param(ctx0, x[0]);
-
-                get_random_dims(ne2, 1);
-                while ((ne2[0] > ne[0]) || (ne2[0] > ggml_nelements(x[0]))) {
-                    get_random_dims(ne2, 1);
-                }
-
-                x[1] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f);
-                ggml_set_param(ctx0, x[1]);
-
-                const int max_offset = MAX(0, ggml_nelements(x[0]) - ggml_nelements(x[1]));
-                const int offset = irand(max_offset) * ggml_element_size(x[0]);
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_acc(ctx0, x[0], x[1], x[0]->nb[1], x[0]->nb[2], x[0]->nb[3], offset));
-
-                check_gradient("acc 1d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-            }
-        }
-
-        // acc 2d
-        {
-            srand(seed);
-            int64_t ne2[4]         = { 1, 1, 1, 1 };
-            int64_t max_offsets[4] = { 0, 0, 0, 0 };
-            int64_t offsets[4]     = { 0, 0, 0, 0 };
-
-            const int nargs = 2;
-            for (int ndims = 2; ndims <= 4; ++ndims) {
-
-                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                ggml_set_param(ctx0, x[0]);
-
-                get_random_dims(ne2, 2);
-                while ((ne2[0] > ne[0]) || (ne2[1] > ne[1]) || (ne2[0]*ne2[1] > ggml_nelements(x[0]))) {
-                    get_random_dims(ne2, 2);
-                }
-
-                x[1] = get_random_tensor_f32(ctx0, 2, ne2, -1.0f, 1.0f);
-                ggml_set_param(ctx0, x[1]);
-
-                max_offsets[0] = MAX(0, x[0]->ne[0] - x[1]->ne[0]);
-                max_offsets[1] = MAX(0, x[0]->ne[1] - x[1]->ne[1]);
-                offsets[0] = irand(max_offsets[0]) * x[0]->nb[0];
-                offsets[1] = irand(max_offsets[1]) * x[0]->nb[1];
-                const int offset = offsets[0] + offsets[1];
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_acc(ctx0, x[0], x[1], x[0]->nb[1], x[0]->nb[2], x[0]->nb[3], offset));
-
-                check_gradient("acc 2d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-            }
-        }
-
-        // acc 3d
-        {
-            srand(seed);
-            int64_t ne2[4]         = { 1, 1, 1, 1 };
-            int64_t max_offsets[4] = { 0, 0, 0, 0 };
-            int64_t offsets[4]     = { 0, 0, 0, 0 };
-
-            const int nargs = 2;
-            for (int ndims = 3; ndims <= 4; ++ndims) {
-
-                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                ggml_set_param(ctx0, x[0]);
-
-                get_random_dims(ne2, 3);
-                while ((ne2[0] > ne[0]) || (ne2[1] > ne[1]) || (ne2[2] > ne[2]) || (ne2[0]*ne2[1]*ne2[2] > ggml_nelements(x[0]))) {
-                    get_random_dims(ne2, 3);
-                }
-
-                x[1] = get_random_tensor_f32(ctx0, 3, ne2, -1.0f, 1.0f);
-                ggml_set_param(ctx0, x[1]);
-
-                max_offsets[0] = MAX(0, x[0]->ne[0] - x[1]->ne[0]);
-                max_offsets[1] = MAX(0, x[0]->ne[1] - x[1]->ne[1]);
-                max_offsets[2] = MAX(0, x[0]->ne[2] - x[1]->ne[2]);
-                offsets[0] = irand(max_offsets[0]) * x[0]->nb[0];
-                offsets[1] = irand(max_offsets[1]) * x[0]->nb[1];
-                offsets[2] = irand(max_offsets[2]) * x[0]->nb[2];
-                const int offset = offsets[0] + offsets[1] + offsets[2];
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_acc(ctx0, x[0], x[1], x[0]->nb[1], x[0]->nb[2], x[0]->nb[3], offset));
-
-                check_gradient("acc 3d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-            }
-        }
-
-        // acc 4d
-        {
-            srand(seed);
-            int64_t ne2[4]         = { 1, 1, 1, 1 };
-            int64_t max_offsets[4] = { 0, 0, 0, 0 };
-            int64_t offsets[4]     = { 0, 0, 0, 0 };
-
-            const int nargs = 2;
-            for (int ndims = 4; ndims <= 4; ++ndims) {
-
-                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                ggml_set_param(ctx0, x[0]);
-
-                get_random_dims(ne2, 4);
-                while ((ne2[0] > ne[0]) || (ne2[1] > ne[1]) || (ne2[2] > ne[2]) || (ne2[3] > ne[3]) || (ne2[0]*ne2[1]*ne2[2]*ne2[3] > ggml_nelements(x[0]))) {
-                    get_random_dims(ne2, 4);
-                }
-
-                x[1] = get_random_tensor_f32(ctx0, 4, ne2, -1.0f, 1.0f);
-                ggml_set_param(ctx0, x[1]);
-
-                max_offsets[0] = MAX(0, x[0]->ne[0] - x[1]->ne[0]);
-                max_offsets[1] = MAX(0, x[0]->ne[1] - x[1]->ne[1]);
-                max_offsets[2] = MAX(0, x[0]->ne[2] - x[1]->ne[2]);
-                max_offsets[3] = MAX(0, x[0]->ne[3] - x[1]->ne[3]);
-                offsets[0] = irand(max_offsets[0]) * x[0]->nb[0];
-                offsets[1] = irand(max_offsets[1]) * x[0]->nb[1];
-                offsets[2] = irand(max_offsets[2]) * x[0]->nb[2];
-                offsets[3] = irand(max_offsets[3]) * x[0]->nb[3];
-                const int offset = offsets[0] + offsets[1] + offsets[2] + offsets[3];
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_acc(ctx0, x[0], x[1], x[0]->nb[1], x[0]->nb[2], x[0]->nb[3], offset));
-
-                check_gradient("acc 4d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-            }
-        }
-
-        // set_1d
-        {
-            srand(seed);
-            int64_t ne2[4];
-
-            const int nargs = 2;
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-
-                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                ggml_set_param(ctx0, x[0]);
-
-                get_random_dims(ne2, 1);
-                while ((ne2[0] > ne[0]) || (ne2[0] > ggml_nelements(x[0]))) {
-                    get_random_dims(ne2, 1);
-                }
-
-                x[1] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f);
-                ggml_set_param(ctx0, x[1]);
-
-                const int max_offset = MAX(0, ggml_nelements(x[0]) - ggml_nelements(x[1]));
-                const int offset = irand(max_offset) * ggml_element_size(x[0]);
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_set_1d(ctx0, x[0], x[1], offset));
-
-                check_gradient("set_1d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-            }
-        }
-
-        // set_2d
-        {
-            srand(seed);
-            int64_t ne2[4];
-            int64_t max_offsets[4] = { 0, 0, 0, 0 };
-            int64_t offsets[4]     = { 0, 0, 0, 0 };
-
-            const int nargs = 1;
-            for (int ndims = 2; ndims <= 4; ++ndims) {
-
-                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                ggml_set_param(ctx0, x[0]);
-
-                get_random_dims(ne2, 2);
-                while ((ne2[0] > ne[0]) || (ne2[1] > ne[1]) || (ne2[0]*ne2[1] > ggml_nelements(x[0]))) {
-                    get_random_dims(ne2, 2);
-                }
-
-                x[1] = get_random_tensor_f32(ctx0, 2, ne2, -1.0f, 1.0f);
-                ggml_set_param(ctx0, x[1]);
-
-                max_offsets[0] = MAX(0, x[0]->ne[0] - x[1]->ne[0]);
-                max_offsets[1] = MAX(0, x[0]->ne[1] - x[1]->ne[1]);
-                offsets[0] = irand(max_offsets[0]) * x[0]->nb[0];
-                offsets[1] = irand(max_offsets[1]) * x[0]->nb[1];
-                const int offset = offsets[0] + offsets[1];
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_set_2d(ctx0, x[0], x[1], x[1]->nb[1], offset));
-
-                check_gradient("set_2d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-            }
-        }
-
-        // view_1d
-        {
-            srand(seed);
-            const int nargs = 1;
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-
-                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-
-                ggml_set_param(ctx0, x[0]);
-
-                const int k0 = irand(ggml_nelements(x[0]));
-                const int k1 = irand(ggml_nelements(x[0]));
-                const int i0 = MIN(k0, k1);
-                const int i1 = MAX(k0, k1);
-
-                const int offset = i0 * sizeof(float);
-                const int nelem  = i1 - i0;
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_view_1d(ctx0, x[0], nelem, offset));
-
-                check_gradient("view_1d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-            }
-        }
-
-        // view_2d
-        {
-            srand(seed);
-            int64_t ne2[4];
-            int64_t nb2[4];
-
-            const int nargs = 1;
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-
-                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-
-                get_random_dims(ne2, 2);
-                while (ne2[0]*ne2[1] > ggml_nelements(x[0])) {
-                    get_random_dims(ne2, 2);
-                }
-                const int count = ne2[0]*ne2[1];
-
-                nb2[0] = sizeof(float);
-                nb2[1] = nb2[0]*ne2[0];
-
-                ggml_set_param(ctx0, x[0]);
-
-                const int max_offset = ggml_nelements(x[0]) - count;
-                const int offset = irand(max_offset+1) * sizeof(float);
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_view_2d(ctx0, x[0], ne2[0], ne2[1], nb2[1], offset));
-
-                check_gradient("view_2d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-            }
-        }
-
-        // view_3d
-        {
-            srand(seed);
-            int64_t ne2[4] = {1,1,1,1};
-            int64_t nb2[4] = {0,0,0,0};
-
-            const int nargs = 1;
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-
-                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-
-                get_random_dims(ne2, 3);
-                while (ne2[0]*ne2[1]*ne2[2] > ggml_nelements(x[0])) {
-                    get_random_dims(ne2, 3);
-                }
-                const int count = ne2[0]*ne2[1]*ne2[2];
-
-                nb2[0] = sizeof(float);
-                nb2[1] = nb2[0]*ne2[0];
-                nb2[2] = nb2[1]*ne2[1];
-
-                ggml_set_param(ctx0, x[0]);
-
-                const int max_offset = ggml_nelements(x[0]) - count;
-                const int offset = irand(max_offset+1) * sizeof(float);
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_view_3d(ctx0, x[0], ne2[0], ne2[1], ne2[2], nb2[1], nb2[2], offset));
-
-                check_gradient("view_3d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-            }
-        }
-
-        // permute
-        {
-            srand(seed);
-            int64_t ne2[4];
-
-            const int nargs = 1;
-            for (int ndims = 1; ndims <= 4; ++ndims)
-            {
-                // ggml_permute will set axes of dimensions below n_dims to 1.
-                // to make ggml_permute work correctly on all axes,
-                // the input tensor needs maximal n_dim of 4.
-                for (int i=0; i<ndims; ++i) {
-                    ne2[i] = ne[i];
-                }
-                for (int i=ndims; i<4; ++i) {
-                    ne2[i] = 1;
-                }
-                x[0] = get_random_tensor_f32(ctx0, 4, ne2, -1.0f, 1.0f);
-
-                ggml_set_param(ctx0, x[0]);
-
-                const int p = irand(NUM_PERMUTATIONS);
-                const int ax0 = all_permutations[p*4+0];
-                const int ax1 = all_permutations[p*4+1];
-                const int ax2 = all_permutations[p*4+2];
-                const int ax3 = all_permutations[p*4+3];
-
-                // sum requires contiguous tensor rows
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_cont(ctx0, ggml_permute(ctx0, x[0], ax0, ax1, ax2, ax3)));
-
-                check_gradient("permute", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-            }
-        }
-
-        // transpose
-        {
-            srand(seed);
-            int64_t ne2[4];
-
-            const int nargs = 1;
-            for (int ndims = 1; ndims <= 4; ++ndims)
-            {
-                // ggml_transpose will set axes of dimensions below n_dims to 1.
-                // to make ggml_transpose work correctly on all axes,
-                // the input tensor needs maximal n_dim of 4.
-                for (int i=0; i<ndims; ++i) {
-                    ne2[i] = ne[i];
-                }
-                for (int i=ndims; i<4; ++i) {
-                    ne2[i] = 1;
-                }
-                x[0] = get_random_tensor_f32(ctx0, 4, ne2, -1.0f, 1.0f);
-
-                ggml_set_param(ctx0, x[0]);
-
-                // sum requires contiguous tensor rows
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, x[0])));
-
-                check_gradient("transpose", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-            }
-        }
-
-        // get_rows
-        {
-            srand(seed);
-            int64_t ne2[4] = {ne[0], ne[1], 1, 1};
-            int64_t ne3[4] = {1+irand(ne[1]), 1, 1, 1};
-            const int nargs = 1;
-            const int ndims = 2;
-            x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
-            x[1] = get_random_tensor_i32(ctx0, 1, ne3, 0, ne2[1]);
-
-            ggml_set_param(ctx0, x[0]);
-
-            struct ggml_tensor * f = ggml_sum(ctx0, ggml_get_rows(ctx0, x[0], x[1]));
-
-            check_gradient("get_rows", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-        }
-
-        // diag_mask_inf
-        {
-            srand(seed);
-            const int nargs = 1;
-            const int ndims = 2;
-
-            x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-            ggml_set_param(ctx0, x[0]);
-
-            int n_past = irand(ne[0]);
-
-            struct ggml_tensor * f = ggml_sum(ctx0, ggml_diag_mask_inf(ctx0, x[0], n_past));
-
-            check_gradient("diag_mask_inf", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-        }
-
-        // diag_mask_zero
-        {
-            srand(seed);
-            const int nargs = 1;
-            const int ndims = 2;
-
-            x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-            ggml_set_param(ctx0, x[0]);
-
-            int n_past = irand(ne[0]);
-
-            struct ggml_tensor * f = ggml_sum(ctx0, ggml_diag_mask_zero(ctx0, x[0], n_past));
-
-            check_gradient("diag_mask_zero", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-        }
-
-        // softmax
-        {
-            srand(seed);
-            const int nargs = 1;
-
-            int64_t ne2[4];
-            get_random_dims(ne2, 4);
-
-            for (int ndims = 1; ndims <= 3; ++ndims) {
-                x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
-                ggml_set_param(ctx0, x[0]);
-
-                float eps = 1e-6f;
-                // dont use only sum as aggregation, because sum of softmax is always 1 -> finite differences should not work
-                // instead use sum(log(soft_max()*(1-eps)+eps)); use eps to avoid log(0)
-                struct ggml_tensor * f = ggml_sum(ctx0,
-                                            ggml_log(ctx0,
-                                                ggml_add1(ctx0,
-                                                    ggml_scale(ctx0,
-                                                        ggml_soft_max(ctx0, x[0]),
-                                                        1.0f - eps),
-                                                    ggml_new_f32(ctx0, eps))));
-
-                check_gradient("softmax", ctx0, x, f, ndims, nargs, 1e-3f, 2e-1f, INFINITY);
-                // NOTE: softmax forward is computed using f16 table lookup instead of using actual expf, but backward assumes actual expf.
-                // this may result in different gradients too finite differences.
-                // when this test reports errors, first try to replace the table lookup with actual expf and test again to see if just that was the cause.
-                // if only the table lookup causes gradients to differ this is acceptable.
-            }
-        }
-
-        // cross_entropy_loss
-        {
-            srand(seed);
-            const int nargs = 1;
-
-            int64_t ne2[4];
-            get_random_dims(ne2, 4);
-
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-                x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -0.1f, 0.1f);
-                x[1] = get_random_tensor_f32(ctx0, ndims, ne2, 0.0f, 1.0f);
-                // the second argument to cross_entropy_loss must sum up to 1 for each row
-                int nr = ggml_nrows(x[1]);
-                int nc = ggml_nelements(x[1]) / nr;
-                for (int ir = 0; ir < nr; ++ir) {
-                    float sum = 0;
-                    for (int ic = 0; ic < nc; ++ic) {
-                        sum += ((float *) x[1]->data)[ic + ir*nc];
-                    }
-                    for (int ic = 0; ic < nc; ++ic) {
-                        ((float *) x[1]->data)[ic + ir*nc] /= sum;
-                    }
-                }
-                ggml_set_param(ctx0, x[0]);
-
-                struct ggml_tensor * f = ggml_cross_entropy_loss(ctx0, x[0], x[1]);
-
-                check_gradient("cross_entropy_loss", ctx0, x, f, ndims, nargs, 1e-4f, 1e-3f, INFINITY);
-            }
-        }
-
-        // rope f32
-        {
-            srand(seed);
-            const int nargs = 1;
-
-            int64_t ne2[4];
-            get_random_dims(ne2, 4);
-            ne2[0] += ne2[0] % 2;
-            int n_rot = ne2[0];
-
-            for (int ndims = 3; ndims <= 4; ++ndims) {
-                for (int mode = 0; mode < 4; ++mode) {
-                    for (int n_past = 1; n_past < ne2[2]; ++n_past) {
-                        x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
-
-                        struct ggml_tensor * p = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne2[2]);
-                        for (int i = 0; i < ne2[2]; ++i) {
-                            ((int32_t *) p->data)[i] = n_past + i;
-                        }
-
-                        ggml_set_param(ctx0, x[0]);
-
-                        const bool skip_past = (mode & 1);
-                        if (skip_past) {
-                            // we have no past, so this would have to work on uninitialized memory.
-                            // we only test the gradients here;
-                            // skip_past should have no influence on gradient computation.
-                            // so when other modes work, we assume that this does as well.
-                            continue;
-                        }
-
-                        struct ggml_tensor * f = ggml_sum(ctx0, ggml_rope(ctx0, x[0], p, n_rot, mode, 0));
-
-                        GGML_PRINT_DEBUG("rope f32: n_past: %d n_rot: %d mode: %d\n", n_past, n_rot, mode);
-                        check_gradient("rope f32", ctx0, x, f, ndims, nargs, 1e-2f, 1e-3f, INFINITY);
-                    }
-                }
-            }
-        }
-
-        // rope f16
-        {
-            srand(seed);
-            const int nargs = 1;
-
-            int64_t ne2[4];
-            get_random_dims(ne2, 4);
-            ne2[0] += ne2[0] % 2;
-            int n_rot = ne2[0];
-
-            for (int ndims = 3; ndims <= 4; ++ndims) {
-                for (int mode = 0; mode < 4; ++mode) {
-                    for (int n_past = 1; n_past < ne2[2]; ++n_past) {
-                        x[0] = get_random_tensor_f16(ctx0, ndims, ne2, -1.0f, 1.0f);
-
-                        struct ggml_tensor * p = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne2[2]);
-                        for (int i = 0; i < ne2[2]; ++i) {
-                            ((int32_t *) p->data)[i] = n_past + i;
-                        }
-
-                        ggml_set_param(ctx0, x[0]);
-
-                        const bool skip_past = (mode & 1);
-                        if (skip_past) {
-                            // we have no past, so this would have to work on uninitialized memory.
-                            // we only test the gradients here;
-                            // skip_past should have no influence on gradient computation.
-                            // so when other modes work, we assume that this does as well.
-                            continue;
-                        }
-
-                        struct ggml_tensor * f = ggml_sum(ctx0, ggml_rope(ctx0, x[0], p, n_rot, mode, 0));
-
-                        GGML_PRINT_DEBUG("rope f16: n_past: %d n_rot: %d mode: %d\n", n_past, n_rot, mode);
-                        check_gradient("rope f16", ctx0, x, f, ndims, nargs, 1e-1f, 1e-1f, INFINITY);
-                    }
-                }
-            }
-        }
-
-        // flash_attn f32
-        {
-            srand(seed);
-            const int nargs = 3;
-
-            int64_t ne2[4];
-
-            get_random_dims(ne2, 4);
-            int64_t D = ne2[0];
-            int64_t N = ne2[1];
-            int64_t M = ne2[2] + N;
-            int64_t B = ne2[3];
-
-            for (int masked = 0; masked <= 1; ++masked) {
-                for (int ndims = 2; ndims <= 4; ++ndims) {
-                    int max_nrep = (ndims >= 3) ? 2 : 1;
-                    for (int nrep = 1; nrep < max_nrep; ++nrep) {
-                        int64_t neq[4] = { D, N, B*nrep, ne[3] };
-                        int64_t nek[4] = { D, M, B, ne[3] };
-                        int64_t nev[4] = { M, D, B, ne[3] };
-                        if (ndims == 2) {
-                            neq[2] = 1; neq[3] = 1;
-                            nek[2] = 1; nek[3] = 1;
-                            nev[2] = 1; nev[3] = 1;
-                        } else if (ndims == 3) {
-                            neq[3] = 1;
-                            nek[3] = 1;
-                            nev[3] = 1;
-                        }
-                        x[0] = get_random_tensor_f32(ctx0, ndims, neq, -0.1250f, 0.1250f);
-                        x[1] = get_random_tensor_f32(ctx0, ndims, nek, -0.1250f, 0.1250f);
-                        x[2] = get_random_tensor_f32(ctx0, ndims, nev, -0.1250f, 0.1250f);
-                        ggml_set_param(ctx0, x[0]);
-                        ggml_set_param(ctx0, x[1]);
-                        ggml_set_param(ctx0, x[2]);
-
-                        struct ggml_tensor * f = ggml_sum(ctx0, ggml_flash_attn(ctx0, x[0], x[1], x[2], (masked == 0)));
-
-                        check_gradient("flash_attn f32", ctx0, x, f, ndims, nargs, 1.5e-4f, 1e-3f, INFINITY);
-                    }
-                }
-            }
-        }
-
-        // flash_attn f16, not yet fully implemented
-        if(0)
-        {
-            srand(seed);
-            const int nargs = 3;
-
-            int64_t ne2[4];
-
-            get_random_dims(ne2, 4);
-            int64_t D = ne2[0];
-            int64_t N = ne2[1];
-            int64_t M = ne2[2] + N;
-            int64_t B = ne2[3];
-
-            for (int masked = 0; masked <= 1; ++masked) {
-                for (int ndims = 2; ndims <= 4; ++ndims) {
-                    int64_t neq[4] = { D, N, B, ne[3] };
-                    int64_t nek[4] = { D, M, B, ne[3] };
-                    int64_t nev[4] = { M, D, B, ne[3] };
-                    if (ndims == 2) {
-                        neq[2] = 1; neq[3] = 1;
-                        nek[2] = 1; nek[3] = 1;
-                        nev[2] = 1; nev[3] = 1;
-                    } else if (ndims == 3) {
-                        neq[3] = 1;
-                        nek[3] = 1;
-                        nev[3] = 1;
-                    }
-                    x[0] = get_random_tensor_f16(ctx0, ndims, neq, -0.1250f, 0.1250f);
-                    x[1] = get_random_tensor_f16(ctx0, ndims, nek, -0.1250f, 0.1250f);
-                    x[2] = get_random_tensor_f16(ctx0, ndims, nev, -0.1250f, 0.1250f);
-                    ggml_set_param(ctx0, x[0]);
-                    ggml_set_param(ctx0, x[1]);
-                    ggml_set_param(ctx0, x[2]);
-
-                    struct ggml_tensor * f = ggml_sum(ctx0, ggml_flash_attn(ctx0, x[0], x[1], x[2], (masked == 0)));
-
-                    check_gradient("flash_attn f16", ctx0, x, f, ndims, nargs, 1.5e-4f, 1e-3f, INFINITY);
-                }
-            }
-        }
-        ggml_free(ctx0);
-    }
-
-    return 0;
-}
diff --git a/ggml/tests/test-mul-mat.cpp b/ggml/tests/test-mul-mat.cpp
deleted file mode 100644
index 7380c97..0000000
--- a/ggml/tests/test-mul-mat.cpp
+++ /dev/null
@@ -1,369 +0,0 @@
-#include "ggml.h"
-#include "ggml/ggml-alloc.h"
-#include "ggml/ggml-backend.h"
-
-//#define GGML_USE_CUBLAS // uncomment this to use cuda backend, make sure build ggml lib with GGML_CUBLAS=ON
-
-#ifdef GGML_USE_CUBLAS
-#include "ggml-cuda.h"
-#endif
-
-#ifdef GGML_USE_METAL
-#include "ggml-metal.h"
-#endif
-
-#include <cassert>
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <fstream>
-#include <map>
-#include <string>
-#include <vector>
-
-static void ggml_log_callback_default(ggml_log_level level, const char * text, void * user_data) {
-    (void) level;
-    (void) user_data;
-    fputs(text, stderr);
-    fflush(stderr);
-}
-
-struct test_model {
-    struct ggml_tensor * a;
-    struct ggml_tensor * b;
-    ggml_backend_t backend = NULL;
-    ggml_backend_buffer_t buffer;
-    struct ggml_context * ctx;
-};
-
-void load_model(test_model & model, float* a, float* b, int M, int N, int K, bool use_gpu = false) {
-    size_t buffer_size = 0;
-    {
-        buffer_size += (M * N) * ggml_type_size(GGML_TYPE_F32); // tensor a
-        buffer_size += (N * K) * ggml_type_size(GGML_TYPE_F32); // tensor b
-        buffer_size += 1024; // overhead
-    }
-
-    printf("%s: ggml tensor size    = %d bytes\n", __func__, (int) sizeof(ggml_tensor));
-    printf("%s: backend buffer size = %d bytes\n", __func__, (int) buffer_size);
-
-    int num_tensors = 2;
-    struct ggml_init_params params {
-            /*.mem_size   =*/ ggml_tensor_overhead() * num_tensors,
-            /*.mem_buffer =*/ NULL,
-            /*.no_alloc   =*/ true,
-    };
-
-    // initialize the backend
-#ifdef GGML_USE_CUBLAS
-    if (use_gpu) {
-        fprintf(stderr, "%s: using CUDA backend\n", __func__);
-        model.backend = ggml_backend_cuda_init(0);
-        if (!model.backend) {
-            fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
-        }
-    }
-#endif
-
-#ifdef GGML_USE_METAL
-    if (use_gpu) {
-        fprintf(stderr, "%s: using Metal backend\n", __func__);
-        ggml_backend_metal_log_set_callback(ggml_log_callback_default, nullptr);
-        model.backend = ggml_backend_metal_init();
-        if (!model.backend) {
-            fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
-        }
-    }
-#endif
-
-    if(!model.backend) {
-        // fallback to CPU backend
-        model.backend = ggml_backend_cpu_init();
-    }
-
-    model.buffer = ggml_backend_alloc_buffer(model.backend, buffer_size);
-
-    // create context
-    model.ctx = ggml_init(params);
-
-    // create tensors
-    model.a = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, K, M);
-    printf("Matrix A: [%i, %i]\n", K, M);
-    model.b = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, K, N);
-    printf("Matrix B: [%i, %i]\n", K, N);
-
-    // create a allocator
-    ggml_allocr * alloc = ggml_allocr_new_from_buffer(model.buffer);
-
-    // alloc memory
-    ggml_allocr_alloc(alloc, model.a);
-
-    // load data to buffer
-    if(ggml_backend_is_cpu(model.backend)
-#ifdef GGML_USE_METAL
-                || ggml_backend_is_metal(model.backend)
-#endif
-    ) {
-        memcpy(model.a->data, a, ggml_nbytes(model.a));
-    } else {
-        ggml_backend_tensor_set(model.a, a, 0, ggml_nbytes(model.a)); // cuda requires copy the data directly to device
-    }
-
-    // alloc memory
-    ggml_allocr_alloc(alloc, model.b);
-
-    if(ggml_backend_is_cpu(model.backend)
-#ifdef GGML_USE_METAL
-                || ggml_backend_is_metal(model.backend)
-#endif
-    ) {
-        memcpy(model.b->data, b, ggml_nbytes(model.b));
-    } else {
-        ggml_backend_tensor_set(model.b, b, 0, ggml_nbytes(model.b));  // cuda requires copy the data directly to device
-    }
-
-    ggml_allocr_free(alloc);
-}
-
-struct ggml_cgraph * build_graph(const test_model& model, struct ggml_allocr * allocr) {
-    static size_t buf_size = ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead();
-    static std::vector<uint8_t> buf(buf_size);
-
-    struct ggml_init_params params0 = {
-        /*.mem_size   =*/ buf_size,
-        /*.mem_buffer =*/ buf.data(),
-        /*.no_alloc   =*/ true, // the tensors will be allocated later by ggml_allocr_alloc_graph()
-    };
-
-    // create a temporally context to build the graph
-    struct ggml_context * ctx0 = ggml_init(params0);
-
-    struct ggml_cgraph * gf = ggml_new_graph(ctx0);
-
-    // zT = x @ yT
-    struct ggml_tensor * result = ggml_mul_mat(ctx0, model.a, ggml_cont(ctx0, model.b));
-
-    // z = (zT)T
-    ggml_build_forward_expand(gf, ggml_cont(ctx0, ggml_transpose(ctx0, result)));
-
-    // delete the temporally context used to build the graph
-    ggml_free(ctx0);
-    return gf;
-}
-
-struct ggml_tensor* compute(const test_model & model, struct ggml_allocr * allocr) {
-    // reset the allocator to free all the memory allocated during the previous inference
-    ggml_allocr_reset(allocr);
-
-    struct ggml_cgraph * gf = build_graph(model, allocr);
-
-    // allocate tensors
-    ggml_allocr_alloc_graph(allocr, gf);
-    int n_threads = 1;
-
-    if (ggml_backend_is_cpu(model.backend)) {
-        ggml_backend_cpu_set_n_threads(model.backend, n_threads);
-    }
-
-#ifdef GGML_USE_METAL
-    if (ggml_backend_is_metal(model.backend)) {
-        ggml_backend_metal_set_n_cb(model.backend, n_threads);
-    }
-#endif
-
-    ggml_backend_graph_compute(model.backend, gf);
-
-    //ggml_graph_print(gf);
-
-    // in this case, the output tensor is the last one in the graph
-    return gf->nodes[gf->n_nodes - 1];
-}
-
-
-static void ggml_vec_dot_f16(const int n, float * s, float * x, float * y) {
-    float sumf = 0.0;
-    for (int i = 0; i < n; ++i) {
-        sumf += x[i] * y[i];
-    }
-    *s = sumf;
-}
-
-static void gemm_f16_out_f32(int m, int n, int k,
-                             float * A,
-                             float * B,
-                             float * C,
-                             const int ith, const int nth) {
-    // does not seem to make a difference
-    int m0, m1, n0, n1;
-    // patches per thread
-    if (m > n) {
-        n0 = 0;
-        n1 = n;
-
-        // total patches in dst
-        const int np = m;
-
-        // patches per thread
-        const int dp = (np + nth - 1)/nth;
-
-        // patch range for this thread
-        m0 = dp*ith;
-        m1 = std::min(m0 + dp, np);
-    } else {
-        m0 = 0;
-        m1 = m;
-
-        // total patches in dst
-        const int np = n;
-
-        // patches per thread
-        const int dp = (np + nth - 1)/nth;
-
-        // patch range for this thread
-        n0 = dp*ith;
-        n1 = std::min(n0 + dp, np);
-    }
-
-    // block-tiling attempt
-    int64_t blck_n = 16;
-    int64_t blck_m = 16;
-
-    for (int j = n0; j < n1; j+=blck_n) {
-        for (int i = m0; i < m1; i+=blck_m) {
-            // printf("i j k => %d %d %d\n", i, j, K);
-            for (int ii = i; ii < i + blck_m && ii < m1; ii++) {
-                for (int jj = j; jj < j + blck_n && jj < n1; jj++) {
-                    ggml_vec_dot_f16(k,
-                                    C + ii*n + jj,
-                                    A + ii * k,
-                                    B + jj * k);
-                }
-            }
-        }
-    }
-}
-
-
-void perform_gemm_test(float* a, float* b, float* expected, int M, int N, int K) {
-    printf("\nPerforming gemm_f16_out_f32 test:\n");
-
-    float* gemm_out = new float[M * N];
-    gemm_f16_out_f32(M, N, K, a, b, gemm_out, 0, 1);
-
-    for (int i = 0; i < M; i++) {
-        for (int j = 0; j < N; j++) {
-            printf("%.1ff,", gemm_out[i * N + j]);
-        }
-        printf("\n");
-    }
-
-    bool passed = true;
-
-    for(int i = 0; i < M * N; i++) {
-        if(gemm_out[i] != expected[i]) {
-            passed = false;
-            break;
-        }
-    }
-
-    printf("gemm_mult (%i): %s\n", (M * N), passed ? "\033[32mPASSED\033[0m" : "\033[31mFAILED\033[0m");
-}
-
-int main(void)
-{
-    ggml_time_init();
-    const int M = 4, N = 16, K = 36;  // a conv2d expected matrix multiplication
-
-    // matrix A (4 X 36)
-    float matrixA[M * K] = {
-       2.0f, 9.0f, 2.0f, 10.0f, 6.0f, 4.0f, 3.0f, 6.0f, 3.0f, 6.0f, 9.0f, 7.0f, 8.0f, 8.0f, 3.0f, 3.0f, 10.0f, 5.0f, 2.0f, 10.0f, 7.0f, 10.0f, 9.0f, 3.0f, 6.0f, 6.0f, 5.0f, 10.0f, 2.0f, 3.0f, 6.0f, 1.0f, 9.0f, 4.0f, 10.0f, 4.0f,
-        10.0f, 7.0f, 8.0f, 10.0f, 10.0f, 8.0f, 7.0f, 10.0f, 4.0f, 6.0f, 8.0f, 7.0f, 7.0f, 6.0f, 9.0f, 3.0f, 6.0f, 5.0f, 5.0f, 2.0f, 7.0f, 2.0f, 7.0f, 4.0f, 4.0f, 6.0f, 6.0f, 4.0f, 3.0f, 9.0f, 3.0f, 6.0f, 4.0f, 7.0f, 2.0f, 9.0f,
-        7.0f, 3.0f, 2.0f, 5.0f, 7.0f, 3.0f, 10.0f, 2.0f, 6.0f, 1.0f, 4.0f, 7.0f, 5.0f, 10.0f, 3.0f, 10.0f, 4.0f, 5.0f, 5.0f, 1.0f, 6.0f, 10.0f, 7.0f, 4.0f, 5.0f, 3.0f, 9.0f, 9.0f, 8.0f, 6.0f, 9.0f, 2.0f, 3.0f, 6.0f, 8.0f, 5.0f,
-        5.0f, 5.0f, 5.0f, 5.0f, 3.0f, 10.0f, 4.0f, 1.0f, 8.0f, 8.0f, 9.0f, 8.0f, 4.0f, 1.0f, 4.0f, 9.0f, 3.0f, 6.0f, 3.0f, 1.0f, 4.0f, 8.0f, 3.0f, 10.0f, 8.0f, 6.0f, 4.0f, 5.0f, 4.0f, 3.0f, 2.0f, 2.0f, 4.0f, 3.0f, 6.0f, 4.0f,
-    };
-
-    // matrix B (16 X 36)
-    float matrixB[N * K] = {
-        9.0f, 7.0f, 1.0f, 3.0f, 5.0f, 9.0f, 7.0f, 6.0f, 1.0f, 10.0f, 1.0f, 1.0f, 7.0f, 2.0f, 4.0f, 9.0f, 10.0f, 4.0f, 5.0f, 5.0f, 7.0f, 1.0f, 7.0f, 7.0f, 2.0f, 9.0f, 5.0f, 10.0f, 7.0f, 4.0f, 8.0f, 9.0f, 9.0f, 3.0f, 10.0f, 2.0f,
-        4.0f, 6.0f, 10.0f, 9.0f, 5.0f, 1.0f, 8.0f, 7.0f, 4.0f, 7.0f, 2.0f, 6.0f, 5.0f, 3.0f, 1.0f, 10.0f, 8.0f, 4.0f, 8.0f, 3.0f, 7.0f, 1.0f, 2.0f, 7.0f, 6.0f, 8.0f, 6.0f, 5.0f, 2.0f, 3.0f, 1.0f, 1.0f, 2.0f, 5.0f, 7.0f, 1.0f,
-        8.0f, 2.0f, 8.0f, 8.0f, 8.0f, 8.0f, 4.0f, 4.0f, 6.0f, 10.0f, 10.0f, 9.0f, 2.0f, 9.0f, 3.0f, 7.0f, 7.0f, 1.0f, 4.0f, 9.0f, 1.0f, 2.0f, 3.0f, 6.0f, 1.0f, 10.0f, 5.0f, 8.0f, 9.0f, 4.0f, 6.0f, 2.0f, 3.0f, 1.0f, 2.0f, 7.0f,
-        5.0f, 1.0f, 7.0f, 2.0f, 9.0f, 10.0f, 9.0f, 5.0f, 2.0f, 5.0f, 4.0f, 10.0f, 9.0f, 9.0f, 1.0f, 9.0f, 8.0f, 8.0f, 9.0f, 4.0f, 9.0f, 4.0f, 8.0f, 2.0f, 1.0f, 8.0f, 4.0f, 5.0f, 10.0f, 7.0f, 6.0f, 2.0f, 1.0f, 10.0f, 10.0f, 7.0f,
-        9.0f, 4.0f, 5.0f, 9.0f, 5.0f, 10.0f, 10.0f, 3.0f, 6.0f, 6.0f, 4.0f, 4.0f, 4.0f, 8.0f, 5.0f, 4.0f, 9.0f, 1.0f, 9.0f, 9.0f, 1.0f, 7.0f, 9.0f, 2.0f, 10.0f, 9.0f, 10.0f, 8.0f, 3.0f, 3.0f, 9.0f, 3.0f, 9.0f, 10.0f, 1.0f, 8.0f,
-        9.0f, 2.0f, 6.0f, 9.0f, 7.0f, 2.0f, 3.0f, 5.0f, 3.0f, 6.0f, 9.0f, 7.0f, 3.0f, 7.0f, 6.0f, 4.0f, 10.0f, 3.0f, 5.0f, 7.0f, 2.0f, 9.0f, 3.0f, 2.0f, 2.0f, 10.0f, 8.0f, 7.0f, 3.0f, 10.0f, 6.0f, 3.0f, 1.0f, 1.0f, 4.0f, 10.0f,
-        2.0f, 9.0f, 2.0f, 10.0f, 6.0f, 4.0f, 3.0f, 6.0f, 3.0f, 6.0f, 9.0f, 7.0f, 8.0f, 8.0f, 3.0f, 3.0f, 10.0f, 5.0f, 2.0f, 10.0f, 7.0f, 10.0f, 9.0f, 3.0f, 6.0f, 6.0f, 5.0f, 10.0f, 2.0f, 3.0f, 6.0f, 1.0f, 9.0f, 4.0f, 10.0f, 4.0f,
-        10.0f, 7.0f, 8.0f, 10.0f, 10.0f, 8.0f, 7.0f, 10.0f, 4.0f, 6.0f, 8.0f, 7.0f, 7.0f, 6.0f, 9.0f, 3.0f, 6.0f, 5.0f, 5.0f, 2.0f, 7.0f, 2.0f, 7.0f, 4.0f, 4.0f, 6.0f, 6.0f, 4.0f, 3.0f, 9.0f, 3.0f, 6.0f, 4.0f, 7.0f, 2.0f, 9.0f,
-        7.0f, 3.0f, 2.0f, 5.0f, 7.0f, 3.0f, 10.0f, 2.0f, 6.0f, 1.0f, 4.0f, 7.0f, 5.0f, 10.0f, 3.0f, 10.0f, 4.0f, 5.0f, 5.0f, 1.0f, 6.0f, 10.0f, 7.0f, 4.0f, 5.0f, 3.0f, 9.0f, 9.0f, 8.0f, 6.0f, 9.0f, 2.0f, 3.0f, 6.0f, 8.0f, 5.0f,
-        5.0f, 5.0f, 5.0f, 5.0f, 3.0f, 10.0f, 4.0f, 1.0f, 8.0f, 8.0f, 9.0f, 8.0f, 4.0f, 1.0f, 4.0f, 9.0f, 3.0f, 6.0f, 3.0f, 1.0f, 4.0f, 8.0f, 3.0f, 10.0f, 8.0f, 6.0f, 4.0f, 5.0f, 4.0f, 3.0f, 2.0f, 2.0f, 4.0f, 3.0f, 6.0f, 4.0f,
-        6.0f, 2.0f, 3.0f, 3.0f, 3.0f, 7.0f, 5.0f, 1.0f, 8.0f, 1.0f, 4.0f, 5.0f, 1.0f, 1.0f, 6.0f, 4.0f, 2.0f, 1.0f, 7.0f, 8.0f, 6.0f, 1.0f, 1.0f, 5.0f, 6.0f, 5.0f, 10.0f, 6.0f, 7.0f, 5.0f, 9.0f, 3.0f, 2.0f, 7.0f, 9.0f, 4.0f,
-        2.0f, 5.0f, 9.0f, 5.0f, 10.0f, 3.0f, 1.0f, 8.0f, 1.0f, 7.0f, 1.0f, 8.0f, 1.0f, 6.0f, 7.0f, 8.0f, 4.0f, 9.0f, 5.0f, 10.0f, 3.0f, 7.0f, 6.0f, 8.0f, 8.0f, 5.0f, 6.0f, 8.0f, 10.0f, 9.0f, 4.0f, 1.0f, 3.0f, 3.0f, 4.0f, 7.0f,
-        8.0f, 2.0f, 6.0f, 6.0f, 5.0f, 1.0f, 3.0f, 7.0f, 1.0f, 7.0f, 2.0f, 2.0f, 2.0f, 8.0f, 4.0f, 1.0f, 1.0f, 5.0f, 9.0f, 4.0f, 1.0f, 2.0f, 3.0f, 10.0f, 1.0f, 4.0f, 9.0f, 9.0f, 6.0f, 8.0f, 8.0f, 1.0f, 9.0f, 10.0f, 4.0f, 1.0f,
-        8.0f, 5.0f, 8.0f, 9.0f, 4.0f, 8.0f, 2.0f, 1.0f, 1.0f, 9.0f, 4.0f, 5.0f, 6.0f, 1.0f, 2.0f, 5.0f, 6.0f, 7.0f, 3.0f, 1.0f, 4.0f, 6.0f, 7.0f, 7.0f, 7.0f, 8.0f, 7.0f, 8.0f, 8.0f, 2.0f, 10.0f, 2.0f, 7.0f, 3.0f, 8.0f, 3.0f,
-        8.0f, 7.0f, 6.0f, 2.0f, 4.0f, 10.0f, 10.0f, 6.0f, 10.0f, 3.0f, 7.0f, 6.0f, 4.0f, 3.0f, 5.0f, 5.0f, 5.0f, 3.0f, 8.0f, 10.0f, 3.0f, 4.0f, 8.0f, 4.0f, 2.0f, 6.0f, 8.0f, 9.0f, 6.0f, 9.0f, 4.0f, 3.0f, 5.0f, 2.0f, 2.0f, 6.0f,
-        10.0f, 6.0f, 2.0f, 1.0f, 7.0f, 5.0f, 6.0f, 4.0f, 1.0f, 9.0f, 10.0f, 2.0f, 4.0f, 5.0f, 8.0f, 5.0f, 7.0f, 4.0f, 7.0f, 6.0f, 3.0f, 9.0f, 2.0f, 1.0f, 4.0f, 2.0f, 6.0f, 6.0f, 3.0f, 3.0f, 2.0f, 8.0f, 5.0f, 9.0f, 3.0f, 4.0f,
-    };
-
-    // matrix C (4 x 16)
-    float expected_result[M * N] = {
-        1224.0f, 1023.0f, 1158.0f,1259.0f,1359.0f,1194.0f,1535.0f,1247.0f,1185.0f,1029.0f,889.0f,1182.0f,955.0f,1179.0f,1147.0f,1048.0f,
-        1216.0f, 1087.0f, 1239.0f,1361.0f,1392.0f,1260.0f,1247.0f,1563.0f,1167.0f,1052.0f,942.0f,1214.0f,1045.0f,1134.0f,1264.0f,1126.0f,
-        1125.0f, 966.0f, 1079.0f,1333.0f,1287.0f,1101.0f,1185.0f,1167.0f,1368.0f,990.0f,967.0f,1121.0f,971.0f,1086.0f,1130.0f,980.0f,
-        999.0f, 902.0f, 1020.0f,1056.0f,1076.0f,929.0f,1029.0f,1052.0f,990.0f,1108.0f,823.0f,989.0f,759.0f,1041.0f,1003.0f,870.0f
-    };
-
-    bool passed = true;
-
-    perform_gemm_test(matrixA, matrixB, expected_result, M, N, K);
-
-    test_model model;
-    load_model(model, matrixA, matrixB, M, N, K, true);
-
-    ggml_backend_buffer_t buf_compute; // for compute
-    struct ggml_allocr * allocr = NULL;
-
-    {
-        allocr = ggml_allocr_new_measure_from_backend(model.backend);
-
-        //create the worst case graph for memory usage estimation
-        struct ggml_cgraph * gf = build_graph(model, allocr);
-        size_t mem_size = ggml_allocr_alloc_graph(allocr, gf);
-        ggml_allocr_free(allocr);
-
-        // compute the required memory
-        buf_compute = ggml_backend_alloc_buffer(model.backend, mem_size);
-        allocr = ggml_allocr_new_from_buffer(buf_compute);
-        fprintf(stderr, "%s: compute buffer size: %.4f KB\n", __func__, mem_size/1024.0);
-    }
-
-    struct ggml_tensor * result = compute(model, allocr);
-
-    float* out_data = new float[ggml_nelements(result)];
-
-    ggml_backend_tensor_get(result, out_data, 0, ggml_nbytes(result));
-
-    printf("\nPerforming ggml_mul_mat test:\n");
-
-    passed = true;
-    for(int i = 0; i < M * N; i++) {
-        if(out_data[i] != expected_result[i]) {
-            passed = false;
-            break;
-        }
-    }
-
-    for (int i = 0; i < M; i++) {
-        for (int j = 0; j < N; j++) {
-            printf("%.1f ", out_data[i * N + j]);
-        }
-        printf("\n");
-    }
-
-    printf("ggml_mul_mat (%d): %s\n", (int) ggml_nelements(result), passed && (ggml_nelements(result) == M * N) ? "\033[32mPASSED\033[0m" : "\033[31mFAILED\033[0m");
-
-   // free memory
-    ggml_free(model.ctx);
-
-    ggml_backend_buffer_free(model.buffer);
-    ggml_backend_buffer_free(buf_compute);
-    ggml_backend_free(model.backend);
-    return 0;
-}
diff --git a/ggml/tests/test-mul-mat0.c b/ggml/tests/test-mul-mat0.c
deleted file mode 100644
index ee52b7a..0000000
--- a/ggml/tests/test-mul-mat0.c
+++ /dev/null
@@ -1,336 +0,0 @@
-#define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows
-#include "ggml/ggml.h"
-
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <assert.h>
-#include <inttypes.h>
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-
-#define MAX_NARGS 2
-
-float frand(void) {
-    return (float)rand()/(float)RAND_MAX;
-}
-
-int irand(int n) {
-    return rand()%n;
-}
-
-void get_random_dims(int64_t * dims, int ndims) {
-    dims[0] = dims[1] = dims[2] = dims[3] = 1;
-
-    for (int i = 0; i < ndims; i++) {
-        dims[i] = 1 + irand(4);
-    }
-}
-
-struct ggml_tensor * get_random_tensor(
-        struct ggml_context * ctx0,
-        int ndims,
-        int64_t ne[],
-        float fmin,
-        float fmax) {
-    struct ggml_tensor * result = ggml_new_tensor(ctx0, GGML_TYPE_F32, ndims, ne);
-
-    switch (ndims) {
-        case 1:
-            for (int i0 = 0; i0 < ne[0]; i0++) {
-                ((float *)result->data)[i0] = frand()*(fmax - fmin) + fmin;
-            }
-            break;
-        case 2:
-            for (int i1 = 0; i1 < ne[1]; i1++) {
-                for (int i0 = 0; i0 < ne[0]; i0++) {
-                    ((float *)result->data)[i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
-                }
-            }
-            break;
-        case 3:
-            for (int i2 = 0; i2 < ne[2]; i2++) {
-                for (int i1 = 0; i1 < ne[1]; i1++) {
-                    for (int i0 = 0; i0 < ne[0]; i0++) {
-                        ((float *)result->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
-                    }
-                }
-            }
-            break;
-        case 4:
-            for (int i3 = 0; i3 < ne[3]; i3++) {
-                for (int i2 = 0; i2 < ne[2]; i2++) {
-                    for (int i1 = 0; i1 < ne[1]; i1++) {
-                        for (int i0 = 0; i0 < ne[0]; i0++) {
-                            ((float *)result->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
-                        }
-                    }
-                }
-            }
-            break;
-        default:
-            assert(false);
-    };
-
-    return result;
-}
-
-float get_element(const struct ggml_tensor * t, int idx) {
-    return ((float *)t->data)[idx];
-}
-
-void set_element(struct ggml_tensor * t, int idx, float value) {
-    ((float *)t->data)[idx] = value;
-}
-
-bool check_gradient(
-        const char * op_name,
-        struct ggml_context * ctx0,
-        struct ggml_tensor * x[],
-        struct ggml_tensor * f,
-        int ndims,
-        int nargs,
-        float eps,
-        float max_error_abs,
-        float max_error_rel) {
-    const int n_threads = 1;
-
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, GGML_DEFAULT_GRAPH_SIZE, true);
-    ggml_build_forward_expand(gf, f);
-    struct ggml_cgraph * gb = ggml_graph_dup(ctx0, gf);
-    ggml_build_backward_expand(ctx0, gf, gb, false);
-
-    ggml_graph_compute_with_ctx(ctx0, gf, n_threads);
-    ggml_graph_reset  (gf);
-    ggml_set_f32      (f->grad, 1.0f);
-    ggml_graph_compute_with_ctx(ctx0, gb, n_threads);
-
-    ggml_graph_dump_dot(gf, NULL, "test-grad0-forward.dot");
-    ggml_graph_dump_dot(gb, gf,   "test-grad0-backward.dot");
-
-    for (int i = 0; i < nargs; ++i) {
-        const int64_t nelements = ggml_nelements(x[i]);
-        for (int64_t k = 0; k < nelements; ++k) {
-            // compute gradient using finite differences
-            const float x0 = get_element(x[i], k);
-
-            set_element(x[i], k, x0 + eps);
-            ggml_graph_compute_with_ctx(ctx0, gf, n_threads);
-
-            const float f0 = ggml_get_f32_1d(f, 0);
-
-            set_element(x[i], k, x0 - eps);
-            ggml_graph_compute_with_ctx(ctx0, gf, n_threads);
-
-            const float f1 = ggml_get_f32_1d(f, 0);
-
-            const float g0 = (f0 - f1)/(2.0f*eps);
-
-            set_element(x[i], k, x0);
-
-            // compute gradient using backward graph
-            ggml_graph_reset  (gf);
-            ggml_set_f32      (f->grad, 1.0f);
-            ggml_graph_compute_with_ctx(ctx0, gb, n_threads);
-
-            const float g1 = get_element(x[i]->grad, k);
-
-            const float error_abs = fabsf(g0 - g1);
-            const float error_rel = g0 != 0 ? fabsf(g0 - g1)/fabs(g0) : 0;
-
-            if (error_abs > max_error_abs || error_rel > max_error_rel) {
-                printf("%s: ndims=%d, i=%d, k=%" PRId64 ", g0=%f, g1=%f, error_abs=%f, error_rel=%f\n", op_name, ndims, i, k, g0, g1, error_abs, error_rel);
-                assert(false);
-            }
-        }
-    }
-
-    return true;
-}
-
-
-float mat_get(const struct ggml_tensor * t, int i0, int i1, int i2, int i3) {
-    const size_t nb0 = t->nb[0];
-    const size_t nb1 = t->nb[1];
-    const size_t nb2 = t->nb[2];
-    const size_t nb3 = t->nb[3];
-
-    return
-        *((float*) ((char*)t->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3));
-}
-
-bool check_mat_mul(
-        const struct ggml_tensor * y,
-        const struct ggml_tensor * x0,
-        const struct ggml_tensor * x1) {
-    const int64_t n00 = x0->ne[0];
-    const int64_t n10 = x0->ne[1];
-    const int64_t n20 = x0->ne[2];
-    const int64_t n30 = x0->ne[3];
-
-    const int64_t n01 = x1->ne[0];
-    const int64_t n11 = x1->ne[1];
-    const int64_t n21 = x1->ne[2];
-    const int64_t n31 = x1->ne[3];
-
-    const int64_t n02 = y->ne[0];
-    const int64_t n12 = y->ne[1];
-    const int64_t n22 = y->ne[2];
-    const int64_t n32 = y->ne[3];
-
-    printf("x0: [%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "]\n", n00, n10, n20, n30);
-    for (int j = 0; j < n10; ++j) {
-        for (int i = 0; i < n00; ++i) {
-            printf("%6.3f ", mat_get(x0, i, j, 0, 0));
-        }
-        printf("\n");
-    }
-    printf("\n");
-
-    printf("x1: [%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "]\n", n01, n11, n21, n31);
-    for (int j = 0; j < n11; ++j) {
-        for (int i = 0; i < n01; ++i) {
-            printf("%6.3f ", mat_get(x1, i, j, 0, 0));
-        }
-        printf("\n");
-    }
-    printf("\n");
-
-    printf("y: [%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "]\n", n02, n12, n22, n32);
-    for (int j = 0; j < n12; ++j) {
-        for (int i = 0; i < n02; ++i) {
-            printf("%6.3f ", mat_get(y, i, j, 0, 0));
-        }
-        printf("\n");
-    }
-
-    for (int i3 = 0; i3 < n32; ++i3) {
-        for (int i2 = 0; i2 < n22; ++i2) {
-            for (int i1 = 0; i1 < n12; ++i1) {
-                for (int i0 = 0; i0 < n02; ++i0) {
-                    float sum = 0.0f;
-                    for (int k = 0; k < n00; ++k) {
-                        sum += mat_get(x0, k, i0, i2, i3) * mat_get(x1, k, i1, i2, i3);
-                    }
-                    if (fabsf(sum - mat_get(y, i0, i1, i2, i3)) > 1e-5) {
-                        printf("error: i0=%d, i1=%d, i2=%d, i3=%d, sum=%f, y=%f\n",
-                                i0, i1, i2, i3, sum, mat_get(y, i0, i1, i2, i3));
-                        assert(false);
-                        return false;
-                    }
-                }
-            }
-        }
-    }
-
-    return true;
-}
-
-int main(int argc, const char ** argv) {
-    struct ggml_init_params params = {
-        .mem_size   = 128*1024*1024,
-        .mem_buffer = NULL,
-        .no_alloc   = false,
-    };
-
-    int64_t ne[4];
-
-    // original loop: 500
-    int niter = 500;
-    const char *env = getenv("GGML_NLOOP");
-    if (env != NULL) {
-        niter = atoi(env);
-    }
-    if (argc > 1) {
-        niter = atoi(argv[1]);
-    }
-
-    int n_threads = 1;
-
-    for (int iter = 0; iter < niter; ++iter) {
-        printf("test-mul-mat0: iter:%d/%d\n", iter, niter);
-        struct ggml_context * ctx0 = ggml_init(params);
-
-        get_random_dims(ne, 4);
-
-        struct ggml_tensor * x[MAX_NARGS];
-
-        // mul_mat
-        {
-            const int nargs = 1;
-
-            for (int ndims = 2; ndims <= 4; ++ndims) {
-                x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
-                ne[1] = rand()%4 + 1;
-                x[1] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
-
-                ggml_set_param(ctx0, x[0]);
-
-                struct ggml_tensor * m = ggml_mul_mat(ctx0, x[1], x[0]);
-                struct ggml_tensor * f = ggml_sum(ctx0, m);
-
-                printf("testing: mul_mat, [%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "] = [%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "] * [%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "]\n",
-                           m->ne[0],    m->ne[1],    m->ne[2],    m->ne[3],
-                        x[1]->ne[0], x[1]->ne[1], x[1]->ne[2], x[1]->ne[3],
-                        x[0]->ne[0], x[0]->ne[1], x[0]->ne[2], x[0]->ne[3]);
-
-                assert(m->ne[0] == x[1]->ne[1]);
-                assert(m->ne[1] == x[0]->ne[1]);
-                assert(m->ne[2] == x[0]->ne[2]);
-                assert(m->ne[3] == x[0]->ne[3]);
-
-                if (ndims <= 2) {
-                    check_gradient("mul_mat", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-                } else {
-                    struct ggml_cgraph * gf = ggml_new_graph(ctx0);
-                    ggml_build_forward_expand(gf, m);
-                    ggml_graph_compute_with_ctx(ctx0, gf, n_threads);
-                }
-
-                check_mat_mul(m, x[1], x[0]);
-            }
-        }
-
-        // mul_mat (transposed)
-        {
-            const int nargs = 1;
-
-            for (int ndims = 2; ndims <= 4; ++ndims) {
-                x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
-                ne[1] = ne[0];
-                ne[0] = rand()%4 + 1;
-                x[1] = ggml_cont(ctx0, ggml_transpose(ctx0, get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f)));
-
-                ggml_set_param(ctx0, x[0]);
-
-                struct ggml_tensor * m = ggml_mul_mat(ctx0, x[1], x[0]);
-                struct ggml_tensor * f = ggml_sum(ctx0, m);
-
-                printf("testing: mul_mat, [%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "] = [%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "] * [%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "]\n",
-                           m->ne[0],    m->ne[1],    m->ne[2],    m->ne[3],
-                        x[1]->ne[0], x[1]->ne[1], x[1]->ne[2], x[1]->ne[3],
-                        x[0]->ne[0], x[0]->ne[1], x[0]->ne[2], x[0]->ne[3]);
-
-                assert(m->ne[0] == x[1]->ne[1]);
-                assert(m->ne[1] == x[0]->ne[1]);
-                assert(m->ne[2] == x[0]->ne[2]);
-                assert(m->ne[3] == x[0]->ne[3]);
-
-                if (ndims <= 2) {
-                    check_gradient("mul_mat", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-                } else {
-                    struct ggml_cgraph * gf = ggml_new_graph(ctx0);
-                    ggml_build_forward_expand(gf, m);
-                    ggml_graph_compute_with_ctx(ctx0, gf, n_threads);
-                }
-
-                check_mat_mul(m, x[1], x[0]);
-            }
-        }
-        ggml_free(ctx0);
-    }
-
-    return 0;
-}
diff --git a/ggml/tests/test-mul-mat1.c b/ggml/tests/test-mul-mat1.c
deleted file mode 100644
index b725a58..0000000
--- a/ggml/tests/test-mul-mat1.c
+++ /dev/null
@@ -1,312 +0,0 @@
-#include <stdint.h>
-#include <stdio.h>
-#include <assert.h>
-#include <stdlib.h>
-#include <string.h>
-#include <time.h>
-#include <math.h>
-
-#include <sys/time.h>
-
-#include <arm_neon.h>
-
-#include <Accelerate/Accelerate.h>
-
-const int M = 1280;
-const int N = 1536;
-const int K = 1280;
-
-uint64_t get_time_us(void) {
-    struct timeval tv;
-    gettimeofday(&tv, NULL);
-    return tv.tv_sec * 1000000 + tv.tv_usec;
-}
-
-//
-// naive implementation
-//
-
-void mul_mat_f32_0(
-    const float * restrict src0, // M x K
-    const float * restrict src1, // N x K (transposed)
-    float * dst,
-    int m, int n, int k) {
-    for (int i = 0; i < m; i++) {
-        for (int j = 0; j < n; j++) {
-            float sum = 0;
-            for (int l = 0; l < k; l++) {
-                sum += src0[i*k + l] * src1[j*k + l];
-            }
-            dst[i*n + j] = sum;
-        }
-    }
-}
-
-void mul_mat_f16_0(
-    const __fp16 * src0,
-    const __fp16 * src1,
-           float * dst,
-    int m, int n, int k) {
-    const int k32 = k & ~31;
-
-    for (int i = 0; i < m; i++) {
-        for (int j = 0; j < n; j++) {
-            float sumf = 0.0;
-
-            float16x8_t sum0 = vdupq_n_f16(0.0f);
-            float16x8_t sum1 = vdupq_n_f16(0.0f);
-            float16x8_t sum2 = vdupq_n_f16(0.0f);
-            float16x8_t sum3 = vdupq_n_f16(0.0f);
-
-            float16x8_t x0, x1, x2, x3;
-            float16x8_t y0, y1, y2, y3;
-
-            const __fp16 * restrict p0 = src0 + i*k;
-            const __fp16 * restrict p1 = src1 + j*k;
-
-            for (int l = 0; l < k32; l += 32) {
-                x0 = vld1q_f16(p0 + l + 0 );
-                x1 = vld1q_f16(p0 + l + 8 );
-                x2 = vld1q_f16(p0 + l + 16);
-                x3 = vld1q_f16(p0 + l + 24);
-
-                y0 = vld1q_f16(p1 + l + 0 );
-                y1 = vld1q_f16(p1 + l + 8 );
-                y2 = vld1q_f16(p1 + l + 16);
-                y3 = vld1q_f16(p1 + l + 24);
-
-                sum0 = vfmaq_f16(sum0, x0, y0);
-                sum1 = vfmaq_f16(sum1, x1, y1);
-                sum2 = vfmaq_f16(sum2, x2, y2);
-                sum3 = vfmaq_f16(sum3, x3, y3);
-            }
-
-            // reduce sum0..sum3 to sum0
-            sum0 = vaddq_f16(sum0, sum1);
-            sum2 = vaddq_f16(sum2, sum3);
-            sum0 = vaddq_f16(sum0, sum2);
-
-            // load sum0 into 2 float32x4_t
-            float32x4_t sum0f32 = vcvt_f32_f16(vget_low_f16(sum0));
-            float32x4_t sum1f32 = vcvt_f32_f16(vget_high_f16(sum0));
-
-            // reduce sum0f32 and sum1f32 to sumf
-            sum0f32 = vaddq_f32(sum0f32, sum1f32);
-
-            float32x2_t sumf32 = vadd_f32(vget_low_f32(sum0f32), vget_high_f32(sum0f32));
-            sumf = vget_lane_f32(sumf32, 0) + vget_lane_f32(sumf32, 1);
-
-            //sumf = sum0[0] + sum0[1] + sum0[2] + sum0[3] + sum0[4] + sum0[5] + sum0[6] + sum0[7];
-
-            for (int l = k32; l < k32; l++) {
-                sumf += p0[l]*p1[l];
-            }
-
-            dst[i*n + j] = sumf;
-        }
-    }
-}
-
-// blocking with block size 32
-void mul_mat_f16_1(
-    const __fp16 * src0,
-    const __fp16 * src1,
-           float * dst,
-    int m, int n, int k) {
-
-    const int k32 = k & ~31;
-    const int bs  = 32;
-
-    memset(dst, 0, m*n*sizeof(float));
-
-    for (int i = 0; i < m; i += bs) {
-        for (int j = 0; j < n; j += bs) {
-            for (int l = 0; l < k; l += bs) {
-                for (int ii = i; ii < i + bs; ii++) {
-                    const __fp16 * restrict p0 = src0 + ii*k;
-
-                    float16x8_t x0, x1, x2, x3;
-
-                    x0 = vld1q_f16(p0 + l + 0 );
-                    x1 = vld1q_f16(p0 + l + 8 );
-                    x2 = vld1q_f16(p0 + l + 16);
-                    x3 = vld1q_f16(p0 + l + 24);
-
-                    for (int jj = j; jj < j + bs; jj++) {
-                        float sumf = 0.0;
-
-                        float16x8_t sum0 = vdupq_n_f16(0.0f);
-                        float16x8_t sum1 = vdupq_n_f16(0.0f);
-                        float16x8_t sum2 = vdupq_n_f16(0.0f);
-                        float16x8_t sum3 = vdupq_n_f16(0.0f);
-
-                        float16x8_t y0, y1, y2, y3;
-
-                        const __fp16 * restrict p1 = src1 + jj*k;
-
-                        y0 = vld1q_f16(p1 + l + 0 );
-                        y1 = vld1q_f16(p1 + l + 8 );
-                        y2 = vld1q_f16(p1 + l + 16);
-                        y3 = vld1q_f16(p1 + l + 24);
-
-                        sum0 = vfmaq_f16(sum0, x0, y0);
-                        sum1 = vfmaq_f16(sum1, x1, y1);
-                        sum2 = vfmaq_f16(sum2, x2, y2);
-                        sum3 = vfmaq_f16(sum3, x3, y3);
-
-                        // reduce sum0..sum3 to sum0
-                        sum0 = vaddq_f16(sum0, sum1);
-                        sum2 = vaddq_f16(sum2, sum3);
-                        sum0 = vaddq_f16(sum0, sum2);
-
-                        // load sum0 into 2 float32x4_t
-                        float32x4_t sum0f32 = vcvt_f32_f16(vget_low_f16(sum0));
-                        float32x4_t sum1f32 = vcvt_f32_f16(vget_high_f16(sum0));
-
-                        // reduce sum0f32 and sum1f32 to sumf
-                        sum0f32 = vaddq_f32(sum0f32, sum1f32);
-
-                        float32x2_t sumf32 = vadd_f32(vget_low_f32(sum0f32), vget_high_f32(sum0f32));
-                        sumf = vget_lane_f32(sumf32, 0) + vget_lane_f32(sumf32, 1);
-
-                        //sumf = sum0[0] + sum0[1] + sum0[2] + sum0[3] + sum0[4] + sum0[5] + sum0[6] + sum0[7];
-
-                        dst[ii*n + jj] += sumf;
-                    }
-                }
-            }
-        }
-    }
-
-}
-
-void mul_mat_f8_0(
-    const uint8_t * src0,
-    const uint8_t * src1,
-           float * dst,
-    int m, int n, int k) {
-    const int k32 = k & ~31;
-
-    for (int i = 0; i < m; i++) {
-        for (int j = 0; j < n; j++) {
-            float sumf = 0.0;
-
-            const uint8_t * restrict p0 = src0 + i*k;
-            const uint8_t * restrict p1 = src1 + j*k;
-
-            for (int l = 0; l < k32; l += 32) {
-                uint8x16_t x0 = vld1q_u8(p0 + l + 0 );
-                uint8x16_t x1 = vld1q_u8(p0 + l + 16);
-
-                uint8x16_t y0 = vld1q_u8(p1 + l + 0 );
-                uint8x16_t y1 = vld1q_u8(p1 + l + 16);
-
-                x0 = vmulq_u8(x0, y0);
-                x1 = vmulq_u8(x1, y1);
-
-                sumf += vaddvq_u8(x0) + vaddvq_u8(x1);
-            }
-
-            dst[i*n + j] = sumf;
-        }
-    }
-}
-
-int main(int argc, const char ** argv) {
-    float * src0 = malloc(sizeof(float)*M*K);
-    float * src1 = malloc(sizeof(float)*N*K);
-    float * dst  = malloc(sizeof(float)*M*N);
-
-    for (int i = 0; i < M*K; i++) {
-        src0[i] = rand() / (float)RAND_MAX;
-    }
-
-    for (int i = 0; i < N*K; i++) {
-        src1[i] = rand() / (float)RAND_MAX;
-    }
-
-    // convert src0 and src1 to __fp16
-    __fp16 * src0_fp16 = (__fp16 *)(malloc(sizeof(__fp16)*M*K));
-    __fp16 * src1_fp16 = (__fp16 *)(malloc(sizeof(__fp16)*N*K));
-
-    uint8_t * src0_fp8 = (uint8_t *)(malloc(sizeof(__fp16)*M*K));
-    uint8_t * src1_fp8 = (uint8_t *)(malloc(sizeof(__fp16)*N*K));
-
-    {
-        const uint64_t t_start = get_time_us();
-
-        for (int i = 0; i < M*K; i++) {
-            src0_fp16[i] = src0[i];
-            //printf("%f %f\n", src0[i], src0_fp16[i]);
-            //assert(!isnan(src0_fp16[i]));
-        }
-
-        for (int i = 0; i < N*K; i++) {
-            src1_fp16[i] = src1[i];
-        }
-
-        const uint64_t t_end = get_time_us();
-        printf("convert time: %f ms\n", (t_end - t_start) / 1000.0);
-    }
-
-    for (int i = 0; i < 16; ++i) {
-        printf("%f %f\n", src0[i], src0_fp16[i]);
-    }
-
-    int method = 0;
-    if (argc > 1) {
-        method = atoi(argv[1]);
-    }
-
-    const int nIter = 1;
-
-    const clock_t start = clock();
-    const uint64_t start_us = get_time_us();
-
-    double iM = 1.0/M;
-    double sum = 0.0f;
-    for (int i = 0; i < nIter; i++) {
-        if (method == 0) {
-            mul_mat_f32_0(src0, src1, dst, M, N, K);
-        }
-
-        if (method == 1) {
-            mul_mat_f16_0(src0_fp16, src1_fp16, dst, M, N, K);
-        }
-
-        if (method == 2) {
-            mul_mat_f16_1(src0_fp16, src1_fp16, dst, M, N, K);
-        }
-
-        if (method == 3) {
-            mul_mat_f8_0(src0_fp8, src1_fp8, dst, M, N, K);
-        }
-
-        if (method == 4) {
-            // Use BLAS sgemm from Accelerate framework
-            cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, M, N, K, 1.0f, src0, K, src1, K, 0.0f, dst, N);
-        }
-    }
-
-    for (int i = 0; i < N; i++) {
-        sum += dst[i]*iM;
-    }
-
-    {
-        const clock_t end = clock();
-        const uint64_t end_us = get_time_us();
-        printf("%s: elapsed ticks: %ld\n",  __func__, end - start);
-        printf("%s: elapsed us:    %llu / %f ms\n",  __func__, end_us - start_us, (end_us - start_us) / 1000.0 / nIter);
-    }
-
-    printf("%f\n", sum);
-
-    free(src0);
-    free(src1);
-    free(dst);
-
-    free(src0_fp16);
-    free(src1_fp16);
-
-    return 0;
-}
diff --git a/ggml/tests/test-mul-mat2.c b/ggml/tests/test-mul-mat2.c
deleted file mode 100644
index 89af286..0000000
--- a/ggml/tests/test-mul-mat2.c
+++ /dev/null
@@ -1,2585 +0,0 @@
-// quantized matrix multiplication
-
-#include "ggml.h"
-
-#include <float.h>
-#include <stdint.h>
-#include <stdio.h>
-#include <inttypes.h>
-#include <assert.h>
-#include <stdlib.h>
-#include <string.h>
-#include <math.h>
-
-#if defined(__ARM_NEON)
-#include "arm_neon.h"
-#elif defined(__AVX__) || defined(__AVX2__)
-#include "immintrin.h"
-#endif
-
-#ifndef MIN
-#define MAX(a, b) ((a) > (b) ? (a) : (b))
-#define MIN(a, b) ((a) < (b) ? (a) : (b))
-#endif
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#include <intrin.h>
-#define __builtin_popcountll __popcnt64
-#endif
-
-const int M = 1280;
-const int N = 1536;
-const int K = 1280;
-
-//const int M = 64;
-//const int N = 64;
-//const int K = 64;
-
-#define QK 64
-#define QB 4
-
-//#define GGML_GQ_USE_FP16_SCALE
-
-#if defined(GGML_GQ_USE_FP16_SCALE)
-#define gq_scale_t ggml_fp16_t
-#define GGML_FP32_TO_GQ(x) ggml_fp32_to_fp16(x)
-#define GGML_GQ_TO_FP32(x) ggml_fp16_to_fp32(x)
-#else
-#define gq_scale_t float
-#define GGML_FP32_TO_GQ(x) (x)
-#define GGML_GQ_TO_FP32(x) (x)
-#endif
-
-#define gq_t_bits 64
-#define gq_quant_t uint64_t
-
-float frand(void) {
-    return (float) rand() / (float) RAND_MAX;
-}
-
-#if defined(__AVX2__)
-// horizontally reduce 8 32-bit integers
-static inline uint32_t _mm256_hadd_epi32_gg(__m256i v) {
-    __m128i v0 = _mm256_extractf128_si256(v, 0);
-    __m128i v1 = _mm256_extractf128_si256(v, 1);
-
-    v0 = _mm_add_epi32(v0, v1);
-
-    v1 = _mm_shuffle_epi32(v0, 0x0e);
-    v0 = _mm_add_epi32(v0, v1);
-
-    v1 = _mm_shuffle_epi32(v0, 0x01);
-    v0 = _mm_add_epi32(v0, v1);
-
-    return _mm_cvtsi128_si32(v0);
-}
-
-//static inline float _mm256_hadd_epi32_gg(__m256i v) {
-//    const __m256 v0 = _mm256_cvtepi32_ps(v);
-//    const __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(v0), _mm256_extractf128_ps(v0, 1));
-//    const __m128 t1 = _mm_hadd_ps(t0, t0);
-//
-//    return _mm_cvtss_f32(_mm_hadd_ps(t1, t1));
-//}
-
-// horizontally reduce 32 8-bit integers
-static inline int32_t _mm256_hadd_epi8_gg(__m256i v0) {
-    __m256i v1 = _mm256_maddubs_epi16(v0, _mm256_set1_epi8(1));
-    __m256i v2 = _mm256_madd_epi16   (v1, _mm256_set1_epi16(1));
-
-    return _mm256_hadd_epi32_gg(v2);
-}
-
-static inline float _mm256_hadd_ps_gg(__m256 v) {
-    const __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(v), _mm256_extractf128_ps(v, 1));
-    const __m128 t1 = _mm_hadd_ps(t0, t0);
-
-    return _mm_cvtss_f32(_mm_hadd_ps(t1, t1));
-}
-#endif
-
-//
-// naive implementation
-//
-
-void mul_mat_f32_naive(
-    const float * restrict src0, // M x K
-    const float * restrict src1, // N x K (transposed)
-    float * dst,
-    int m, int n, int k) {
-    for (int i = 0; i < m; i++) {
-        for (int j = 0; j < n; j++) {
-            float sum = 0;
-            for (int l = 0; l < k; l++) {
-                sum += src0[i*k + l] * src1[j*k + l];
-            }
-            dst[i*n + j] = sum;
-        }
-    }
-}
-
-//
-// method 1
-//
-
-static inline int quantize_1_blocks_per_row(int k) {
-    return k/QK;
-}
-
-static inline int quantize_1_quants_per_block(void) {
-    return QK/gq_t_bits;
-}
-
-static inline int quantize_1_row_size(int k) {
-    const int nb = quantize_1_blocks_per_row(k);
-    const int nq = quantize_1_quants_per_block();
-
-    return nb*(2*sizeof(gq_scale_t) + nq*QB*sizeof(gq_quant_t));
-}
-
-void quantize_1(const float * src, void * dst, int n, int k) {
-    char * p0 = dst;
-
-    gq_quant_t pp[QB];
-
-    for (int j = 0; j < n; j++) {
-        for (int i = 0; i < k/QK; i++) {
-            float min = FLT_MAX;
-            float max = -FLT_MAX;
-
-            // find min/max
-#ifdef __ARM_NEON
-            {
-                float32x4_t minv = vdupq_n_f32(FLT_MAX);
-                float32x4_t maxv = vdupq_n_f32(-FLT_MAX);
-
-                for (int l = 0; l < QK; l += 4) {
-                    float32x4_t v = vld1q_f32(src + j*k + i*QK + l);
-                    minv = vminq_f32(minv, v);
-                    maxv = vmaxq_f32(maxv, v);
-                }
-
-                float32x2_t minv32 = vpmin_f32(vget_low_f32(minv), vget_high_f32(minv));
-                float32x2_t maxv32 = vpmax_f32(vget_low_f32(maxv), vget_high_f32(maxv));
-
-                min = MIN(vget_lane_f32(minv32, 0), vget_lane_f32(minv32, 1));
-                max = MAX(vget_lane_f32(maxv32, 0), vget_lane_f32(maxv32, 1));
-
-                //printf("SIMD min/max: %f %f\n", min, max);
-            }
-#else
-            {
-                for (int l = 0; l < QK; l++) {
-                    const float v = src[j*k + i*QK + l];
-                    if (v < min) min = v;
-                    if (v > max) max = v;
-                }
-
-                //printf("NORM min/max: %f %f\n", min, max);
-            }
-#endif
-
-            const float d = (max - min) / ((1 << QB) - 1);
-            const float id = d ? 1.0/d : 0.0;
-
-            memcpy(p0, &min, sizeof(float)); p0 += sizeof(float);
-            memcpy(p0, &d,   sizeof(float)); p0 += sizeof(float);
-
-            //printf("min/max/d/id: %f %f %f %f\n", min, max, d, id);
-
-            for (int s = 0; s < QK/gq_t_bits; ++s) {
-                memset(pp, 0, sizeof(pp));
-
-                for (int l = 0; l < gq_t_bits; l++) {
-                    const   float v = src[j*k + i*QK + s*gq_t_bits + l];
-                    const uint8_t q = (v - min)*id;
-
-                    for (int b = 0; b < QB; b++) {
-                        pp[b] |= q & (1 << b) ? (1ULL << l) : 0;
-                    }
-                }
-
-                for (int b = 0; b < QB; b++) {
-                    memcpy(p0, &pp[b], sizeof(gq_quant_t)); p0 += sizeof(gq_quant_t);
-                }
-            }
-        }
-    }
-}
-
-void mul_mat_gq_1(
-    const void * src0,
-    const void * src1,
-         float * dst,
-    int m, int n, int k) {
-    const int kp = k & ~(gq_t_bits - 1);
-
-    const char * restrict p0 = src0;
-    const char * restrict p1 = src1;
-
-    float s0[QB + 1];
-    float s1[QB + 1];
-
-    gq_quant_t m0[QB + 1];
-    gq_quant_t m1[QB + 1];
-
-    for (int ir0 = 0; ir0 < m; ir0++) {
-        for (int ir1 = 0; ir1 < n; ir1++) {
-            float sumf = 0.0;
-
-            const char * restrict pp0 = p0 + ir0*((2*sizeof(float) + (QK/gq_t_bits)*QB*sizeof(gq_quant_t))*(k/QK));
-            const char * restrict pp1 = p1 + ir1*((2*sizeof(float) + (QK/gq_t_bits)*QB*sizeof(gq_quant_t))*(k/QK));
-
-            for (int i = 0; i < kp/QK; i++) {
-                float min0, d0;
-                memcpy(&min0, pp0, sizeof(float)); pp0 += sizeof(float);
-                memcpy(&d0,   pp0, sizeof(float)); pp0 += sizeof(float);
-
-                float min1, d1;
-                memcpy(&min1, pp1, sizeof(float)); pp1 += sizeof(float);
-                memcpy(&d1,   pp1, sizeof(float)); pp1 += sizeof(float);
-
-                //printf("min0/d0 = %f %f | min1/d1 = %f %f\n", min0, d0, min1, d1);
-
-#if 1
-                // >>> General case for any QB
-
-                s0[0] = min0;
-                s1[0] = min1;
-
-                for (int b = 0; b < QB; b++) {
-                    s0[b + 1] = d0*(1 << b);
-                    s1[b + 1] = d1*(1 << b);
-                }
-
-                m0[0] = 0-1ULL;
-                m1[0] = 0-1ULL;
-
-                for (int s = 0; s < QK/gq_t_bits; ++s) {
-                    for (int b = 0; b < QB; b++) {
-                        memcpy(&m0[b + 1], pp0, sizeof(gq_quant_t)); pp0 += sizeof(gq_quant_t);
-                        memcpy(&m1[b + 1], pp1, sizeof(gq_quant_t)); pp1 += sizeof(gq_quant_t);
-                    }
-
-                    for (int q0 = 0; q0 < QB + 1; q0++) {
-                        for (int q1 = 0; q1 < QB + 1; q1++) {
-                            sumf += s0[q0]*s1[q1]*__builtin_popcountll(m0[q0] & m1[q1]);
-                        }
-                    }
-                }
-#else
-#endif
-            }
-
-            dst[ir0*n + ir1] = sumf;
-        }
-    }
-}
-
-//
-// method 2
-// n-bit quantization (2nd attempt)
-//
-
-static inline int quantize_2_blocks_per_row(int k) {
-    return k/QK;
-}
-
-static inline int quantize_2_quants_per_block(void) {
-    return QK/gq_t_bits;
-}
-
-static inline int quantize_2_row_size(int k) {
-    const int nb = quantize_2_blocks_per_row(k);
-    const int nq = quantize_2_quants_per_block();
-
-    return nb*(2*sizeof(gq_scale_t) + nq*QB*sizeof(gq_quant_t));
-}
-
-void quantize_2_row(const float * restrict src, void * restrict dst, int k) {
-    assert(k % QK == 0);
-
-    const int nb = quantize_2_blocks_per_row(k);
-    const int nq = quantize_2_quants_per_block();
-
-    gq_scale_t * restrict pm = (gq_scale_t *) (dst);
-    gq_scale_t * restrict pd = (gq_scale_t *) (pm + nb);
-    gq_quant_t * restrict pb = (gq_quant_t *) (pd + nb);
-
-    gq_quant_t pp[QB];
-
-    static const int32_t sh[32] = {
-        0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
-        16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-    };
-
-    for (int i = 0; i < nb; i++) {
-        float min = FLT_MAX;
-        float max = -FLT_MAX;
-
-#ifdef __ARM_NEON
-        {
-            float32x4_t minv = vdupq_n_f32(FLT_MAX);
-            float32x4_t maxv = vdupq_n_f32(-FLT_MAX);
-
-            for (int l = 0; l < QK; l += 4) {
-                float32x4_t v = vld1q_f32(src + i*QK + l);
-                minv = vminq_f32(minv, v);
-                maxv = vmaxq_f32(maxv, v);
-            }
-
-            float32x2_t minv32 = vpmin_f32(vget_low_f32(minv), vget_high_f32(minv));
-            float32x2_t maxv32 = vpmax_f32(vget_low_f32(maxv), vget_high_f32(maxv));
-
-            min = MIN(vget_lane_f32(minv32, 0), vget_lane_f32(minv32, 1));
-            max = MAX(vget_lane_f32(maxv32, 0), vget_lane_f32(maxv32, 1));
-        }
-#else
-        {
-            for (int l = 0; l < QK; l++) {
-                const float v = src[i*QK + l];
-                if (v < min) min = v;
-                if (v > max) max = v;
-            }
-        }
-#endif
-
-        const float d = (max - min) / ((1 << QB) - 1);
-        const float id = d ? 1.0/d : 0.0;
-
-        pm[i] = GGML_FP32_TO_GQ(min);
-        pd[i] = GGML_FP32_TO_GQ(d);
-
-        for (int s = 0; s < nq; ++s) {
-            memset(pp, 0, sizeof(pp));
-
-#if 1
-            for (int l = 0; l < gq_t_bits; l++) {
-                const   float v = src[i*QK + s*gq_t_bits + l];
-                const uint8_t q = (v - min)*id + frand();
-
-                for (int b = 0; b < QB; b++) {
-                    pp[b] |= q & (1 << b) ? (1ULL << l) : 0;
-                }
-            }
-#elif defined(__ARM_NEON)
-#if 1
-            {
-                uint32_t ppt[2*4*QB];
-
-                float32x4_t minv = vdupq_n_f32(min);
-                float32x4_t idv  = vdupq_n_f32(id);
-
-                assert(gq_t_bits % 16 == 0);
-
-                uint32x4_t p0[QB] = { vdupq_n_u32(0) };
-                uint32x4_t p1[QB] = { vdupq_n_u32(0) };
-
-                for (int l = 0; l < gq_t_bits; l += 16) {
-                    float32x4_t v0 = vld1q_f32(src + i*QK + s*gq_t_bits + l + 0);
-                    float32x4_t v1 = vld1q_f32(src + i*QK + s*gq_t_bits + l + 4);
-                    float32x4_t v2 = vld1q_f32(src + i*QK + s*gq_t_bits + l + 8);
-                    float32x4_t v3 = vld1q_f32(src + i*QK + s*gq_t_bits + l + 12);
-
-                    v0 = vsubq_f32(v0, minv);
-                    v1 = vsubq_f32(v1, minv);
-                    v2 = vsubq_f32(v2, minv);
-                    v3 = vsubq_f32(v3, minv);
-
-                    v0 = vmulq_f32(v0, idv);
-                    v1 = vmulq_f32(v1, idv);
-                    v2 = vmulq_f32(v2, idv);
-                    v3 = vmulq_f32(v3, idv);
-
-#if 1
-                    v0[0] += frand(); v0[1] += frand(); v0[2] += frand(); v0[3] += frand();
-                    v1[0] += frand(); v1[1] += frand(); v1[2] += frand(); v1[3] += frand();
-                    v2[0] += frand(); v2[1] += frand(); v2[2] += frand(); v2[3] += frand();
-                    v3[0] += frand(); v3[1] += frand(); v3[2] += frand(); v3[3] += frand();
-#endif
-
-                    uint32x4_t q0 = vcvtq_u32_f32(v0);
-                    uint32x4_t q1 = vcvtq_u32_f32(v1);
-                    uint32x4_t q2 = vcvtq_u32_f32(v2);
-                    uint32x4_t q3 = vcvtq_u32_f32(v3);
-
-                    for (int b = 0; b < QB; ++b) {
-                        uint32x4_t m = vdupq_n_u32(1 << b);
-                        uint32x4_t r = vdupq_n_u32(-b);
-
-                        if (l < 32) {
-                            p0[b] = vorrq_u32(p0[b], vshlq_u32(vshlq_u32(vandq_u32(q0, m), r), vld1q_s32(sh + l + 0)));
-                            p0[b] = vorrq_u32(p0[b], vshlq_u32(vshlq_u32(vandq_u32(q1, m), r), vld1q_s32(sh + l + 4)));
-                            p0[b] = vorrq_u32(p0[b], vshlq_u32(vshlq_u32(vandq_u32(q2, m), r), vld1q_s32(sh + l + 8)));
-                            p0[b] = vorrq_u32(p0[b], vshlq_u32(vshlq_u32(vandq_u32(q3, m), r), vld1q_s32(sh + l + 12)));
-                        } else {
-                            p1[b] = vorrq_u32(p1[b], vshlq_u32(vshlq_u32(vandq_u32(q0, m), r), vld1q_s32(sh + l - 32)));
-                            p1[b] = vorrq_u32(p1[b], vshlq_u32(vshlq_u32(vandq_u32(q1, m), r), vld1q_s32(sh + l - 28)));
-                            p1[b] = vorrq_u32(p1[b], vshlq_u32(vshlq_u32(vandq_u32(q2, m), r), vld1q_s32(sh + l - 24)));
-                            p1[b] = vorrq_u32(p1[b], vshlq_u32(vshlq_u32(vandq_u32(q3, m), r), vld1q_s32(sh + l - 20)));
-                        }
-                    }
-                }
-
-#if QB == 4
-                vst1q_u32((uint32_t *) ppt + 0,  p0[0]);
-                vst1q_u32((uint32_t *) ppt + 4,  p1[0]);
-                vst1q_u32((uint32_t *) ppt + 8,  p0[1]);
-                vst1q_u32((uint32_t *) ppt + 12, p1[1]);
-                vst1q_u32((uint32_t *) ppt + 16, p0[2]);
-                vst1q_u32((uint32_t *) ppt + 20, p1[2]);
-                vst1q_u32((uint32_t *) ppt + 24, p0[3]);
-                vst1q_u32((uint32_t *) ppt + 28, p1[3]);
-
-                pp[0] = (ppt[0]  | ppt[1]  | ppt[2]  | ppt[3] ) | ((uint64_t) (ppt[4]  | ppt[5]  | ppt[6]  | ppt[7]) ) << 32;
-                pp[1] = (ppt[8]  | ppt[9]  | ppt[10] | ppt[11]) | ((uint64_t) (ppt[12] | ppt[13] | ppt[14] | ppt[15])) << 32;
-                pp[2] = (ppt[16] | ppt[17] | ppt[18] | ppt[19]) | ((uint64_t) (ppt[20] | ppt[21] | ppt[22] | ppt[23])) << 32;
-                pp[3] = (ppt[24] | ppt[25] | ppt[26] | ppt[27]) | ((uint64_t) (ppt[28] | ppt[29] | ppt[30] | ppt[31])) << 32;
-#else
-                for (int b = 0; b < QB; ++b) {
-                    vst1q_u32((uint32_t *) ppt + 0,  p0[b]);
-                    vst1q_u32((uint32_t *) ppt + 4,  p1[b]);
-
-                    pp[b] = (ppt[0] | ppt[1] | ppt[2] | ppt[3]) | ((uint64_t) (ppt[4] | ppt[5] | ppt[6] | ppt[7])) << 32;
-                }
-#endif
-            }
-#else
-            // less optimal SIMD
-            {
-                float32x4_t minv = vdupq_n_f32(min);
-                float32x4_t idv  = vdupq_n_f32(id);
-
-                assert(gq_t_bits == 64);
-                uint8_t qq[gq_t_bits];
-
-                for (int l = 0; l < gq_t_bits; l += 16) {
-                    float32x4_t v0 = vld1q_f32(src + i*QK + s*gq_t_bits + l + 0);
-                    float32x4_t v1 = vld1q_f32(src + i*QK + s*gq_t_bits + l + 4);
-                    float32x4_t v2 = vld1q_f32(src + i*QK + s*gq_t_bits + l + 8);
-                    float32x4_t v3 = vld1q_f32(src + i*QK + s*gq_t_bits + l + 12);
-
-                    v0 = vsubq_f32(v0, minv);
-                    v1 = vsubq_f32(v1, minv);
-                    v2 = vsubq_f32(v2, minv);
-                    v3 = vsubq_f32(v3, minv);
-
-                    v0 = vmulq_f32(v0, idv);
-                    v1 = vmulq_f32(v1, idv);
-                    v2 = vmulq_f32(v2, idv);
-                    v3 = vmulq_f32(v3, idv);
-
-#if 0
-                    v0[0] += frand(); v0[1] += frand(); v0[2] += frand(); v0[3] += frand();
-                    v1[0] += frand(); v1[1] += frand(); v1[2] += frand(); v1[3] += frand();
-                    v2[0] += frand(); v2[1] += frand(); v2[2] += frand(); v2[3] += frand();
-                    v3[0] += frand(); v3[1] += frand(); v3[2] += frand(); v3[3] += frand();
-#endif
-
-                    uint32x4_t q0 = vcvtq_u32_f32(v0);
-                    uint32x4_t q1 = vcvtq_u32_f32(v1);
-                    uint32x4_t q2 = vcvtq_u32_f32(v2);
-                    uint32x4_t q3 = vcvtq_u32_f32(v3);
-
-                    // store in qq as uint8_t
-                    vst1_u8(qq + l + 0, vmovn_u16(vcombine_u16(vmovn_u32(q0), vmovn_u32(q1))));
-                    vst1_u8(qq + l + 8, vmovn_u16(vcombine_u16(vmovn_u32(q2), vmovn_u32(q3))));
-                }
-
-                for (int l = 0; l < gq_t_bits; l++) {
-                    for (int b = 0; b < QB; b++) {
-                        const uint64_t ql = qq[l];
-                        /*pp[b] |= qq[l] & (1 << b) ? (1ULL << l) : 0;*/
-                        pp[b] |= ((ql & (1 << b)) >> b) << l;
-                    }
-                }
-            }
-#endif
-#endif
-            memcpy(pb + i*nq*QB + s*QB, pp, sizeof(pp));
-        }
-    }
-}
-
-// reimplementation of quantize_2 using quantize_2_row
-void quantize_2(const float * restrict src, char * restrict dst, int n, int k) {
-    assert(k % QK == 0);
-
-    for (int j = 0; j < n; j++) {
-        quantize_2_row(src + j*k, dst, k);
-        dst = (char *) dst + quantize_2_row_size(k);
-    }
-}
-
-void vec_dot_gq_2(const int n, float * restrict s, const void * restrict x, const void * restrict y) {
-    const int nb = quantize_2_blocks_per_row(n);
-    const int nq = quantize_2_quants_per_block();
-
-    const gq_scale_t * restrict pm0 = (const gq_scale_t *) x;
-    const gq_scale_t * restrict pm1 = (const gq_scale_t *) y;
-
-    const gq_scale_t * restrict pd0 = pm0 + nb;
-    const gq_scale_t * restrict pd1 = pm1 + nb;
-
-    const gq_quant_t * restrict pb0 = (const gq_quant_t *) (pd0 + nb);
-    const gq_quant_t * restrict pb1 = (const gq_quant_t *) (pd1 + nb);
-
-    float sumf = 0.0;
-
-#if 1
-    for (int i = 0; i < nb; i++) {
-        const float m0 = GGML_GQ_TO_FP32(pm0[i]);
-        const float d0 = GGML_GQ_TO_FP32(pd0[i]);
-
-        const float m1 = GGML_GQ_TO_FP32(pm1[i]);
-        const float d1 = GGML_GQ_TO_FP32(pd1[i]);
-
-#if QB == 4
-        int isum01 = 0;
-        int isum10 = 0;
-        int isum11 = 0;
-
-        for (int s = 0; s < nq; ++s) {
-            const gq_quant_t * restrict mm0 = pb0 + i*nq*QB + s*QB;
-            const gq_quant_t * restrict mm1 = pb1 + i*nq*QB + s*QB;
-
-#define bpcnt(x) __builtin_popcountll(x)
-            isum01 += (1 << 0)*(bpcnt(mm1[0]));
-            isum01 += (1 << 1)*(bpcnt(mm1[1]));
-            isum01 += (1 << 2)*(bpcnt(mm1[2]));
-            isum01 += (1 << 3)*(bpcnt(mm1[3]));
-
-            isum10 += (1 << 0)*(bpcnt(mm0[0]));
-            isum10 += (1 << 1)*(bpcnt(mm0[1]));
-            isum10 += (1 << 2)*(bpcnt(mm0[2]));
-            isum10 += (1 << 3)*(bpcnt(mm0[3]));
-
-            isum11 += (1 << 0)*(bpcnt(mm0[0] & mm1[0]));
-            isum11 += (1 << 1)*(bpcnt(mm0[0] & mm1[1]) + bpcnt(mm0[1] & mm1[0]));
-            isum11 += (1 << 2)*(bpcnt(mm0[0] & mm1[2]) + bpcnt(mm0[1] & mm1[1]) + bpcnt(mm0[2] & mm1[0]));
-            isum11 += (1 << 3)*(bpcnt(mm0[0] & mm1[3]) + bpcnt(mm0[1] & mm1[2]) + bpcnt(mm0[2] & mm1[1]) + bpcnt(mm0[3] & mm1[0]));
-            isum11 += (1 << 4)*(bpcnt(mm0[1] & mm1[3]) + bpcnt(mm0[2] & mm1[2]) + bpcnt(mm0[3] & mm1[1]));
-            isum11 += (1 << 5)*(bpcnt(mm0[2] & mm1[3]) + bpcnt(mm0[3] & mm1[2]));
-            isum11 += (1 << 6)*(bpcnt(mm0[3] & mm1[3]));
-#undef bpcnt
-        }
-
-        sumf += nq*gq_t_bits*(m0*m1) + isum01*(m0*d1) + isum10*(m1*d0) + isum11*(d0*d1);
-#elif QB == 3
-        int isum01 = 0;
-        int isum10 = 0;
-        int isum11 = 0;
-
-        for (int s = 0; s < nq; ++s) {
-            const gq_quant_t * restrict mm0 = pb0 + i*nq*QB + s*QB;
-            const gq_quant_t * restrict mm1 = pb1 + i*nq*QB + s*QB;
-
-#if gq_t_bits == 32
-#define bpcnt(x) __builtin_popcount(x)
-#else
-#define bpcnt(x) __builtin_popcountll(x)
-#endif
-            isum01 += (1 << 0)*(bpcnt(mm1[0]));
-            isum01 += (1 << 1)*(bpcnt(mm1[1]));
-            isum01 += (1 << 2)*(bpcnt(mm1[2]));
-
-            isum10 += (1 << 0)*(bpcnt(mm0[0]));
-            isum10 += (1 << 1)*(bpcnt(mm0[1]));
-            isum10 += (1 << 2)*(bpcnt(mm0[2]));
-
-            isum11 += (1 << 0)*(bpcnt(mm0[0] & mm1[0]));
-            isum11 += (1 << 1)*(bpcnt(mm0[0] & mm1[1]) + bpcnt(mm0[1] & mm1[0]));
-            isum11 += (1 << 2)*(bpcnt(mm0[0] & mm1[2]) + bpcnt(mm0[1] & mm1[1]) + bpcnt(mm0[2] & mm1[0]));
-            isum11 += (1 << 3)*(bpcnt(mm0[1] & mm1[2]) + bpcnt(mm0[2] & mm1[1]));
-            isum11 += (1 << 4)*(bpcnt(mm0[2] & mm1[2]));
-#undef bpcnt
-        }
-
-        sumf += nq*gq_t_bits*(m0*m1) + isum01*(m0*d1) + isum10*(m1*d0) + isum11*(d0*d1);
-#elif QB == 2
-        int isum01 = 0;
-        int isum10 = 0;
-        int isum11 = 0;
-
-        for (int s = 0; s < nq; ++s) {
-            const gq_quant_t * restrict mm0 = pb0 + i*nq*QB + s*QB;
-            const gq_quant_t * restrict mm1 = pb1 + i*nq*QB + s*QB;
-
-#if gq_t_bits == 32
-#define bpcnt(x) __builtin_popcount(x)
-#else
-#define bpcnt(x) __builtin_popcountll(x)
-#endif
-            isum01 += (1 << 0)*(bpcnt(mm1[0]));
-            isum01 += (1 << 1)*(bpcnt(mm1[1]));
-
-            isum10 += (1 << 0)*(bpcnt(mm0[0]));
-            isum10 += (1 << 1)*(bpcnt(mm0[1]));
-
-            isum11 += (1 << 0)*(bpcnt(mm0[0] & mm1[0]));
-            isum11 += (1 << 1)*(bpcnt(mm0[0] & mm1[1]) + bpcnt(mm0[1] & mm1[0]));
-            isum11 += (1 << 2)*(bpcnt(mm0[1] & mm1[1]));
-#undef bpcnt
-        }
-
-        sumf += nq*gq_t_bits*(m0*m1) + isum01*(m0*d1) + isum10*(m1*d0) + isum11*(d0*d1);
-#else
-        float s0[QB + 1];
-        float s1[QB + 1];
-
-        s0[0] = m0;
-        s1[0] = m1;
-
-        for (int b = 0; b < QB; b++) {
-            s0[b + 1] = d0*(1 << b);
-            s1[b + 1] = d1*(1 << b);
-        }
-
-        for (int s = 0; s < nq; ++s) {
-            for (int q0 = 0; q0 < QB + 1; q0++) {
-                const gq_quant_t mm0 = q0 ? pb0[i*nq*QB + s*QB + q0 - 1] : -1ULL;
-                for (int q1 = 0; q1 < QB + 1; q1++) {
-                    const gq_quant_t mm1 = q1 ? pb1[i*nq*QB + s*QB + q1 - 1] : -1ULL;
-                    sumf += s0[q0]*s1[q1]*__builtin_popcountll(mm0 & mm1);
-                }
-            }
-        }
-#endif
-    }
-#else
-#error "not implemented"
-#endif
-
-    *s = sumf;
-}
-
-// use vec_dot_gq_2 to compute the dot product of two rows
-void mul_mat_gq_2(
-    const void * src0,
-    const void * src1, // transposed
-         float * dst,
-    int m, int n, int k) {
-    assert(k % QK == 0);
-
-    for (int ir0 = 0; ir0 < m; ir0++) {
-        for (int ir1 = 0; ir1 < n; ir1++) {
-            vec_dot_gq_2(k, dst + ir1, src0, src1);
-            src1 = (const char *) src1 + quantize_2_row_size(k);
-        }
-        src0 = (const char *) src0 +   quantize_2_row_size(k);
-        src1 = (const char *) src1 - n*quantize_2_row_size(k);
-
-        dst = (float *) dst + n;
-    }
-}
-
-//
-// method 3
-// (does not work)
-//
-
-static inline int quantize_3_blocks_per_row(int k) {
-    return k/QK;
-}
-
-static inline int quantize_3_quants_per_block(void) {
-    return QK/gq_t_bits;
-}
-
-static inline int quantize_3_row_size(int k) {
-    const int nb = quantize_3_blocks_per_row(k);
-    const int nq = quantize_3_quants_per_block();
-
-    return nb*(sizeof(gq_scale_t) + nq*QB*sizeof(gq_quant_t));
-}
-
-void quantize_3_row(const float * restrict src, void * restrict dst, int k) {
-    assert(k % QK == 0);
-
-    const int nb = quantize_3_blocks_per_row(k);
-    const int nq = quantize_3_quants_per_block();
-
-    gq_scale_t * restrict pd = (gq_scale_t *) (dst);
-    gq_quant_t * restrict pb = (gq_quant_t *) (pd + nb);
-
-    gq_quant_t pp[QB];
-
-    static const int32_t sh[32] = {
-        0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
-        16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-    };
-
-    for (int i = 0; i < nb; i++) {
-        float amax = 0.0f; // abs max
-
-#ifdef __ARM_NEON
-        {
-            // min / max
-            //float32x4_t minv = vdupq_n_f32(FLT_MAX);
-            //float32x4_t maxv = vdupq_n_f32(-FLT_MAX);
-
-            //for (int l = 0; l < QK; l += 4) {
-            //    float32x4_t v = vld1q_f32(src + i*QK + l);
-            //    minv = vminq_f32(minv, v);
-            //    maxv = vmaxq_f32(maxv, v);
-            //}
-
-            //float32x2_t minv32 = vpmin_f32(vget_low_f32(minv), vget_high_f32(minv));
-            //float32x2_t maxv32 = vpmax_f32(vget_low_f32(maxv), vget_high_f32(maxv));
-
-            //min = MIN(vget_lane_f32(minv32, 0), vget_lane_f32(minv32, 1));
-            //max = MAX(vget_lane_f32(maxv32, 0), vget_lane_f32(maxv32, 1));
-
-            // abs max
-            float32x4_t amaxv = vdupq_n_f32(0.0f);
-
-            for (int l = 0; l < QK; l += 4) {
-                float32x4_t v = vld1q_f32(src + i*QK + l);
-                amaxv = vmaxq_f32(amaxv, vabsq_f32(v));
-            }
-
-            float32x2_t amaxv32 = vpmax_f32(vget_low_f32(amaxv), vget_high_f32(amaxv));
-
-            amax = MAX(vget_lane_f32(amaxv32, 0), vget_lane_f32(amaxv32, 1));
-        }
-#else
-        {
-            for (int l = 0; l < QK; l++) {
-                const float v = src[i*QK + l];
-                amax = MAX(amax, fabsf(v));
-            }
-        }
-#endif
-
-        const float d = amax / ((1 << (QB - 1)) - 1);
-        const float id = d ? 1.0/d : 0.0;
-
-        pd[i] = GGML_FP32_TO_GQ(d);
-
-        for (int s = 0; s < nq; ++s) {
-            memset(pp, 0, sizeof(pp));
-
-#if 0
-            for (int l = 0; l < gq_t_bits; l++) {
-                const   float v = src[i*QK + s*gq_t_bits + l];
-                const uint8_t q = v*id + frand();
-
-                for (int b = 0; b < QB; b++) {
-                    pp[b] |= q & (1 << b) ? (1ULL << l) : 0;
-                }
-            }
-#elif defined(__ARM_NEON)
-            {
-                uint32_t ppt[2*4*QB];
-
-                float32x4_t idv  = vdupq_n_f32(id);
-
-                assert(gq_t_bits == 64);
-
-                uint32x4_t p0[QB] = { vdupq_n_u32(0) };
-                uint32x4_t p1[QB] = { vdupq_n_u32(0) };
-
-                for (int l = 0; l < gq_t_bits; l += 16) {
-                    float32x4_t v0 = vld1q_f32(src + i*QK + s*gq_t_bits + l + 0);
-                    float32x4_t v1 = vld1q_f32(src + i*QK + s*gq_t_bits + l + 4);
-                    float32x4_t v2 = vld1q_f32(src + i*QK + s*gq_t_bits + l + 8);
-                    float32x4_t v3 = vld1q_f32(src + i*QK + s*gq_t_bits + l + 12);
-
-                    v0 = vmulq_f32(v0, idv);
-                    v1 = vmulq_f32(v1, idv);
-                    v2 = vmulq_f32(v2, idv);
-                    v3 = vmulq_f32(v3, idv);
-
-#if 1
-                    v0[0] += frand(); v0[1] += frand(); v0[2] += frand(); v0[3] += frand();
-                    v1[0] += frand(); v1[1] += frand(); v1[2] += frand(); v1[3] += frand();
-                    v2[0] += frand(); v2[1] += frand(); v2[2] += frand(); v2[3] += frand();
-                    v3[0] += frand(); v3[1] += frand(); v3[2] += frand(); v3[3] += frand();
-#endif
-
-                    uint32x4_t q0 = vcvtq_u32_f32(v0);
-                    uint32x4_t q1 = vcvtq_u32_f32(v1);
-                    uint32x4_t q2 = vcvtq_u32_f32(v2);
-                    uint32x4_t q3 = vcvtq_u32_f32(v3);
-
-                    for (int b = 0; b < QB; ++b) {
-                        uint32x4_t m = vdupq_n_u32(1 << b);
-                        int32x4_t r = vdupq_n_s32(-b);
-
-                        if (l < 32) {
-                            p0[b] = vorrq_u32(p0[b], vshlq_u32(vshlq_u32(vandq_u32(q0, m), r), vld1q_s32(sh + l + 0)));
-                            p0[b] = vorrq_u32(p0[b], vshlq_u32(vshlq_u32(vandq_u32(q1, m), r), vld1q_s32(sh + l + 4)));
-                            p0[b] = vorrq_u32(p0[b], vshlq_u32(vshlq_u32(vandq_u32(q2, m), r), vld1q_s32(sh + l + 8)));
-                            p0[b] = vorrq_u32(p0[b], vshlq_u32(vshlq_u32(vandq_u32(q3, m), r), vld1q_s32(sh + l + 12)));
-                        } else {
-                            p1[b] = vorrq_u32(p1[b], vshlq_u32(vshlq_u32(vandq_u32(q0, m), r), vld1q_s32(sh + l - 32)));
-                            p1[b] = vorrq_u32(p1[b], vshlq_u32(vshlq_u32(vandq_u32(q1, m), r), vld1q_s32(sh + l - 28)));
-                            p1[b] = vorrq_u32(p1[b], vshlq_u32(vshlq_u32(vandq_u32(q2, m), r), vld1q_s32(sh + l - 24)));
-                            p1[b] = vorrq_u32(p1[b], vshlq_u32(vshlq_u32(vandq_u32(q3, m), r), vld1q_s32(sh + l - 20)));
-                        }
-                    }
-                }
-
-#if QB == 4
-                vst1q_u32((uint32_t *) ppt + 0,  p0[0]);
-                vst1q_u32((uint32_t *) ppt + 4,  p1[0]);
-                vst1q_u32((uint32_t *) ppt + 8,  p0[1]);
-                vst1q_u32((uint32_t *) ppt + 12, p1[1]);
-                vst1q_u32((uint32_t *) ppt + 16, p0[2]);
-                vst1q_u32((uint32_t *) ppt + 20, p1[2]);
-                vst1q_u32((uint32_t *) ppt + 24, p0[3]);
-                vst1q_u32((uint32_t *) ppt + 28, p1[3]);
-
-                pp[0] = (ppt[0]  | ppt[1]  | ppt[2]  | ppt[3] ) | ((uint64_t) (ppt[4]  | ppt[5]  | ppt[6]  | ppt[7]) ) << 32;
-                pp[1] = (ppt[8]  | ppt[9]  | ppt[10] | ppt[11]) | ((uint64_t) (ppt[12] | ppt[13] | ppt[14] | ppt[15])) << 32;
-                pp[2] = (ppt[16] | ppt[17] | ppt[18] | ppt[19]) | ((uint64_t) (ppt[20] | ppt[21] | ppt[22] | ppt[23])) << 32;
-                pp[3] = (ppt[24] | ppt[25] | ppt[26] | ppt[27]) | ((uint64_t) (ppt[28] | ppt[29] | ppt[30] | ppt[31])) << 32;
-#else
-                for (int q = 0; q < QB; ++q) {
-                    vst1q_u32((uint32_t *) ppt + 0,  p0[q]);
-                    vst1q_u32((uint32_t *) ppt + 4,  p1[q]);
-
-                    pp[q] = (ppt[0] | ppt[1] | ppt[2] | ppt[3]) | ((uint64_t) (ppt[4] | ppt[5] | ppt[6] | ppt[7])) << 32;
-                }
-#endif
-            }
-#endif
-            memcpy(pb + i*nq*QB + s*QB, pp, sizeof(pp));
-        }
-    }
-}
-
-// reimplementation of quantize_3 using quantize_3_row
-void quantize_3(const float * restrict src, char * restrict dst, int n, int k) {
-    assert(k % QK == 0);
-
-    for (int j = 0; j < n; j++) {
-        quantize_3_row(src + j*k, dst, k);
-        dst = (char *) dst + quantize_3_row_size(k);
-    }
-}
-
-void vec_dot_gq_3(const int n, float * restrict s, const void * restrict x, const void * restrict y) {
-    float sumf = 0.0f;
-
-    const int nb = quantize_3_blocks_per_row(n);
-    const int nq = quantize_3_quants_per_block();
-
-    const gq_scale_t * restrict pd0 = (const gq_scale_t *) x;
-    const gq_scale_t * restrict pd1 = (const gq_scale_t *) y;
-
-    const gq_quant_t * restrict pb0 = (const gq_quant_t *) (pd0 + nb);
-    const gq_quant_t * restrict pb1 = (const gq_quant_t *) (pd1 + nb);
-
-#if 1
-    for (int i = 0; i < nb; i++) {
-        int isum = 0;
-
-#if QB == 4
-        for (int s = 0; s < nq; ++s) {
-            const gq_quant_t * restrict m0 = pb0 + i*nq*QB + s*QB;
-            const gq_quant_t * restrict m1 = pb1 + i*nq*QB + s*QB;
-
-            isum += (1 << 0)*(__builtin_popcountll(m0[0] & m1[0]));
-            isum += (1 << 1)*(__builtin_popcountll(m0[0] & m1[1]) + __builtin_popcountll(m0[1] & m1[0]));
-            isum += (1 << 2)*(__builtin_popcountll(m0[0] & m1[2]) + __builtin_popcountll(m0[1] & m1[1]) + __builtin_popcountll(m0[2] & m1[0]));
-            isum += (1 << 3)*(__builtin_popcountll(m0[0] & m1[3]) + __builtin_popcountll(m0[1] & m1[2]) + __builtin_popcountll(m0[2] & m1[1]) + __builtin_popcountll(m0[3] & m1[0]));
-            isum += (1 << 4)*(__builtin_popcountll(m0[1] & m1[3]) + __builtin_popcountll(m0[2] & m1[2]) + __builtin_popcountll(m0[3] & m1[1]));
-            isum += (1 << 5)*(__builtin_popcountll(m0[2] & m1[3]) + __builtin_popcountll(m0[3] & m1[2]));
-            isum += (1 << 6)*(__builtin_popcountll(m0[3] & m1[3]));
-        }
-#else
-        for (int s = 0; s < nq; ++s) {
-            for (int q0 = 0; q0 < QB; q0++) {
-                const gq_quant_t mm0 = pb0[i*nq*QB + s*QB + q0];
-                for (int q1 = 0; q1 < QB; q1++) {
-                    const gq_quant_t mm1 = pb1[i*nq*QB + s*QB + q1];
-                    isum += (1 << (q0 + q1))*(__builtin_popcountll(mm0 & mm1));
-                }
-            }
-        }
-#endif
-
-        const float d0 = GGML_GQ_TO_FP32(pd0[i]);
-        const float d1 = GGML_GQ_TO_FP32(pd1[i]);
-
-        sumf += d0*d1*isum;
-    }
-#else
-#ifdef __ARM_NEON
-    // gq_quant_t == uint64_t
-    for (int i = 0; i < nb; i += 4) {
-        int isum[4] = {0, 0, 0, 0};
-
-        for (int k = 0; k < 4; ++k) {
-            for (int s = 0; s < nq; ++s) {
-                const gq_quant_t * restrict m0 = pb0 + (i+k)*nq*QB + s*QB;
-                const gq_quant_t * restrict m1 = pb1 + (i+k)*nq*QB + s*QB;
-
-#if QB == 4
-#define bpcnt(x) __builtin_popcountll(x)
-                //isum[k] += (1ULL << 0)*(bpcnt(m0[0] & m1[0])) +
-                //           (1ULL << 1)*(bpcnt(m0[0] & m1[1]) + bpcnt(m0[1] & m1[0])) +
-                //           (1ULL << 2)*(bpcnt(m0[0] & m1[2]) + bpcnt(m0[1] & m1[1]) + bpcnt(m0[2] & m1[0])) +
-                //           (1ULL << 3)*(bpcnt(m0[0] & m1[3]) + bpcnt(m0[1] & m1[2]) + bpcnt(m0[2] & m1[1]) + bpcnt(m0[3] & m1[0])) +
-                //           (1ULL << 4)*(bpcnt(m0[1] & m1[3]) + bpcnt(m0[2] & m1[2]) + bpcnt(m0[3] & m1[1])) +
-                //           (1ULL << 5)*(bpcnt(m0[2] & m1[3]) + bpcnt(m0[3] & m1[2])) +
-                //           (1ULL << 6)*(bpcnt(m0[3] & m1[3]));
-#undef bpcnt
-
-                const uint8x8_t m00 = vld1_u8((const uint8_t *) (m0 + 0));
-                const uint8x8_t m01 = vld1_u8((const uint8_t *) (m0 + 1));
-                const uint8x8_t m02 = vld1_u8((const uint8_t *) (m0 + 2));
-                const uint8x8_t m03 = vld1_u8((const uint8_t *) (m0 + 3));
-
-                const uint8x8_t m10 = vld1_u8((const uint8_t *) (m1 + 0));
-                const uint8x8_t m11 = vld1_u8((const uint8_t *) (m1 + 1));
-                const uint8x8_t m12 = vld1_u8((const uint8_t *) (m1 + 2));
-                const uint8x8_t m13 = vld1_u8((const uint8_t *) (m1 + 3));
-
-                const uint8x8_t m00m10 = vand_u8(m00, m10);
-
-                const uint8x8_t m00m11 = vand_u8(m00, m11);
-                const uint8x8_t m01m10 = vand_u8(m01, m10);
-
-                const uint8x8_t m00m12 = vand_u8(m00, m12);
-                const uint8x8_t m01m11 = vand_u8(m01, m11);
-                const uint8x8_t m02m10 = vand_u8(m02, m10);
-
-                const uint8x8_t m00m13 = vand_u8(m00, m13);
-                const uint8x8_t m01m12 = vand_u8(m01, m12);
-                const uint8x8_t m02m11 = vand_u8(m02, m11);
-                const uint8x8_t m03m10 = vand_u8(m03, m10);
-
-                const uint8x8_t m01m13 = vand_u8(m01, m13);
-                const uint8x8_t m02m12 = vand_u8(m02, m12);
-                const uint8x8_t m03m11 = vand_u8(m03, m11);
-
-                const uint8x8_t m02m13 = vand_u8(m02, m13);
-                const uint8x8_t m03m12 = vand_u8(m03, m12);
-
-                const uint8x8_t m03m13 = vand_u8(m03, m13);
-
-#define bpcnt(x) vaddv_u8(vcnt_u8(x))
-                isum[k] += (1ULL << 0)*(bpcnt(m00m10)) +
-                           (1ULL << 1)*(bpcnt(m00m11) + bpcnt(m01m10)) +
-                           (1ULL << 2)*(bpcnt(m00m12) + bpcnt(m01m11) + bpcnt(m02m10)) +
-                           (1ULL << 3)*(bpcnt(m00m13) + bpcnt(m01m12) + bpcnt(m02m11) + bpcnt(m03m10)) +
-                           (1ULL << 4)*(bpcnt(m01m13) + bpcnt(m02m12) + bpcnt(m03m11)) +
-                           (1ULL << 5)*(bpcnt(m02m13) + bpcnt(m03m12)) +
-                           (1ULL << 6)*(bpcnt(m03m13));
-#undef bpcnt
-#else
-                for (int q0 = 0; q0 < QB; q0++) {
-                    const gq_quant_t mm0 = m0[q0];
-                    for (int q1 = 0; q1 < QB; q1++) {
-                        const gq_quant_t mm1 = m1[q1];
-                        isum[k] += (1ULL << (q0 + q1))*(__builtin_popcountll(mm0 & mm1));
-                    }
-                }
-#endif
-            }
-        }
-
-        int32x4_t isumv = vld1q_s32(isum);
-
-        float32x4_t d0v = vld1q_f32(pd0 + i);
-        float32x4_t d1v = vld1q_f32(pd1 + i);
-
-        float32x4_t sumfv = vmulq_f32(d0v, d1v);
-
-        sumfv = vmulq_f32(sumfv, vcvtq_f32_s32(isumv));
-        sumf += vaddvq_f32(sumfv);
-    }
-#else
-#error "not implemented"
-#endif
-
-#endif
-    *s = sumf;
-}
-
-// use vec_dot_gq_3 to compute the dot product of two rows
-void mul_mat_gq_3(
-    const void * src0,
-    const void * src1, // transposed
-         float * dst,
-    int m, int n, int k) {
-    assert(k % QK == 0);
-
-    const int nb = quantize_3_blocks_per_row(k);
-    const int nq = quantize_3_quants_per_block();
-
-    for (int ir0 = 0; ir0 < m; ir0++) {
-        for (int ir1 = 0; ir1 < n; ir1++) {
-            vec_dot_gq_3(k, dst + ir1, src0, src1);
-            src1 = (const char *) src1 + quantize_3_row_size(k);
-        }
-        src0 = (const char *) src0 +   quantize_3_row_size(k);
-        src1 = (const char *) src1 - n*quantize_3_row_size(k);
-
-        dst = (float *) dst + n;
-    }
-}
-
-//
-// method 4
-// 4-bit quantization
-//
-
-static inline int quantize_4_blocks_per_row(int k) {
-    return k/QK;
-}
-
-static inline int quantize_4_row_size(int k) {
-    const int nb = quantize_4_blocks_per_row(k);
-
-    return nb*(2*sizeof(gq_scale_t) + QK/2);
-}
-
-void quantize_4_row(const float * restrict src, void * restrict dst, int k) {
-    assert(k % QK == 0);
-    assert(QB == 4);
-
-    const int nb = quantize_4_blocks_per_row(k);
-
-    gq_scale_t * restrict pm = (gq_scale_t *) (dst);
-    gq_scale_t * restrict pd = (gq_scale_t *) (pm + nb);
-    uint8_t    * restrict pb = (uint8_t *)    (pd + nb);
-
-    uint8_t pp[QK/2];
-
-    for (int i = 0; i < nb; i++) {
-        memset(pp, 0, sizeof(pp));
-
-        float min = FLT_MAX;
-        float max = -FLT_MAX;
-
-#if defined(__AVX2__)
-        {
-            assert(QK == 64);
-            enum { QK8 = QK/8 };
-
-            __m256 srcv[QK8];
-            __m256 minv[QK8];
-            __m256 maxv[QK8];
-
-            for (int l = 0; l < QK8; l++) {
-                srcv[l] = _mm256_loadu_ps(src + i*QK + 8*l);
-            }
-
-            for (int l = 0; l < QK8/2; l++) {
-                minv[2*l] = _mm256_min_ps(srcv[2*l], srcv[2*l+1]);
-                maxv[2*l] = _mm256_max_ps(srcv[2*l], srcv[2*l+1]);
-            }
-
-            for (int l = 0; l < QK8/4; l++) {
-                minv[4*l] = _mm256_min_ps(minv[4*l], minv[4*l+2]);
-                maxv[4*l] = _mm256_max_ps(maxv[4*l], maxv[4*l+2]);
-            }
-
-            for (int l = 0; l < QK8/8; l++) {
-                minv[8*l] = _mm256_min_ps(minv[8*l], minv[8*l+4]);
-                maxv[8*l] = _mm256_max_ps(maxv[8*l], maxv[8*l+4]);
-            }
-
-            //min = MIN(minv[0][0], MIN(minv[0][1], MIN(minv[0][2], MIN(minv[0][3], MIN(minv[0][4], MIN(minv[0][5], MIN(minv[0][6], minv[0][7])))))));
-            //max = MAX(maxv[0][0], MAX(maxv[0][1], MAX(maxv[0][2], MAX(maxv[0][3], MAX(maxv[0][4], MAX(maxv[0][5], MAX(maxv[0][6], maxv[0][7])))))));
-
-            const __m256 minv0_0 = _mm256_permute2f128_ps(minv[0], minv[0], 3);
-            const __m256 minv0_1 = _mm256_min_ps(minv[0], minv0_0);
-            const __m256 minv0_2 = _mm256_permute_ps(minv0_1, 0x4e);
-            const __m256 minv0_3 = _mm256_min_ps(minv0_1, minv0_2);
-            const __m256 minv0_4 = _mm256_permute_ps(minv0_3, 0xb1);
-            const __m256 minv0_5 = _mm256_min_ps(minv0_3, minv0_4);
-
-            const __m256 maxv0_0 = _mm256_permute2f128_ps(maxv[0], maxv[0], 3);
-            const __m256 maxv0_1 = _mm256_max_ps(maxv[0], maxv0_0);
-            const __m256 maxv0_2 = _mm256_permute_ps(maxv0_1, 0x4e);
-            const __m256 maxv0_3 = _mm256_max_ps(maxv0_1, maxv0_2);
-            const __m256 maxv0_4 = _mm256_permute_ps(maxv0_3, 0xb1);
-            const __m256 maxv0_5 = _mm256_max_ps(maxv0_3, maxv0_4);
-
-            min = _mm256_cvtss_f32(minv0_5);
-            max = _mm256_cvtss_f32(maxv0_5);
-
-            const float d = (max - min) / ((1 << QB) - 2);
-            const float id = d ? 1.0/d : 0.0;
-
-            pm[i] = GGML_FP32_TO_GQ(min);
-            pd[i] = GGML_FP32_TO_GQ(d);
-
-            const __m256 idv = _mm256_set1_ps(id);
-
-            for (int l = 0; l < QK/8; l++) {
-                __m256 v = _mm256_mul_ps(_mm256_sub_ps(srcv[l], _mm256_set1_ps(min)), idv);
-#if 0
-                v[0] += frand(); v[1] += frand(); v[2] += frand(); v[3] += frand();
-                v[4] += frand(); v[5] += frand(); v[6] += frand(); v[7] += frand();
-#endif
-
-                // convert to uint8
-                __m256i vi = _mm256_cvtps_epi32(v);
-
-                uint32_t vi_0 = _mm256_extract_epi32(vi, 0);
-                uint32_t vi_1 = _mm256_extract_epi32(vi, 1);
-                uint32_t vi_2 = _mm256_extract_epi32(vi, 2);
-                uint32_t vi_3 = _mm256_extract_epi32(vi, 3);
-
-                uint32_t vi_4 = _mm256_extract_epi32(vi, 4);
-                uint32_t vi_5 = _mm256_extract_epi32(vi, 5);
-                uint32_t vi_6 = _mm256_extract_epi32(vi, 6);
-                uint32_t vi_7 = _mm256_extract_epi32(vi, 7);
-
-                // convert to 4-bit, 2 consecutive packed into 1 byte
-                pp[4*l + 0] = vi_0 | (vi_1 << 4);
-                pp[4*l + 1] = vi_2 | (vi_3 << 4);
-                pp[4*l + 2] = vi_4 | (vi_5 << 4);
-                pp[4*l + 3] = vi_6 | (vi_7 << 4);
-
-                //printf("vi: %7d %7d %7d %7d %7d %7d %7d %7d\n", vi_0, vi_1, vi_2, vi_3, vi_4, vi_5, vi_6, vi_7);
-                //printf("v : %7.3f %7.3f %7.3f %7.3f %7.3f %7.3f %7.3f %7.3f\n", v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);
-            }
-
-            memcpy(pb + i*QK/2, pp, sizeof(pp));
-        }
-#elif defined(__ARM_NEON) && 0
-        {
-            // TODO
-        }
-#else
-        {
-            for (int l = 0; l < QK; l++) {
-                const float v = src[i*QK + l];
-                if (v < min) min = v;
-                if (v > max) max = v;
-            }
-
-            const float d = (max - min) / ((1 << QB) - 1);
-            const float id = d ? 1.0/d : 0.0;
-
-            pm[i] = GGML_FP32_TO_GQ(min);
-            pd[i] = GGML_FP32_TO_GQ(d);
-
-            for (int l = 0; l < QK; l++) {
-                const float v = (src[i*QK + l] - min) * id;
-                const uint8_t vi = (uint8_t) (v + frand());
-                pp[l/2] |= (vi & 0xf) << (4*(l & 1));
-            }
-
-            memcpy(pb + i*QK/2, pp, sizeof(pp));
-        }
-#endif
-        //printf("min %f max %f\n", min, max);
-    }
-}
-
-// reimplementation of quantize_4 using quantize_4_row
-void quantize_4(const float * restrict src, char * restrict dst, int n, int k) {
-    assert(k % QK == 0);
-
-    for (int j = 0; j < n; j++) {
-        quantize_4_row(src + j*k, dst, k);
-        dst = (char *) dst + quantize_4_row_size(k);
-    }
-}
-
-void vec_dot_gq_4(const int n, float * restrict s, const void * restrict x, const void * restrict y) {
-    const int nb = quantize_4_blocks_per_row(n);
-
-    const gq_scale_t * restrict pm0 = (const gq_scale_t *) x;
-    const gq_scale_t * restrict pm1 = (const gq_scale_t *) y;
-
-    const gq_scale_t * restrict pd0 = pm0 + nb;
-    const gq_scale_t * restrict pd1 = pm1 + nb;
-
-    const uint8_t * restrict pb0 = (const uint8_t *) (pd0 + nb);
-    const uint8_t * restrict pb1 = (const uint8_t *) (pd1 + nb);
-
-    float sumf = 0.0;
-
-#if 0
-    // scalar
-    for (int i = 0; i < nb; i++) {
-        const float m0 = GGML_GQ_TO_FP32(pm0[i]);
-        const float d0 = GGML_GQ_TO_FP32(pd0[i]);
-
-        const float m1 = GGML_GQ_TO_FP32(pm1[i]);
-        const float d1 = GGML_GQ_TO_FP32(pd1[i]);
-
-        const uint8_t * restrict p0 = pb0 + i*QK/2;
-        const uint8_t * restrict p1 = pb1 + i*QK/2;
-
-        for (int j = 0; j < QK/2; j++) {
-            const uint8_t v0 = p0[j];
-            const uint8_t v1 = p1[j];
-
-            const float f0 = d0*(v0 & 0xf) + m0;
-            const float f1 = d0*(v0 >> 4)  + m0;
-
-            const float f2 = d1*(v1 & 0xf) + m1;
-            const float f3 = d1*(v1 >> 4)  + m1;
-
-            sumf += f0*f2 + f1*f3;
-        }
-    }
-#else
-#if defined(__AVX2__)
-#if QK == 64 && 0
-    __m256 sumv0 = _mm256_setzero_ps();
-    __m256 sumv1 = _mm256_setzero_ps();
-
-    for (int i = 0; i < nb; i++) {
-        const float m0 = GGML_GQ_TO_FP32(pm0[i]);
-        const float d0 = GGML_GQ_TO_FP32(pd0[i]);
-
-        const float m1 = GGML_GQ_TO_FP32(pm1[i]);
-        const float d1 = GGML_GQ_TO_FP32(pd1[i]);
-
-        const uint8_t * restrict p0 = pb0 + i*QK/2;
-        const uint8_t * restrict p1 = pb1 + i*QK/2;
-
-        const __m256 m0v = _mm256_set1_ps(m0);
-        const __m256 d0v = _mm256_set1_ps(d0);
-
-        const __m256 m1v = _mm256_set1_ps(m1);
-        const __m256 d1v = _mm256_set1_ps(d1);
-
-        const __m256i m4b = _mm256_set1_epi8(0xf);
-
-        __m256i v0 = _mm256_loadu_si256((__m256i *) p0);
-
-        //_mm_prefetch((const char *) (p0 + 32), _MM_HINT_T0);
-        //_mm_prefetch((const char *) (p1 + 32), _MM_HINT_T0);
-        //_mm_prefetch((const char *) (pm0 + i + 1), _MM_HINT_T0);
-        //_mm_prefetch((const char *) (pm1 + i + 1), _MM_HINT_T0);
-        //_mm_prefetch((const char *) (pd0 + i + 1), _MM_HINT_T0);
-        //_mm_prefetch((const char *) (pd1 + i + 1), _MM_HINT_T0);
-
-        __m256i v00 = _mm256_and_si256(v0, _mm256_set1_epi32(0x000000FF));
-        __m256i v01 = _mm256_srli_epi32(_mm256_and_si256(v0, _mm256_set1_epi32(0x0000FFFF)), 8);
-        __m256i v02 = _mm256_srli_epi32(_mm256_and_si256(v0, _mm256_set1_epi32(0x00FFFFFF)), 16);
-        __m256i v03 = _mm256_srli_epi32(v0, 24);
-
-        //////////////////////
-
-        //{
-        //    uint32_t vi_0 = _mm256_extract_epi32(v00, 0);
-        //    uint32_t vi_1 = _mm256_extract_epi32(v00, 1);
-        //    uint32_t vi_2 = _mm256_extract_epi32(v00, 2);
-        //    uint32_t vi_3 = _mm256_extract_epi32(v00, 3);
-        //    uint32_t vi_4 = _mm256_extract_epi32(v00, 4);
-        //    uint32_t vi_5 = _mm256_extract_epi32(v00, 5);
-        //    uint32_t vi_6 = _mm256_extract_epi32(v00, 6);
-        //    uint32_t vi_7 = _mm256_extract_epi32(v00, 7);
-        //    printf("v0: %7d %7d %7d %7d %7d %7d %7d %7d\n", vi_0, vi_1, vi_2, vi_3, vi_4, vi_5, vi_6, vi_7);
-        //    printf("p0: %7d %7d %7d %7d %7d %7d %7d %7d\n", p0[0], p0[4], p0[8], p0[12], p0[16], p0[20], p0[24], p0[28]);
-        //    printf("p1: %7d %7d %7d %7d %7d %7d %7d %7d\n", p0[1], p0[5], p0[9], p0[13], p0[17], p0[21], p0[25], p0[29]);
-        //    printf("p2: %7d %7d %7d %7d %7d %7d %7d %7d\n", p0[2], p0[6], p0[10], p0[14], p0[18], p0[22], p0[26], p0[30]);
-        //    printf("p3: %7d %7d %7d %7d %7d %7d %7d %7d\n", p0[3], p0[7], p0[11], p0[15], p0[19], p0[23], p0[27], p0[31]);
-        //}
-
-        // compute 32 x 4-bit values (low and high)
-        __m256i v00l = _mm256_and_si256(v00, m4b);
-        __m256i v01l = _mm256_and_si256(v01, m4b);
-        __m256i v02l = _mm256_and_si256(v02, m4b);
-        __m256i v03l = _mm256_and_si256(v03, m4b);
-
-        __m256i v00h = _mm256_srli_epi32(v00, 4);
-        __m256i v01h = _mm256_srli_epi32(v01, 4);
-        __m256i v02h = _mm256_srli_epi32(v02, 4);
-        __m256i v03h = _mm256_srli_epi32(v03, 4);
-
-        //{
-        //    uint32_t vi_0 = _mm256_extract_epi32(v00l, 0);
-        //    uint32_t vi_1 = _mm256_extract_epi32(v00l, 1);
-        //    uint32_t vi_2 = _mm256_extract_epi32(v00l, 2);
-        //    uint32_t vi_3 = _mm256_extract_epi32(v00l, 3);
-        //    uint32_t vi_4 = _mm256_extract_epi32(v00l, 4);
-        //    uint32_t vi_5 = _mm256_extract_epi32(v00l, 5);
-        //    uint32_t vi_6 = _mm256_extract_epi32(v00l, 6);
-        //    uint32_t vi_7 = _mm256_extract_epi32(v00l, 7);
-
-        //    printf("v0l: %7d %7d %7d %7d %7d %7d %7d %7d\n", vi_0, vi_1, vi_2, vi_3, vi_4, vi_5, vi_6, vi_7);
-
-        //    vi_0 = _mm256_extract_epi32(v00h, 0);
-        //    vi_1 = _mm256_extract_epi32(v00h, 1);
-        //    vi_2 = _mm256_extract_epi32(v00h, 2);
-        //    vi_3 = _mm256_extract_epi32(v00h, 3);
-        //    vi_4 = _mm256_extract_epi32(v00h, 4);
-        //    vi_5 = _mm256_extract_epi32(v00h, 5);
-        //    vi_6 = _mm256_extract_epi32(v00h, 6);
-        //    vi_7 = _mm256_extract_epi32(v00h, 7);
-
-        //    printf("v0h: %7d %7d %7d %7d %7d %7d %7d %7d\n", vi_0, vi_1, vi_2, vi_3, vi_4, vi_5, vi_6, vi_7);
-        //}
-
-        // convert to float
-        __m256 vf00l = _mm256_cvtepi32_ps(v00l);
-        __m256 vf01l = _mm256_cvtepi32_ps(v01l);
-        __m256 vf02l = _mm256_cvtepi32_ps(v02l);
-        __m256 vf03l = _mm256_cvtepi32_ps(v03l);
-
-        __m256 vf00h = _mm256_cvtepi32_ps(v00h);
-        __m256 vf01h = _mm256_cvtepi32_ps(v01h);
-        __m256 vf02h = _mm256_cvtepi32_ps(v02h);
-        __m256 vf03h = _mm256_cvtepi32_ps(v03h);
-
-        //{
-        //    printf("vf00l: %7.3f %7.3f %7.3f %7.3f %7.3f %7.3f %7.3f %7.3f\n", vf00l[0], vf00l[1], vf00l[2], vf00l[3], vf00l[4], vf00l[5], vf00l[6], vf00l[7]);
-        //    printf("vf01l: %7.3f %7.3f %7.3f %7.3f %7.3f %7.3f %7.3f %7.3f\n", vf01l[0], vf01l[1], vf01l[2], vf01l[3], vf01l[4], vf01l[5], vf01l[6], vf01l[7]);
-        //    printf("vf02l: %7.3f %7.3f %7.3f %7.3f %7.3f %7.3f %7.3f %7.3f\n", vf02l[0], vf02l[1], vf02l[2], vf02l[3], vf02l[4], vf02l[5], vf02l[6], vf02l[7]);
-        //    printf("vf03l: %7.3f %7.3f %7.3f %7.3f %7.3f %7.3f %7.3f %7.3f\n", vf03l[0], vf03l[1], vf03l[2], vf03l[3], vf03l[4], vf03l[5], vf03l[6], vf03l[7]);
-        //}
-
-        // multiply by scale and add offset
-        vf00l = _mm256_fmadd_ps(vf00l, d0v, m0v);
-        vf01l = _mm256_fmadd_ps(vf01l, d0v, m0v);
-        vf02l = _mm256_fmadd_ps(vf02l, d0v, m0v);
-        vf03l = _mm256_fmadd_ps(vf03l, d0v, m0v);
-
-        vf00h = _mm256_fmadd_ps(vf00h, d0v, m0v);
-        vf01h = _mm256_fmadd_ps(vf01h, d0v, m0v);
-        vf02h = _mm256_fmadd_ps(vf02h, d0v, m0v);
-        vf03h = _mm256_fmadd_ps(vf03h, d0v, m0v);
-
-        __m256i v1 = _mm256_loadu_si256((__m256i *) p1);
-
-        __m256i v10 = _mm256_and_si256(v1, _mm256_set1_epi32(0x000000FF));
-        __m256i v11 = _mm256_srli_epi32(_mm256_and_si256(v1, _mm256_set1_epi32(0x0000FFFF)), 8);
-        __m256i v12 = _mm256_srli_epi32(_mm256_and_si256(v1, _mm256_set1_epi32(0x00FFFFFF)), 16);
-        __m256i v13 = _mm256_srli_epi32(v1, 24);
-
-        __m256i v10l = _mm256_and_si256(v10, m4b);
-        __m256i v11l = _mm256_and_si256(v11, m4b);
-        __m256i v12l = _mm256_and_si256(v12, m4b);
-        __m256i v13l = _mm256_and_si256(v13, m4b);
-
-        __m256i v10h = _mm256_srli_epi32(v10, 4);
-        __m256i v11h = _mm256_srli_epi32(v11, 4);
-        __m256i v12h = _mm256_srli_epi32(v12, 4);
-        __m256i v13h = _mm256_srli_epi32(v13, 4);
-
-        __m256 vf10l = _mm256_cvtepi32_ps(v10l);
-        __m256 vf11l = _mm256_cvtepi32_ps(v11l);
-        __m256 vf12l = _mm256_cvtepi32_ps(v12l);
-        __m256 vf13l = _mm256_cvtepi32_ps(v13l);
-
-        __m256 vf10h = _mm256_cvtepi32_ps(v10h);
-        __m256 vf11h = _mm256_cvtepi32_ps(v11h);
-        __m256 vf12h = _mm256_cvtepi32_ps(v12h);
-        __m256 vf13h = _mm256_cvtepi32_ps(v13h);
-
-        vf10l = _mm256_fmadd_ps(vf10l, d1v, m1v);
-        vf11l = _mm256_fmadd_ps(vf11l, d1v, m1v);
-        vf12l = _mm256_fmadd_ps(vf12l, d1v, m1v);
-        vf13l = _mm256_fmadd_ps(vf13l, d1v, m1v);
-
-        vf10h = _mm256_fmadd_ps(vf10h, d1v, m1v);
-        vf11h = _mm256_fmadd_ps(vf11h, d1v, m1v);
-        vf12h = _mm256_fmadd_ps(vf12h, d1v, m1v);
-        vf13h = _mm256_fmadd_ps(vf13h, d1v, m1v);
-
-        // compute dot product
-        sumv0 = _mm256_fmadd_ps(vf00l, vf10l, sumv0);
-        sumv0 = _mm256_fmadd_ps(vf01l, vf11l, sumv0);
-        sumv0 = _mm256_fmadd_ps(vf02l, vf12l, sumv0);
-        sumv0 = _mm256_fmadd_ps(vf03l, vf13l, sumv0);
-
-        sumv1 = _mm256_fmadd_ps(vf00h, vf10h, sumv1);
-        sumv1 = _mm256_fmadd_ps(vf01h, vf11h, sumv1);
-        sumv1 = _mm256_fmadd_ps(vf02h, vf12h, sumv1);
-        sumv1 = _mm256_fmadd_ps(vf03h, vf13h, sumv1);
-    }
-
-    // accumulate (horizontal sum)
-    const __m256 vdot = _mm256_add_ps(sumv0, sumv1);
-    const __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(vdot), _mm256_extractf128_ps(vdot, 1));
-    const __m128 t1 = _mm_hadd_ps(t0, t0);
-
-    sumf += _mm_cvtss_f32(_mm_hadd_ps(t1, t1));
-#elif QK == 64 && 0
-    float sum00 = 0.0f;
-    float sum01 = 0.0f;
-    float sum10 = 0.0f;
-    float sum11 = 0.0f;
-
-    const __m256i m4b = _mm256_set1_epi8(0xf);
-
-    for (int i = 0; i < nb; i++) {
-        const float m0 = GGML_GQ_TO_FP32(pm0[i]);
-        const float d0 = GGML_GQ_TO_FP32(pd0[i]);
-
-        const float m1 = GGML_GQ_TO_FP32(pm1[i]);
-        const float d1 = GGML_GQ_TO_FP32(pd1[i]);
-
-        const uint8_t * restrict p0 = pb0 + i*QK/2;
-        const uint8_t * restrict p1 = pb1 + i*QK/2;
-
-        // 64 x 4
-        const __m256i v0 = _mm256_loadu_si256((__m256i *) p0);
-        const __m256i v1 = _mm256_loadu_si256((__m256i *) p1);
-
-        // 32 x 8
-        const __m256i v0l = _mm256_and_si256(v0, m4b);
-        const __m256i v1l = _mm256_and_si256(v1, m4b);
-
-        const __m256i v0h = _mm256_and_si256(_mm256_srli_epi16(v0, 4), m4b);
-        const __m256i v1h = _mm256_and_si256(_mm256_srli_epi16(v1, 4), m4b);
-
-        const __m256i pl = _mm256_maddubs_epi16(v0l, v1l);
-        const __m256i ph = _mm256_maddubs_epi16(v0h, v1h);
-
-        const __m256i p16 = _mm256_add_epi16(ph, pl);
-        const __m256i p = _mm256_madd_epi16(_mm256_set1_epi16(1), p16);
-
-        sum00 += m0*m1;
-        sum01 += m1*d0*(_mm256_hadd_epi8_gg(_mm256_add_epi8(v0l, v0h)));
-        sum10 += m0*d1*(_mm256_hadd_epi8_gg(_mm256_add_epi8(v1l, v1h)));
-        sum11 += d0*d1*(_mm256_hadd_epi32_gg(p));
-    }
-
-    sumf = 64.0*sum00 + sum01 + sum10 + sum11;
-#elif QK == 64 && 1 // this is the best when using min + d
-    float sum00 = 0.0f;
-
-    __m256 sum01 = _mm256_setzero_ps();
-    __m256 sum10 = _mm256_setzero_ps();
-    __m256 sum11 = _mm256_setzero_ps();
-
-    for (int i = 0; i < nb; i++) {
-        const float m0 = GGML_GQ_TO_FP32(pm0[i]);
-        const float d0 = GGML_GQ_TO_FP32(pd0[i]);
-
-        const float m1 = GGML_GQ_TO_FP32(pm1[i]);
-        const float d1 = GGML_GQ_TO_FP32(pd1[i]);
-
-        const uint8_t * restrict p0 = pb0 + i*QK/2;
-        const uint8_t * restrict p1 = pb1 + i*QK/2;
-
-        const __m256 m0v = _mm256_set1_ps(m0);
-        const __m256 d0v = _mm256_set1_ps(d0);
-
-        const __m256 m1v = _mm256_set1_ps(m1);
-        const __m256 d1v = _mm256_set1_ps(d1);
-
-        const __m256 m1d0v = _mm256_mul_ps(m1v, d0v);
-        const __m256 m0d1v = _mm256_mul_ps(m0v, d1v);
-        const __m256 d0d1v = _mm256_mul_ps(d0v, d1v);
-
-        const __m256i m4b = _mm256_set1_epi8(0xf);
-
-        // 64 x 4
-        const __m256i v0 = _mm256_loadu_si256((__m256i *) p0);
-        const __m256i v1 = _mm256_loadu_si256((__m256i *) p1);
-
-        // 32 x 8
-        const __m256i v0l = _mm256_and_si256(v0, m4b);
-        const __m256i v1l = _mm256_and_si256(v1, m4b);
-
-        const __m256i v0h = _mm256_and_si256(_mm256_srli_epi16(v0, 4), m4b);
-        const __m256i v1h = _mm256_and_si256(_mm256_srli_epi16(v1, 4), m4b);
-
-        const __m256i v0a = _mm256_add_epi8(v0l, v0h);
-        const __m256i v1a = _mm256_add_epi8(v1l, v1h);
-
-        const __m128i v0al = _mm256_extracti128_si256(v0a, 0);
-        const __m128i v0ah = _mm256_extracti128_si256(v0a, 1);
-
-        const __m128i v1al = _mm256_extracti128_si256(v1a, 0);
-        const __m128i v1ah = _mm256_extracti128_si256(v1a, 1);
-
-        const __m128i v0as = _mm_add_epi8(v0al, v0ah);
-        const __m128i v1as = _mm_add_epi8(v1al, v1ah);
-
-        const __m256i v0as_0 = _mm256_cvtepu8_epi32(v0as);
-        const __m256i v0as_1 = _mm256_cvtepu8_epi32(_mm_srli_si128(v0as, 8));
-
-        const __m256i v1as_0 = _mm256_cvtepu8_epi32(v1as);
-        const __m256i v1as_1 = _mm256_cvtepu8_epi32(_mm_srli_si128(v1as, 8));
-
-        const __m256i v0ass = _mm256_add_epi32(v0as_0, v0as_1);
-        const __m256i v1ass = _mm256_add_epi32(v1as_0, v1as_1);
-
-        const __m256 v0f = _mm256_cvtepi32_ps(v0ass);
-        const __m256 v1f = _mm256_cvtepi32_ps(v1ass);
-
-        const __m256i pl = _mm256_maddubs_epi16(v0l, v1l);
-        const __m256i ph = _mm256_maddubs_epi16(v0h, v1h);
-
-        const __m256i p16 = _mm256_add_epi16(ph, pl);
-        const __m256i p = _mm256_madd_epi16(_mm256_set1_epi16(1), p16);
-
-        sum00 += m0*m1;
-        sum01 = _mm256_fmadd_ps(m1d0v, v0f, sum01);
-        sum10 = _mm256_fmadd_ps(m0d1v, v1f, sum10);
-        sum11 = _mm256_fmadd_ps(d0d1v, _mm256_cvtepi32_ps(p), sum11);
-    }
-
-    sumf = 64.0*sum00 + _mm256_hadd_ps_gg(sum01) + _mm256_hadd_ps_gg(sum10) + _mm256_hadd_ps_gg(sum11);
-#endif
-#elif defined (__ARM_NEON)
-    float sum00 = 0.0f;
-    float sum01 = 0.0f;
-    float sum10 = 0.0f;
-    float sum11 = 0.0f;
-
-    for (int i = 0; i < nb; i++) {
-        const float m0 = GGML_GQ_TO_FP32(pm0[i]);
-        const float d0 = GGML_GQ_TO_FP32(pd0[i]);
-
-        const float m1 = GGML_GQ_TO_FP32(pm1[i]);
-        const float d1 = GGML_GQ_TO_FP32(pd1[i]);
-
-        const uint8_t * restrict p0 = pb0 + i*QK/2;
-        const uint8_t * restrict p1 = pb1 + i*QK/2;
-
-        const uint8x16_t m4b = vdupq_n_u8(0xf);
-
-        const uint8x16_t v0_0 = vld1q_u8(p0);
-        const uint8x16_t v0_1 = vld1q_u8(p0 + 16);
-        const uint8x16_t v1_0 = vld1q_u8(p1);
-        const uint8x16_t v1_1 = vld1q_u8(p1 + 16);
-
-        // and with 0xf
-        const uint8x16_t v0_0l = vandq_u8(v0_0, m4b);
-        const uint8x16_t v0_1l = vandq_u8(v0_1, m4b);
-        const uint8x16_t v1_0l = vandq_u8(v1_0, m4b);
-        const uint8x16_t v1_1l = vandq_u8(v1_1, m4b);
-
-        const uint8x16_t v0_0h = vshrq_n_u8(v0_0, 4);
-        const uint8x16_t v0_1h = vshrq_n_u8(v0_1, 4);
-        const uint8x16_t v1_0h = vshrq_n_u8(v1_0, 4);
-        const uint8x16_t v1_1h = vshrq_n_u8(v1_1, 4);
-
-        // dot product into uint16x8_t
-        const uint16x8_t pl0l = vmull_u8(vget_low_u8 (v0_0l), vget_low_u8 (v1_0l));
-        const uint16x8_t pl0h = vmull_u8(vget_high_u8(v0_0l), vget_high_u8(v1_0l));
-        const uint16x8_t pl1l = vmull_u8(vget_low_u8 (v0_1l), vget_low_u8 (v1_1l));
-        const uint16x8_t pl1h = vmull_u8(vget_high_u8(v0_1l), vget_high_u8(v1_1l));
-
-        const uint16x8_t ph0l = vmull_u8(vget_low_u8 (v0_0h), vget_low_u8 (v1_0h));
-        const uint16x8_t ph0h = vmull_u8(vget_high_u8(v0_0h), vget_high_u8(v1_0h));
-        const uint16x8_t ph1l = vmull_u8(vget_low_u8 (v0_1h), vget_low_u8 (v1_1h));
-        const uint16x8_t ph1h = vmull_u8(vget_high_u8(v0_1h), vget_high_u8(v1_1h));
-
-        const uint16x8_t pl0 = vaddq_u16(pl0l, pl0h);
-        const uint16x8_t pl1 = vaddq_u16(pl1l, pl1h);
-        const uint16x8_t ph0 = vaddq_u16(ph0l, ph0h);
-        const uint16x8_t ph1 = vaddq_u16(ph1l, ph1h);
-
-        const uint16x8_t pl = vaddq_u16(pl0, pl1);
-        const uint16x8_t ph = vaddq_u16(ph0, ph1);
-
-        sum00 += m0*m1;
-        sum01 += m1*d0*(vaddvq_u8(v0_0l) + vaddvq_u8(v0_0h) + vaddvq_u8(v0_1l) + vaddvq_u8(v0_1h));
-        sum10 += m0*d1*(vaddvq_u8(v1_0l) + vaddvq_u8(v1_0h) + vaddvq_u8(v1_1l) + vaddvq_u8(v1_1h));
-        //sum11 += d0*d1*(
-        //        vaddvq_u16(vaddq_u16(vaddq_u16(pl0l, pl0h), vaddq_u16(pl1l, pl1h))) +
-        //        vaddvq_u16(vaddq_u16(vaddq_u16(ph0l, ph0h), vaddq_u16(ph1l, ph1h))));
-        sum11 += d0*d1*vaddvq_u16(vaddq_u16(pl, ph));
-    }
-
-    sumf = 64.0*sum00 + sum01 + sum10 + sum11;
-#endif
-#endif
-
-    *s = sumf;
-}
-
-// use vec_dot_gq_4 to compute the dot product of two rows
-void mul_mat_gq_4(
-    const void * src0,
-    const void * src1, // transposed
-         float * dst,
-    int m, int n, int k) {
-    assert(k % QK == 0);
-
-    const int nb = quantize_4_blocks_per_row(k);
-
-    for (int ir0 = 0; ir0 < m; ir0++) {
-        for (int ir1 = 0; ir1 < n; ir1++) {
-            vec_dot_gq_4(k, dst + ir1, src0, src1);
-            src1 = (const char *) src1 + quantize_4_row_size(k);
-        }
-        src0 = (const char *) src0 +   quantize_4_row_size(k);
-        src1 = (const char *) src1 - n*quantize_4_row_size(k);
-
-        dst = (float *) dst + n;
-    }
-}
-
-//
-// method 5
-// 4-bit quantization (without min, only delta)
-//
-
-static inline int quantize_5_blocks_per_row(int k) {
-    return k/QK;
-}
-
-static inline int quantize_5_row_size(int k) {
-    const int nb = quantize_5_blocks_per_row(k);
-
-    return nb*(sizeof(gq_scale_t) + QK/2);
-}
-
-void quantize_5_row(const float * restrict src, void * restrict dst, int k) {
-    assert(k % QK == 0);
-    assert(QB == 4);
-
-    const int nb = quantize_5_blocks_per_row(k);
-
-    gq_scale_t * restrict pd = (gq_scale_t *) (dst);
-    uint8_t    * restrict pb = (uint8_t *)    (pd + nb);
-
-    uint8_t pp[QK/2];
-
-    for (int i = 0; i < nb; i++) {
-        memset(pp, 0, sizeof(pp));
-
-        float amax = 0.0f; // absolute max
-
-#if defined(__AVX2__)
-        {
-            assert(QK == 64);
-            enum { QK8 = QK/8 };
-
-            __m256 srcv [QK8];
-            __m256 asrcv[QK8];
-            __m256 amaxv[QK8];
-
-            for (int l = 0; l < QK8; l++) {
-                srcv[l]  = _mm256_loadu_ps(src + i*QK + 8*l);
-            }
-
-            for (int l = 0; l < QK8; l++) {
-                asrcv[l] = _mm256_and_ps(srcv[l], _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff)));
-            }
-
-
-            for (int l = 0; l < QK8/2; l++) {
-                amaxv[2*l] = _mm256_max_ps(asrcv[2*l], asrcv[2*l+1]);
-            }
-
-            for (int l = 0; l < QK8/4; l++) {
-                amaxv[4*l] = _mm256_max_ps(amaxv[4*l], amaxv[4*l+2]);
-            }
-
-            for (int l = 0; l < QK8/8; l++) {
-                amaxv[8*l] = _mm256_max_ps(amaxv[8*l], amaxv[8*l+4]);
-            }
-
-            //amax = MAX(amaxv[0][0], MAX(amaxv[0][1], MAX(amaxv[0][2], MAX(amaxv[0][3], MAX(amaxv[0][4], MAX(amaxv[0][5], MAX(amaxv[0][6], amaxv[0][7])))))));
-
-            const __m256 amaxv0_0 = _mm256_permute2f128_ps(amaxv[0], amaxv[0], 3);
-            const __m256 amaxv0_1 = _mm256_max_ps(amaxv[0], amaxv0_0);
-            const __m256 amaxv0_2 = _mm256_permute_ps(amaxv0_1, 0x4e);
-            const __m256 amaxv0_3 = _mm256_max_ps(amaxv0_1, amaxv0_2);
-            const __m256 amaxv0_4 = _mm256_permute_ps(amaxv0_3, 0xb1);
-            const __m256 amaxv0_5 = _mm256_max_ps(amaxv0_3, amaxv0_4);
-
-            amax = _mm256_cvtss_f32(amaxv0_5);
-
-            //printf("amax = %f\n", amax);
-
-            const float d = amax / ((1 << (QB - 1)) - 1);
-            const float id = d ? 1.0/d : 0.0;
-
-            pd[i] = GGML_FP32_TO_GQ(d);
-
-            const __m256 idv = _mm256_set1_ps(id);
-
-            for (int l = 0; l < QK/8; l++) {
-                __m256 v = _mm256_mul_ps(srcv[l], idv);
-#if 0
-                v[0] += frand(); v[1] += frand(); v[2] += frand(); v[3] += frand();
-                v[4] += frand(); v[5] += frand(); v[6] += frand(); v[7] += frand();
-#endif
-
-                // convert to int8
-                __m256i vi = _mm256_cvtps_epi32(v);
-                vi = _mm256_add_epi32(vi, _mm256_set1_epi32(8));
-
-                int32_t vi_0 = _mm256_extract_epi32(vi, 0);
-                int32_t vi_1 = _mm256_extract_epi32(vi, 1);
-                int32_t vi_2 = _mm256_extract_epi32(vi, 2);
-                int32_t vi_3 = _mm256_extract_epi32(vi, 3);
-
-                int32_t vi_4 = _mm256_extract_epi32(vi, 4);
-                int32_t vi_5 = _mm256_extract_epi32(vi, 5);
-                int32_t vi_6 = _mm256_extract_epi32(vi, 6);
-                int32_t vi_7 = _mm256_extract_epi32(vi, 7);
-
-                // convert to 4-bit, 2 consecutive packed into 1 byte
-                pp[4*l + 0] = vi_0 | (vi_1 << 4);
-                pp[4*l + 1] = vi_2 | (vi_3 << 4);
-                pp[4*l + 2] = vi_4 | (vi_5 << 4);
-                pp[4*l + 3] = vi_6 | (vi_7 << 4);
-
-                //printf("vi: %7d %7d %7d %7d %7d %7d %7d %7d\n", vi_0, vi_1, vi_2, vi_3, vi_4, vi_5, vi_6, vi_7);
-                ////printf("v : %7.3f %7.3f %7.3f %7.3f %7.3f %7.3f %7.3f %7.3f\n", v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);
-
-                assert(vi_0 >= 0 && vi_0 < 16);
-                assert(vi_1 >= 0 && vi_1 < 16);
-                assert(vi_2 >= 0 && vi_2 < 16);
-                assert(vi_3 >= 0 && vi_3 < 16);
-
-                assert(vi_4 >= 0 && vi_4 < 16);
-                assert(vi_5 >= 0 && vi_5 < 16);
-                assert(vi_6 >= 0 && vi_6 < 16);
-                assert(vi_7 >= 0 && vi_7 < 16);
-            }
-
-            memcpy(pb + i*QK/2, pp, sizeof(pp));
-        }
-#elif defined(__ARM_NEON) && 0
-        {
-            // TODO
-        }
-#else
-        {
-            for (int l = 0; l < QK; l++) {
-                const float v = src[i*QK + l];
-                amax = MAX(amax, fabsf(v));
-            }
-
-            const float d = amax / ((1 << (QB - 1)) - 1);
-            const float id = d ? 1.0/d : 0.0;
-
-            pd[i] = GGML_FP32_TO_GQ(d);
-
-            for (int l = 0; l < QK; l++) {
-                const float v = src[i*QK + l]*id;
-                const int8_t vi = ((int8_t) (round(v))) + 8;
-                assert(vi >= 0 && vi < 16);
-                pp[l/2] |= (vi & 0xf) << (4*(l & 1));
-            }
-
-            memcpy(pb + i*QK/2, pp, sizeof(pp));
-        }
-#endif
-        //printf("min %f max %f\n", min, max);
-    }
-}
-
-// reimplementation of quantize_5 using quantize_5_row
-void quantize_5(const float * restrict src, char * restrict dst, int n, int k) {
-    assert(k % QK == 0);
-
-    for (int j = 0; j < n; j++) {
-        quantize_5_row(src + j*k, dst, k);
-        dst = (char *) dst + quantize_5_row_size(k);
-    }
-}
-
-void vec_dot_gq_5(const int n, float * restrict s, const void * restrict x, const void * restrict y) {
-    const int nb = quantize_5_blocks_per_row(n);
-
-    const gq_scale_t * restrict pd0 = (const gq_scale_t *) x;
-    const gq_scale_t * restrict pd1 = (const gq_scale_t *) y;
-
-    const uint8_t * restrict pb0 = (const uint8_t *) (pd0 + nb);
-    const uint8_t * restrict pb1 = (const uint8_t *) (pd1 + nb);
-
-    float sumf = 0.0;
-
-#if 0
-    // scalar
-    for (int i = 0; i < nb; i++) {
-        const float d0 = GGML_GQ_TO_FP32(pd0[i]);
-        const float d1 = GGML_GQ_TO_FP32(pd1[i]);
-
-        const uint8_t * restrict p0 = pb0 + i*QK/2;
-        const uint8_t * restrict p1 = pb1 + i*QK/2;
-
-        for (int j = 0; j < QK/2; j++) {
-            const uint8_t v0 = p0[j];
-            const uint8_t v1 = p1[j];
-
-            const float f0 = d0*((int8_t) (v0 & 0xf) - 8);
-            const float f1 = d0*((int8_t) (v0 >> 4)  - 8);
-
-            const float f2 = d1*((int8_t) (v1 & 0xf) - 8);
-            const float f3 = d1*((int8_t) (v1 >> 4)  - 8);
-
-            sumf += f0*f2 + f1*f3;
-        }
-    }
-#else
-#if defined(__AVX2__)
-#if QK == 64 && 1
-    __m256 sum11 = _mm256_setzero_ps();
-
-    for (int i = 0; i < nb; i++) {
-        const float d0 = GGML_GQ_TO_FP32(pd0[i]);
-        const float d1 = GGML_GQ_TO_FP32(pd1[i]);
-
-        const uint8_t * restrict p0 = pb0 + i*QK/2;
-        const uint8_t * restrict p1 = pb1 + i*QK/2;
-
-        const __m256 d0v = _mm256_set1_ps(d0);
-        const __m256 d1v = _mm256_set1_ps(d1);
-
-        const __m256 d0d1v = _mm256_mul_ps(d0v, d1v);
-
-        const __m256i m4b = _mm256_set1_epi8(0xf);
-
-        // 64 x 4
-        const __m256i v0 = _mm256_loadu_si256((__m256i *) p0);
-        const __m256i v1 = _mm256_loadu_si256((__m256i *) p1);
-
-        // 32 x 8
-        __m256i v0l = _mm256_and_si256(v0, m4b);
-        __m256i v1l = _mm256_and_si256(v1, m4b);
-
-        __m256i v0h = _mm256_and_si256(_mm256_srli_epi16(v0, 4), m4b);
-        __m256i v1h = _mm256_and_si256(_mm256_srli_epi16(v1, 4), m4b);
-
-        // sub 8
-        v0l = _mm256_sub_epi8(v0l, _mm256_set1_epi8(8));
-        v0h = _mm256_sub_epi8(v0h, _mm256_set1_epi8(8));
-
-        v1l = _mm256_sub_epi8(v1l, _mm256_set1_epi8(8));
-        v1h = _mm256_sub_epi8(v1h, _mm256_set1_epi8(8));
-
-        // abs
-        const __m256i v0la = _mm256_sign_epi8(v0l, v0l);
-        const __m256i v0ha = _mm256_sign_epi8(v0h, v0h);
-
-        // sign
-        const __m256i v1ls = _mm256_sign_epi8(v1l, v0l);
-        const __m256i v1hs = _mm256_sign_epi8(v1h, v0h);
-
-        const __m256i pl = _mm256_maddubs_epi16(v0la, v1ls);
-        const __m256i ph = _mm256_maddubs_epi16(v0ha, v1hs);
-
-        const __m256i p16 = _mm256_add_epi16(ph, pl);
-        const __m256i p = _mm256_madd_epi16(_mm256_set1_epi16(1), p16);
-
-        sum11 = _mm256_fmadd_ps(d0d1v, _mm256_cvtepi32_ps(p), sum11);
-    }
-
-    sumf = _mm256_hadd_ps_gg(sum11);
-#endif
-#elif defined (__ARM_NEON)
-    float sum11 = 0.0f;
-
-    //float32x4_t sum_0 = vdupq_n_f32(0.0f);
-    //float32x4_t sum_1 = vdupq_n_f32(0.0f);
-
-    //float16x8_t sum_0 = vdupq_n_f16(0.0f);
-    //float16x8_t sum_1 = vdupq_n_f16(0.0f);
-
-    for (int i = 0; i < nb; i++) {
-        const float d0 = GGML_GQ_TO_FP32(pd0[i]);
-        const float d1 = GGML_GQ_TO_FP32(pd1[i]);
-
-        //float32x4_t d0d1v = vdupq_n_f32(d0*d1);
-        //float16x8_t d0d1v = vdupq_n_f16(d0*d1);
-
-        const uint8_t * restrict p0 = pb0 + i*QK/2;
-        const uint8_t * restrict p1 = pb1 + i*QK/2;
-
-        const uint8x16_t m4b = vdupq_n_u8(0xf);
-        const int8x16_t s8b = vdupq_n_s8(0x8);
-
-        const uint8x16_t v0_0 = vld1q_u8(p0);
-        const uint8x16_t v0_1 = vld1q_u8(p0 + 16);
-        const uint8x16_t v1_0 = vld1q_u8(p1);
-        const uint8x16_t v1_1 = vld1q_u8(p1 + 16);
-
-        // 4-bit -> 8-bit
-        const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8(v0_0, m4b));
-        const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8(v0_1, m4b));
-        const int8x16_t v1_0l = vreinterpretq_s8_u8(vandq_u8(v1_0, m4b));
-        const int8x16_t v1_1l = vreinterpretq_s8_u8(vandq_u8(v1_1, m4b));
-
-        const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
-        const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
-        const int8x16_t v1_0h = vreinterpretq_s8_u8(vshrq_n_u8(v1_0, 4));
-        const int8x16_t v1_1h = vreinterpretq_s8_u8(vshrq_n_u8(v1_1, 4));
-
-        // sub 8
-        const int8x16_t v0_0ls = vsubq_s8(v0_0l, s8b);
-        const int8x16_t v0_1ls = vsubq_s8(v0_1l, s8b);
-        const int8x16_t v1_0ls = vsubq_s8(v1_0l, s8b);
-        const int8x16_t v1_1ls = vsubq_s8(v1_1l, s8b);
-
-        const int8x16_t v0_0hs = vsubq_s8(v0_0h, s8b);
-        const int8x16_t v0_1hs = vsubq_s8(v0_1h, s8b);
-        const int8x16_t v1_0hs = vsubq_s8(v1_0h, s8b);
-        const int8x16_t v1_1hs = vsubq_s8(v1_1h, s8b);
-
-        // dot product into int16x8_t
-        const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0ls));
-        const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0ls));
-        const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1ls), vget_low_s8 (v1_1ls));
-        const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1ls), vget_high_s8(v1_1ls));
-
-        const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hs), vget_low_s8 (v1_0hs));
-        const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hs), vget_high_s8(v1_0hs));
-        const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hs), vget_low_s8 (v1_1hs));
-        const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hs), vget_high_s8(v1_1hs));
-
-        const int16x8_t pl0 = vaddq_s16(pl0l, pl0h);
-        const int16x8_t pl1 = vaddq_s16(pl1l, pl1h);
-        const int16x8_t ph0 = vaddq_s16(ph0l, ph0h);
-        const int16x8_t ph1 = vaddq_s16(ph1l, ph1h);
-
-        const int16x8_t pl = vaddq_s16(pl0, pl1);
-        const int16x8_t ph = vaddq_s16(ph0, ph1);
-
-        //const int8x16_t pl0 = vmulq_s8(v0_0ls, v1_0ls);
-        //const int8x16_t pl1 = vmulq_s8(v0_1ls, v1_1ls);
-        //const int8x16_t ph0 = vmulq_s8(v0_0hs, v1_0hs);
-        //const int8x16_t ph1 = vmulq_s8(v0_1hs, v1_1hs);
-
-        //const int16x8_t pll = vaddl_s8(vget_low_s8(pl0),  vget_low_s8(pl1));
-        //const int16x8_t plh = vaddl_s8(vget_high_s8(pl0), vget_high_s8(pl1));
-        //const int16x8_t phl = vaddl_s8(vget_low_s8(ph0),  vget_low_s8(ph1));
-        //const int16x8_t phh = vaddl_s8(vget_high_s8(ph0), vget_high_s8(ph1));
-
-        //const int16x8_t pl = vaddq_s16(pll, plh);
-        //const int16x8_t ph = vaddq_s16(phl, phh);
-
-        const int16x8_t p = vaddq_s16(pl, ph);
-
-        // convert to float
-        //const float32x4_t pf0 = vcvtq_f32_s32(vmovl_s16(vget_low_s16 (p)));
-        //const float32x4_t pf1 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(p)));
-
-        // scalar
-        sum11 += d0*d1*vaddvq_s16(p);
-        //sum11 += d0*d1*(vaddvq_s16(pl) + vaddvq_s16(ph));
-        //sum11 += d0*d1*vaddvq_s16(vaddq_s16(pl, ph));
-        //sum11 += d0*d1*(vaddvq_s8(pl0) + vaddvq_s8(pl1) + vaddvq_s8(ph0) + vaddvq_s8(ph1));
-        //sum11 += d0*d1*(vaddvq_s16(pll) + vaddvq_s16(plh) + vaddvq_s16(phl) + vaddvq_s16(phh));
-
-        //sum_0 = vfmaq_f16(sum_0, d0d1v, vcvtq_f16_s16(p));
-        //sum_0 = vfmaq_f16(sum_0, d0d1v, vcvtq_f16_s16(pl));
-        //sum_1 = vfmaq_f16(sum_1, d0d1v, vcvtq_f16_s16(ph));
-
-        // vectorize
-        //sum_0 = vmlaq_f32(sum_0, d0d1v, pf0);
-        //sum_1 = vmlaq_f32(sum_1, d0d1v, pf1);
-    }
-
-    sumf = sum11;
-    //sumf = vaddvq_f32(sum_0) + vaddvq_f32(sum_1);
-    //sumf = sum_0[0] + sum_0[1] + sum_0[2] + sum_0[3] + sum_0[4] + sum_0[5] + sum_0[6] + sum_0[7];
-    //sum_0 = vaddq_f16(sum_0, sum_1);
-    //sumf = sum_0[0] + sum_0[1] + sum_0[2] + sum_0[3] + sum_0[4] + sum_0[5] + sum_0[6] + sum_0[7];
-#endif
-#endif
-
-    *s = sumf;
-}
-
-// use vec_dot_gq_5 to compute the dot product of two rows
-void mul_mat_gq_5(
-    const void * src0,
-    const void * src1, // transposed
-         float * dst,
-    int m, int n, int k) {
-    assert(k % QK == 0);
-
-    const int nb = quantize_5_blocks_per_row(k);
-
-    for (int ir0 = 0; ir0 < m; ir0++) {
-        for (int ir1 = 0; ir1 < n; ir1++) {
-            vec_dot_gq_5(k, dst + ir1, src0, src1);
-            src1 = (const char *) src1 + quantize_5_row_size(k);
-        }
-        src0 = (const char *) src0 +   quantize_5_row_size(k);
-        src1 = (const char *) src1 - n*quantize_5_row_size(k);
-
-        dst = (float *) dst + n;
-    }
-}
-
-//
-// method 6
-// same as 5 but with 32 element blocks
-//
-
-static inline int quantize_6_blocks_per_row(int k) {
-    return k/32;
-}
-
-static inline int quantize_6_row_size(int k) {
-    const int nb = quantize_6_blocks_per_row(k);
-
-    return nb*(sizeof(gq_scale_t) + 16);
-}
-
-void quantize_6_row(const float * restrict src, void * restrict dst, int k) {
-    assert(k % 32 == 0);
-    assert(QB == 4);
-
-    const int nb = quantize_6_blocks_per_row(k);
-
-    gq_scale_t * restrict pd = (gq_scale_t *) (dst);
-    uint8_t    * restrict pb = (uint8_t *)    (pd + nb);
-
-    uint8_t pp[16];
-
-    for (int i = 0; i < nb; i++) {
-        memset(pp, 0, sizeof(pp));
-
-        float amax = 0.0f; // absolute max
-
-#if defined(__AVX2__)
-        {
-            enum { QK8 = 4 };
-
-            __m256 srcv [QK8];
-            __m256 asrcv[QK8];
-            __m256 amaxv[QK8];
-
-            for (int l = 0; l < QK8; l++) {
-                srcv[l]  = _mm256_loadu_ps(src + i*32 + 8*l);
-            }
-
-            for (int l = 0; l < QK8; l++) {
-                asrcv[l] = _mm256_and_ps(srcv[l], _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff)));
-            }
-
-            for (int l = 0; l < QK8/2; l++) {
-                amaxv[2*l] = _mm256_max_ps(asrcv[2*l], asrcv[2*l+1]);
-            }
-
-            for (int l = 0; l < QK8/4; l++) {
-                amaxv[4*l] = _mm256_max_ps(amaxv[4*l], amaxv[4*l+2]);
-            }
-
-            const __m256 amaxv0_0 = _mm256_permute2f128_ps(amaxv[0], amaxv[0], 3);
-            const __m256 amaxv0_1 = _mm256_max_ps(amaxv[0], amaxv0_0);
-            const __m256 amaxv0_2 = _mm256_permute_ps(amaxv0_1, 0x4e);
-            const __m256 amaxv0_3 = _mm256_max_ps(amaxv0_1, amaxv0_2);
-            const __m256 amaxv0_4 = _mm256_permute_ps(amaxv0_3, 0xb1);
-            const __m256 amaxv0_5 = _mm256_max_ps(amaxv0_3, amaxv0_4);
-
-            amax = _mm256_cvtss_f32(amaxv0_5);
-
-            const float d = amax / ((1 << (QB - 1)) - 1);
-            const float id = d ? 1.0/d : 0.0;
-
-            pd[i] = GGML_FP32_TO_GQ(d);
-
-            const __m256 idv = _mm256_set1_ps(id);
-
-            for (int l = 0; l < 4; l++) {
-                __m256 v = _mm256_mul_ps(srcv[l], idv);
-
-                // convert to int8
-                __m256i vi = _mm256_cvtps_epi32(v);
-                vi = _mm256_add_epi32(vi, _mm256_set1_epi32(8));
-
-                int32_t vi_0 = _mm256_extract_epi32(vi, 0);
-                int32_t vi_1 = _mm256_extract_epi32(vi, 1);
-                int32_t vi_2 = _mm256_extract_epi32(vi, 2);
-                int32_t vi_3 = _mm256_extract_epi32(vi, 3);
-
-                int32_t vi_4 = _mm256_extract_epi32(vi, 4);
-                int32_t vi_5 = _mm256_extract_epi32(vi, 5);
-                int32_t vi_6 = _mm256_extract_epi32(vi, 6);
-                int32_t vi_7 = _mm256_extract_epi32(vi, 7);
-
-                // convert to 4-bit, 2 consecutive packed into 1 byte
-                pp[4*l + 0] = vi_0 | (vi_1 << 4);
-                pp[4*l + 1] = vi_2 | (vi_3 << 4);
-                pp[4*l + 2] = vi_4 | (vi_5 << 4);
-                pp[4*l + 3] = vi_6 | (vi_7 << 4);
-
-                assert(vi_0 >= 0 && vi_0 < 16);
-                assert(vi_1 >= 0 && vi_1 < 16);
-                assert(vi_2 >= 0 && vi_2 < 16);
-                assert(vi_3 >= 0 && vi_3 < 16);
-
-                assert(vi_4 >= 0 && vi_4 < 16);
-                assert(vi_5 >= 0 && vi_5 < 16);
-                assert(vi_6 >= 0 && vi_6 < 16);
-                assert(vi_7 >= 0 && vi_7 < 16);
-            }
-
-            memcpy(pb + i*16, pp, sizeof(pp));
-        }
-#elif defined(__ARM_NEON)
-        {
-            float32x4_t srcv [8];
-            float32x4_t asrcv[8];
-            float32x4_t amaxv[8];
-
-            for (int l = 0; l < 8; l++) srcv[l]  = vld1q_f32(src + i*32 + 4*l);
-            for (int l = 0; l < 8; l++) asrcv[l] = vabsq_f32(srcv[l]);
-
-            for (int l = 0; l < 4; l++) amaxv[2*l] = vmaxq_f32(asrcv[2*l], asrcv[2*l+1]);
-            for (int l = 0; l < 2; l++) amaxv[4*l] = vmaxq_f32(amaxv[4*l], amaxv[4*l+2]);
-            for (int l = 0; l < 1; l++) amaxv[8*l] = vmaxq_f32(amaxv[8*l], amaxv[8*l+4]);
-
-            amax = MAX(
-                    MAX(vgetq_lane_f32(amaxv[0], 0), vgetq_lane_f32(amaxv[0], 1)),
-                    MAX(vgetq_lane_f32(amaxv[0], 2), vgetq_lane_f32(amaxv[0], 3)));
-
-            const float d = amax / ((1 << 3) - 1);
-            const float id = d ? 1.0/d : 0.0;
-
-            pd[i] = GGML_FP32_TO_GQ(d);
-
-            for (int l = 0; l < 8; l++) {
-                const float32x4_t v = vmulq_n_f32(srcv[l], id);
-                const float32x4_t vf = vaddq_f32(v, vdupq_n_f32(8.5f));
-                const int32x4_t vi = vcvtq_s32_f32(vf);
-
-                pp[2*l + 0] = vgetq_lane_s32(vi, 0) | (vgetq_lane_s32(vi, 1) << 4);
-                pp[2*l + 1] = vgetq_lane_s32(vi, 2) | (vgetq_lane_s32(vi, 3) << 4);
-            }
-
-            memcpy(pb + i*16, pp, sizeof(pp));
-        }
-#else
-        {
-            for (int l = 0; l < 32; l++) {
-                const float v = src[i*32 + l];
-                amax = MAX(amax, fabsf(v));
-            }
-
-            const float d = amax / ((1 << (QB - 1)) - 1);
-            const float id = d ? 1.0/d : 0.0;
-
-            pd[i] = GGML_FP32_TO_GQ(d);
-
-            for (int l = 0; l < 32; l++) {
-                const float v = src[i*32 + l]*id;
-                const int8_t vi = ((int8_t) (round(v))) + 8;
-                assert(vi >= 0 && vi < 16);
-                pp[l/2] |= (vi & 0xf) << (4*(l & 1));
-            }
-
-            memcpy(pb + i*16, pp, sizeof(pp));
-        }
-#endif
-        //printf("amax = %f\n", amax);
-    }
-}
-
-// reimplementation of quantize__6using quantize_6_row
-void quantize_6(const float * restrict src, char * restrict dst, int n, int k) {
-    assert(k % 32 == 0);
-
-    for (int j = 0; j < n; j++) {
-        quantize_6_row(src + j*k, dst, k);
-        dst = (char *) dst + quantize_6_row_size(k);
-    }
-}
-
-void vec_dot_gq_6(const int n, float * restrict s, const void * restrict x, const void * restrict y) {
-    const int nb = quantize_6_blocks_per_row(n);
-
-    const gq_scale_t * restrict pd0 = (const gq_scale_t *) x;
-    const gq_scale_t * restrict pd1 = (const gq_scale_t *) y;
-
-    const uint8_t * restrict pb0 = (const uint8_t *) (pd0 + nb);
-    const uint8_t * restrict pb1 = (const uint8_t *) (pd1 + nb);
-
-    float sumf = 0.0;
-
-#if 0
-    // scalar
-    for (int i = 0; i < nb; i++) {
-        const float d0 = GGML_GQ_TO_FP32(pd0[i]);
-        const float d1 = GGML_GQ_TO_FP32(pd1[i]);
-
-        const uint8_t * restrict p0 = pb0 + i*16;
-        const uint8_t * restrict p1 = pb1 + i*16;
-
-        for (int j = 0; j < 16; j++) {
-            const uint8_t v0 = p0[j];
-            const uint8_t v1 = p1[j];
-
-            const float f0 = d0*((int8_t) (v0 & 0xf) - 8);
-            const float f1 = d0*((int8_t) (v0 >> 4)  - 8);
-
-            const float f2 = d1*((int8_t) (v1 & 0xf) - 8);
-            const float f3 = d1*((int8_t) (v1 >> 4)  - 8);
-
-            sumf += f0*f2 + f1*f3;
-        }
-    }
-#else
-#if defined(__AVX2__)
-    // TODO
-#elif defined (__ARM_NEON)
-#if 0
-    float sum0 = 0.0f;
-
-    for (int i = 0; i < nb; i++) {
-        const float d0 = GGML_GQ_TO_FP32(pd0[i]);
-        const float d1 = GGML_GQ_TO_FP32(pd1[i]);
-
-        //float32x4_t d0d1v = vdupq_n_f32(d0*d1);
-        //float16x8_t d0d1v = vdupq_n_f16(d0*d1);
-
-        const uint8_t * restrict p0 = pb0 + i*16;
-        const uint8_t * restrict p1 = pb1 + i*16;
-
-        const uint8x16_t m4b = vdupq_n_u8(0xf);
-        const int8x16_t  s8b = vdupq_n_s8(0x8);
-
-        const uint8x16_t v0_0 = vld1q_u8(p0);
-        const uint8x16_t v1_0 = vld1q_u8(p1);
-
-        // 4-bit -> 8-bit
-        const uint8x16_t v0_0l = vandq_u8(v0_0, m4b);
-        const uint8x16_t v1_0l = vandq_u8(v1_0, m4b);
-
-        const uint8x16_t v0_0h = vshrq_n_u8(v0_0, 4);
-        const uint8x16_t v1_0h = vshrq_n_u8(v1_0, 4);
-
-        // sub 8
-        const int8x16_t v0_0ls = vsubq_s8(v0_0l, s8b);
-        const int8x16_t v1_0ls = vsubq_s8(v1_0l, s8b);
-
-        const int8x16_t v0_0hs = vsubq_s8(v0_0h, s8b);
-        const int8x16_t v1_0hs = vsubq_s8(v1_0h, s8b);
-
-        // dot product into int16x8_t
-        const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0ls));
-        const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0ls));
-
-        const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hs), vget_low_s8 (v1_0hs));
-        const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hs), vget_high_s8(v1_0hs));
-
-        const int16x8_t pl = vaddq_s16(pl0l, pl0h);
-        const int16x8_t ph = vaddq_s16(ph0l, ph0h);
-
-        const int16x8_t p = vaddq_s16(pl, ph);
-
-        // scalar
-        sum0 += d0*d1*vaddvq_s16(p);
-    }
-
-    sumf = sum0;
-#elif 1 // this is a bit faster than the above
-    float sum0 = 0.0f;
-    float sum1 = 0.0f;
-
-    for (int i = 0; i < nb; i += 2) {
-        const float d0_0 = GGML_GQ_TO_FP32(pd0[i + 0]);
-        const float d1_0 = GGML_GQ_TO_FP32(pd1[i + 0]);
-        const float d0_1 = GGML_GQ_TO_FP32(pd0[i + 1]);
-        const float d1_1 = GGML_GQ_TO_FP32(pd1[i + 1]);
-
-        const uint8_t * restrict p0 = pb0 + i*16;
-        const uint8_t * restrict p1 = pb1 + i*16;
-
-        const uint8x16_t m4b = vdupq_n_u8(0xf);
-        const int8x16_t s8b = vdupq_n_s8(0x8);
-
-        const uint8x16_t v0_0 = vld1q_u8(p0);
-        const uint8x16_t v0_1 = vld1q_u8(p0 + 16);
-        const uint8x16_t v1_0 = vld1q_u8(p1);
-        const uint8x16_t v1_1 = vld1q_u8(p1 + 16);
-
-        // 4-bit -> 8-bit
-        const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8(v0_0, m4b));
-        const int8x16_t v1_0l = vreinterpretq_s8_u8(vandq_u8(v1_0, m4b));
-
-        const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
-        const int8x16_t v1_0h = vreinterpretq_s8_u8(vshrq_n_u8(v1_0, 4));
-
-        const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8(v0_1, m4b));
-        const int8x16_t v1_1l = vreinterpretq_s8_u8(vandq_u8(v1_1, m4b));
-
-        const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
-        const int8x16_t v1_1h = vreinterpretq_s8_u8(vshrq_n_u8(v1_1, 4));
-
-        // sub 8
-        const int8x16_t v0_0ls = vsubq_s8(v0_0l, s8b);
-        const int8x16_t v1_0ls = vsubq_s8(v1_0l, s8b);
-
-        const int8x16_t v0_0hs = vsubq_s8(v0_0h, s8b);
-        const int8x16_t v1_0hs = vsubq_s8(v1_0h, s8b);
-
-        const int8x16_t v0_1ls = vsubq_s8(v0_1l, s8b);
-        const int8x16_t v1_1ls = vsubq_s8(v1_1l, s8b);
-
-        const int8x16_t v0_1hs = vsubq_s8(v0_1h, s8b);
-        const int8x16_t v1_1hs = vsubq_s8(v1_1h, s8b);
-
-        // dot product into int16x8_t
-        const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0ls));
-        const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0ls));
-
-        const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hs), vget_low_s8 (v1_0hs));
-        const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hs), vget_high_s8(v1_0hs));
-
-        const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1ls), vget_low_s8 (v1_1ls));
-        const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1ls), vget_high_s8(v1_1ls));
-
-        const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hs), vget_low_s8 (v1_1hs));
-        const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hs), vget_high_s8(v1_1hs));
-
-        const int16x8_t pl_0 = vaddq_s16(pl0l, pl0h);
-        const int16x8_t ph_0 = vaddq_s16(ph0l, ph0h);
-
-        const int16x8_t pl_1 = vaddq_s16(pl1l, pl1h);
-        const int16x8_t ph_1 = vaddq_s16(ph1l, ph1h);
-
-        const int16x8_t p_0 = vaddq_s16(pl_0, ph_0);
-        const int16x8_t p_1 = vaddq_s16(pl_1, ph_1);
-
-        // scalar
-        sum0 += d0_0*d1_0*vaddvq_s16(p_0);
-        sum1 += d0_1*d1_1*vaddvq_s16(p_1);
-    }
-
-    sumf = sum0 + sum1;
-#endif
-#endif
-#endif
-
-    *s = sumf;
-}
-
-// use vec_dot_gq_6 to compute the dot product of two rows
-void mul_mat_gq_6(
-    const void * src0,
-    const void * src1, // transposed
-         float * dst,
-    int m, int n, int k) {
-    assert(k % 32 == 0);
-
-    for (int ir0 = 0; ir0 < m; ir0++) {
-        for (int ir1 = 0; ir1 < n; ir1++) {
-            vec_dot_gq_6(k, dst + ir1, src0, src1);
-            src1 = (const char *) src1 + quantize_6_row_size(k);
-        }
-        src0 = (const char *) src0 +   quantize_6_row_size(k);
-        src1 = (const char *) src1 - n*quantize_6_row_size(k);
-
-        dst = (float *) dst + n;
-    }
-}
-
-int main(int argc, const char ** argv) {
-    assert(sizeof(gq_quant_t)*8 == gq_t_bits);
-    ggml_time_init();
-
-    // needed to initialize f16 tables
-    {
-        struct ggml_init_params params = { 0, NULL, false };
-        struct ggml_context * ctx = ggml_init(params);
-        ggml_free(ctx);
-    }
-
-    int method = 0;
-    if (argc > 1) {
-        method = atoi(argv[1]);
-    }
-
-    float * src0 = malloc(sizeof(float)*M*K);
-    float * src1 = malloc(sizeof(float)*N*K);
-    float * dst  = malloc(sizeof(float)*M*N);
-
-    // allocate aligned memory
-    //float * src0 = (float *)aligned_alloc(32, sizeof(float)*M*K);
-    //float * src1 = (float *)aligned_alloc(32, sizeof(float)*N*K);
-    //float * dst  = (float *)aligned_alloc(32, sizeof(float)*M*N);
-
-    for (int i = 0; i < M*K; i++) {
-        src0[i] = 0.8 - rand() / (float)RAND_MAX;
-        /*src0[i] = rand() / (float)RAND_MAX;*/
-        /*src0[i] = i % 2;*/
-    }
-
-    for (int i = 0; i < N*K; i++) {
-        src1[i] = 0.8 - rand() / (float)RAND_MAX;
-        /*src1[i] = rand() / (float)RAND_MAX;*/
-        /*src1[i] = i % 3;*/
-    }
-
-    void * src0_gq = NULL;
-    void * src1_gq = NULL;
-
-    size_t sizegq = 0;
-
-    {
-        if (method == 1) {
-            src0_gq = calloc(1, quantize_1_row_size(K)*M);
-            src1_gq = calloc(1, quantize_1_row_size(K)*N);
-
-            sizegq  = quantize_1_row_size(K)*M + quantize_1_row_size(K)*N;
-        }
-
-        if (method == 2) {
-            src0_gq = calloc(1, quantize_2_row_size(K)*M);
-            src1_gq = calloc(1, quantize_2_row_size(K)*N);
-
-            sizegq  = quantize_2_row_size(K)*M + quantize_2_row_size(K)*N;
-        }
-
-        if (method == 3) {
-            src0_gq = calloc(1, quantize_3_row_size(K)*M);
-            src1_gq = calloc(1, quantize_3_row_size(K)*N);
-
-            sizegq  = quantize_3_row_size(K)*M + quantize_3_row_size(K)*N;
-        }
-
-        if (method == 4) {
-            src0_gq = calloc(1, quantize_4_row_size(K)*M);
-            src1_gq = calloc(1, quantize_4_row_size(K)*N);
-
-            sizegq  = quantize_4_row_size(K)*M + quantize_4_row_size(K)*N;
-        }
-
-        if (method == 5) {
-            src0_gq = calloc(1, quantize_5_row_size(K)*M);
-            src1_gq = calloc(1, quantize_5_row_size(K)*N);
-
-            sizegq  = quantize_5_row_size(K)*M + quantize_5_row_size(K)*N;
-        }
-
-        if (method == 6) {
-            src0_gq = calloc(1, quantize_6_row_size(K)*M);
-            src1_gq = calloc(1, quantize_6_row_size(K)*N);
-
-            sizegq  = quantize_6_row_size(K)*M + quantize_6_row_size(K)*N;
-        }
-    }
-
-    const size_t sizef16 = sizeof(ggml_fp16_t)*M*K + sizeof(ggml_fp16_t)*N*K;
-
-    printf("compression: %f\n", (float)sizegq/sizef16);
-
-    // convert fp32 -> gq
-    {
-        const int64_t t_start = ggml_time_us();
-
-        if (method == 1) {
-            quantize_1(src0, src0_gq, M, K);
-            quantize_1(src1, src1_gq, N, K);
-        }
-
-        if (method == 2) {
-            quantize_2(src0, src0_gq, M, K);
-            quantize_2(src1, src1_gq, N, K);
-        }
-
-        if (method == 3) {
-            quantize_3(src0, src0_gq, M, K);
-            quantize_3(src1, src1_gq, N, K);
-        }
-
-        if (method == 4) {
-            quantize_4(src0, src0_gq, M, K);
-            quantize_4(src1, src1_gq, N, K);
-        }
-
-        if (method == 5) {
-            quantize_5(src0, src0_gq, M, K);
-            quantize_5(src1, src1_gq, N, K);
-        }
-
-        if (method == 6) {
-            quantize_6(src0, src0_gq, M, K);
-            quantize_6(src1, src1_gq, N, K);
-        }
-
-        const int64_t t_end = ggml_time_us();
-        printf("convert time: %f ms / method = %d\n", (t_end - t_start) / 1000.0, method);
-    }
-
-    for (int i = 0; i < 16; ++i) {
-        printf("%f %f\n", src0[i], src1[i]);
-    }
-
-    const int nIter = 1;
-
-    const int64_t start = ggml_cycles();
-    const int64_t start_us = ggml_time_us();
-
-    double iM = 1.0/M;
-    double sum = 0.0f;
-    for (int i = 0; i < nIter; i++) {
-        if (method == 0) {
-            mul_mat_f32_naive(src0, src1, dst, M, N, K);
-        }
-
-        if (method == 1) {
-            mul_mat_gq_1(src0_gq, src1_gq, dst, M, N, K);
-        }
-
-        if (method == 2) {
-            mul_mat_gq_2(src0_gq, src1_gq, dst, M, N, K);
-        }
-
-        if (method == 3) {
-            mul_mat_gq_3(src0_gq, src1_gq, dst, M, N, K);
-        }
-
-        if (method == 4) {
-            mul_mat_gq_4(src0_gq, src1_gq, dst, M, N, K);
-        }
-
-        if (method == 5) {
-            mul_mat_gq_5(src0_gq, src1_gq, dst, M, N, K);
-        }
-
-        if (method == 6) {
-            mul_mat_gq_6(src0_gq, src1_gq, dst, M, N, K);
-        }
-    }
-
-    for (int i = 0; i < N; i++) {
-        sum += dst[i]*iM;
-    }
-
-    {
-        const int64_t end = ggml_cycles();
-        const int64_t end_us = ggml_time_us();
-        printf("%s: elapsed ticks: %" PRIu64 "\n",  __func__, end - start);
-        printf("%s: elapsed us:    %d / %f ms\n",  __func__, (int)(end_us - start_us), (end_us - start_us) / 1000.0 / nIter);
-    }
-
-#if 0
-    // print src0
-    printf("src0:\n");
-    for (int i = 0; i < M; i++) {
-        for (int j = 0; j < K; j++) {
-            printf("%4.1f ", src0[i*K+j]);
-        }
-        printf("\n");
-    }
-
-    // print src1
-    printf("src1:\n");
-    for (int i = 0; i < N; i++) {
-        for (int j = 0; j < K; j++) {
-            printf("%4.1f ", src1[i*K+j]);
-        }
-        printf("\n");
-    }
-
-    printf("dst:\n");
-    for (int i = 0; i < M; i++) {
-        for (int j = 0; j < N; j++) {
-            printf("%4.1f ", dst[i*N+j]);
-        }
-        printf("\n");
-    }
-#endif
-
-    printf("%f\n", sum);
-
-    free(src0);
-    free(src1);
-    free(dst);
-
-    if (src0_gq) free(src0_gq);
-    if (src1_gq) free(src1_gq);
-
-    return 0;
-}
diff --git a/ggml/tests/test-opt.cpp b/ggml/tests/test-opt.cpp
deleted file mode 100644
index 2c9997f..0000000
--- a/ggml/tests/test-opt.cpp
+++ /dev/null
@@ -1,181 +0,0 @@
-#include "ggml.h"
-
-#include <cmath>
-#include <cstdio>
-#include <cstdlib>
-#include <cassert>
-
-#define MAX_NARGS 2
-
-#if defined(__GNUC__)
-#pragma GCC diagnostic ignored "-Wdouble-promotion"
-#endif
-
-//
-// logging
-//
-#define GGML_DEBUG 0
-#if (GGML_DEBUG >= 1)
-#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
-#else
-#define GGML_PRINT_DEBUG(...)
-#endif
-
-#if (GGML_DEBUG >= 5)
-#define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
-#else
-#define GGML_PRINT_DEBUG_5(...)
-#endif
-
-#if (GGML_DEBUG >= 10)
-#define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
-#else
-#define GGML_PRINT_DEBUG_10(...)
-#endif
-
-#define GGML_PRINT(...) printf(__VA_ARGS__)
-
-
-static float frand(void) {
-    return (float)rand()/(float)RAND_MAX;
-}
-
-static struct ggml_tensor * get_random_tensor(
-    struct ggml_context * ctx0, int ndims, int64_t ne[], float fmin, float fmax
-) {
-    struct ggml_tensor * result = ggml_new_tensor(ctx0, GGML_TYPE_F32, ndims, ne);
-
-    switch (ndims) {
-        case 1:
-            for (int i0 = 0; i0 < ne[0]; i0++) {
-                ((float *)result->data)[i0] = frand()*(fmax - fmin) + fmin;
-            }
-            break;
-        case 2:
-            for (int i1 = 0; i1 < ne[1]; i1++) {
-                for (int i0 = 0; i0 < ne[0]; i0++) {
-                    ((float *)result->data)[i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
-                }
-            }
-            break;
-        case 3:
-            for (int i2 = 0; i2 < ne[2]; i2++) {
-                for (int i1 = 0; i1 < ne[1]; i1++) {
-                    for (int i0 = 0; i0 < ne[0]; i0++) {
-                        ((float *)result->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
-                    }
-                }
-            }
-            break;
-        case 4:
-            for (int i3 = 0; i3 < ne[3]; i3++) {
-                for (int i2 = 0; i2 < ne[2]; i2++) {
-                    for (int i1 = 0; i1 < ne[1]; i1++) {
-                        for (int i0 = 0; i0 < ne[0]; i0++) {
-                            ((float *)result->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
-                        }
-                    }
-                }
-            }
-            break;
-        default:
-            assert(false);
-    }
-
-    return result;
-}
-
-int main(void) {
-    struct ggml_init_params params = {
-        /* .mem_size   = */ 1024*1024*1024,
-        /* .mem_buffer = */ NULL,
-        /* .no_alloc   = */ false,
-    };
-
-    struct ggml_context * ctx = ggml_init(params);
-
-    int64_t ne1[4] = {4, 128, 1, 1};
-    int64_t ne2[4] = {4, 256, 1, 1};
-    int64_t ne3[4] = {128, 256, 1, 1};
-
-    struct ggml_tensor * a = get_random_tensor(ctx, 2, ne1, -1, +1);
-    struct ggml_tensor * b = get_random_tensor(ctx, 2, ne2, -1, +1);
-    ggml_set_param(ctx, a);
-    ggml_set_param(ctx, b);
-
-    struct ggml_tensor * c = get_random_tensor(ctx, 2, ne3, -1, +1);
-
-    struct ggml_tensor * ab = ggml_mul_mat(ctx, a, b);
-    struct ggml_tensor * d  = ggml_sub(ctx, c, ab);
-    struct ggml_tensor * e  = ggml_sum(ctx, ggml_sqr(ctx, d));
-
-    struct ggml_cgraph * ge = ggml_new_graph_custom(ctx, GGML_DEFAULT_GRAPH_SIZE, true);
-    ggml_build_forward_expand(ge, e);
-    ggml_graph_reset(ge);
-
-    ggml_graph_compute_with_ctx(ctx, ge, /*n_threads*/ 1);
-
-    const float fe = ggml_get_f32_1d(e, 0);
-    printf("%s: e = %.4f\n", __func__, fe);
-
-    struct ggml_opt_params opt_params = ggml_opt_default_params(GGML_OPT_ADAM);
-
-    ggml_opt(ctx, opt_params, e);
-
-    ggml_graph_reset(ge);
-
-    ggml_graph_compute_with_ctx(ctx, ge, /*n_threads*/ 1);
-
-    const float fe_opt = ggml_get_f32_1d(e, 0);
-    printf("%s: original  e = %.4f\n", __func__, fe);
-    printf("%s: optimized e = %.4f\n", __func__, fe_opt);
-
-    const bool success = (fe_opt <= fe);
-    assert(success);
-
-    ggml_free(ctx);
-    return success ? 0 : -1;
-}
-// int64_t ne1[4] = {4, 128, 1, 1};
-// int64_t ne2[4] = {4, 256, 1, 1};;
-// int64_t ne3[4] = {128, 256, 1, 1};
-// main: original  e = 25890.9375
-// main: optimized e = 10094.7031
-
-// int64_t ne1[4] = {8, 128, 1, 1};
-// int64_t ne2[4] = {8, 256, 1, 1};;
-// int64_t ne3[4] = {128, 256, 1, 1};
-// main: original  e = 39429.5078
-// main: optimized e = 9275.8936
-
-// int64_t ne1[4] = {16, 128, 1, 1};
-// int64_t ne2[4] = {16, 256, 1, 1};;
-// int64_t ne3[4] = {128, 256, 1, 1};
-// main: original  e = 68371.1328
-// main: optimized e = 7854.4502
-
-
-// int64_t ne1[4] = {32, 128, 1, 1};
-// int64_t ne2[4] = {32, 256, 1, 1};;
-// int64_t ne3[4] = {128, 256, 1, 1};
-// main: original  e = 126061.1953
-// main: optimized e = 5451.0166
-
-// int64_t ne1[4] = {4, 1024, 1, 1};
-// int64_t ne2[4] = {4, 2048, 1, 1};;
-// int64_t ne3[4] = {1024, 2048, 1, 1};
-// main: original  e = 1620817.8750
-// main: optimized e = 698387.6875
-
-// another run on M1
-// int64_t ne1[4] = {4, 1024, 1, 1};
-// int64_t ne2[4] = {4, 2048, 1, 1};;
-// int64_t ne3[4] = {1024, 2048, 1, 1};
-// main: original  e = 1629595.6250
-// main: optimized e = 698169.1250
-
-// int64_t ne1[4] = {32, 1024, 1, 1};
-// int64_t ne2[4] = {32, 2048, 1, 1};;
-// int64_t ne3[4] = {1024, 2048, 1, 1};
-// main: original  e = 8146770.5000
-// main: optimized e = 651119.1250
diff --git a/ggml/tests/test-pool.c b/ggml/tests/test-pool.c
deleted file mode 100644
index e4626ec..0000000
--- a/ggml/tests/test-pool.c
+++ /dev/null
@@ -1,147 +0,0 @@
-#include "ggml/ggml.h"
-
-#include <string.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-struct ggml_context* make_ctx(void) {
-    struct ggml_init_params params = {
-        .mem_size = 2 * 1024 * 1024,
-    };
-
-    return ggml_init(params);
-}
-
-int main(int argc, const char** argv) {
-
-    float buf_f32[1024];
-    for (int i = 0; i < 1024; ++i) {
-        buf_f32[i] = (float)(i + 1);
-    }
-
-    // avg pool 1d
-    {
-        struct ggml_context * ctx = make_ctx();
-        struct ggml_tensor * t = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 10, 2);
-        memcpy(t->data, buf_f32, ggml_nbytes(t));
-
-        struct ggml_tensor * t_pooled = ggml_pool_1d(ctx, t, GGML_OP_POOL_AVG, 3, 3, 0);
-        GGML_ASSERT(t_pooled->ne[0] == 3);
-        GGML_ASSERT(t_pooled->ne[1] == 2);
-        GGML_ASSERT(t_pooled->ne[2] == 1);
-
-        struct ggml_cgraph * graph = ggml_new_graph(ctx);
-        ggml_build_forward_expand(graph, t_pooled);
-
-        ggml_graph_compute_with_ctx(ctx, graph, 4);
-
-        const float * output = ggml_get_data_f32(t_pooled);
-
-        GGML_ASSERT(output[0] == 2);
-        GGML_ASSERT(output[1] == 5);
-        GGML_ASSERT(output[2] == 8);
-        GGML_ASSERT(output[3] == 12);
-        GGML_ASSERT(output[4] == 15);
-        GGML_ASSERT(output[5] == 18);
-
-        ggml_free(ctx);
-    }
-
-    // max pool 1d
-    {
-        struct ggml_context * ctx = make_ctx();
-        struct ggml_tensor * t = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 10, 2);
-        memcpy(t->data, buf_f32, ggml_nbytes(t));
-
-        struct ggml_tensor * t_pooled = ggml_pool_1d(ctx, t, GGML_OP_POOL_MAX, 3, 3, 0);
-        GGML_ASSERT(t_pooled->ne[0] == 3);
-        GGML_ASSERT(t_pooled->ne[1] == 2);
-        GGML_ASSERT(t_pooled->ne[2] == 1);
-
-        struct ggml_cgraph * graph = ggml_new_graph(ctx);
-        ggml_build_forward_expand(graph, t_pooled);
-
-        ggml_graph_compute_with_ctx(ctx, graph, 4);
-
-        const float * output = ggml_get_data_f32(t_pooled);
-        GGML_ASSERT(output[0] == 3);
-        GGML_ASSERT(output[1] == 6);
-        GGML_ASSERT(output[2] == 9);
-        GGML_ASSERT(output[3] == 13);
-        GGML_ASSERT(output[4] == 16);
-        GGML_ASSERT(output[5] == 19);
-
-        ggml_free(ctx);
-    }
-
-    // avg pool 2d
-    {
-        struct ggml_context * ctx = make_ctx();
-        struct ggml_tensor * t = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 10, 10, 2);
-        memcpy(t->data, buf_f32, ggml_nbytes(t));
-
-        struct ggml_tensor * t_pooled = ggml_pool_2d(ctx, t, GGML_OP_POOL_AVG, 3, 4, 3, 4, 0, 0);
-        GGML_ASSERT(t_pooled->ne[0] == 3);
-        GGML_ASSERT(t_pooled->ne[1] == 2);
-        GGML_ASSERT(t_pooled->ne[2] == 2);
-        GGML_ASSERT(t_pooled->ne[3] == 1);
-
-        struct ggml_cgraph * graph = ggml_new_graph(ctx);
-        ggml_build_forward_expand(graph, t_pooled);
-
-        ggml_graph_compute_with_ctx(ctx, graph, 4);
-
-        const float * output = ggml_get_data_f32(t_pooled);
-        GGML_ASSERT(output[0] == 17);
-        GGML_ASSERT(output[1] == 20);
-        GGML_ASSERT(output[2] == 23);
-        GGML_ASSERT(output[3] == 57);
-        GGML_ASSERT(output[4] == 60);
-        GGML_ASSERT(output[5] == 63);
-        GGML_ASSERT(output[6] == 117);
-        GGML_ASSERT(output[7] == 120);
-        GGML_ASSERT(output[8] == 123);
-        GGML_ASSERT(output[9] == 157);
-        GGML_ASSERT(output[10] == 160);
-        GGML_ASSERT(output[11] == 163);
-
-
-        ggml_free(ctx);
-    }
-
-    // max pool 2d
-    {
-        struct ggml_context * ctx = make_ctx();
-        struct ggml_tensor * t = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 10, 10, 2);
-        memcpy(t->data, buf_f32, ggml_nbytes(t));
-
-        struct ggml_tensor * t_pooled = ggml_pool_2d(ctx, t, GGML_OP_POOL_MAX, 3, 4, 3, 4, 0, 0);
-        GGML_ASSERT(t_pooled->ne[0] == 3);
-        GGML_ASSERT(t_pooled->ne[1] == 2);
-        GGML_ASSERT(t_pooled->ne[2] == 2);
-        GGML_ASSERT(t_pooled->ne[3] == 1);
-
-        struct ggml_cgraph * graph = ggml_new_graph(ctx);
-        ggml_build_forward_expand(graph, t_pooled);
-
-        ggml_graph_compute_with_ctx(ctx, graph, 4);
-
-        const float * output = ggml_get_data_f32(t_pooled);
-        GGML_ASSERT(output[0] == 33);
-        GGML_ASSERT(output[1] == 36);
-        GGML_ASSERT(output[2] == 39);
-        GGML_ASSERT(output[3] == 73);
-        GGML_ASSERT(output[4] == 76);
-        GGML_ASSERT(output[5] == 79);
-        GGML_ASSERT(output[6] == 133);
-        GGML_ASSERT(output[7] == 136);
-        GGML_ASSERT(output[8] == 139);
-        GGML_ASSERT(output[9] == 173);
-        GGML_ASSERT(output[10] == 176);
-        GGML_ASSERT(output[11] == 179);
-
-        ggml_free(ctx);
-    }
-
-    return 0;
-}
diff --git a/ggml/tests/test-quantize-fns.cpp b/ggml/tests/test-quantize-fns.cpp
deleted file mode 100644
index 31a78c6..0000000
--- a/ggml/tests/test-quantize-fns.cpp
+++ /dev/null
@@ -1,179 +0,0 @@
-// Unit tests for quantization specific functions - quantize, dequantize and dot product
-
-#include "ggml.h"
-
-#undef NDEBUG
-#include <assert.h>
-#include <math.h>
-#include <stdio.h>
-#include <string>
-#include <vector>
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-
-constexpr float MAX_QUANTIZATION_REFERENCE_ERROR = 0.0001f;
-constexpr float MAX_QUANTIZATION_TOTAL_ERROR = 0.002f;
-constexpr float MAX_QUANTIZATION_TOTAL_ERROR_2BITS = 0.0075f;
-constexpr float MAX_QUANTIZATION_TOTAL_ERROR_3BITS = 0.0040f;
-constexpr float MAX_DOT_PRODUCT_ERROR = 0.02f;
-
-static const char* RESULT_STR[] = {"ok", "FAILED"};
-
-
-// Generate synthetic data
-static void generate_data(float offset, size_t n, float * dst) {
-    for (size_t i = 0; i < n; i++) {
-        dst[i] = 0.1 + 2*cosf(i + offset);
-    }
-}
-
-// Calculate RMSE between two float arrays
-static float array_rmse(const float * a1, const float * a2, size_t n) {
-    double sum = 0;
-    for (size_t i = 0; i < n; i++) {
-        double diff = a1[i] - a2[i];
-        sum += diff * diff;
-    }
-    return sqrtf(sum) / n;
-}
-
-// Total quantization error on test data
-static float total_quantization_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data) {
-    std::vector<uint8_t> tmp_q(2*test_size);
-    std::vector<float> tmp_out(test_size);
-
-    qfns.from_float(test_data, tmp_q.data(), test_size);
-    qfns.to_float(tmp_q.data(), tmp_out.data(), test_size);
-    return array_rmse(test_data, tmp_out.data(), test_size);
-}
-
-// Total quantization error on test data
-static float reference_quantization_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data) {
-    std::vector<uint8_t> tmp_q(2*test_size);
-    std::vector<float> tmp_out(test_size);
-    std::vector<float> tmp_out_ref(test_size);
-
-    qfns.from_float(test_data, tmp_q.data(), test_size);
-    qfns.to_float(tmp_q.data(), tmp_out.data(), test_size);
-
-    qfns.from_float_reference(test_data, tmp_q.data(), test_size);
-    qfns.to_float(tmp_q.data(), tmp_out_ref.data(), test_size);
-
-    return array_rmse(tmp_out.data(), tmp_out_ref.data(), test_size);
-}
-
-static float dot_product(const float * a1, const float * a2, size_t test_size) {
-    double sum = 0;
-    for (size_t i = 0; i < test_size; i++) {
-        sum += a1[i] * a2[i];
-    }
-    return sum;
-}
-
-// Total dot product error
-static float dot_product_error(
-    ggml_type_traits_t & qfns, size_t test_size, const float * test_data1, const float *test_data2
-) {
-    std::vector<uint8_t> tmp_q1(2*test_size);
-    std::vector<uint8_t> tmp_q2(2*test_size);
-
-    auto vdot = ggml_internal_get_type_traits(qfns.vec_dot_type);
-
-    qfns.from_float(test_data1, tmp_q1.data(), test_size);
-    vdot.from_float(test_data2, tmp_q2.data(), test_size);
-
-    float result = INFINITY;
-    qfns.vec_dot(test_size, &result, tmp_q1.data(), tmp_q2.data());
-
-    const float dot_ref = dot_product(test_data1, test_data2, test_size);
-
-    return fabsf(result - dot_ref) / test_size;
-}
-
-int main(int argc, char * argv[]) {
-    bool verbose = false;
-    const size_t test_size = 32 * 128;
-
-    std::string arg;
-    for (int i = 1; i < argc; i++) {
-        arg = argv[i];
-
-        if (arg == "-v") {
-            verbose = true;
-        } else {
-            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
-            return 1;
-        }
-    }
-
-    std::vector<float> test_data(test_size);
-    std::vector<float> test_data2(test_size);
-
-    generate_data(0.0, test_data.size(), test_data.data());
-    generate_data(1.0, test_data2.size(), test_data2.data());
-
-    // Initialize GGML, ensures float conversion tables are initialized
-    struct ggml_init_params ggml_params = {
-        /* .mem_size   = */ 1*1024,
-        /* .mem_buffer = */ NULL,
-        /* .no_alloc   = */ true,
-    };
-    struct ggml_context * ctx = ggml_init(ggml_params);
-
-    int num_failed = 0;
-    bool failed = false;
-
-    for (int i = 0; i < GGML_TYPE_COUNT; i++) {
-        ggml_type type = (ggml_type) i;
-        ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
-
-        // deprecated - skip
-        if (qfns.blck_size == 0) {
-            continue;
-        }
-
-        const ggml_type ei = (ggml_type)i;
-        if (ei == GGML_TYPE_IQ2_XXS || ei == GGML_TYPE_IQ2_XS) {
-            printf("Skip %s due to missing quantization functionality\n", ggml_type_name(ei));
-            continue;
-        }
-
-        printf("Testing %s\n", ggml_type_name((ggml_type) i));
-
-        if (qfns.from_float && qfns.to_float) {
-            const float total_error = total_quantization_error(qfns, test_size, test_data.data());
-            const float max_quantization_error =
-                type == GGML_TYPE_Q2_K ? MAX_QUANTIZATION_TOTAL_ERROR_2BITS :
-                type == GGML_TYPE_Q3_K ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS : MAX_QUANTIZATION_TOTAL_ERROR;
-            failed = !(total_error < max_quantization_error);
-            num_failed += failed;
-            if (failed || verbose) {
-                printf("%5s absolute quantization error:    %s (%f)\n", ggml_type_name(type), RESULT_STR[failed], total_error);
-            }
-
-            const float reference_error = reference_quantization_error(qfns, test_size, test_data.data());
-            failed = !(reference_error < MAX_QUANTIZATION_REFERENCE_ERROR);
-            num_failed += failed;
-            if (failed || verbose) {
-                printf("%5s reference implementation error: %s (%f)\n", ggml_type_name(type), RESULT_STR[failed], reference_error);
-            }
-
-            const float vec_dot_error = dot_product_error(qfns, test_size, test_data.data(), test_data2.data());
-            failed = !(vec_dot_error < MAX_DOT_PRODUCT_ERROR);
-            num_failed += failed;
-            if (failed || verbose) {
-                printf("%5s dot product error:              %s (%f)\n", ggml_type_name(type), RESULT_STR[failed], vec_dot_error);
-            }
-        }
-    }
-
-    if (num_failed || verbose) {
-        printf("%d tests failed\n", num_failed);
-    }
-
-    ggml_free(ctx);
-
-    return num_failed > 0;
-}
diff --git a/ggml/tests/test-quantize-perf.cpp b/ggml/tests/test-quantize-perf.cpp
deleted file mode 100644
index 09d410b..0000000
--- a/ggml/tests/test-quantize-perf.cpp
+++ /dev/null
@@ -1,361 +0,0 @@
-// Benchmark quantization specific functions on synthetic data
-
-#include "ggml.h"
-
-#undef NDEBUG
-#include <algorithm>
-#include <assert.h>
-#include <functional>
-#include <inttypes.h>
-#include <math.h>
-#include <memory>
-#include <stdio.h>
-#include <string>
-#include <vector>
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-
-#define MAX_ALIGNMENT 64
-#define QK 32
-#define WARMUP 5
-#define ITERATIONS 10
-#define MAX_ITERATIONS 100000000
-
-#define L1_SIZE      32*128
-#define L2_SIZE     32*2048
-#define L3_SIZE    32*20480
-#define MEM_SIZE 32*2048000
-
-struct quantize_perf_params {
-    std::vector<std::string> include_types;
-    std::vector<size_t> test_sizes;
-    size_t alignment_offset = 0;
-    bool op_quantize_row_q_reference = false;
-    bool op_quantize_row_q = false;
-    bool op_dequantize_row_q = false;
-    bool op_quantize_row_q_dot = false;
-    bool op_vec_dot_q = false;
-    int64_t iterations = ITERATIONS;
-};
-
-#if defined(__x86_64__) || defined(__i386__)
-
-#include <x86intrin.h>
-inline int64_t cpu_cycles() {
-// Rough way to detect new-ish CPUs
-#ifdef __POPCNT__
-    unsigned int dummy;
-    return __rdtscp(&dummy);
-#else
-    return __rdtsc();
-#endif
-}
-
-#else
-
-#define cpu_cycles() 0
-
-#endif
-
-
-// Generate synthetic data
-static void generate_data(float offset, size_t n, float * dst) {
-    for (size_t i = 0; i < n; i++) {
-        dst[i] = 0.1 + 2*cosf(i + offset);
-    }
-}
-
-static float gigabytes_per_second(size_t bytes, int64_t usecs) {
-    return bytes / (float) usecs * 1000000 / (1024*1024*1024);
-}
-
-static void * align_with_offset(void * ptr, int offset) {
-    size_t dummy_size = MAX_ALIGNMENT * 4;
-    return (char *) std::align(MAX_ALIGNMENT, MAX_ALIGNMENT, ptr, dummy_size) + offset;
-}
-
-static void benchmark_function(size_t size, size_t q_size, int64_t iterations, const std::function<float(void)> & func) {
-    int64_t min_time_us = INT64_MAX;
-    int64_t total_time_us = 0;
-    int64_t min_time_cycles = INT64_MAX;
-    int64_t total_time_cycles = 0;
-
-    for (int i = 0; i < WARMUP; i++) {
-        func();
-    }
-
-    for (int i = 0; i < iterations; i++) {
-        const int64_t start_time = ggml_time_us();
-        const int64_t start_cycles = cpu_cycles();
-
-        func();
-
-        const int64_t end_cycles = cpu_cycles();
-        const int64_t end_time = ggml_time_us();
-
-        total_time_cycles += end_cycles - start_cycles;
-        min_time_cycles = std::min(min_time_cycles, end_cycles - start_cycles);
-        total_time_us += end_time - start_time;
-        min_time_us = std::min(min_time_us, end_time - start_time);
-    }
-
-    printf("      min cycles/%d vals   : %9.2f\n",  QK, QK * min_time_cycles / (float) size);
-    printf("      avg cycles/%d vals   : %9.2f\n",  QK, QK * total_time_cycles / (float) (size * iterations));
-    printf("      float32 throughput   : %9.2f GB/s\n",  gigabytes_per_second(4 * size * iterations, total_time_us));
-    printf("      quantized throughput : %9.2f GB/s\n",  gigabytes_per_second(q_size * iterations, total_time_us));
-}
-
-static void usage(char * argv[]) {
-    printf("Benchmark quantization specific functions on synthetic data\n");
-    printf("\n");
-    printf("usage: %s [options]\n", argv[0]);
-    printf("\n");
-    printf("options: (default)\n");
-    printf("  -h, --help            show this help message and exit\n");
-    printf("  --size SIZE           set test size, divisible by 32 (L1_SIZE:%d)\n", L1_SIZE);
-    printf("  -3                    use size as L1, L2, L3 sizes (L1:%d L2:%d L3:%d)\n", L1_SIZE, L2_SIZE, L3_SIZE);
-    printf("  -4                    use size as L1, L2, L3, MEM sizes (L1:%d L2:%d L3:%d MEM:%d)\n", L1_SIZE, L2_SIZE, L3_SIZE, MEM_SIZE);
-    printf("  --op OP               set test operation as quantize_row_q_reference, quantize_row_q, dequantize_row_q,\n");
-    printf("                        quantize_row_q_dot, vec_dot_q (all)\n");
-    printf("  --type TYPE           set test type as");
-    for (int i = 0; i < GGML_TYPE_COUNT; i++) {
-        ggml_type type = (ggml_type) i;
-        ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
-        if (ggml_type_name(type) != NULL) {
-            if (qfns.from_float && qfns.to_float) {
-                printf(" %s", ggml_type_name(type));
-            }
-        }
-    }
-    printf(" (all)\n");
-    printf("  --alignment-offset OFFSET\n");
-    printf("                        set alignment offset as OFFSET (0)\n");
-    printf("  -i NUM, --iterations NUM\n");
-    printf("                        set test iteration number (%d)\n", ITERATIONS);
-}
-
-int main(int argc, char * argv[]) {
-    quantize_perf_params params {};
-
-    // read command line
-
-    bool invalid_param = false;
-    std::string arg;
-    for (int i = 1; i < argc; i++) {
-        arg = argv[i];
-
-        if (arg == "--size") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            size_t size = std::stoi(argv[i]);
-            if (size % 32 != 0) {
-                fprintf(stderr, "error: size %zu not divisible by 32\n", size);
-                invalid_param = true;
-                break;
-            }
-            params.test_sizes.push_back(size);
-        } else if (arg == "-3") {
-            // quick select sizes that probably fit in CPU caches
-            params.test_sizes.push_back(L1_SIZE);
-            params.test_sizes.push_back(L2_SIZE);
-            params.test_sizes.push_back(L3_SIZE);
-        } else if (arg == "-4") {
-            // quick select cache sizes + memory
-            params.test_sizes.push_back(L1_SIZE);
-            params.test_sizes.push_back(L2_SIZE);
-            params.test_sizes.push_back(L3_SIZE);
-            params.test_sizes.push_back(MEM_SIZE);
-        } else if (arg == "--op") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            std::string op {argv[i]};
-            if (op == "quantize_row_q_reference") {
-                params.op_quantize_row_q_reference = true;
-            } else if (op == "quantize_row_q") {
-                params.op_quantize_row_q = true;
-            } else if (op == "dequantize_row_q") {
-                params.op_dequantize_row_q = true;
-            } else if (op == "quantize_row_q_dot") {
-                params.op_quantize_row_q_dot = true;
-            } else if (op == "vec_dot_q") {
-                params.op_vec_dot_q = true;
-            } else {
-                invalid_param = true;
-                break;
-            }
-        } else if (arg == "--type") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.include_types.push_back(argv[i]);
-        } else if (arg == "--alignment-offset") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            int alignment = std::stoi(argv[i]);
-            if (alignment < 0 || alignment > MAX_ALIGNMENT) {
-            fprintf(stderr, "error: alignment-offset must be less than %d\n", MAX_ALIGNMENT);
-                invalid_param = true;
-                break;
-            }
-            params.alignment_offset = alignment;
-        } else if ((arg == "-i") || (arg == "--iterations")) {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            int number = std::stoi(argv[i]);
-            if (number < 0 || number > MAX_ITERATIONS) {
-            fprintf(stderr, "error: iterations must be less than %d\n", MAX_ITERATIONS);
-                invalid_param = true;
-                break;
-            }
-            params.iterations = number;
-        } else if ((arg == "-h") || (arg == "--help")) {
-            usage(argv);
-            return 1;
-        } else {
-            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
-            return 1;
-        }
-    }
-    if (invalid_param) {
-        fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
-        return 1;
-    }
-
-    if (params.test_sizes.empty()) {
-        params.test_sizes.push_back(L1_SIZE);
-    }
-    if (!(params.op_quantize_row_q_reference || params.op_quantize_row_q || params.op_dequantize_row_q || params.op_quantize_row_q_dot || params.op_vec_dot_q)) {
-        params.op_quantize_row_q_reference = params.op_quantize_row_q = params.op_dequantize_row_q = params.op_quantize_row_q_dot = params.op_vec_dot_q = true;
-    }
-
-    std::sort(params.test_sizes.begin(), params.test_sizes.end());
-    size_t largest = params.test_sizes.back();
-
-    std::vector<uint8_t> test_data1_v(largest*4 + MAX_ALIGNMENT*2);
-    std::vector<uint8_t> test_data2_v(largest*4 + MAX_ALIGNMENT*2);
-    std::vector<uint8_t> test_q1_v   (largest*4 + MAX_ALIGNMENT*2);
-    std::vector<uint8_t> test_q2_v   (largest*4 + MAX_ALIGNMENT*2);
-    std::vector<uint8_t> test_out_v  (largest*4 + MAX_ALIGNMENT*2);
-
-    float * test_data1 = (float *) align_with_offset(test_data1_v.data(), params.alignment_offset);
-    float * test_data2 = (float *) align_with_offset(test_data2_v.data(), params.alignment_offset);
-    float * test_q1    = (float *) align_with_offset(test_q1_v.data(),    params.alignment_offset);
-    float * test_q2    = (float *) align_with_offset(test_q2_v.data(),    params.alignment_offset);
-    float * test_out   = (float *) align_with_offset(test_out_v.data(),   params.alignment_offset);
-
-    generate_data(0, largest, test_data1);
-    generate_data(1, largest, test_data2);
-
-    int64_t iterations = params.iterations;
-
-
-    // Initialize GGML, ensures float conversion tables are initialized
-    struct ggml_init_params ggml_params = {
-        /* .mem_size   = */ 1*1024,
-        /* .mem_buffer = */ NULL,
-        /* .no_alloc   = */ true,
-    };
-    struct ggml_context * ctx = ggml_init(ggml_params);
-
-    for (int i = 0; i < GGML_TYPE_COUNT; i++) {
-        ggml_type type = (ggml_type) i;
-        ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
-        if (!params.include_types.empty() && ggml_type_name(type) && std::find(params.include_types.begin(), params.include_types.end(), ggml_type_name(type)) == params.include_types.end()) {
-            continue;
-        }
-
-        if (qfns.from_float && qfns.to_float) {
-            printf("%s\n", ggml_type_name(type));
-
-            if (params.op_quantize_row_q_reference) {
-                printf("  quantize_row_q_reference\n");
-                for (size_t size : params.test_sizes) {
-                    printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
-                    auto quantize_fn = [&](void) -> float {
-                        qfns.from_float_reference(test_data1, test_q1, size);
-                        return test_q1[0];
-                    };
-                    size_t quantized_size = ggml_row_size(type, size);
-                    benchmark_function(size, quantized_size, iterations, quantize_fn);
-                }
-                printf("\n");
-            }
-
-            if (params.op_quantize_row_q) {
-                printf("  quantize_row_q\n");
-                for (size_t size : params.test_sizes) {
-                    printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
-                    auto quantize_fn = [&](void) -> float {
-                        qfns.from_float(test_data1, test_q1, size);
-                        return test_q1[0];
-                    };
-                    size_t quantized_size = ggml_row_size(type, size);
-                    benchmark_function(size, quantized_size, iterations, quantize_fn);
-                }
-                printf("\n");
-            }
-
-            if (params.op_dequantize_row_q) {
-                printf("  dequantize_row_q\n");
-                qfns.from_float(test_data1, test_q1, largest);
-                for (size_t size : params.test_sizes) {
-                    printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
-                    auto quantize_fn = [&](void) -> float {
-                        qfns.to_float(test_q1, test_out, size);
-                        return test_out[0];
-                    };
-                    size_t quantized_size = ggml_row_size(type, size);
-                    benchmark_function(size, quantized_size, iterations, quantize_fn);
-                }
-                printf("\n");
-            }
-
-            if (params.op_quantize_row_q_dot) {
-                printf("  quantize_row_q_dot\n");
-                for (size_t size : params.test_sizes) {
-                    printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
-                    auto quantize_fn = [&](void) -> float {
-                        auto vdot = ggml_internal_get_type_traits(qfns.vec_dot_type);
-                        vdot.from_float(test_data1, test_q1, size);
-                        return test_q1[0];
-                    };
-                    size_t quantized_size = ggml_row_size(type, size);
-                    benchmark_function(size, quantized_size, iterations, quantize_fn);
-                }
-                printf("\n");
-            }
-
-            if (params.op_vec_dot_q) {
-                printf("  vec_dot_q\n");
-                qfns.from_float(test_data1, test_q1, largest);
-                qfns.from_float(test_data2, test_q2, largest);
-                for (size_t size : params.test_sizes) {
-                    printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
-                    auto quantize_fn = [&](void) -> float {
-                        float result;
-                        qfns.vec_dot(size, &result, test_q1, test_q2);
-                        return result;
-                    };
-                    size_t quantized_size = ggml_row_size(type, size);
-                    benchmark_function(size, quantized_size, iterations, quantize_fn);
-                }
-                printf("\n");
-            }
-        }
-    }
-
-    ggml_free(ctx);
-
-    return 0;
-}
diff --git a/ggml/tests/test-rel-pos.c b/ggml/tests/test-rel-pos.c
deleted file mode 100644
index 47c8438..0000000
--- a/ggml/tests/test-rel-pos.c
+++ /dev/null
@@ -1,86 +0,0 @@
-#include "ggml/ggml.h"
-
-#include <string.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-struct ggml_context* make_ctx(void) {
-    struct ggml_init_params params = {
-        .mem_size = 2 * 1024 * 1024,
-    };
-
-    return ggml_init(params);
-}
-
-void check_tensor(struct ggml_tensor * t, float * expected_t_d, int ne0, int ne1, int ne2) {
-    GGML_ASSERT(t->type == GGML_TYPE_F32);
-    GGML_ASSERT(t->ne[0] == ne0);
-    GGML_ASSERT(t->ne[1] == ne1);
-    GGML_ASSERT(t->ne[2] == ne2);
-    for (int i2 = 0; i2 < ne2; ++i2) {
-        for (int i1 = 0; i1 < ne1; ++i1) {
-            for (int i0 = 0; i0 < ne0; ++i0) {
-                float expected = *(expected_t_d + i2 * ne1 * ne0 + i1 * ne0 + i0);
-                float actual = ggml_get_data_f32(t)[i2 * ne1 * ne0 + i1 * ne0 + i0];
-                GGML_ASSERT(expected == actual);
-            }
-        }
-    }
-}
-
-int main(int argc, const char** argv) {
-    ggml_fp16_t buf_f16[1024];
-    for (int i = 0; i < 1024; ++i) {
-        buf_f16[i] = ggml_fp32_to_fp16((float)i);
-    }
-
-    float expected_out[4][9] = {
-        { 8.0, 9.0, 10.0, 9.0, 10.0, 11.0, 10.0, 11.0, 12.0 },
-        { 2.0, 3.0, 4.0, 3.0, 4.0, 5.0, 4.0, 5.0, 6.0 },
-        { 14.0, 15.0, 16.0, 15.0, 16.0, 17.0, 16.0, 17.0, 18.0 },
-        { 8.0, 9.0, 10.0, 9.0, 10.0, 11.0, 10.0, 11.0, 12.0 },
-    };
-
-    {
-        struct ggml_context * ctx = make_ctx();
-
-
-        struct ggml_tensor * t = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, 3, 3);
-        ggml_fp16_t* t_d = (ggml_fp16_t*)t->data;
-        memcpy(t_d, buf_f16, ggml_nbytes(t));
-
-        struct ggml_tensor * t_2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, 3, 3);
-        ggml_fp16_t* t_d_2 = (ggml_fp16_t*)t_2->data;
-        memcpy(t_d_2, buf_f16 + 1, ggml_nbytes(t_2));
-
-        struct ggml_tensor * rw = ggml_get_rel_pos(ctx, t, 2, 2);
-        struct ggml_tensor * rh = ggml_get_rel_pos(ctx, t_2, 2, 2);
-
-        struct ggml_tensor * rw_f32 = ggml_cpy(ctx, rw, ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 3, 2, 2));
-        struct ggml_tensor * rh_f32 = ggml_cpy(ctx, rh, ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 3, 2, 2));
-
-        struct ggml_tensor * in = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 9, 4);
-        struct ggml_tensor * out_inplace = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 9, 4);
-        float * in_d          = (float*)in->data;
-        float * out_inplace_d = (float*)out_inplace->data;
-        for (int i = 0; i < ggml_nelements(in); ++i) {
-            in_d[i]          = 1.f;
-            out_inplace_d[i] = 1.f;
-        }
-
-        struct ggml_tensor * out = ggml_add_rel_pos(ctx, in, rw_f32, rh_f32);
-        struct ggml_cgraph * gf = ggml_new_graph(ctx);
-        ggml_build_forward_expand(gf, out);
-        ggml_graph_compute_with_ctx(ctx, gf, 1);
-
-        out_inplace = ggml_add_rel_pos_inplace(ctx, out_inplace, rw_f32, rh_f32);
-        struct ggml_cgraph * gf_2 = ggml_new_graph(ctx);
-        ggml_build_forward_expand(gf_2, out_inplace);
-        ggml_graph_compute_with_ctx(ctx, gf_2, 1);
-
-        check_tensor(out, (float*)expected_out, 9, 4, 1);
-        check_tensor(out_inplace, (float*)expected_out, 9, 4, 1);
-    }
-
-    return 0;
-}
diff --git a/ggml/tests/test-svd0.c b/ggml/tests/test-svd0.c
deleted file mode 100644
index 8160bd3..0000000
--- a/ggml/tests/test-svd0.c
+++ /dev/null
@@ -1,218 +0,0 @@
-// SVD dimensionality reduction
-
-#include <float.h>
-#include <stdint.h>
-#include <stdio.h>
-#include <assert.h>
-#include <stdlib.h>
-#include <string.h>
-#include <time.h>
-#include <math.h>
-
-#include <sys/time.h>
-
-#ifdef GGML_USE_ACCELERATE
-#include <Accelerate/Accelerate.h>
-#endif
-
-float frand(void) {
-    return (float) rand() / (float) RAND_MAX;
-}
-
-//int sgesvd_(char *__jobu, char *__jobvt, __CLPK_integer *__m,
-//        __CLPK_integer *__n, __CLPK_real *__a, __CLPK_integer *__lda,
-//        __CLPK_real *__s, __CLPK_real *__u, __CLPK_integer *__ldu,
-//        __CLPK_real *__vt, __CLPK_integer *__ldvt, __CLPK_real *__work,
-//        __CLPK_integer *__lwork,
-//        __CLPK_integer *__info)
-
-int main(int argc, const char ** argv) {
-    int m = 10;
-    int n = 5;
-
-    float * A  = malloc(n * m * sizeof(float));
-    float * A0 = malloc(n * m * sizeof(float));
-
-    for (int i = 0; i < n; ++i) {
-        for (int j = 0; j < m; ++j) {
-            A[i * m + j] = (float) (10.0f*(i + 1) + 1.0f * frand());
-            //A[i * m + j] = (float) (10.0f*(i%2 + 1) + 0.1f * frand());
-            //if (i == 2) {
-            //    A[i * m + j] += 20*frand();
-            //}
-            if ((i == 1 || i == 3) && j > m/2) {
-                A[i * m + j] = -A[i * m + j];
-            }
-        }
-    }
-
-    // average vector
-    //float * M = malloc(m * sizeof(float));
-
-    //{
-    //    for (int j = 0; j < m; ++j) {
-    //        M[j] = 0.0f;
-    //    }
-    //    for (int i = 0; i < n; ++i) {
-    //        for (int j = 0; j < m; ++j) {
-    //            M[j] += A[i * m + j];
-    //        }
-    //    }
-    //    for (int j = 0; j < m; ++j) {
-    //        M[j] /= (float) n;
-    //    }
-    //}
-
-    //// subtract average vector
-    //for (int i = 0; i < n; ++i) {
-    //    for (int j = 0; j < m; ++j) {
-    //        A[i * m + j] -= M[j];
-    //    }
-    //}
-
-    memcpy(A0, A, n * m * sizeof(float));
-
-    // print A
-    printf("A:\n");
-    for (int i = 0; i < n; ++i) {
-        printf("col %d : ", i);
-        for (int j = 0; j < m; ++j) {
-            printf("%9.5f ", A[i * m + j]);
-        }
-        printf("\n");
-    }
-    printf("\n");
-
-    // SVD
-    // A = U * S * V^T
-
-    float * U = malloc(n * m * sizeof(float));
-    float * S = malloc(n * sizeof(float));
-    float * V = malloc(n * n * sizeof(float));
-
-    int lda = m;
-    int ldu = m;
-    int ldvt = n;
-
-    float work_size;
-    int lwork = -1;
-    int info = 0;
-
-    sgesvd_("S", "S", &m, &n, A, &lda, S, U, &ldu, V, &ldvt, &work_size, &lwork, &info);
-
-    lwork = (int) work_size;
-
-    printf("work_size = %f, info = %d, lwork = %d\n", work_size, info, lwork);
-
-    float * work = malloc(lwork * sizeof(float));
-
-    sgesvd_("S", "S", &m, &n, A, &lda, S, U, &ldu, V, &ldvt, work, &lwork, &info);
-
-    // print U
-    printf("U:\n");
-    for (int i = 0; i < n; ++i) {
-        printf("col %d : ", i);
-        for (int j = 0; j < m; ++j) {
-            printf("%9.5f ", U[i * m + j]);
-        }
-        printf("\n");
-    }
-    printf("\n");
-
-    // normalize S
-    {
-        double sum = 0.0;
-        for (int i = 0; i < n; ++i) {
-            sum += S[i];
-        }
-        sum *= sqrt((double) m);
-        for (int i = 0; i < n; ++i) {
-            S[i] /= sum;
-        }
-    }
-
-    // print S
-    printf("S:\n");
-    for (int i = 0; i < n; ++i) {
-        printf("- %d = %9.5f\n", i, S[i]);
-    }
-    printf("\n");
-
-    // print V
-    printf("V:\n");
-    for (int i = 0; i < n; ++i) {
-        printf("col %d : ", i);
-        for (int j = 0; j < n; ++j) {
-            printf("%9.5f ", V[i * n + j]);
-        }
-        printf("\n");
-    }
-    printf("\n");
-
-    // print A
-    printf("A:\n");
-    for (int i = 0; i < n; ++i) {
-        printf("col %d : ", i);
-        for (int j = 0; j < m; ++j) {
-            printf("%9.5f ", A[i * m + j]);
-        }
-        printf("\n");
-    }
-    printf("\n");
-
-    // compute singular vectors in U
-    for (int i = 0; i < n; ++i) {
-        for (int j = 0; j < m; ++j) {
-            U[i * m + j] *= S[i];
-        }
-    }
-
-    // normalize U
-    for (int i = 0; i < n; ++i) {
-        double sum = 0.0;
-        for (int j = 0; j < m; ++j) {
-            sum += U[i * m + j] * U[i * m + j];
-        }
-        sum = sqrt(sum);
-        for (int j = 0; j < m; ++j) {
-            U[i * m + j] /= sum*sqrt((double) m);
-        }
-    }
-
-    // print U
-    printf("U:\n");
-    for (int i = 0; i < n; ++i) {
-        printf("col %d : ", i);
-        for (int j = 0; j < m; ++j) {
-            printf("%9.5f ", U[i * m + j]);
-        }
-        printf("\n");
-    }
-    printf("\n");
-
-
-    // project A0 onto U
-    float * A1 = malloc(n * n * sizeof(float));
-
-    for (int i = 0; i < n; ++i) {
-        for (int j = 0; j < n; ++j) {
-            A1[i * n + j] = 0.0f;
-            for (int k = 0; k < m; ++k) {
-                A1[i * n + j] += A0[i * m + k] * U[j * m + k];
-            }
-        }
-    }
-
-    // print A1
-    printf("A1:\n");
-    for (int i = 0; i < n; ++i) {
-        printf("col %d : ", i);
-        for (int j = 0; j < n; ++j) {
-            printf("%9.5f ", A1[i * n + j]);
-        }
-        printf("\n");
-    }
-    printf("\n");
-
-    return 0;
-}
diff --git a/ggml/tests/test-vec0.c b/ggml/tests/test-vec0.c
deleted file mode 100644
index fc7b8ee..0000000
--- a/ggml/tests/test-vec0.c
+++ /dev/null
@@ -1,133 +0,0 @@
-#include <stdio.h>
-#include <assert.h>
-#include <stdlib.h>
-#include <time.h>
-
-const int N = 1 << 14;
-const int M = 1 << 14;
-
-void mul_mat_vec_f32_0(
-    const float * src0,
-    const float * src1,
-    float * dst,
-    unsigned nrows,
-    unsigned ncols) {
-    for (unsigned i = 0; i < nrows; i++) {
-        float sum = 0.0f;
-        for (unsigned j = 0; j < ncols; j++) {
-            sum += src0[i*ncols + j]*src1[j];
-        }
-        dst[i] = sum;
-    }
-}
-#if defined(_MSC_VER)
-typedef float __declspec(align(32)) afloat;
-#else
-typedef float afloat __attribute__((__aligned__(32)));
-#endif
-void mul_mat_vec_f32_1(
-    const afloat *restrict src0,
-    const afloat *restrict src1,
-    afloat *restrict dst,
-    unsigned nrows,
-    unsigned ncols) {
-    for (unsigned i = 0; i < nrows; i++) {
-        const afloat * restrict row = src0 + i*ncols;
-        const afloat * restrict col = src1;
-
-        float sum = 0.0f;
-
-        for (unsigned j = 0; j < ncols; j++) {
-            sum += *row++ * *col++;
-        }
-
-        dst[i] = sum;
-
-        //float sum[8] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
-
-        //for (unsigned j = 0; j < ncols; j += 8) {
-        //    sum[0] += row[0]*col[0];
-        //    sum[1] += row[1]*col[1];
-        //    sum[2] += row[2]*col[2];
-        //    sum[3] += row[3]*col[3];
-        //    sum[4] += row[4]*col[4];
-        //    sum[5] += row[5]*col[5];
-        //    sum[6] += row[6]*col[6];
-        //    sum[7] += row[7]*col[7];
-
-        //    row += 8;
-        //    col += 8;
-        //}
-
-        //dst[i] = sum[0] + sum[1] + sum[2] + sum[3] + sum[4] + sum[5] + sum[6] + sum[7];
-    }
-}
-
-void mul_mat_vec_f32_2(
-    const void * src0,
-    const void * src1,
-    void * dst,
-    unsigned nrows,
-    unsigned ncols) {
-    void * d = dst;
-    for (unsigned i = 0; i < nrows; i++) {
-        float sum = 0.0f;
-
-        const char * row = (const char*)src0 + i*ncols*sizeof(float);
-        const char * col = (const char*)src1;
-        for (unsigned j = 0; j < ncols; j++) {
-            sum += (*(float *)row) * (*(float *)col);
-            row += sizeof(float);
-            col += sizeof(float);
-        }
-        *(float *)d = sum;
-        d = (char*)d + sizeof(float);
-    }
-}
-
-#if defined(_MSC_VER) || defined(__MINGW32__) || defined(__MINGW64__)
-void* aligned_alloc(size_t alignment, size_t size) {
-    return _aligned_malloc(size, alignment);
-}
-#endif
-
-int main(int argc, const char ** argv) {
-    //float * src0 = malloc(sizeof(float)*N*M);
-    //float * src1 = malloc(sizeof(float)*M);
-    //float * dst  = malloc(sizeof(float)*N);
-
-    afloat * src0 = (float *)(aligned_alloc(32, sizeof(float)*N*M));
-    afloat * src1 = (float *)(aligned_alloc(32, sizeof(float)*M));
-    afloat * dst  = (float *)(aligned_alloc(32, sizeof(float)*N));
-
-    for (int i = 0; i < N*M; i++) {
-        src0[i] = (afloat)i;
-    }
-
-    for (int i = 0; i < M; i++) {
-        src1[i] = (afloat)i;
-    }
-
-    const int nIter = 10;
-
-    const clock_t start = clock();
-
-    double sum = 0.0f;
-    for (int i = 0; i < nIter; i++) {
-        //mul_mat_vec_f32_0(src0, src1, dst, N, M);
-        mul_mat_vec_f32_1(src0, src1, dst, N, M);
-        //mul_mat_vec_f32_2(src0, src1, dst, N, M);
-        for (int  i = 0; i < N; i++) {
-            sum += dst[i];
-        }
-    }
-
-    {
-        const clock_t end = clock();
-        printf("%s: elapsed ticks: %ld\n", __func__, end - start);
-    }
-
-    printf("%f\n", sum);
-
-    return 0;
-}
diff --git a/ggml/tests/test-vec1.c b/ggml/tests/test-vec1.c
deleted file mode 100644
index 567cb06..0000000
--- a/ggml/tests/test-vec1.c
+++ /dev/null
@@ -1,576 +0,0 @@
-#include <stdint.h>
-#include <stdio.h>
-#include <assert.h>
-#include <stdlib.h>
-#include <time.h>
-#include <math.h>
-
-#include <sys/time.h>
-
-#include <immintrin.h>
-
-const int N = 1 << 14;
-const int M = 768;
-
-//
-// naive implementation
-//
-
-void mul_mat_vec_f32_0(
-    const float * restrict src0,
-    const float * restrict src1,
-    float * dst,
-    int nrows,
-    int ncols) {
-    for (int i = 0; i < nrows; i++) {
-        float sum = 0.0f;
-        for (int j = 0; j < ncols; j++) {
-            sum += src0[i*ncols + j]*src1[j];
-        }
-        dst[i] = sum;
-    }
-}
-
-//
-// SIMD with 8 32-bit floats
-//
-
-float reduce_vector8_0(__m256 v) {
-    __m128 v1 = _mm256_extractf128_ps(v, 0);
-    __m128 v2 = _mm256_extractf128_ps(v, 1);
-    __m128 v3 = _mm_add_ps(v1, v2);
-    __m128 v4 = _mm_shuffle_ps(v3, v3, 0x4e);
-    __m128 v5 = _mm_add_ps(v3, v4);
-    __m128 v6 = _mm_shuffle_ps(v5, v5, 0x11);
-    __m128 v7 = _mm_add_ps(v5, v6);
-    return _mm_cvtss_f32(v7);
-}
-
-// vectorized implementation using AVX
-void mul_mat_vec_f32_1(
-    const float * restrict src0,
-    const float * restrict src1,
-    float * dst,
-    int nrows,
-    int ncols) {
-
-    const int ncols8 = ncols & ~7;
-
-    for (int i = 0; i < nrows; i++) {
-        __m256 sum = _mm256_setzero_ps();
-        for (int j = 0; j < ncols8; j += 8) {
-            __m256 a = _mm256_loadu_ps(src0 + i*ncols + j);
-            __m256 b = _mm256_loadu_ps(src1 + j);
-            __m256 c = _mm256_mul_ps(a, b);
-            sum = _mm256_add_ps(sum, c);
-        }
-        dst[i] = reduce_vector8_0(sum);
-
-        for (int j = ncols8; j < ncols; j++) {
-            dst[i] += src0[i*ncols + j]*src1[j];
-        }
-    }
-}
-
-void mul_mat_vec_f32_2(
-    const float * restrict src0,
-    const float * restrict src1,
-    float * dst,
-    int nrows,
-    int ncols) {
-
-    const int ncols32 = ncols & ~31;
-
-    for (int i = 0; i < nrows; i++) {
-        __m256 sum0 = _mm256_setzero_ps();
-        __m256 sum1 = _mm256_setzero_ps();
-        __m256 sum2 = _mm256_setzero_ps();
-        __m256 sum3 = _mm256_setzero_ps();
-
-        const float * restrict src0_row = src0 + i*ncols;
-        for (int j = 0; j < ncols32; j += 32) {
-            __m256 a0 = _mm256_loadu_ps(src0_row + j + 0);
-            __m256 a1 = _mm256_loadu_ps(src0_row + j + 8);
-            __m256 a2 = _mm256_loadu_ps(src0_row + j + 16);
-            __m256 a3 = _mm256_loadu_ps(src0_row + j + 24);
-            __m256 b0 = _mm256_loadu_ps(src1 + j + 0);
-            __m256 b1 = _mm256_loadu_ps(src1 + j + 8);
-            __m256 b2 = _mm256_loadu_ps(src1 + j + 16);
-            __m256 b3 = _mm256_loadu_ps(src1 + j + 24);
-#if defined(__FMA__)
-            sum0 = _mm256_fmadd_ps(a0, b0, sum0);
-            sum1 = _mm256_fmadd_ps(a1, b1, sum1);
-            sum2 = _mm256_fmadd_ps(a2, b2, sum2);
-            sum3 = _mm256_fmadd_ps(a3, b3, sum3);
-#else
-            sum0 = _mm256_add_ps(_mm256_mul_ps(a0, b0), sum0);
-            sum1 = _mm256_add_ps(_mm256_mul_ps(a1, b1), sum1);
-            sum2 = _mm256_add_ps(_mm256_mul_ps(a2, b2), sum2);
-            sum3 = _mm256_add_ps(_mm256_mul_ps(a3, b3), sum3);
-#endif
-        }
-        dst[i] = reduce_vector8_0(_mm256_add_ps(_mm256_add_ps(sum0, sum1), _mm256_add_ps(sum2, sum3)));
-
-        for (int j = ncols32; j < ncols; j++) {
-            dst[i] += src0[i*ncols + j]*src1[j];
-        }
-    }
-}
-
-//
-// SIMD with 8 16-bit floats
-//
-
-static inline float fp32_from_bits(uint32_t w) {
-#if defined(__OPENCL_VERSION__)
-    return as_float(w);
-#elif defined(__CUDA_ARCH__)
-    return __uint_as_float((unsigned int) w);
-#elif defined(__INTEL_COMPILER)
-    return _castu32_f32(w);
-#elif defined(_MSC_VER) && (defined(_M_ARM) || defined(_M_ARM64))
-    return _CopyFloatFromInt32((__int32) w);
-#else
-    union {
-        uint32_t as_bits;
-        float as_value;
-    } fp32 = { w };
-    return fp32.as_value;
-#endif
-}
-
-static inline uint32_t fp32_to_bits(float f) {
-#if defined(__OPENCL_VERSION__)
-	return as_uint(f);
-#elif defined(__CUDA_ARCH__)
-	return (uint32_t) __float_as_uint(f);
-#elif defined(__INTEL_COMPILER)
-	return _castf32_u32(f);
-#elif defined(_MSC_VER) && (defined(_M_ARM) || defined(_M_ARM64))
-	return (uint32_t) _CopyInt32FromFloat(f);
-#else
-	union {
-		float as_value;
-		uint32_t as_bits;
-	} fp32 = { f };
-	return fp32.as_bits;
-#endif
-}
-
-/*
- * Convert a 16-bit floating-point number in IEEE half-precision format, in bit representation, to
- * a 32-bit floating-point number in IEEE single-precision format.
- *
- * @note The implementation relies on IEEE-like (no assumption about rounding mode and no operations on denormals)
- * floating-point operations and bitcasts between integer and floating-point variables.
- */
-static inline float fp16_ieee_to_fp32_value(uint16_t h) {
-    /*
-     * Extend the half-precision floating-point number to 32 bits and shift to the upper part of the 32-bit word:
-     *      +---+-----+------------+-------------------+
-     *      | S |EEEEE|MM MMMM MMMM|0000 0000 0000 0000|
-     *      +---+-----+------------+-------------------+
-     * Bits  31  26-30    16-25            0-15
-     *
-     * S - sign bit, E - bits of the biased exponent, M - bits of the mantissa, 0 - zero bits.
-     */
-    const uint32_t w = (uint32_t) h << 16;
-    /*
-     * Extract the sign of the input number into the high bit of the 32-bit word:
-     *
-     *      +---+----------------------------------+
-     *      | S |0000000 00000000 00000000 00000000|
-     *      +---+----------------------------------+
-     * Bits  31                 0-31
-     */
-    const uint32_t sign = w & UINT32_C(0x80000000);
-    /*
-     * Extract mantissa and biased exponent of the input number into the high bits of the 32-bit word:
-     *
-     *      +-----+------------+---------------------+
-     *      |EEEEE|MM MMMM MMMM|0 0000 0000 0000 0000|
-     *      +-----+------------+---------------------+
-     * Bits  27-31    17-26            0-16
-     */
-    const uint32_t two_w = w + w;
-
-    /*
-     * Shift mantissa and exponent into bits 23-28 and bits 13-22 so they become mantissa and exponent
-     * of a single-precision floating-point number:
-     *
-     *       S|Exponent |          Mantissa
-     *      +-+---+-----+------------+----------------+
-     *      |0|000|EEEEE|MM MMMM MMMM|0 0000 0000 0000|
-     *      +-+---+-----+------------+----------------+
-     * Bits   | 23-31   |           0-22
-     *
-     * Next, there are some adjustments to the exponent:
-     * - The exponent needs to be corrected by the difference in exponent bias between single-precision and half-precision
-     *   formats (0x7F - 0xF = 0x70)
-     * - Inf and NaN values in the inputs should become Inf and NaN values after conversion to the single-precision number.
-     *   Therefore, if the biased exponent of the half-precision input was 0x1F (max possible value), the biased exponent
-     *   of the single-precision output must be 0xFF (max possible value). We do this correction in two steps:
-     *   - First, we adjust the exponent by (0xFF - 0x1F) = 0xE0 (see exp_offset below) rather than by 0x70 suggested
-     *     by the difference in the exponent bias (see above).
-     *   - Then we multiply the single-precision result of exponent adjustment by 2**(-112) to reverse the effect of
-     *     exponent adjustment by 0xE0 less the necessary exponent adjustment by 0x70 due to difference in exponent bias.
-     *     The floating-point multiplication hardware would ensure than Inf and NaN would retain their value on at least
-     *     partially IEEE754-compliant implementations.
-     *
-     * Note that the above operations do not handle denormal inputs (where biased exponent == 0). However, they also do not
-     * operate on denormal inputs, and do not produce denormal results.
-     */
-    const uint32_t exp_offset = UINT32_C(0xE0) << 23;
-#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
-    const float exp_scale = 0x1.0p-112f;
-#else
-    const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
-#endif
-    const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
-
-    /*
-     * Convert denormalized half-precision inputs into single-precision results (always normalized).
-     * Zero inputs are also handled here.
-     *
-     * In a denormalized number the biased exponent is zero, and mantissa has on-zero bits.
-     * First, we shift mantissa into bits 0-9 of the 32-bit word.
-     *
-     *                  zeros           |  mantissa
-     *      +---------------------------+------------+
-     *      |0000 0000 0000 0000 0000 00|MM MMMM MMMM|
-     *      +---------------------------+------------+
-     * Bits             10-31                0-9
-     *
-     * Now, remember that denormalized half-precision numbers are represented as:
-     *    FP16 = mantissa * 2**(-24).
-     * The trick is to construct a normalized single-precision number with the same mantissa and thehalf-precision input
-     * and with an exponent which would scale the corresponding mantissa bits to 2**(-24).
-     * A normalized single-precision floating-point number is represented as:
-     *    FP32 = (1 + mantissa * 2**(-23)) * 2**(exponent - 127)
-     * Therefore, when the biased exponent is 126, a unit change in the mantissa of the input denormalized half-precision
-     * number causes a change of the constructud single-precision number by 2**(-24), i.e. the same ammount.
-     *
-     * The last step is to adjust the bias of the constructed single-precision number. When the input half-precision number
-     * is zero, the constructed single-precision number has the value of
-     *    FP32 = 1 * 2**(126 - 127) = 2**(-1) = 0.5
-     * Therefore, we need to subtract 0.5 from the constructed single-precision number to get the numerical equivalent of
-     * the input half-precision number.
-     */
-    const uint32_t magic_mask = UINT32_C(126) << 23;
-    const float magic_bias = 0.5f;
-    const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
-
-    /*
-     * - Choose either results of conversion of input as a normalized number, or as a denormalized number, depending on the
-     *   input exponent. The variable two_w contains input exponent in bits 27-31, therefore if its smaller than 2**27, the
-     *   input is either a denormal number, or zero.
-     * - Combine the result of conversion of exponent and mantissa with the sign of the input number.
-     */
-    const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
-    const uint32_t result = sign |
-        (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
-    return fp32_from_bits(result);
-}
-
-/*
- * Convert a 32-bit floating-point number in IEEE single-precision format to a 16-bit floating-point number in
- * IEEE half-precision format, in bit representation.
- *
- * @note The implementation relies on IEEE-like (no assumption about rounding mode and no operations on denormals)
- * floating-point operations and bitcasts between integer and floating-point variables.
- */
-static inline uint16_t fp16_ieee_from_fp32_value(float f) {
-#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
-    const float scale_to_inf = 0x1.0p+112f;
-    const float scale_to_zero = 0x1.0p-110f;
-#else
-    const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
-    const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
-#endif
-    float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
-
-    const uint32_t w = fp32_to_bits(f);
-    const uint32_t shl1_w = w + w;
-    const uint32_t sign = w & UINT32_C(0x80000000);
-    uint32_t bias = shl1_w & UINT32_C(0xFF000000);
-    if (bias < UINT32_C(0x71000000)) {
-        bias = UINT32_C(0x71000000);
-    }
-
-    base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
-    const uint32_t bits = fp32_to_bits(base);
-    const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
-    const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
-    const uint32_t nonsign = exp_bits + mantissa_bits;
-    return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
-}
-
-void mul_mat_vec_f16_0(
-    const uint16_t * src0,
-    const uint16_t * src1,
-             float * dst,
-    int nrows,
-    int ncols) {
-
-    const int ncols8 = ncols & ~7;
-
-    for (int i = 0; i < nrows; i++) {
-        __m256 sum = _mm256_setzero_ps();
-
-        const uint16_t * src0_row = src0 + i * ncols;
-        for (int j = 0; j < ncols8; j += 8) {
-            __m256 a = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src0_row + j)));
-            __m256 b = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src1 + j)));
-#if defined(__FMA__)
-            sum = _mm256_fmadd_ps(a, b, sum);
-#else
-            sum = _mm256_add_ps(_mm256_mul_ps(a, b), sum);
-#endif
-        }
-        dst[i] = reduce_vector8_0(sum);
-
-        for (int j = ncols8; j < ncols; j++) {
-            dst[i] += fp16_ieee_to_fp32_value(src0_row[j]) * fp16_ieee_to_fp32_value(src1[j]);
-        }
-    }
-}
-
-void mul_mat_vec_f16_1(
-    const uint16_t * src0,
-    const uint16_t * src1,
-             float * dst,
-    int nrows,
-    int ncols) {
-
-    const int ncols16 = ncols & ~15;
-
-    for (int i = 0; i < nrows; i++) {
-        __m256 sum0 = _mm256_setzero_ps();
-        __m256 sum1 = _mm256_setzero_ps();
-
-        const uint16_t * src0_row = src0 + i * ncols;
-        for (int j = 0; j < ncols16; j += 16) {
-            __m256 a0 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src0_row + j + 0)));
-            __m256 a1 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src0_row + j + 8)));
-            __m256 b0 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src1 + j)));
-            __m256 b1 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src1 + j + 8)));
-#if defined(__FMA__)
-            sum0 = _mm256_fmadd_ps(a0, b0, sum0);
-            sum1 = _mm256_fmadd_ps(a1, b1, sum1);
-#else
-            sum0 = _mm256_add_ps(_mm256_mul_ps(a0, b0), sum0);
-            sum1 = _mm256_add_ps(_mm256_mul_ps(a1, b1), sum1);
-#endif
-        }
-        dst[i] = reduce_vector8_0(sum0) + reduce_vector8_0(sum1);
-
-        for (int j = ncols16; j < ncols; j++) {
-            dst[i] += fp16_ieee_to_fp32_value(src0_row[j]) * fp16_ieee_to_fp32_value(src1[j]);
-        }
-    }
-}
-
-void mul_mat_vec_f16_2(
-    const uint16_t * src0,
-    const uint16_t * src1,
-             float * dst,
-    int nrows,
-    int ncols) {
-
-    const int ncols32 = ncols & ~31;
-
-    for (int i = 0; i < nrows; i++) {
-        __m256 sum0 = _mm256_setzero_ps();
-        __m256 sum1 = _mm256_setzero_ps();
-        __m256 sum2 = _mm256_setzero_ps();
-        __m256 sum3 = _mm256_setzero_ps();
-
-        const uint16_t * src0_row = src0 + i * ncols;
-        for (int j = 0; j < ncols32; j += 32) {
-            __m256 a0 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src0_row + j + 0)));
-            __m256 a1 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src0_row + j + 8)));
-            __m256 a2 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src0_row + j + 16)));
-            __m256 a3 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src0_row + j + 24)));
-            __m256 b0 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src1 + j)));
-            __m256 b1 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src1 + j + 8)));
-            __m256 b2 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src1 + j + 16)));
-            __m256 b3 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src1 + j + 24)));
-#if defined(__FMA__)
-            sum0 = _mm256_fmadd_ps(a0, b0, sum0);
-            sum1 = _mm256_fmadd_ps(a1, b1, sum1);
-            sum2 = _mm256_fmadd_ps(a2, b2, sum2);
-            sum3 = _mm256_fmadd_ps(a3, b3, sum3);
-#else
-            sum0 = _mm256_add_ps(_mm256_mul_ps(a0, b0), sum0);
-            sum1 = _mm256_add_ps(_mm256_mul_ps(a1, b1), sum1);
-            sum2 = _mm256_add_ps(_mm256_mul_ps(a2, b2), sum2);
-            sum3 = _mm256_add_ps(_mm256_mul_ps(a3, b3), sum3);
-#endif
-        }
-        dst[i] = reduce_vector8_0(sum0) + reduce_vector8_0(sum1) + reduce_vector8_0(sum2) + reduce_vector8_0(sum3);
-
-        for (int j = ncols32; j < ncols; j++) {
-            dst[i] += fp16_ieee_to_fp32_value(src0_row[j]) * fp16_ieee_to_fp32_value(src1[j]);
-        }
-    }
-}
-
-void mul_mat_vec_f16_3(
-    const uint16_t * src0,
-    const    float * src1,
-             float * dst,
-    int nrows,
-    int ncols) {
-
-    const int ncols32 = ncols & ~31;
-
-    for (int i = 0; i < nrows; i++) {
-        __m256 sum0 = _mm256_setzero_ps();
-        __m256 sum1 = _mm256_setzero_ps();
-        __m256 sum2 = _mm256_setzero_ps();
-        __m256 sum3 = _mm256_setzero_ps();
-
-        const uint16_t * src0_row = src0 + i * ncols;
-        for (int j = 0; j < ncols32; j += 32) {
-            __m256 a0 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src0_row + j + 0)));
-            __m256 a1 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src0_row + j + 8)));
-            __m256 a2 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src0_row + j + 16)));
-            __m256 a3 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(src0_row + j + 24)));
-            __m256 b0 = _mm256_loadu_ps(src1 + j);
-            __m256 b1 = _mm256_loadu_ps(src1 + j + 8);
-            __m256 b2 = _mm256_loadu_ps(src1 + j + 16);
-            __m256 b3 = _mm256_loadu_ps(src1 + j + 24);
-#if defined(__FMA__)
-            sum0 = _mm256_fmadd_ps(a0, b0, sum0);
-            sum1 = _mm256_fmadd_ps(a1, b1, sum1);
-            sum2 = _mm256_fmadd_ps(a2, b2, sum2);
-            sum3 = _mm256_fmadd_ps(a3, b3, sum3);
-#else
-            sum0 = _mm256_add_ps(_mm256_mul_ps(a0, b0), sum0);
-            sum1 = _mm256_add_ps(_mm256_mul_ps(a1, b1), sum1);
-            sum2 = _mm256_add_ps(_mm256_mul_ps(a2, b2), sum2);
-            sum3 = _mm256_add_ps(_mm256_mul_ps(a3, b3), sum3);
-#endif
-        }
-        dst[i] = reduce_vector8_0(sum0) + reduce_vector8_0(sum1) + reduce_vector8_0(sum2) + reduce_vector8_0(sum3);
-
-        for (int j = ncols32; j < ncols; j++) {
-            dst[i] += fp16_ieee_to_fp32_value(src0_row[j]) * fp16_ieee_to_fp32_value(src1[j]);
-        }
-    }
-}
-
-uint64_t get_time_us(void) {
-    struct timeval tv;
-    gettimeofday(&tv, NULL);
-    return tv.tv_sec * 1000000 + tv.tv_usec;
-}
-
-int main(int argc, const char ** argv) {
-    float * src0 = malloc(sizeof(float)*N*M);
-    float * src1 = malloc(sizeof(float)*M);
-    float * dst  = malloc(sizeof(float)*N);
-
-    //float * src0 = (float *)(aligned_alloc(64, sizeof(float)*N*M));
-    //float * src1 = (float *)(aligned_alloc(64, sizeof(float)*M));
-    //float * dst  = (float *)(aligned_alloc(64, sizeof(float)*N));
-
-    for (int i = 0; i < N*M; i++) {
-        src0[i] = rand() / (float)RAND_MAX;
-    }
-
-    for (int i = 0; i < M; i++) {
-        src1[i] = rand() / (float)RAND_MAX;
-    }
-
-    // convert src0 and src1 to __fp16
-    uint16_t * src0_fp16 = (uint16_t *)(malloc(sizeof(uint16_t)*N*M));
-    uint16_t * src1_fp16 = (uint16_t *)(malloc(sizeof(uint16_t)*M));
-    //uint16_t * src0_fp16 = (uint16_t *)(aligned_alloc(64, sizeof(uint16_t)*N*M));
-    //uint16_t * src1_fp16 = (uint16_t *)(aligned_alloc(64, sizeof(uint16_t)*M));
-
-    {
-        const uint64_t t_start = get_time_us();
-
-        for (int i = 0; i < N*M; i++) {
-            src0_fp16[i] = fp16_ieee_from_fp32_value(src0[i]);
-            //printf("%f %f\n", src0[i], fp16_ieee_to_fp32_value(src0_fp16[i]));
-            //assert(!isnan(fp16_ieee_to_fp32_value(src0_fp16[i])));
-        }
-
-        for (int i = 0; i < M; i++) {
-            src1_fp16[i] = fp16_ieee_from_fp32_value(src1[i]);
-        }
-
-        const uint64_t t_end = get_time_us();
-        printf("convert time: %f ms\n", (t_end - t_start) / 1000.0);
-    }
-
-    for (int i = 0; i < 16; ++i) {
-        printf("%f %f\n", src0[i], fp16_ieee_to_fp32_value(src0_fp16[i]));
-    }
-
-    int method = 0;
-    if (argc > 1) {
-        method = atoi(argv[1]);
-    }
-
-    const int nIter = 1000;
-
-    const clock_t start = clock();
-    const uint64_t start_us = get_time_us();
-
-    double iM = 1.0/M;
-    double sum = 0.0f;
-    for (int i = 0; i < nIter; i++) {
-        if (method == 0) {
-            mul_mat_vec_f32_0(src0, src1, dst, N, M);
-        }
-
-        if (method == 1) {
-            mul_mat_vec_f32_1(src0, src1, dst, N, M);
-        }
-
-        if (method == 2) {
-            mul_mat_vec_f32_2(src0, src1, dst, N, M);
-        }
-
-        if (method == 3) {
-            mul_mat_vec_f16_0(src0_fp16, src1_fp16, dst, N, M);
-        }
-
-        if (method == 4) {
-            mul_mat_vec_f16_1(src0_fp16, src1_fp16, dst, N, M);
-        }
-
-        if (method == 5) {
-            mul_mat_vec_f16_2(src0_fp16, src1_fp16, dst, N, M);
-        }
-
-        if (method == 6) {
-            mul_mat_vec_f16_3(src0_fp16, src1, dst, N, M);
-        }
-    }
-
-    for (int i = 0; i < N; i++) {
-        sum += dst[i]*iM;
-    }
-
-    {
-        const clock_t end = clock();
-        const uint64_t end_us = get_time_us();
-        printf("%s: elapsed ticks: %ld\n", __func__, end - start);
-        printf("%s: elapsed us: %ld\n", __func__, end_us - start_us);
-    }
-
-    printf("%f\n", sum);
-
-    free(src0);
-    free(src1);
-    free(dst);
-
-    free(src0_fp16);
-    free(src1_fp16);
-
-    return 0;
-}
diff --git a/ggml/tests/test-vec2.c b/ggml/tests/test-vec2.c
deleted file mode 100644
index 4fa364c..0000000
--- a/ggml/tests/test-vec2.c
+++ /dev/null
@@ -1,268 +0,0 @@
-#include <stdint.h>
-#include <stdio.h>
-#include <assert.h>
-#include <stdlib.h>
-#include <time.h>
-#include <math.h>
-
-#include <sys/time.h>
-
-#include <arm_neon.h>
-
-const int N = 1 << 12;
-const int M = 1 << 12;
-
-//
-// naive implementation
-//
-
-void mul_mat_vec_f32_0(
-    const float * restrict src0,
-    const float * restrict src1,
-    float * dst,
-    int nrows,
-    int ncols) {
-    for (int i = 0; i < nrows; i++) {
-        float sum = 0.0f;
-        for (int j = 0; j < ncols; j++) {
-            sum += src0[i*ncols + j]*src1[j];
-        }
-        dst[i] = sum;
-    }
-}
-
-void mul_mat_vec_f16_0(
-    const __fp16 * src0,
-    const __fp16 * src1,
-           float * dst,
-    int nrows,
-    int ncols) {
-
-    const int n64 = ncols & ~63;
-
-    for (int r = 0; r < nrows; r++) {
-        float sumf = 0.0;
-
-        float16x8_t sum0 = vdupq_n_f16(0.0f);
-        float16x8_t sum1 = vdupq_n_f16(0.0f);
-        float16x8_t sum2 = vdupq_n_f16(0.0f);
-        float16x8_t sum3 = vdupq_n_f16(0.0f);
-        float16x8_t sum4 = vdupq_n_f16(0.0f);
-        float16x8_t sum5 = vdupq_n_f16(0.0f);
-        float16x8_t sum6 = vdupq_n_f16(0.0f);
-        float16x8_t sum7 = vdupq_n_f16(0.0f);
-
-        float16x8_t x0, x1, x2, x3, x4, x5, x6, x7;
-        float16x8_t y0, y1, y2, y3, y4, y5, y6, y7;
-
-        const __fp16 * restrict p0 = src0 + r*ncols;
-
-        for (int i = 0; i < n64; i += 64) {
-            x0 = vld1q_f16(p0 + i + 0 );
-            x1 = vld1q_f16(p0 + i + 8 );
-            x2 = vld1q_f16(p0 + i + 16);
-            x3 = vld1q_f16(p0 + i + 24);
-            x4 = vld1q_f16(p0 + i + 32);
-            x5 = vld1q_f16(p0 + i + 40);
-            x6 = vld1q_f16(p0 + i + 48);
-            x7 = vld1q_f16(p0 + i + 56);
-
-            y0 = vld1q_f16(src1 + i + 0 );
-            y1 = vld1q_f16(src1 + i + 8 );
-            y2 = vld1q_f16(src1 + i + 16);
-            y3 = vld1q_f16(src1 + i + 24);
-            y4 = vld1q_f16(src1 + i + 32);
-            y5 = vld1q_f16(src1 + i + 40);
-            y6 = vld1q_f16(src1 + i + 48);
-            y7 = vld1q_f16(src1 + i + 56);
-
-            sum0 = vfmaq_f16(sum0, x0, y0);
-            sum1 = vfmaq_f16(sum1, x1, y1);
-            sum2 = vfmaq_f16(sum2, x2, y2);
-            sum3 = vfmaq_f16(sum3, x3, y3);
-            sum4 = vfmaq_f16(sum4, x4, y4);
-            sum5 = vfmaq_f16(sum5, x5, y5);
-            sum6 = vfmaq_f16(sum6, x6, y6);
-            sum7 = vfmaq_f16(sum7, x7, y7);
-        }
-
-        // TODO: F16 - better way to reduce this ?
-        float16x8_t sum = vaddq_f16(sum0, sum1);
-
-        sum = vaddq_f16(sum, sum2);
-        sum = vaddq_f16(sum, sum3);
-        sum = vaddq_f16(sum, sum4);
-        sum = vaddq_f16(sum, sum5);
-        sum = vaddq_f16(sum, sum6);
-        sum = vaddq_f16(sum, sum7);
-
-        sumf += sum[0] + sum[1] + sum[2] + sum[3] + sum[4] + sum[5] + sum[6] + sum[7];
-
-        for (int j = n64; j < n64; j++) {
-            sumf += src0[r*ncols + j]*src1[j];
-        }
-
-        dst[r] = sumf;
-    }
-}
-
-void mul_mat_vec_f16_1(
-    const __fp16 * src0,
-    const __fp16 * src1,
-           float * dst,
-    int nrows,
-    int ncols) {
-
-    const int n32 = ncols & ~31;
-
-    for (int r = 0; r < nrows; r++) {
-        float sumf = 0.0;
-
-        float16x8_t sum0 = vdupq_n_f16(0.0f);
-        float16x8_t sum1 = vdupq_n_f16(0.0f);
-        float16x8_t sum2 = vdupq_n_f16(0.0f);
-        float16x8_t sum3 = vdupq_n_f16(0.0f);
-
-        float16x8_t x0, x1, x2, x3;
-        float16x8_t y0, y1, y2, y3;
-
-        const __fp16 * restrict p0 = src0 + r*ncols;
-
-        for (int i = 0; i < n32; i += 32) {
-            x0 = vld1q_f16(p0 + i + 0 );
-            x1 = vld1q_f16(p0 + i + 8 );
-            x2 = vld1q_f16(p0 + i + 16);
-            x3 = vld1q_f16(p0 + i + 24);
-
-            y0 = vld1q_f16(src1 + i + 0 );
-            y1 = vld1q_f16(src1 + i + 8 );
-            y2 = vld1q_f16(src1 + i + 16);
-            y3 = vld1q_f16(src1 + i + 24);
-
-            sum0 = vfmaq_f16(sum0, x0, y0);
-            sum1 = vfmaq_f16(sum1, x1, y1);
-            sum2 = vfmaq_f16(sum2, x2, y2);
-            sum3 = vfmaq_f16(sum3, x3, y3);
-        }
-
-        // reduce sum0..sum3 to sum0
-        sum0 = vaddq_f16(sum0, sum1);
-        sum2 = vaddq_f16(sum2, sum3);
-        sum0 = vaddq_f16(sum0, sum2);
-
-        // load sum0 into 2 float32x4_t
-        float32x4_t sum0f32 = vcvt_f32_f16(vget_low_f16(sum0));
-        float32x4_t sum1f32 = vcvt_f32_f16(vget_high_f16(sum0));
-
-        // reduce sum0f32 and sum1f32 to sumf
-        sum0f32 = vaddq_f32(sum0f32, sum1f32);
-
-        float32x2_t sumf32 = vadd_f32(vget_low_f32(sum0f32), vget_high_f32(sum0f32));
-        sumf = vget_lane_f32(sumf32, 0) + vget_lane_f32(sumf32, 1);
-
-        //sumf = sum0[0] + sum0[1] + sum0[2] + sum0[3] + sum0[4] + sum0[5] + sum0[6] + sum0[7];
-
-        for (int j = n32; j < n32; j++) {
-            sumf += src0[r*ncols + j]*src1[j];
-        }
-
-        dst[r] = sumf;
-    }
-}
-
-uint64_t get_time_us(void) {
-    struct timeval tv;
-    gettimeofday(&tv, NULL);
-    return tv.tv_sec * 1000000 + tv.tv_usec;
-}
-
-int main(int argc, const char ** argv) {
-    float * src0 = malloc(sizeof(float)*N*M);
-    float * src1 = malloc(sizeof(float)*M);
-    float * dst  = malloc(sizeof(float)*N);
-
-    //float * src0 = (float *)(aligned_alloc(64, sizeof(float)*N*M));
-    //float * src1 = (float *)(aligned_alloc(64, sizeof(float)*M));
-    //float * dst  = (float *)(aligned_alloc(64, sizeof(float)*N));
-
-    for (int i = 0; i < N*M; i++) {
-        src0[i] = rand() / (float)RAND_MAX;
-    }
-
-    for (int i = 0; i < M; i++) {
-        src1[i] = rand() / (float)RAND_MAX;
-    }
-
-    // convert src0 and src1 to __fp16
-    __fp16 * src0_fp16 = (__fp16 *)(malloc(sizeof(__fp16)*N*M));
-    __fp16 * src1_fp16 = (__fp16 *)(malloc(sizeof(__fp16)*M));
-
-    {
-        const uint64_t t_start = get_time_us();
-
-        for (int i = 0; i < N*M; i++) {
-            src0_fp16[i] = src0[i];
-            //printf("%f %f\n", src0[i], src0_fp16[i]);
-            //assert(!isnan(src0_fp16[i]));
-        }
-
-        for (int i = 0; i < M; i++) {
-            src1_fp16[i] = src1[i];
-        }
-
-        const uint64_t t_end = get_time_us();
-        printf("convert time: %f ms\n", (t_end - t_start) / 1000.0);
-    }
-
-    for (int i = 0; i < 16; ++i) {
-        printf("%f %f\n", src0[i], src0_fp16[i]);
-    }
-
-    int method = 0;
-    if (argc > 1) {
-        method = atoi(argv[1]);
-    }
-
-    const int nIter = 1000;
-
-    const clock_t start = clock();
-    const uint64_t start_us = get_time_us();
-
-    double iM = 1.0/M;
-    double sum = 0.0f;
-    for (int i = 0; i < nIter; i++) {
-        if (method == 0) {
-            mul_mat_vec_f32_0(src0, src1, dst, N, M);
-        }
-
-        if (method == 1) {
-            mul_mat_vec_f16_0(src0_fp16, src1_fp16, dst, N, M);
-        }
-
-        if (method == 2) {
-            mul_mat_vec_f16_1(src0_fp16, src1_fp16, dst, N, M);
-        }
-    }
-
-    for (int i = 0; i < N; i++) {
-        sum += dst[i]*iM;
-    }
-
-    {
-        const clock_t end = clock();
-        const uint64_t end_us = get_time_us();
-        printf("%s: elapsed ticks: %ld\n",  __func__, end - start);
-        printf("%s: elapsed us:    %llu / %f ms\n",  __func__, end_us - start_us, (end_us - start_us) / 1000.0 / nIter);
-    }
-
-    printf("%f\n", sum);
-
-    free(src0);
-    free(src1);
-    free(dst);
-
-    free(src0_fp16);
-    free(src1_fp16);
-
-    return 0;
-}
diff --git a/ggml/tests/test-xpos.c b/ggml/tests/test-xpos.c
deleted file mode 100644
index 5f33110..0000000
--- a/ggml/tests/test-xpos.c
+++ /dev/null
@@ -1,94 +0,0 @@
-#include "ggml/ggml.h"
-
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-bool is_close(float a, float b, float epsilon) {
-    return fabs(a - b) < epsilon;
-}
-
-int main(int argc, char ** argv) {
-    const int n_threads = 1;
-    const int n_embd_head = 4; // aka head_dim
-    const int n_head = 1;
-    const int N = 8;
-
-    struct ggml_init_params params = {
-        .mem_size   = 16*1024*1024,
-        .mem_buffer = NULL,
-    };
-
-    // memory allocation happens here
-    struct ggml_context * ctx = ggml_init(params);
-
-    struct ggml_tensor * Q = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd_head, n_head, N);
-    struct ggml_tensor * K = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd_head, n_head, N);
-
-    for (int i = 0; i < ggml_nelements(Q); i++) {
-        ((float*) Q->data)[i] = 2.0f;
-        ((float*) K->data)[i] = 2.0f;
-    }
-
-    struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, N);
-    int * data = (int *) KQ_pos->data;
-    for (int i = 0; i < N; ++i) {
-        data[i] = 1 + i;
-    }
-
-    struct ggml_tensor * Qx = ggml_rope_xpos_inplace(ctx, Q, KQ_pos, n_embd_head, 512.0f, false);
-    struct ggml_tensor * Kx = ggml_rope_xpos_inplace(ctx, K, KQ_pos, n_embd_head, 512.0f, true);
-
-    struct ggml_cgraph * gf = ggml_new_graph(ctx);
-    ggml_build_forward_expand(gf, Qx);
-    ggml_build_forward_expand(gf, Kx);
-    ggml_graph_compute_with_ctx(ctx, gf, n_threads);
-
-	// expected output for Qx:
-    // -0.6009  2.7568  1.9782  2.0182
-    // -2.6379  0.9815  1.9562  2.0361
-    // -2.2457 -1.6853  1.9341  2.0538
-    //  0.2043 -2.7934  1.9118  2.0712
-    //  2.4550 -1.3341  1.8894  2.0884
-    //  2.4430  1.3417  1.8668  2.1054
-    //  0.1905  2.7739  1.8440  2.1221
-    // -2.2257  1.6550  1.8212  2.1386
-
-    for (int i = 0; i < ggml_nelements(Q); i++) {
-        if (((float*) Qx->data)[i] > 0) printf(" ");
-        printf("%.4f ", ((float*) Qx->data)[i]);
-        if ((i+1) % n_embd_head == 0) printf("\n");
-    }
-    printf("\n");
-
-    GGML_ASSERT(is_close(((float*) Qx->data)[7 * n_embd_head + 0], -2.2257f, 0.0001f));
-    GGML_ASSERT(is_close(((float*) Qx->data)[7 * n_embd_head + 1],  1.6550f, 0.0001f));
-    GGML_ASSERT(is_close(((float*) Qx->data)[7 * n_embd_head + 2],  1.8212f, 0.0001f));
-    GGML_ASSERT(is_close(((float*) Qx->data)[7 * n_embd_head + 3],  2.1386f, 0.0001f));
-
-    // expected output for Kx:
-	// -0.6038  2.7703  1.9816  2.0216
-    // -2.6639  0.9911  1.9630  2.0431
-    // -2.2789 -1.7103  1.9441  2.0644
-    //  0.2083 -2.8486  1.9251  2.0856
-    //  2.5158 -1.3671  1.9057  2.1065
-    //  2.5158  1.3816  1.8862  2.1273
-    //  0.1972  2.8705  1.8665  2.1479
-    // -2.3146  1.7211  1.8465  2.1684
-
-    for (int i = 0; i < ggml_nelements(K); i++) {
-        if (((float*) Kx->data)[i] > 0) printf(" ");
-        printf("%.4f ", ((float*) Kx->data)[i]);
-        if ((i+1) % n_embd_head == 0) printf("\n");
-    }
-    printf("\n");
-
-    GGML_ASSERT(is_close(((float*) Kx->data)[7 * n_embd_head + 0], -2.3146f, 0.0001f));
-    GGML_ASSERT(is_close(((float*) Kx->data)[7 * n_embd_head + 1],  1.7211f, 0.0001f));
-    GGML_ASSERT(is_close(((float*) Kx->data)[7 * n_embd_head + 2],  1.8465f, 0.0001f));
-    GGML_ASSERT(is_close(((float*) Kx->data)[7 * n_embd_head + 3],  2.1684f, 0.0001f));
-
-    ggml_free(ctx);
-
-    return 0;
-}
diff --git a/ggml/tests/test0.c b/ggml/tests/test0.c
deleted file mode 100644
index 2d2fa85..0000000
--- a/ggml/tests/test0.c
+++ /dev/null
@@ -1,42 +0,0 @@
-#include "ggml/ggml.h"
-
-#include <stdio.h>
-#include <stdlib.h>
-
-int main(int argc, const char ** argv) {
-    struct ggml_init_params params = {
-        .mem_size   = 128*1024*1024,
-        .mem_buffer = NULL,
-        .no_alloc   = false,
-    };
-
-    struct ggml_context * ctx0 = ggml_init(params);
-
-    struct ggml_tensor * t1 = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 10);
-    struct ggml_tensor * t2 = ggml_new_tensor_2d(ctx0, GGML_TYPE_I16, 10, 20);
-    struct ggml_tensor * t3 = ggml_new_tensor_3d(ctx0, GGML_TYPE_I32, 10, 20, 30);
-
-    GGML_ASSERT(ggml_n_dims(t1) == 1);
-    GGML_ASSERT(t1->ne[0]  == 10);
-    GGML_ASSERT(t1->nb[1]  == 10*sizeof(float));
-
-    GGML_ASSERT(ggml_n_dims(t2) == 2);
-    GGML_ASSERT(t2->ne[0]  == 10);
-    GGML_ASSERT(t2->ne[1]  == 20);
-    GGML_ASSERT(t2->nb[1]  == 10*sizeof(int16_t));
-    GGML_ASSERT(t2->nb[2]  == 10*20*sizeof(int16_t));
-
-    GGML_ASSERT(ggml_n_dims(t3) == 3);
-    GGML_ASSERT(t3->ne[0]  == 10);
-    GGML_ASSERT(t3->ne[1]  == 20);
-    GGML_ASSERT(t3->ne[2]  == 30);
-    GGML_ASSERT(t3->nb[1]  == 10*sizeof(int32_t));
-    GGML_ASSERT(t3->nb[2]  == 10*20*sizeof(int32_t));
-    GGML_ASSERT(t3->nb[3]  == 10*20*30*sizeof(int32_t));
-
-    ggml_print_objects(ctx0);
-
-    ggml_free(ctx0);
-
-    return 0;
-}
diff --git a/ggml/tests/test0.zig b/ggml/tests/test0.zig
deleted file mode 100644
index 01bd601..0000000
--- a/ggml/tests/test0.zig
+++ /dev/null
@@ -1,41 +0,0 @@
-const std = @import("std");
-const c = @cImport({
-    @cInclude("ggml/ggml.h");
-});
-
-pub fn main() !void {
-    const params = .{
-        .mem_size   = 128*1024*1024,
-        .mem_buffer = null,
-        .no_alloc   = false,
-    };
-
-    const ctx0 = c.ggml_init(params);
-    defer c.ggml_free(ctx0);
-
-    const t1 = c.ggml_new_tensor_1d(ctx0, c.GGML_TYPE_F32, 10);
-    const t2 = c.ggml_new_tensor_2d(ctx0, c.GGML_TYPE_I16, 10, 20);
-    const t3 = c.ggml_new_tensor_3d(ctx0, c.GGML_TYPE_I32, 10, 20, 30);
-
-    try std.testing.expect(t1.*.n_dims == 1);
-    try std.testing.expect(t1.*.ne[0]  == 10);
-    try std.testing.expect(t1.*.nb[1]  == 10*@sizeOf(f32));
-
-    try std.testing.expect(t2.*.n_dims == 2);
-    try std.testing.expect(t2.*.ne[0]  == 10);
-    try std.testing.expect(t2.*.ne[1]  == 20);
-    try std.testing.expect(t2.*.nb[1]  == 10*@sizeOf(i16));
-    try std.testing.expect(t2.*.nb[2]  == 10*20*@sizeOf(i16));
-
-    try std.testing.expect(t3.*.n_dims == 3);
-    try std.testing.expect(t3.*.ne[0]  == 10);
-    try std.testing.expect(t3.*.ne[1]  == 20);
-    try std.testing.expect(t3.*.ne[2]  == 30);
-    try std.testing.expect(t3.*.nb[1]  == 10*@sizeOf(i32));
-    try std.testing.expect(t3.*.nb[2]  == 10*20*@sizeOf(i32));
-    try std.testing.expect(t3.*.nb[3]  == 10*20*30*@sizeOf(i32));
-
-    c.ggml_print_objects(ctx0);
-
-    _ = try std.io.getStdIn().reader().readByte();
-}
diff --git a/ggml/tests/test1.c b/ggml/tests/test1.c
deleted file mode 100644
index 230aaed..0000000
--- a/ggml/tests/test1.c
+++ /dev/null
@@ -1,458 +0,0 @@
-#include "ggml/ggml.h"
-
-#include <stdio.h>
-#include <stdlib.h>
-
-int main(int argc, const char ** argv) {
-    const int n_threads = 2;
-
-    struct ggml_init_params params = {
-        .mem_size   = 128*1024*1024,
-        .mem_buffer = NULL,
-        .no_alloc   = false,
-    };
-
-    struct ggml_context * ctx0 = ggml_init(params);
-
-    {
-        struct ggml_tensor * x = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
-
-        ggml_set_param(ctx0, x);
-
-        struct ggml_tensor * a = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
-        struct ggml_tensor * b = ggml_mul(ctx0, x, x);
-        struct ggml_tensor * f = ggml_mul(ctx0, b, a);
-
-        // a*x^2
-        // 2*a*x
-
-        ggml_print_objects(ctx0);
-
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, GGML_DEFAULT_GRAPH_SIZE, true);
-        ggml_build_forward_expand(gf, f);
-        struct ggml_cgraph * gb = ggml_graph_dup(ctx0, gf);
-        ggml_build_backward_expand(ctx0, gf, gb, false);
-
-        ggml_set_f32(x, 2.0f);
-        ggml_set_f32(a, 3.0f);
-
-        ggml_graph_reset(gf);
-        ggml_set_f32(f->grad, 1.0f);
-
-        ggml_graph_compute_with_ctx(ctx0, gb, n_threads);
-
-        printf("f     = %f\n", ggml_get_f32_1d(f, 0));
-        printf("df/dx = %f\n", ggml_get_f32_1d(x->grad, 0));
-
-        GGML_ASSERT(ggml_get_f32_1d(f, 0)       == 12.0f);
-        GGML_ASSERT(ggml_get_f32_1d(x->grad, 0) == 12.0f);
-
-        ggml_set_f32(x, 3.0f);
-
-        ggml_graph_reset(gf);
-        ggml_set_f32(f->grad, 1.0f);
-
-        ggml_graph_compute_with_ctx(ctx0, gb, n_threads);
-
-        printf("f     = %f\n", ggml_get_f32_1d(f, 0));
-        printf("df/dx = %f\n", ggml_get_f32_1d(x->grad, 0));
-
-        GGML_ASSERT(ggml_get_f32_1d(f, 0)       == 27.0f);
-        GGML_ASSERT(ggml_get_f32_1d(x->grad, 0) == 18.0f);
-
-        ggml_graph_dump_dot(gf, NULL, "test1-1-forward.dot");
-        ggml_graph_dump_dot(gb, gf,   "test1-1-backward.dot");
-    }
-
-    ///////////////////////////////////////////////////////////////
-
-    {
-        struct ggml_tensor * x1 = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
-        struct ggml_tensor * x2 = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
-        struct ggml_tensor * x3 = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
-
-        ggml_set_f32(x1, 3.0f);
-        ggml_set_f32(x2, 1.0f);
-        ggml_set_f32(x3, 0.0f);
-
-        ggml_set_param(ctx0, x1);
-        ggml_set_param(ctx0, x2);
-
-        struct ggml_tensor * y = ggml_add(ctx0, ggml_mul(ctx0, x1, x1), ggml_mul(ctx0, x1, x2));
-
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, GGML_DEFAULT_GRAPH_SIZE, true);
-        ggml_build_forward_expand(gf, y);
-        struct ggml_cgraph * gb = ggml_graph_dup(ctx0, gf);
-        ggml_build_backward_expand(ctx0, gf, gb, false);
-
-        ggml_graph_reset(gf);
-        ggml_set_f32(y->grad, 1.0f);
-
-        ggml_graph_compute_with_ctx(ctx0, gb, n_threads);
-
-        printf("y      = %f\n", ggml_get_f32_1d(y, 0));
-        printf("df/dx1 = %f\n", ggml_get_f32_1d(x1->grad, 0));
-        printf("df/dx2 = %f\n", ggml_get_f32_1d(x2->grad, 0));
-
-        GGML_ASSERT(ggml_get_f32_1d(y, 0)        == 12.0f);
-        GGML_ASSERT(ggml_get_f32_1d(x1->grad, 0) == 7.0f);
-        GGML_ASSERT(ggml_get_f32_1d(x2->grad, 0) == 3.0f);
-
-        struct ggml_tensor * g1 = x1->grad;
-        struct ggml_tensor * g2 = x2->grad;
-
-        struct ggml_cgraph * gbb = ggml_graph_dup(ctx0, gb);
-
-        ggml_build_backward_expand(ctx0, gb, gbb, true);
-
-        ggml_graph_reset(gb);
-        ggml_set_f32(g1->grad, 1.0f);
-        ggml_set_f32(g2->grad, 1.0f);
-
-        ggml_graph_compute_with_ctx(ctx0, gbb, n_threads);
-
-        printf("H * [1, 1] = [ %f %f ]\n", ggml_get_f32_1d(x1->grad, 0), ggml_get_f32_1d(x2->grad, 0));
-
-        GGML_ASSERT(ggml_get_f32_1d(x1->grad, 0) == 3.0f);
-        GGML_ASSERT(ggml_get_f32_1d(x2->grad, 0) == 1.0f);
-
-        ggml_graph_dump_dot(gf, NULL, "test1-2-forward.dot");
-        ggml_graph_dump_dot(gb, gf,   "test1-2-backward.dot");
-    }
-
-    ///////////////////////////////////////////////////////////////
-
-    {
-        struct ggml_tensor * x1 = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
-        struct ggml_tensor * x2 = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
-
-        ggml_set_param(ctx0, x1);
-        ggml_set_param(ctx0, x2);
-
-        struct ggml_tensor * y = ggml_mul(ctx0, ggml_add(ctx0, ggml_mul(ctx0, x1, x1), ggml_mul(ctx0, x1, x2)), x1);
-
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, GGML_DEFAULT_GRAPH_SIZE, true);
-        ggml_build_forward_expand(gf, y);
-        struct ggml_cgraph * gb = ggml_graph_dup(ctx0, gf);
-        ggml_build_backward_expand(ctx0, gf, gb, false);
-
-        ggml_set_f32(x1, 3.0f);
-        ggml_set_f32(x2, 4.0f);
-
-        ggml_graph_reset(gf);
-        ggml_set_f32(y->grad, 1.0f);
-
-        ggml_graph_compute_with_ctx(ctx0, gb, n_threads);
-
-        printf("y      = %f\n", ggml_get_f32_1d(y, 0));
-        printf("df/dx1 = %f\n", ggml_get_f32_1d(x1->grad, 0));
-        printf("df/dx2 = %f\n", ggml_get_f32_1d(x2->grad, 0));
-
-        GGML_ASSERT(ggml_get_f32_1d(y, 0)        == 63.0f);
-        GGML_ASSERT(ggml_get_f32_1d(x1->grad, 0) == 51.0f);
-        GGML_ASSERT(ggml_get_f32_1d(x2->grad, 0) == 9.0f);
-
-        ggml_graph_dump_dot(gf, NULL, "test1-3-forward.dot");
-        ggml_graph_dump_dot(gb, gf,   "test1-3-backward.dot");
-    }
-
-    ///////////////////////////////////////////////////////////////
-
-    {
-        struct ggml_tensor * x1 = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
-        struct ggml_tensor * x2 = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
-        struct ggml_tensor * x3 = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
-
-        ggml_set_param(ctx0, x1);
-        ggml_set_param(ctx0, x2);
-        ggml_set_param(ctx0, x3);
-
-        struct ggml_tensor * y = ggml_mul(ctx0, ggml_mul(ctx0, ggml_mul(ctx0, x1, x1), ggml_mul(ctx0, x2, x2)), x3);
-
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, GGML_DEFAULT_GRAPH_SIZE, true);
-        ggml_build_forward_expand(gf, y);
-        struct ggml_cgraph * gb = ggml_graph_dup(ctx0, gf);
-        ggml_build_backward_expand(ctx0, gf, gb, false);
-
-        ggml_set_f32(x1, 1.0f);
-        ggml_set_f32(x2, 2.0f);
-        ggml_set_f32(x3, 3.0f);
-
-        ggml_graph_reset(gf);
-        ggml_set_f32(y->grad, 1.0f);
-
-        ggml_graph_compute_with_ctx(ctx0, gb, n_threads);
-
-        printf("y      = %f\n", ggml_get_f32_1d(y, 0));
-        printf("df/dx1 = %f\n", ggml_get_f32_1d(x1->grad, 0));
-        printf("df/dx2 = %f\n", ggml_get_f32_1d(x2->grad, 0));
-        printf("df/dx3 = %f\n", ggml_get_f32_1d(x3->grad, 0));
-
-        GGML_ASSERT(ggml_get_f32_1d(y, 0)        == 12.0f);
-        GGML_ASSERT(ggml_get_f32_1d(x1->grad, 0) == 24.0f);
-        GGML_ASSERT(ggml_get_f32_1d(x2->grad, 0) == 12.0f);
-        GGML_ASSERT(ggml_get_f32_1d(x3->grad, 0) == 4.0f);
-
-        struct ggml_tensor * g1 = x1->grad;
-        struct ggml_tensor * g2 = x2->grad;
-        struct ggml_tensor * g3 = x3->grad;
-
-        struct ggml_cgraph * gbb = ggml_graph_dup(ctx0, gb);
-
-        ggml_build_backward_expand(ctx0, gb, gbb, true);
-
-        ggml_graph_reset(gb);
-        ggml_set_f32(g1->grad, 1.0f);
-        ggml_set_f32(g2->grad, 1.0f);
-        ggml_set_f32(g3->grad, 1.0f);
-
-        ggml_graph_compute_with_ctx(ctx0, gbb, n_threads);
-
-        printf("H * [1, 1, 1] = [ %f %f %f ]\n",
-                ggml_get_f32_1d(x1->grad, 0),
-                ggml_get_f32_1d(x2->grad, 0),
-                ggml_get_f32_1d(x3->grad, 0));
-
-        GGML_ASSERT(ggml_get_f32_1d(x1->grad, 0) == 56.0f);
-        GGML_ASSERT(ggml_get_f32_1d(x2->grad, 0) == 34.0f);
-        GGML_ASSERT(ggml_get_f32_1d(x3->grad, 0) == 12.0f);
-
-        ggml_graph_dump_dot(gf, NULL, "test1-4-forward.dot");
-        ggml_graph_dump_dot(gb, gf,   "test1-4-backward.dot");
-    }
-
-    ///////////////////////////////////////////////////////////////
-
-    {
-        struct ggml_tensor * x1 = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 3);
-        struct ggml_tensor * x2 = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 3);
-
-        ggml_set_param(ctx0, x1);
-        ggml_set_param(ctx0, x2);
-
-        struct ggml_tensor * y = ggml_sum(ctx0, ggml_mul(ctx0, x1, x2));
-
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, GGML_DEFAULT_GRAPH_SIZE, true);
-        ggml_build_forward_expand(gf, y);
-        struct ggml_cgraph * gb = ggml_graph_dup(ctx0, gf);
-        ggml_build_backward_expand(ctx0, gf, gb, false);
-
-        ggml_set_f32(x1, 3.0f);
-        ggml_set_f32(x2, 5.0f);
-
-        ggml_graph_reset(gf);
-        ggml_set_f32(y->grad, 1.0f);
-
-        ggml_graph_compute_with_ctx(ctx0, gb, n_threads);
-
-        printf("y      = %f\n", ggml_get_f32_1d(y, 0));
-        printf("df/dx1 = %f %f %f\n",
-                ggml_get_f32_1d(x1->grad, 0),
-                ggml_get_f32_1d(x1->grad, 1),
-                ggml_get_f32_1d(x1->grad, 2));
-        printf("df/dx2 = %f %f %f\n",
-                ggml_get_f32_1d(x2->grad, 0),
-                ggml_get_f32_1d(x2->grad, 1),
-                ggml_get_f32_1d(x2->grad, 2));
-
-        GGML_ASSERT(ggml_get_f32_1d(y, 0)        == 45.0f);
-        GGML_ASSERT(ggml_get_f32_1d(x1->grad, 0) == 5.0f);
-        GGML_ASSERT(ggml_get_f32_1d(x2->grad, 0) == 3.0f);
-        GGML_ASSERT(ggml_get_f32_1d(x1->grad, 1) == 5.0f);
-        GGML_ASSERT(ggml_get_f32_1d(x2->grad, 1) == 3.0f);
-        GGML_ASSERT(ggml_get_f32_1d(x1->grad, 2) == 5.0f);
-        GGML_ASSERT(ggml_get_f32_1d(x2->grad, 2) == 3.0f);
-
-        ggml_graph_dump_dot(gf, NULL, "test1-5-forward.dot");
-        ggml_graph_dump_dot(gb, gf,   "test1-5-backward.dot");
-    }
-
-    ///////////////////////////////////////////////////////////////
-
-    {
-        struct ggml_tensor * x1 = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 3);
-        struct ggml_tensor * x2 = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 3);
-
-        ggml_set_param(ctx0, x1);
-        ggml_set_param(ctx0, x2);
-
-        struct ggml_tensor * y =
-            ggml_sum(ctx0,
-                    ggml_add(ctx0,
-                        ggml_mul(ctx0, x1, x2),
-                        ggml_mul(ctx0,
-                            ggml_repeat(ctx0, ggml_new_f32(ctx0, -2.0f), x1),
-                            ggml_mul(ctx0, x1, x1)
-                            )
-                        )
-                    );
-
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, GGML_DEFAULT_GRAPH_SIZE, true);
-        ggml_build_forward_expand(gf, y);
-        struct ggml_cgraph * gb = ggml_graph_dup(ctx0, gf);
-        ggml_build_backward_expand(ctx0, gf, gb, false);
-
-        ggml_set_f32(x1, 3.0f);
-        ggml_set_f32(x2, 5.0f);
-
-        ggml_graph_reset(gf);
-        ggml_set_f32(y->grad, 1.0f);
-
-        ggml_graph_compute_with_ctx(ctx0, gb, n_threads);
-
-        printf("y      = %f\n", ggml_get_f32_1d(y, 0));
-        printf("df/dx1 = %f %f %f\n",
-                ggml_get_f32_1d(x1->grad, 0),
-                ggml_get_f32_1d(x1->grad, 1),
-                ggml_get_f32_1d(x1->grad, 2));
-        printf("df/dx2 = %f %f %f\n",
-                ggml_get_f32_1d(x2->grad, 0),
-                ggml_get_f32_1d(x2->grad, 1),
-                ggml_get_f32_1d(x2->grad, 2));
-
-        GGML_ASSERT(ggml_get_f32_1d(y, 0)              == -9.0f);
-        GGML_ASSERT(ggml_get_f32_1d(x1->grad, 0) == -7.0f);
-        GGML_ASSERT(ggml_get_f32_1d(x1->grad, 1) == -7.0f);
-        GGML_ASSERT(ggml_get_f32_1d(x1->grad, 2) == -7.0f);
-        GGML_ASSERT(ggml_get_f32_1d(x2->grad, 0) ==  3.0f);
-        GGML_ASSERT(ggml_get_f32_1d(x2->grad, 1) ==  3.0f);
-        GGML_ASSERT(ggml_get_f32_1d(x2->grad, 2) ==  3.0f);
-
-        ggml_graph_dump_dot(gf, NULL, "test1-6-forward.dot");
-        ggml_graph_dump_dot(gb, gf,   "test1-6-backward.dot");
-    }
-
-    ///////////////////////////////////////////////////////////////
-
-    {
-        struct ggml_tensor * x1 = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 3);
-        struct ggml_tensor * x2 = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 3);
-
-        ggml_set_param(ctx0, x1);
-        ggml_set_param(ctx0, x2);
-
-        struct ggml_tensor * y =
-            ggml_sum(ctx0,
-                    ggml_sub(ctx0,
-                        ggml_mul(ctx0, x1, x2),
-                        ggml_mul(ctx0,
-                            ggml_mul(ctx0, x1, x1),
-                            ggml_repeat(ctx0, ggml_new_f32(ctx0, -2.0f), x1)
-                            )
-                        )
-                    );
-
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, GGML_DEFAULT_GRAPH_SIZE, true);
-        ggml_build_forward_expand(gf, y);
-        struct ggml_cgraph * gb = ggml_graph_dup(ctx0, gf);
-        ggml_build_backward_expand(ctx0, gf, gb, false);
-
-        ggml_set_f32(x1, 3.0f);
-        ggml_set_f32(x2, 5.0f);
-
-        ggml_graph_reset(gf);
-        ggml_set_f32(y->grad, 1.0f);
-
-        ggml_graph_compute_with_ctx(ctx0, gb, n_threads);
-
-        printf("y      = %f\n", ggml_get_f32_1d(y, 0));
-        printf("df/dx1 = %f %f %f\n",
-                ggml_get_f32_1d(x1->grad, 0),
-                ggml_get_f32_1d(x1->grad, 1),
-                ggml_get_f32_1d(x1->grad, 2));
-        printf("df/dx2 = %f %f %f\n",
-                ggml_get_f32_1d(x2->grad, 0),
-                ggml_get_f32_1d(x2->grad, 1),
-                ggml_get_f32_1d(x2->grad, 2));
-
-        GGML_ASSERT(ggml_get_f32_1d(y, 0)        == 99.0f);
-        GGML_ASSERT(ggml_get_f32_1d(x1->grad, 0) == 17.0f);
-        GGML_ASSERT(ggml_get_f32_1d(x1->grad, 1) == 17.0f);
-        GGML_ASSERT(ggml_get_f32_1d(x1->grad, 2) == 17.0f);
-        GGML_ASSERT(ggml_get_f32_1d(x2->grad, 0) ==  3.0f);
-        GGML_ASSERT(ggml_get_f32_1d(x2->grad, 1) ==  3.0f);
-        GGML_ASSERT(ggml_get_f32_1d(x2->grad, 2) ==  3.0f);
-
-        ggml_graph_dump_dot(gf, NULL, "test1-7-forward.dot");
-        ggml_graph_dump_dot(gb, gf,   "test1-7-backward.dot");
-    }
-
-    ///////////////////////////////////////////////////////////////
-
-    {
-        struct ggml_tensor * x1 = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 3);
-        struct ggml_tensor * x2 = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 3);
-
-        ggml_set_param(ctx0, x1);
-        ggml_set_param(ctx0, x2);
-
-        struct ggml_tensor * y =
-            ggml_abs(ctx0,
-                    ggml_sub(ctx0, x1, x2)
-                    );
-
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, GGML_DEFAULT_GRAPH_SIZE, true);
-        ggml_build_forward_expand(gf, y);
-        struct ggml_cgraph * gb = ggml_graph_dup(ctx0, gf);
-        ggml_build_backward_expand(ctx0, gf, gb, false);
-
-        ggml_set_f32(x1, 3.0f);
-        ggml_set_f32(x2, 5.0f);
-
-        ggml_graph_reset(gf);
-        ggml_set_f32(y->grad, 1.0f);
-
-        ggml_graph_compute_with_ctx(ctx0, gb, n_threads);
-
-        printf("y      = %f\n", ggml_get_f32_1d(y, 0));
-        printf("df/dx1 = %f %f %f\n",
-                ggml_get_f32_1d(x1->grad, 0),
-                ggml_get_f32_1d(x1->grad, 1),
-                ggml_get_f32_1d(x1->grad, 2));
-        printf("df/dx2 = %f %f %f\n",
-                ggml_get_f32_1d(x2->grad, 0),
-                ggml_get_f32_1d(x2->grad, 1),
-                ggml_get_f32_1d(x2->grad, 2));
-
-        GGML_ASSERT(ggml_get_f32_1d(y, 0)        ==  2.0f);
-        GGML_ASSERT(ggml_get_f32_1d(x1->grad, 0) == -1.0f);
-        GGML_ASSERT(ggml_get_f32_1d(x1->grad, 1) == -1.0f);
-        GGML_ASSERT(ggml_get_f32_1d(x1->grad, 2) == -1.0f);
-        GGML_ASSERT(ggml_get_f32_1d(x2->grad, 0) ==  1.0f);
-        GGML_ASSERT(ggml_get_f32_1d(x2->grad, 1) ==  1.0f);
-        GGML_ASSERT(ggml_get_f32_1d(x2->grad, 2) ==  1.0f);
-
-        ggml_set_f32(x1, 7.0f);
-        ggml_set_f32(x2, 5.0f);
-
-        ggml_graph_reset(gf);
-        ggml_set_f32(y->grad, 1.0f);
-
-        ggml_graph_compute_with_ctx(ctx0, gb, n_threads);
-
-        printf("y      = %f\n", ggml_get_f32_1d(y, 0));
-        printf("df/dx1 = %f %f %f\n",
-                ggml_get_f32_1d(x1->grad, 0),
-                ggml_get_f32_1d(x1->grad, 1),
-                ggml_get_f32_1d(x1->grad, 2));
-        printf("df/dx2 = %f %f %f\n",
-                ggml_get_f32_1d(x2->grad, 0),
-                ggml_get_f32_1d(x2->grad, 1),
-                ggml_get_f32_1d(x2->grad, 2));
-
-        GGML_ASSERT(ggml_get_f32_1d(y, 0)        ==  2.0f);
-        GGML_ASSERT(ggml_get_f32_1d(x1->grad, 0) ==  1.0f);
-        GGML_ASSERT(ggml_get_f32_1d(x1->grad, 1) ==  1.0f);
-        GGML_ASSERT(ggml_get_f32_1d(x1->grad, 2) ==  1.0f);
-        GGML_ASSERT(ggml_get_f32_1d(x2->grad, 0) == -1.0f);
-        GGML_ASSERT(ggml_get_f32_1d(x2->grad, 1) == -1.0f);
-        GGML_ASSERT(ggml_get_f32_1d(x2->grad, 2) == -1.0f);
-
-        ggml_graph_dump_dot(gf, NULL, "test1-8-forward.dot");
-        ggml_graph_dump_dot(gb, gf,   "test1-8-backward.dot");
-    }
-
-    ggml_free(ctx0);
-
-    return 0;
-}
diff --git a/ggml/tests/test1.zig b/ggml/tests/test1.zig
deleted file mode 100644
index e472e00..0000000
--- a/ggml/tests/test1.zig
+++ /dev/null
@@ -1,459 +0,0 @@
-const std = @import("std");
-const c = @cImport({
-    @cInclude("ggml/ggml.h");
-});
-
-pub fn main() !void {
-    const n_threads = 2;
-
-    const params = .{
-        .mem_size   = 128*1024*1024,
-        .mem_buffer = null,
-        .no_alloc   = false,
-    };
-
-    const ctx0 = c.ggml_init(params);
-    defer c.ggml_free(ctx0);
-
-    {
-        const x = c.ggml_new_tensor_1d(ctx0, c.GGML_TYPE_F32, 1);
-
-        c.ggml_set_param(ctx0, x);
-
-        const a = c.ggml_new_tensor_1d(ctx0, c.GGML_TYPE_F32, 1);
-        const b = c.ggml_mul(ctx0, x, x);
-        const f = c.ggml_mul(ctx0, b, a);
-
-        // a*x^2
-        // 2*a*x
-
-        c.ggml_print_objects(ctx0);
-
-        const gf = c.ggml_build_forward(f);
-        const gb = c.ggml_build_backward(ctx0, @constCast(&gf), false);
-
-        _ = c.ggml_set_f32(x, 2.0);
-        _ = c.ggml_set_f32(a, 3.0);
-
-        c.ggml_graph_reset(@constCast(&gf));
-        _ = c.ggml_set_f32(f.*.grad, 1.0);
-
-        c.ggml_graph_compute_with_ctx(ctx0, @constCast(&gb), n_threads);
-
-        std.debug.print("f     = {d:.6}\n", .{c.ggml_get_f32_1d(f, 0)});
-        std.debug.print("df/dx = {d:.6}\n", .{c.ggml_get_f32_1d(x.*.grad, 0)});
-
-        try std.testing.expect(c.ggml_get_f32_1d(f, 0)          ==  12.0);
-        try std.testing.expect(c.ggml_get_f32_1d(x.*.grad, 0)   ==  12.0);
-
-        _ = c.ggml_set_f32(x, 3.0);
-
-        c.ggml_graph_reset(@constCast(&gf));
-        _ = c.ggml_set_f32(f.*.grad, 1.0);
-
-        c.ggml_graph_compute_with_ctx(ctx0, @constCast(&gb), n_threads);
-
-        std.debug.print("f     = {d:.6}\n", .{c.ggml_get_f32_1d(f, 0)});
-        std.debug.print("df/dx = {d:.6}\n", .{c.ggml_get_f32_1d(x.*.grad, 0)});
-
-        try std.testing.expect(c.ggml_get_f32_1d(f, 0)          ==  27.0);
-        try std.testing.expect(c.ggml_get_f32_1d(x.*.grad, 0)   ==  18.0);
-
-        c.ggml_graph_dump_dot(&gf, null, "test1-1-forward.dot");
-        c.ggml_graph_dump_dot(&gb, &gf,  "test1-1-backward.dot");
-    }
-
-    /////////////////////////////////////////////////////////////
-
-    {
-        const x1 = c.ggml_new_tensor_1d(ctx0, c.GGML_TYPE_F32, 1);
-        const x2 = c.ggml_new_tensor_1d(ctx0, c.GGML_TYPE_F32, 1);
-        const x3 = c.ggml_new_tensor_1d(ctx0, c.GGML_TYPE_F32, 1);
-
-        _ = c.ggml_set_f32(x1, 3.0);
-        _ = c.ggml_set_f32(x2, 1.0);
-        _ = c.ggml_set_f32(x3, 0.0);
-
-        c.ggml_set_param(ctx0, x1);
-        c.ggml_set_param(ctx0, x2);
-
-        const y = c.ggml_add(ctx0, c.ggml_mul(ctx0, x1, x1), c.ggml_mul(ctx0, x1, x2));
-
-        const gf = c.ggml_build_forward(y);
-        const gb = c.ggml_build_backward(ctx0, @constCast(&gf), false);
-
-        c.ggml_graph_reset(@constCast(&gf));
-        _ = c.ggml_set_f32(y.*.grad, 1.0);
-
-        c.ggml_graph_compute_with_ctx(ctx0, @constCast(&gb), n_threads);
-
-        std.debug.print("y      = {d:.6}\n", .{c.ggml_get_f32_1d(y, 0)});
-        std.debug.print("df/dx1 = {d:.6}\n", .{c.ggml_get_f32_1d(x1.*.grad, 0)});
-        std.debug.print("df/dx2 = {d:.6}\n", .{c.ggml_get_f32_1d(x2.*.grad, 0)});
-
-        try std.testing.expect(c.ggml_get_f32_1d(y, 0)          ==  12.0);
-        try std.testing.expect(c.ggml_get_f32_1d(x1.*.grad, 0)  ==  7.0);
-        try std.testing.expect(c.ggml_get_f32_1d(x2.*.grad, 0)  ==  3.0);
-
-        const g1 = x1.*.grad;
-        const g2 = x2.*.grad;
-
-        const gbb = c.ggml_build_backward(ctx0, @constCast(&gb), true);
-
-        c.ggml_graph_reset(@constCast(&gb));
-        _ = c.ggml_set_f32(g1.*.grad, 1.0);
-        _ = c.ggml_set_f32(g2.*.grad, 1.0);
-
-        c.ggml_graph_compute_with_ctx(ctx0, @constCast(&gbb), n_threads);
-
-        std.debug.print("H * [1, 1] = [ {d:.6} {d:.6} ]\n", .{c.ggml_get_f32_1d(x1.*.grad, 0), c.ggml_get_f32_1d(x2.*.grad, 0)});
-
-        try std.testing.expect(c.ggml_get_f32_1d(x1.*.grad, 0)  ==  3.0);
-        try std.testing.expect(c.ggml_get_f32_1d(x2.*.grad, 0)  ==  1.0);
-
-        c.ggml_graph_dump_dot(&gf, null, "test1-2-forward.dot");
-        c.ggml_graph_dump_dot(&gb, &gf,  "test1-2-backward.dot");
-    }
-
-    ///////////////////////////////////////////////////////////////
-
-    {
-        const x1 = c.ggml_new_tensor_1d(ctx0, c.GGML_TYPE_F32, 1);
-        const x2 = c.ggml_new_tensor_1d(ctx0, c.GGML_TYPE_F32, 1);
-
-        c.ggml_set_param(ctx0, x1);
-        c.ggml_set_param(ctx0, x2);
-
-        const y = c.ggml_mul(ctx0, c.ggml_add(ctx0, c.ggml_mul(ctx0, x1, x1), c.ggml_mul(ctx0, x1, x2)), x1);
-
-        const gf = c.ggml_build_forward(y);
-        const gb = c.ggml_build_backward(ctx0, @constCast(&gf), false);
-
-        _ = c.ggml_set_f32(x1, 3.0);
-        _ = c.ggml_set_f32(x2, 4.0);
-
-        c.ggml_graph_reset(@constCast(&gf));
-        _ = c.ggml_set_f32(y.*.grad, 1.0);
-
-        c.ggml_graph_compute_with_ctx(ctx0, @constCast(&gb), n_threads);
-
-        std.debug.print("y      = {d:.6}\n", .{c.ggml_get_f32_1d(y, 0)});
-        std.debug.print("df/dx1 = {d:.6}\n", .{c.ggml_get_f32_1d(x1.*.grad, 0)});
-        std.debug.print("df/dx2 = {d:.6}\n", .{c.ggml_get_f32_1d(x2.*.grad, 0)});
-
-        try std.testing.expect(c.ggml_get_f32_1d(y, 0)          ==  63.0);
-        try std.testing.expect(c.ggml_get_f32_1d(x1.*.grad, 0)  ==  51.0);
-        try std.testing.expect(c.ggml_get_f32_1d(x2.*.grad, 0)  ==  9.0);
-
-        c.ggml_graph_dump_dot(&gf, null, "test1-3-forward.dot");
-        c.ggml_graph_dump_dot(&gb, &gf,  "test1-3-backward.dot");
-    }
-
-    ///////////////////////////////////////////////////////////////
-
-    {
-        const x1 = c.ggml_new_tensor_1d(ctx0, c.GGML_TYPE_F32, 1);
-        const x2 = c.ggml_new_tensor_1d(ctx0, c.GGML_TYPE_F32, 1);
-        const x3 = c.ggml_new_tensor_1d(ctx0, c.GGML_TYPE_F32, 1);
-
-        c.ggml_set_param(ctx0, x1);
-        c.ggml_set_param(ctx0, x2);
-        c.ggml_set_param(ctx0, x3);
-
-        const y = c.ggml_mul(ctx0, c.ggml_mul(ctx0, c.ggml_mul(ctx0, x1, x1), c.ggml_mul(ctx0, x2, x2)), x3);
-
-        const gf = c.ggml_build_forward(y);
-        const gb = c.ggml_build_backward(ctx0, @constCast(&gf), false);
-
-        _ = c.ggml_set_f32(x1, 1.0);
-        _ = c.ggml_set_f32(x2, 2.0);
-        _ = c.ggml_set_f32(x3, 3.0);
-
-        c.ggml_graph_reset(@constCast(&gf));
-        _ = c.ggml_set_f32(y.*.grad, 1.0);
-
-        c.ggml_graph_compute_with_ctx(ctx0, @constCast(&gb), n_threads);
-
-        std.debug.print("y      = {d:.6}\n", .{c.ggml_get_f32_1d(y, 0)});
-        std.debug.print("df/dx1 = {d:.6}\n", .{c.ggml_get_f32_1d(x1.*.grad, 0)});
-        std.debug.print("df/dx2 = {d:.6}\n", .{c.ggml_get_f32_1d(x2.*.grad, 0)});
-        std.debug.print("df/dx3 = {d:.6}\n", .{c.ggml_get_f32_1d(x3.*.grad, 0)});
-
-        try std.testing.expect(c.ggml_get_f32_1d(y, 0)          ==  12.0);
-        try std.testing.expect(c.ggml_get_f32_1d(x1.*.grad, 0)  ==  24.0);
-        try std.testing.expect(c.ggml_get_f32_1d(x2.*.grad, 0)  ==  12.0);
-        try std.testing.expect(c.ggml_get_f32_1d(x3.*.grad, 0)  ==  4.0);
-
-        const g1 = x1.*.grad;
-        const g2 = x2.*.grad;
-        const g3 = x3.*.grad;
-
-        const gbb = c.ggml_build_backward(ctx0, @constCast(&gb), true);
-
-        c.ggml_graph_reset(@constCast(&gb));
-        _ = c.ggml_set_f32(g1.*.grad, 1.0);
-        _ = c.ggml_set_f32(g2.*.grad, 1.0);
-        _ = c.ggml_set_f32(g3.*.grad, 1.0);
-
-        c.ggml_graph_compute_with_ctx(ctx0, @constCast(&gbb), n_threads);
-
-        std.debug.print("H * [1, 1, 1] = [ {d:.6} {d:.6} {d:.6}]\n",
-            .{
-                c.ggml_get_f32_1d(x1.*.grad, 0),
-                c.ggml_get_f32_1d(x2.*.grad, 0),
-                c.ggml_get_f32_1d(x3.*.grad, 0),
-            });
-
-        try std.testing.expect(c.ggml_get_f32_1d(x1.*.grad, 0)  ==  56.0);
-        try std.testing.expect(c.ggml_get_f32_1d(x2.*.grad, 0)  ==  34.0);
-        try std.testing.expect(c.ggml_get_f32_1d(x3.*.grad, 0)  ==  12.0);
-
-        c.ggml_graph_dump_dot(&gf, null, "test1-4-forward.dot");
-        c.ggml_graph_dump_dot(&gb, &gf,  "test1-4-backward.dot");
-    }
-
-    ///////////////////////////////////////////////////////////////
-
-    {
-        const x1 = c.ggml_new_tensor_1d(ctx0, c.GGML_TYPE_F32, 3);
-        const x2 = c.ggml_new_tensor_1d(ctx0, c.GGML_TYPE_F32, 3);
-
-        c.ggml_set_param(ctx0, x1);
-        c.ggml_set_param(ctx0, x2);
-
-        const y = c.ggml_sum(ctx0, c.ggml_mul(ctx0, x1, x2));
-
-        const gf = c.ggml_build_forward(y);
-        const gb = c.ggml_build_backward(ctx0, @constCast(&gf), false);
-
-        _ = c.ggml_set_f32(x1, 3.0);
-        _ = c.ggml_set_f32(x2, 5.0);
-
-        c.ggml_graph_reset(@constCast(&gf));
-        _ = c.ggml_set_f32(y.*.grad, 1.0);
-
-        c.ggml_graph_compute_with_ctx(ctx0, @constCast(&gb), n_threads);
-
-        std.debug.print("y      = {d:.6}\n", .{c.ggml_get_f32_1d(y, 0)});
-        std.debug.print("df/dx1 = {d:.6} {d:.6} {d:.6}\n",
-            .{
-                c.ggml_get_f32_1d(x1.*.grad, 0),
-                c.ggml_get_f32_1d(x1.*.grad, 1),
-                c.ggml_get_f32_1d(x1.*.grad, 2),
-            });
-        std.debug.print("df/dx2 = {d:.6} {d:.6} {d:.6}\n",
-            .{
-                c.ggml_get_f32_1d(x2.*.grad, 0),
-                c.ggml_get_f32_1d(x2.*.grad, 1),
-                c.ggml_get_f32_1d(x2.*.grad, 2),
-            });
-
-        try std.testing.expect(c.ggml_get_f32_1d(y, 0)          ==  45.0);
-        try std.testing.expect(c.ggml_get_f32_1d(x1.*.grad, 0)  ==  5.0);
-        try std.testing.expect(c.ggml_get_f32_1d(x2.*.grad, 0)  ==  3.0);
-        try std.testing.expect(c.ggml_get_f32_1d(x1.*.grad, 1)  ==  5.0);
-        try std.testing.expect(c.ggml_get_f32_1d(x2.*.grad, 1)  ==  3.0);
-        try std.testing.expect(c.ggml_get_f32_1d(x1.*.grad, 2)  ==  5.0);
-        try std.testing.expect(c.ggml_get_f32_1d(x2.*.grad, 2)  ==  3.0);
-
-        c.ggml_graph_dump_dot(&gf, null, "test1-5-forward.dot");
-        c.ggml_graph_dump_dot(&gb, &gf,  "test1-5-backward.dot");
-    }
-
-    ///////////////////////////////////////////////////////////////
-
-    {
-        const x1 = c.ggml_new_tensor_1d(ctx0, c.GGML_TYPE_F32, 3);
-        const x2 = c.ggml_new_tensor_1d(ctx0, c.GGML_TYPE_F32, 3);
-
-        c.ggml_set_param(ctx0, x1);
-        c.ggml_set_param(ctx0, x2);
-
-        const y =
-            c.ggml_sum(ctx0,
-                    c.ggml_add(ctx0,
-                        c.ggml_mul(ctx0, x1, x2),
-                        c.ggml_mul(ctx0,
-                            c.ggml_repeat(ctx0, c.ggml_new_f32(ctx0, -2.0), x1),
-                            c.ggml_mul(ctx0, x1, x1)
-                            )
-                        )
-                    );
-
-        const gf = c.ggml_build_forward(y);
-        const gb = c.ggml_build_backward(ctx0, @constCast(&gf), false);
-
-        _ = c.ggml_set_f32(x1, 3.0);
-        _ = c.ggml_set_f32(x2, 5.0);
-
-        c.ggml_graph_reset(@constCast(&gf));
-        _ = c.ggml_set_f32(y.*.grad, 1.0);
-
-        c.ggml_graph_compute_with_ctx(ctx0, @constCast(&gb), n_threads);
-
-        std.debug.print("y      = {d:.6}\n", .{c.ggml_get_f32_1d(y, 0)});
-        std.debug.print("df/dx1 = {d:.6} {d:.6} {d:.6}\n",
-            .{
-                c.ggml_get_f32_1d(x1.*.grad, 0),
-                c.ggml_get_f32_1d(x1.*.grad, 1),
-                c.ggml_get_f32_1d(x1.*.grad, 2),
-            });
-        std.debug.print("df/dx2 = {d:.6} {d:.6} {d:.6}\n",
-            .{
-                c.ggml_get_f32_1d(x2.*.grad, 0),
-                c.ggml_get_f32_1d(x2.*.grad, 1),
-                c.ggml_get_f32_1d(x2.*.grad, 2),
-            });
-
-        try std.testing.expect(c.ggml_get_f32_1d(y, 0)          ==  -9.0);
-        try std.testing.expect(c.ggml_get_f32_1d(x1.*.grad, 0)  ==  -7.0);
-        try std.testing.expect(c.ggml_get_f32_1d(x1.*.grad, 1)  ==  -7.0);
-        try std.testing.expect(c.ggml_get_f32_1d(x1.*.grad, 2)  ==  -7.0);
-        try std.testing.expect(c.ggml_get_f32_1d(x2.*.grad, 0)  ==  3.0);
-        try std.testing.expect(c.ggml_get_f32_1d(x2.*.grad, 1)  ==  3.0);
-        try std.testing.expect(c.ggml_get_f32_1d(x2.*.grad, 2)  ==  3.0);
-
-        c.ggml_graph_dump_dot(&gf, null, "test1-6-forward.dot");
-        c.ggml_graph_dump_dot(&gb, &gf,  "test1-6-backward.dot");
-    }
-
-    ///////////////////////////////////////////////////////////////
-
-    {
-        const x1 = c.ggml_new_tensor_1d(ctx0, c.GGML_TYPE_F32, 3);
-        const x2 = c.ggml_new_tensor_1d(ctx0, c.GGML_TYPE_F32, 3);
-
-        c.ggml_set_param(ctx0, x1);
-        c.ggml_set_param(ctx0, x2);
-
-        const y =
-            c.ggml_sum(ctx0,
-                    c.ggml_sub(ctx0,
-                        c.ggml_mul(ctx0, x1, x2),
-                        c.ggml_mul(ctx0,
-                            c.ggml_mul(ctx0, x1, x1),
-                            c.ggml_repeat(ctx0, c.ggml_new_f32(ctx0, -2.0), x1)
-                            )
-                        )
-                    );
-
-        const gf = c.ggml_build_forward(y);
-        const gb = c.ggml_build_backward(ctx0, @constCast(&gf), false);
-
-        _ = c.ggml_set_f32(x1, 3.0);
-        _ = c.ggml_set_f32(x2, 5.0);
-
-        c.ggml_graph_reset(@constCast(&gf));
-        _ = c.ggml_set_f32(y.*.grad, 1.0);
-
-        c.ggml_graph_compute_with_ctx(ctx0, @constCast(&gb), n_threads);
-
-        std.debug.print("y      = {d:.6}\n", .{c.ggml_get_f32_1d(y, 0)});
-        std.debug.print("df/dx1 = {d:.6} {d:.6} {d:.6}\n",
-            .{
-                c.ggml_get_f32_1d(x1.*.grad, 0),
-                c.ggml_get_f32_1d(x1.*.grad, 1),
-                c.ggml_get_f32_1d(x1.*.grad, 2),
-            });
-        std.debug.print("df/dx2 = {d:.6} {d:.6} {d:.6}\n",
-            .{
-                c.ggml_get_f32_1d(x2.*.grad, 0),
-                c.ggml_get_f32_1d(x2.*.grad, 1),
-                c.ggml_get_f32_1d(x2.*.grad, 2),
-            });
-
-        try std.testing.expect(c.ggml_get_f32_1d(y, 0)          ==  99.0);
-        try std.testing.expect(c.ggml_get_f32_1d(x1.*.grad, 0)  ==  17.0);
-        try std.testing.expect(c.ggml_get_f32_1d(x1.*.grad, 1)  ==  17.0);
-        try std.testing.expect(c.ggml_get_f32_1d(x1.*.grad, 2)  ==  17.0);
-        try std.testing.expect(c.ggml_get_f32_1d(x2.*.grad, 0)  ==  3.0);
-        try std.testing.expect(c.ggml_get_f32_1d(x2.*.grad, 1)  ==  3.0);
-        try std.testing.expect(c.ggml_get_f32_1d(x2.*.grad, 2)  ==  3.0);
-
-        c.ggml_graph_dump_dot(&gf, null, "test1-7-forward.dot");
-        c.ggml_graph_dump_dot(&gb, &gf,  "test1-7-backward.dot");
-    }
-
-    ///////////////////////////////////////////////////////////////
-
-    {
-        const x1 = c.ggml_new_tensor_1d(ctx0, c.GGML_TYPE_F32, 3);
-        const x2 = c.ggml_new_tensor_1d(ctx0, c.GGML_TYPE_F32, 3);
-
-        c.ggml_set_param(ctx0, x1);
-        c.ggml_set_param(ctx0, x2);
-
-        const y =
-            c.ggml_abs(ctx0,
-                    c.ggml_sub(ctx0, x1, x2)
-                    );
-
-        const gf = c.ggml_build_forward(y);
-        const gb = c.ggml_build_backward(ctx0, @constCast(&gf), false);
-
-        _ = c.ggml_set_f32(x1, 3.0);
-        _ = c.ggml_set_f32(x2, 5.0);
-
-        c.ggml_graph_reset(@constCast(&gf));
-        _ = c.ggml_set_f32(y.*.grad, 1.0);
-
-        c.ggml_graph_compute_with_ctx(ctx0, @constCast(&gb), n_threads);
-
-        std.debug.print("y      = {d:.6}\n", .{c.ggml_get_f32_1d(y, 0)});
-        std.debug.print("df/dx1 = {d:.6} {d:.6} {d:.6}\n",
-            .{
-                c.ggml_get_f32_1d(x1.*.grad, 0),
-                c.ggml_get_f32_1d(x1.*.grad, 1),
-                c.ggml_get_f32_1d(x1.*.grad, 2),
-            });
-        std.debug.print("df/dx2 = {d:.6} {d:.6} {d:.6}\n",
-            .{
-                c.ggml_get_f32_1d(x2.*.grad, 0),
-                c.ggml_get_f32_1d(x2.*.grad, 1),
-                c.ggml_get_f32_1d(x2.*.grad, 2),
-            });
-
-        try std.testing.expect(c.ggml_get_f32_1d(y, 0)          ==  2.0);
-        try std.testing.expect(c.ggml_get_f32_1d(x1.*.grad, 0)  ==  -1.0);
-        try std.testing.expect(c.ggml_get_f32_1d(x1.*.grad, 1)  ==  -1.0);
-        try std.testing.expect(c.ggml_get_f32_1d(x1.*.grad, 2)  ==  -1.0);
-        try std.testing.expect(c.ggml_get_f32_1d(x2.*.grad, 0)  ==  1.0);
-        try std.testing.expect(c.ggml_get_f32_1d(x2.*.grad, 1)  ==  1.0);
-        try std.testing.expect(c.ggml_get_f32_1d(x2.*.grad, 2)  ==  1.0);
-
-        _ = c.ggml_set_f32(x1, 7.0);
-        _ = c.ggml_set_f32(x2, 5.0);
-
-        c.ggml_graph_reset(@constCast(&gf));
-        _ = c.ggml_set_f32(y.*.grad, 1.0);
-
-        c.ggml_graph_compute_with_ctx(ctx0, @constCast(&gb), n_threads);
-
-        std.debug.print("y      = {d:.6}\n", .{c.ggml_get_f32_1d(y, 0)});
-        std.debug.print("df/dx1 = {d:.6} {d:.6} {d:.6}\n",
-            .{
-                c.ggml_get_f32_1d(x1.*.grad, 0),
-                c.ggml_get_f32_1d(x1.*.grad, 1),
-                c.ggml_get_f32_1d(x1.*.grad, 2),
-            });
-        std.debug.print("df/dx2 = {d:.6} {d:.6} {d:.6}\n",
-            .{
-                c.ggml_get_f32_1d(x2.*.grad, 0),
-                c.ggml_get_f32_1d(x2.*.grad, 1),
-                c.ggml_get_f32_1d(x2.*.grad, 2),
-            });
-
-        try std.testing.expect(c.ggml_get_f32_1d(y, 0)          ==  2.0);
-        try std.testing.expect(c.ggml_get_f32_1d(x1.*.grad, 0)  ==  1.0);
-        try std.testing.expect(c.ggml_get_f32_1d(x1.*.grad, 1)  ==  1.0);
-        try std.testing.expect(c.ggml_get_f32_1d(x1.*.grad, 2)  ==  1.0);
-        try std.testing.expect(c.ggml_get_f32_1d(x2.*.grad, 0)  ==  -1.0);
-        try std.testing.expect(c.ggml_get_f32_1d(x2.*.grad, 1)  ==  -1.0);
-        try std.testing.expect(c.ggml_get_f32_1d(x2.*.grad, 2)  ==  -1.0);
-
-        c.ggml_graph_dump_dot(&gf, null, "test1-8-forward.dot");
-        c.ggml_graph_dump_dot(&gb, &gf,  "test1-8-backward.dot");
-    }
-
-    _ = try std.io.getStdIn().reader().readByte();
-}
diff --git a/ggml/tests/test2.c b/ggml/tests/test2.c
deleted file mode 100644
index 839e3e6..0000000
--- a/ggml/tests/test2.c
+++ /dev/null
@@ -1,181 +0,0 @@
-#define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows
-#include "ggml/ggml.h"
-
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-
-bool is_close(float a, float b, float epsilon) {
-    return fabs(a - b) < epsilon;
-}
-
-int main(int argc, const char ** argv) {
-    struct ggml_init_params params = {
-        .mem_size   = 128*1024*1024,
-        .mem_buffer = NULL,
-        .no_alloc   = false,
-    };
-
-    //struct ggml_opt_params opt_params = ggml_opt_default_params(GGML_OPT_ADAM);
-    //opt_params.adam.alpha = 0.01f;
-
-    struct ggml_opt_params opt_params = ggml_opt_default_params(GGML_OPT_LBFGS);
-
-    // original threads: 8
-    int nthreads = 8;
-    const char *env = getenv("GGML_NTHREADS");
-    if (env != NULL) {
-        nthreads = atoi(env);
-    }
-    if (argc > 1) {
-        nthreads = atoi(argv[1]);
-    }
-    opt_params.n_threads = nthreads;
-    printf("test2: n_threads:%d\n", opt_params.n_threads);
-
-    const float xi[] = {  1.0f,  2.0f,  3.0f,  4.0f,  5.0f , 6.0f,  7.0f,  8.0f,  9.0f,  10.0f, };
-          float yi[] = { 15.0f, 25.0f, 35.0f, 45.0f, 55.0f, 65.0f, 75.0f, 85.0f, 95.0f, 105.0f, };
-
-    const int n = sizeof(xi)/sizeof(xi[0]);
-
-    struct ggml_context * ctx0 = ggml_init(params);
-
-    struct ggml_tensor * x = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n);
-    struct ggml_tensor * y = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n);
-
-    for (int i = 0; i < n; i++) {
-        ((float *) x->data)[i] = xi[i];
-        ((float *) y->data)[i] = yi[i];
-    }
-
-    {
-        struct ggml_tensor * t0 = ggml_new_f32(ctx0, 0.0f);
-        struct ggml_tensor * t1 = ggml_new_f32(ctx0, 0.0f);
-
-        // initialize auto-diff parameters:
-        ggml_set_param(ctx0, t0);
-        ggml_set_param(ctx0, t1);
-
-        // f = sum_i[(t0 + t1*x_i - y_i)^2]/(2n)
-        struct ggml_tensor * f =
-            ggml_div(ctx0,
-                    ggml_sum(ctx0,
-                        ggml_sqr(ctx0,
-                            ggml_sub(ctx0,
-                                ggml_add(ctx0,
-                                    ggml_mul(ctx0, x, ggml_repeat(ctx0, t1, x)),
-                                    ggml_repeat(ctx0, t0, x)),
-                                y)
-                            )
-                        ),
-                    ggml_new_f32(ctx0, 2.0f*n));
-
-        enum ggml_opt_result res = ggml_opt(NULL, opt_params, f);
-
-        printf("t0 = %f\n", ggml_get_f32_1d(t0, 0));
-        printf("t1 = %f\n", ggml_get_f32_1d(t1, 0));
-
-        GGML_ASSERT(res == GGML_OPT_OK);
-
-        GGML_ASSERT(is_close(ggml_get_f32_1d(t0, 0),  5.0f, 1e-3f));
-        GGML_ASSERT(is_close(ggml_get_f32_1d(t1, 0), 10.0f, 1e-3f));
-    }
-
-    {
-        struct ggml_tensor * t0 = ggml_new_f32(ctx0, -1.0f);
-        struct ggml_tensor * t1 = ggml_new_f32(ctx0,  9.0f);
-
-        ggml_set_param(ctx0, t0);
-        ggml_set_param(ctx0, t1);
-
-        // f = 0.5*sum_i[abs(t0 + t1*x_i - y_i)]/n
-        struct ggml_tensor * f =
-            ggml_mul(ctx0,
-                    ggml_new_f32(ctx0, 1.0/(2*n)),
-                    ggml_sum(ctx0,
-                        ggml_abs(ctx0,
-                            ggml_sub(ctx0,
-                                ggml_add(ctx0,
-                                    ggml_mul(ctx0, x, ggml_repeat(ctx0, t1, x)),
-                                    ggml_repeat(ctx0, t0, x)),
-                                y)
-                            )
-                        )
-                    );
-
-
-        enum ggml_opt_result res = ggml_opt(NULL, opt_params, f);
-
-        GGML_ASSERT(res == GGML_OPT_OK);
-        GGML_ASSERT(is_close(ggml_get_f32_1d(t0, 0),  5.0f, 1e-2f));
-        GGML_ASSERT(is_close(ggml_get_f32_1d(t1, 0), 10.0f, 1e-2f));
-    }
-
-    {
-        struct ggml_tensor * t0 = ggml_new_f32(ctx0,  5.0f);
-        struct ggml_tensor * t1 = ggml_new_f32(ctx0, -4.0f);
-
-        ggml_set_param(ctx0, t0);
-        ggml_set_param(ctx0, t1);
-
-        // f = t0^2 + t1^2
-        struct ggml_tensor * f =
-            ggml_add(ctx0,
-                    ggml_sqr(ctx0, t0),
-                    ggml_sqr(ctx0, t1)
-                    );
-
-        enum ggml_opt_result res = ggml_opt(NULL, opt_params, f);
-
-        GGML_ASSERT(res == GGML_OPT_OK);
-        GGML_ASSERT(is_close(ggml_get_f32_1d(f,  0), 0.0f, 1e-3f));
-        GGML_ASSERT(is_close(ggml_get_f32_1d(t0, 0), 0.0f, 1e-3f));
-        GGML_ASSERT(is_close(ggml_get_f32_1d(t1, 0), 0.0f, 1e-3f));
-    }
-
-    /////////////////////////////////////////
-
-    {
-        struct ggml_tensor * t0 = ggml_new_f32(ctx0, -7.0f);
-        struct ggml_tensor * t1 = ggml_new_f32(ctx0,  8.0f);
-
-        ggml_set_param(ctx0, t0);
-        ggml_set_param(ctx0, t1);
-
-        // f = (t0 + 2*t1 - 7)^2 + (2*t0 + t1 - 5)^2
-        struct ggml_tensor * f =
-            ggml_add(ctx0,
-                    ggml_sqr(ctx0,
-                        ggml_sub(ctx0,
-                            ggml_add(ctx0,
-                                t0,
-                                ggml_mul(ctx0, t1, ggml_new_f32(ctx0, 2.0f))),
-                            ggml_new_f32(ctx0, 7.0f)
-                            )
-                        ),
-                    ggml_sqr(ctx0,
-                        ggml_sub(ctx0,
-                            ggml_add(ctx0,
-                                ggml_mul(ctx0, t0, ggml_new_f32(ctx0, 2.0f)),
-                                t1),
-                            ggml_new_f32(ctx0, 5.0f)
-                            )
-                        )
-                    );
-
-        enum ggml_opt_result res = ggml_opt(NULL, opt_params, f);
-
-        GGML_ASSERT(res == GGML_OPT_OK);
-        GGML_ASSERT(is_close(ggml_get_f32_1d(f,  0), 0.0f, 1e-3f));
-        GGML_ASSERT(is_close(ggml_get_f32_1d(t0, 0), 1.0f, 1e-3f));
-        GGML_ASSERT(is_close(ggml_get_f32_1d(t1, 0), 3.0f, 1e-3f));
-    }
-
-    ggml_free(ctx0);
-
-    return 0;
-}
diff --git a/ggml/tests/test2.zig b/ggml/tests/test2.zig
deleted file mode 100644
index 974de0d..0000000
--- a/ggml/tests/test2.zig
+++ /dev/null
@@ -1,165 +0,0 @@
-const std = @import("std");
-const Thread = std.Thread;
-const c = @cImport({
-    @cInclude("ggml/ggml.h");
-});
-
-fn is_close(a: f32, b: f32, epsilon: f32) bool {
-    return std.math.fabs(a - b) < epsilon;
-}
-
-pub fn main() !void {
-    const params = .{
-        .mem_size   = 128*1024*1024,
-        .mem_buffer = null,
-        .no_alloc   = false,
-    };
-
-    var opt_params = c.ggml_opt_default_params(c.GGML_OPT_LBFGS);
-    
-    const nthreads = try Thread.getCpuCount();
-    opt_params.n_threads = @intCast(nthreads);
-    std.debug.print("test2: n_threads:{}\n", .{opt_params.n_threads});
-
-    const xi = [_]f32{  1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,  9.0,  10.0 };
-    const yi = [_]f32{ 15.0, 25.0, 35.0, 45.0, 55.0, 65.0, 75.0, 85.0, 95.0, 105.0 };
-
-    const n = xi.len;
-
-    const ctx0 = c.ggml_init(params);
-    defer c.ggml_free(ctx0);
-
-    const x = c.ggml_new_tensor_1d(ctx0, c.GGML_TYPE_F32, n);
-    const y = c.ggml_new_tensor_1d(ctx0, c.GGML_TYPE_F32, n);
-
-    for (0..n) |i| {
-        const x_data_pointer: [*]f32 = @ptrCast(@alignCast(x.*.data));
-        x_data_pointer[i] = xi[i];
-        const y_data_pointer: [*]f32 = @ptrCast(@alignCast(y.*.data));
-        y_data_pointer[i] = yi[i];
-    }
-
-    {
-        const t0 = c.ggml_new_f32(ctx0, 0.0);
-        const t1 = c.ggml_new_f32(ctx0, 0.0);
-
-        // initialize auto-diff parameters:
-        _ = c.ggml_set_param(ctx0, t0);
-        _ = c.ggml_set_param(ctx0, t1);
-
-        // f = sum_i[(t0 + t1*x_i - y_i)^2]/(2n)
-        const f =
-            c.ggml_div(ctx0,
-                    c.ggml_sum(ctx0,
-                        c.ggml_sqr(ctx0,
-                            c.ggml_sub(ctx0,
-                                c.ggml_add(ctx0,
-                                    c.ggml_mul(ctx0, x, c.ggml_repeat(ctx0, t1, x)),
-                                    c.ggml_repeat(ctx0, t0, x)),
-                                y)
-                            )
-                        ),
-                    c.ggml_new_f32(ctx0, @as(f32, 2.0)*n));
-
-        const res = c.ggml_opt(null, opt_params, f);
-
-        std.debug.print("t0 = {d:.6}\n", .{c.ggml_get_f32_1d(t0, 0)});
-        std.debug.print("t1 = {d:.6}\n", .{c.ggml_get_f32_1d(t1, 0)});
-
-        try std.testing.expect(res == c.GGML_OPT_OK);
-        try std.testing.expect(is_close(c.ggml_get_f32_1d(t0, 0),  5.0, 1e-3));
-        try std.testing.expect(is_close(c.ggml_get_f32_1d(t1, 0), 10.0, 1e-3));
-    }
-
-    {
-        const t0 = c.ggml_new_f32(ctx0, -1.0);
-        const t1 = c.ggml_new_f32(ctx0,  9.0);
-
-        _ = c.ggml_set_param(ctx0, t0);
-        _ = c.ggml_set_param(ctx0, t1);
-
-        // f = 0.5*sum_i[abs(t0 + t1*x_i - y_i)]/n
-        const f =
-            c.ggml_mul(ctx0,
-                    c.ggml_new_f32(ctx0, @as(f32, 1.0)/(2*n)),
-                    c.ggml_sum(ctx0,
-                        c.ggml_abs(ctx0,
-                            c.ggml_sub(ctx0,
-                                c.ggml_add(ctx0,
-                                    c.ggml_mul(ctx0, x, c.ggml_repeat(ctx0, t1, x)),
-                                    c.ggml_repeat(ctx0, t0, x)),
-                                y)
-                            )
-                        )
-                    );
-
-
-        const res = c.ggml_opt(null, opt_params, f);
-
-        try std.testing.expect(res == c.GGML_OPT_OK);
-        try std.testing.expect(is_close(c.ggml_get_f32_1d(t0, 0),  5.0, 1e-2));
-        try std.testing.expect(is_close(c.ggml_get_f32_1d(t1, 0), 10.0, 1e-2));
-    }
-
-    {
-        const t0 = c.ggml_new_f32(ctx0,  5.0);
-        const t1 = c.ggml_new_f32(ctx0, -4.0);
-
-        _ = c.ggml_set_param(ctx0, t0);
-        _ = c.ggml_set_param(ctx0, t1);
-
-        // f = t0^2 + t1^2
-        const f =
-            c.ggml_add(ctx0,
-                    c.ggml_sqr(ctx0, t0),
-                    c.ggml_sqr(ctx0, t1)
-                    );
-
-        const res = c.ggml_opt(null, opt_params, f);
-
-        try std.testing.expect(res == c.GGML_OPT_OK);
-        try std.testing.expect(is_close(c.ggml_get_f32_1d(f,  0), 0.0, 1e-3));
-        try std.testing.expect(is_close(c.ggml_get_f32_1d(t0, 0), 0.0, 1e-3));
-        try std.testing.expect(is_close(c.ggml_get_f32_1d(t1, 0), 0.0, 1e-3));
-    }
-
-    /////////////////////////////////////////
-
-    {
-        const t0 = c.ggml_new_f32(ctx0, -7.0);
-        const t1 = c.ggml_new_f32(ctx0,  8.0);
-
-        _ = c.ggml_set_param(ctx0, t0);
-        _ = c.ggml_set_param(ctx0, t1);
-
-        // f = (t0 + 2*t1 - 7)^2 + (2*t0 + t1 - 5)^2
-        const f =
-            c.ggml_add(ctx0,
-                    c.ggml_sqr(ctx0,
-                        c.ggml_sub(ctx0,
-                            c.ggml_add(ctx0,
-                                t0,
-                                c.ggml_mul(ctx0, t1, c.ggml_new_f32(ctx0, 2.0))),
-                            c.ggml_new_f32(ctx0, 7.0)
-                            )
-                        ),
-                    c.ggml_sqr(ctx0,
-                        c.ggml_sub(ctx0,
-                            c.ggml_add(ctx0,
-                                c.ggml_mul(ctx0, t0, c.ggml_new_f32(ctx0, 2.0)),
-                                t1),
-                            c.ggml_new_f32(ctx0, 5.0)
-                            )
-                        )
-                    );
-
-        const res = c.ggml_opt(null, opt_params, f);
-
-        try std.testing.expect(res == c.GGML_OPT_OK);
-        try std.testing.expect(is_close(c.ggml_get_f32_1d(f,  0), 0.0, 1e-3));
-        try std.testing.expect(is_close(c.ggml_get_f32_1d(t0, 0), 1.0, 1e-3));
-        try std.testing.expect(is_close(c.ggml_get_f32_1d(t1, 0), 3.0, 1e-3));
-    }
-
-    _ = try std.io.getStdIn().reader().readByte();
-}
diff --git a/ggml/tests/test3.c b/ggml/tests/test3.c
deleted file mode 100644
index b92d623..0000000
--- a/ggml/tests/test3.c
+++ /dev/null
@@ -1,95 +0,0 @@
-#include "ggml/ggml.h"
-
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-bool is_close(float a, float b, float epsilon) {
-    return fabs(a - b) < epsilon;
-}
-
-int main(int argc, const char ** argv) {
-    struct ggml_init_params params = {
-        .mem_size   = 1024*1024*1024,
-        .mem_buffer = NULL,
-        .no_alloc   = false,
-    };
-
-    //struct ggml_opt_params opt_params = ggml_opt_default_params(GGML_OPT_ADAM);
-    struct ggml_opt_params opt_params = ggml_opt_default_params(GGML_OPT_LBFGS);
-
-    opt_params.n_threads = (argc > 1) ? atoi(argv[1]) : 8;
-
-    const int NP = 1 << 12;
-    const int NF = 1 << 8;
-
-    struct ggml_context * ctx0 = ggml_init(params);
-
-    struct ggml_tensor * F = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, NF, NP);
-    struct ggml_tensor * l = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, NP);
-
-    // regularization weight
-    struct ggml_tensor * lambda = ggml_new_f32(ctx0, 1e-5f);
-
-    srand(0);
-
-    for (int j = 0; j < NP; j++) {
-        const float ll = j < NP/2 ? 1.0f : -1.0f;
-        ((float *)l->data)[j] = ll;
-
-        for (int i = 0; i < NF; i++) {
-            ((float *)F->data)[j*NF + i] = ((ll > 0 && i < NF/2 ? 1.0f : ll < 0 && i >= NF/2 ? 1.0f : 0.0f) + ((float)rand()/(float)RAND_MAX - 0.5f)*0.1f)/(0.5f*NF);
-        }
-    }
-
-    {
-        // initial guess
-        struct ggml_tensor * x = ggml_set_f32(ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, NF), 0.0f);
-
-        ggml_set_param(ctx0, x);
-
-        // f = sum[(fj*x - l)^2]/n + lambda*|x^2|
-        struct ggml_tensor * f =
-            ggml_add(ctx0,
-                    ggml_div(ctx0,
-                        ggml_sum(ctx0,
-                            ggml_sqr(ctx0,
-                                ggml_sub(ctx0,
-                                    ggml_mul_mat(ctx0, F, x),
-                                    l)
-                                )
-                            ),
-                        ggml_new_f32(ctx0, (float)NP)
-                        ),
-                    ggml_mul(ctx0,
-                        ggml_sum(ctx0, ggml_sqr(ctx0, x)),
-                        lambda)
-                    );
-
-        enum ggml_opt_result res = ggml_opt(NULL, opt_params, f);
-
-        GGML_ASSERT(res == GGML_OPT_OK);
-
-        // print results
-        for (int i = 0; i < 16; i++) {
-            printf("x[%3d] = %g\n", i, ((float *)x->data)[i]);
-        }
-        printf("...\n");
-        for (int i = NF - 16; i < NF; i++) {
-            printf("x[%3d] = %g\n", i, ((float *)x->data)[i]);
-        }
-        printf("\n");
-
-        for (int i = 0; i < NF; ++i) {
-            if (i < NF/2) {
-                GGML_ASSERT(is_close(((float *)x->data)[i],  1.0f, 1e-2f));
-            } else {
-                GGML_ASSERT(is_close(((float *)x->data)[i], -1.0f, 1e-2f));
-            }
-        }
-    }
-
-    ggml_free(ctx0);
-
-    return 0;
-}
diff --git a/ggml/tests/test3.zig b/ggml/tests/test3.zig
deleted file mode 100644
index 2c9f002..0000000
--- a/ggml/tests/test3.zig
+++ /dev/null
@@ -1,102 +0,0 @@
-const std = @import("std");
-const Thread = std.Thread;
-const c = @cImport({
-    @cInclude("stdlib.h");
-    @cInclude("ggml/ggml.h");
-});
-
-fn is_close(a: f32, b: f32, epsilon: f32) bool {
-    return std.math.fabs(a - b) < epsilon;
-}
-
-pub fn main() !void {
-    const params = .{
-        .mem_size   = 128*1024*1024,
-        .mem_buffer = null,
-        .no_alloc   = false,
-    };
-
-    var opt_params = c.ggml_opt_default_params(c.GGML_OPT_LBFGS);
-    
-    const nthreads = try Thread.getCpuCount();
-    opt_params.n_threads = @intCast(nthreads);
-
-    const NP = 1 << 12;
-    const NF = 1 << 8;
-
-    const ctx0 = c.ggml_init(params);
-    defer c.ggml_free(ctx0);
-
-    const F = c.ggml_new_tensor_2d(ctx0, c.GGML_TYPE_F32, NF, NP);
-    const l = c.ggml_new_tensor_1d(ctx0, c.GGML_TYPE_F32, NP);
-
-    // regularization weight
-    const lambda = c.ggml_new_f32(ctx0, 1e-5);
-
-    c.srand(0);
-
-    const l_data_pointer: [*]f32 = @ptrCast(@alignCast(l.*.data));
-    const f_data_pointer: [*]f32 = @ptrCast(@alignCast(F.*.data));
-    for (0..NP) |j| {
-        const ll = if (j < NP/2) @as(f32, 1.0) else @as(f32, -1.0);
-        l_data_pointer[j] = ll;
-        
-        for (0..NF) |i| {
-            const c_rand: f32 = @floatFromInt(c.rand());
-            f_data_pointer[j*NF + i] = 
-                ((if (ll > 0 and i < NF/2) @as(f32, 1.0) else 
-                    if (ll < 0 and i >= NF/2) @as(f32, 1.0) else @as(f32, 0.0)) + 
-                        (c_rand/c.RAND_MAX - 0.5) * 0.1) / (0.5 * NF);
-        }
-    }
-
-    {
-        // initial guess
-        const x = c.ggml_set_f32(c.ggml_new_tensor_1d(ctx0, c.GGML_TYPE_F32, NF), 0.0);
-
-        c.ggml_set_param(ctx0, x);
-
-        // f = sum[(fj*x - l)^2]/n + lambda*|x^2|
-        const f =
-            c.ggml_add(ctx0,
-                    c.ggml_div(ctx0,
-                        c.ggml_sum(ctx0,
-                            c.ggml_sqr(ctx0,
-                                c.ggml_sub(ctx0,
-                                    c.ggml_mul_mat(ctx0, F, x),
-                                    l)
-                                )
-                            ),
-                        c.ggml_new_f32(ctx0, @as(f32, NP))
-                        ),
-                    c.ggml_mul(ctx0,
-                        c.ggml_sum(ctx0, c.ggml_sqr(ctx0, x)),
-                        lambda)
-                    );
-
-        const res = c.ggml_opt(null, opt_params, f);
-
-        try std.testing.expect(res == c.GGML_OPT_OK);
-
-        const x_data_pointer: [*]f32 = @ptrCast(@alignCast(x.*.data));
-        // print results
-        for (0..16) |i| {
-            std.debug.print("x[{d:3}] = {d:.6}\n", .{i, x_data_pointer[i]});
-        }
-        std.debug.print("...\n", .{});
-        for (NF - 16..NF) |i| {
-            std.debug.print("x[{d:3}] = {d:.6}\n", .{i, x_data_pointer[i]});
-        }
-        std.debug.print("\n", .{});
-
-        for (0..NF) |i| {
-            if (i < NF/2) {
-                try std.testing.expect(is_close(x_data_pointer[i], 1.0, 1e-2));
-            } else {
-                try std.testing.expect(is_close(x_data_pointer[i], -1.0, 1e-2));
-            }
-        }
-    }
-
-    _ = try std.io.getStdIn().reader().readByte();
-}

--
Gitblit v1.9.3