mirror of https://github.com/ollama/ollama
Compare commits
50 Commits
v0.13.2-rc
...
main
| Author | SHA1 | Date |
|---|---|---|
|
|
a013693f80 | |
|
|
f6a016f49d | |
|
|
45c4739374 | |
|
|
2dd029de12 | |
|
|
903b1fc97f | |
|
|
89eb795293 | |
|
|
7e3ea813c1 | |
|
|
7b95087b9d | |
|
|
971d62595a | |
|
|
ffbe8e076d | |
|
|
2c639431b1 | |
|
|
aacd1cb394 | |
|
|
e3731fb160 | |
|
|
8dbc9e7b68 | |
|
|
abe67acf8a | |
|
|
4ff8a691bc | |
|
|
1b308e1d2a | |
|
|
bd6c1d6b49 | |
|
|
3af5d3b738 | |
|
|
7730895158 | |
|
|
de9ecfd01c | |
|
|
95fdd8d619 | |
|
|
9f7822851c | |
|
|
9b2035d194 | |
|
|
93d45d7a04 | |
|
|
709f842457 | |
|
|
2dfb74410d | |
|
|
1eb5e75972 | |
|
|
3475d915cb | |
|
|
48e78e9be1 | |
|
|
a838421ea3 | |
|
|
1c4e85b4df | |
|
|
dac4f17fea | |
|
|
56b8fb024c | |
|
|
b95693056c | |
|
|
c34fc64688 | |
|
|
7cf6f18c1f | |
|
|
bbbb6b2a01 | |
|
|
76f88caf43 | |
|
|
2bccf8c624 | |
|
|
0c5e5f6630 | |
|
|
d475d1f081 | |
|
|
d2f334c1f7 | |
|
|
603ceefaa6 | |
|
|
e082d60a24 | |
|
|
5dae738067 | |
|
|
0c78723174 | |
|
|
5a41d69b2a | |
|
|
c146a138e3 | |
|
|
31b8c6a214 |
|
|
@ -16,13 +16,15 @@ jobs:
|
|||
outputs:
|
||||
GOFLAGS: ${{ steps.goflags.outputs.GOFLAGS }}
|
||||
VERSION: ${{ steps.goflags.outputs.VERSION }}
|
||||
vendorsha: ${{ steps.changes.outputs.vendorsha }}
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- name: Set environment
|
||||
id: goflags
|
||||
run: |
|
||||
echo GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=${GITHUB_REF_NAME#v}\" \"-X=github.com/ollama/ollama/server.mode=release\"'" >>$GITHUB_OUTPUT
|
||||
echo VERSION="${GITHUB_REF_NAME#v}" >>$GITHUB_OUTPUT
|
||||
echo GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=${GITHUB_REF_NAME#v}\" \"-X=github.com/ollama/ollama/server.mode=release\"'" | tee -a $GITHUB_OUTPUT
|
||||
echo VERSION="${GITHUB_REF_NAME#v}" | tee -a $GITHUB_OUTPUT
|
||||
echo vendorsha=$(make -f Makefile.sync print-base) | tee -a $GITHUB_OUTPUT
|
||||
|
||||
darwin-build:
|
||||
runs-on: macos-14-xlarge
|
||||
|
|
@ -53,6 +55,9 @@ jobs:
|
|||
- uses: actions/setup-go@v5
|
||||
with:
|
||||
go-version-file: go.mod
|
||||
cache-dependency-path: |
|
||||
go.sum
|
||||
Makefile.sync
|
||||
- run: |
|
||||
./scripts/build_darwin.sh
|
||||
- name: Log build results
|
||||
|
|
@ -185,7 +190,7 @@ jobs:
|
|||
- uses: actions/cache@v4
|
||||
with:
|
||||
path: ${{ github.workspace }}\.ccache
|
||||
key: ccache-${{ matrix.os }}-${{ matrix.arch }}-${{ matrix.preset }}
|
||||
key: ccache-${{ matrix.os }}-${{ matrix.arch }}-${{ matrix.preset }}-${{ needs.setup-environment.outputs.vendorsha }}
|
||||
- name: Build target "${{ matrix.preset }}"
|
||||
run: |
|
||||
Import-Module 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
|
||||
|
|
@ -249,6 +254,9 @@ jobs:
|
|||
- uses: actions/setup-go@v5
|
||||
with:
|
||||
go-version-file: go.mod
|
||||
cache-dependency-path: |
|
||||
go.sum
|
||||
Makefile.sync
|
||||
- name: Verify gcc is actually clang
|
||||
run: |
|
||||
$ErrorActionPreference='Continue'
|
||||
|
|
@ -302,6 +310,9 @@ jobs:
|
|||
- uses: actions/setup-go@v5
|
||||
with:
|
||||
go-version-file: go.mod
|
||||
cache-dependency-path: |
|
||||
go.sum
|
||||
Makefile.sync
|
||||
- uses: actions/download-artifact@v4
|
||||
with:
|
||||
pattern: depends-windows*
|
||||
|
|
|
|||
|
|
@ -22,6 +22,7 @@ jobs:
|
|||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
changed: ${{ steps.changes.outputs.changed }}
|
||||
vendorsha: ${{ steps.changes.outputs.vendorsha }}
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
|
|
@ -37,6 +38,7 @@ jobs:
|
|||
}
|
||||
|
||||
echo changed=$(changed 'llama/llama.cpp/**/*' 'ml/backend/ggml/ggml/**/*') | tee -a $GITHUB_OUTPUT
|
||||
echo vendorsha=$(make -f Makefile.sync print-base) | tee -a $GITHUB_OUTPUT
|
||||
|
||||
linux:
|
||||
needs: [changes]
|
||||
|
|
@ -83,7 +85,7 @@ jobs:
|
|||
- uses: actions/cache@v4
|
||||
with:
|
||||
path: /github/home/.cache/ccache
|
||||
key: ccache-${{ runner.os }}-${{ runner.arch }}-${{ matrix.preset }}
|
||||
key: ccache-${{ runner.os }}-${{ runner.arch }}-${{ matrix.preset }}-${{ needs.changes.outputs.vendorsha }}
|
||||
- run: |
|
||||
cmake --preset ${{ matrix.preset }} ${{ matrix.flags }}
|
||||
cmake --build --preset ${{ matrix.preset }} --parallel
|
||||
|
|
@ -178,7 +180,7 @@ jobs:
|
|||
- uses: actions/cache@v4
|
||||
with:
|
||||
path: ${{ github.workspace }}\.ccache
|
||||
key: ccache-${{ runner.os }}-${{ runner.arch }}-${{ matrix.preset }}
|
||||
key: ccache-${{ runner.os }}-${{ runner.arch }}-${{ matrix.preset }}-${{ needs.changes.outputs.vendorsha }}
|
||||
- run: |
|
||||
Import-Module 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
|
||||
Enter-VsDevShell -VsInstallPath 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise' -SkipAutomaticLocation -DevCmdArguments '-arch=x64 -no_logo'
|
||||
|
|
@ -206,6 +208,9 @@ jobs:
|
|||
- uses: actions/setup-go@v5
|
||||
with:
|
||||
go-version-file: 'go.mod'
|
||||
cache-dependency-path: |
|
||||
go.sum
|
||||
Makefile.sync
|
||||
- uses: actions/setup-node@v4
|
||||
with:
|
||||
node-version: '20'
|
||||
|
|
|
|||
|
|
@ -54,6 +54,13 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-cp
|
|||
|
||||
add_compile_definitions(NDEBUG GGML_VERSION=0x0 GGML_COMMIT=0x0)
|
||||
|
||||
# Define GGML version variables for shared library SOVERSION
|
||||
# These are required by ggml/src/CMakeLists.txt for proper library versioning
|
||||
set(GGML_VERSION_MAJOR 0)
|
||||
set(GGML_VERSION_MINOR 0)
|
||||
set(GGML_VERSION_PATCH 0)
|
||||
set(GGML_VERSION "${GGML_VERSION_MAJOR}.${GGML_VERSION_MINOR}.${GGML_VERSION_PATCH}")
|
||||
|
||||
set(GGML_CPU ON)
|
||||
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src)
|
||||
set_property(TARGET ggml PROPERTY EXCLUDE_FROM_ALL TRUE)
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
UPSTREAM=https://github.com/ggml-org/llama.cpp.git
|
||||
WORKDIR=llama/vendor
|
||||
FETCH_HEAD=7f8ef50cce40e3e7e4526a3696cb45658190e69a
|
||||
FETCH_HEAD=17f7f4baad8b3a716ee139da7bb56ae984e8c0fa
|
||||
|
||||
.PHONY: help
|
||||
help:
|
||||
|
|
@ -57,7 +57,7 @@ checkout: $(WORKDIR)
|
|||
$(WORKDIR):
|
||||
git clone $(UPSTREAM) $(WORKDIR)
|
||||
|
||||
.PHONE: format-patches
|
||||
.PHONY: format-patches
|
||||
format-patches: llama/patches
|
||||
git -C $(WORKDIR) format-patch \
|
||||
--no-signature \
|
||||
|
|
@ -66,7 +66,11 @@ format-patches: llama/patches
|
|||
-o $(realpath $<) \
|
||||
$(FETCH_HEAD)
|
||||
|
||||
.PHONE: clean
|
||||
.PHONY: clean
|
||||
clean: checkout
|
||||
@git -C $(WORKDIR) am --abort || true
|
||||
$(RM) llama/patches/.*.patched
|
||||
|
||||
.PHONY: print-base
|
||||
print-base:
|
||||
@echo $(FETCH_HEAD)
|
||||
|
|
@ -555,7 +555,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
|||
- [Parakeet](https://github.com/parakeet-nest/parakeet) is a GoLang library, made to simplify the development of small generative AI applications with Ollama.
|
||||
- [Haverscript](https://github.com/andygill/haverscript) with [examples](https://github.com/andygill/haverscript/tree/main/examples)
|
||||
- [Ollama for Swift](https://github.com/mattt/ollama-swift)
|
||||
- [Swollama for Swift](https://github.com/marcusziade/Swollama) with [DocC](https://marcusziade.github.io/Swollama/documentation/swollama/)
|
||||
- [Swollama for Swift](https://github.com/guitaripod/Swollama) with [DocC](https://guitaripod.github.io/Swollama/documentation/swollama)
|
||||
- [GoLamify](https://github.com/prasad89/golamify)
|
||||
- [Ollama for Haskell](https://github.com/tusharad/ollama-haskell)
|
||||
- [multi-llm-ts](https://github.com/nbonamy/multi-llm-ts) (A Typescript/JavaScript library allowing access to different LLM in a unified API)
|
||||
|
|
|
|||
|
|
@ -347,7 +347,7 @@ type CreateProgressFunc func(ProgressResponse) error
|
|||
// Create creates a model from a [Modelfile]. fn is a progress function that
|
||||
// behaves similarly to other methods (see [Client.Pull]).
|
||||
//
|
||||
// [Modelfile]: https://github.com/ollama/ollama/blob/main/docs/modelfile.md
|
||||
// [Modelfile]: https://github.com/ollama/ollama/blob/main/docs/modelfile.mdx
|
||||
func (c *Client) Create(ctx context.Context, req *CreateRequest, fn CreateProgressFunc) error {
|
||||
return c.stream(ctx, http.MethodPost, "/api/create", req, func(bts []byte) error {
|
||||
var resp ProgressResponse
|
||||
|
|
|
|||
|
|
@ -273,10 +273,6 @@ func main() {
|
|||
Handler: uiServer.Handler(),
|
||||
}
|
||||
|
||||
if _, err := uiServer.UserData(ctx); err != nil {
|
||||
slog.Warn("failed to load user data", "error", err)
|
||||
}
|
||||
|
||||
// Start the UI server
|
||||
slog.Info("starting ui server", "port", port)
|
||||
go func() {
|
||||
|
|
@ -320,6 +316,17 @@ func main() {
|
|||
slog.Debug("no URL scheme request to handle")
|
||||
}
|
||||
|
||||
go func() {
|
||||
slog.Debug("waiting for ollama server to be ready")
|
||||
if err := ui.WaitForServer(ctx, 10*time.Second); err != nil {
|
||||
slog.Warn("ollama server not ready, continuing anyway", "error", err)
|
||||
}
|
||||
|
||||
if _, err := uiServer.UserData(ctx); err != nil {
|
||||
slog.Warn("failed to load user data", "error", err)
|
||||
}
|
||||
}()
|
||||
|
||||
osRun(cancel, hasCompletedFirstRun, startHidden)
|
||||
|
||||
slog.Info("shutting down desktop server")
|
||||
|
|
@ -361,7 +368,7 @@ func checkUserLoggedIn(uiServerPort int) bool {
|
|||
return false
|
||||
}
|
||||
|
||||
resp, err := http.Get(fmt.Sprintf("http://127.0.0.1:%d/api/v1/me", uiServerPort))
|
||||
resp, err := http.Post(fmt.Sprintf("http://127.0.0.1:%d/api/me", uiServerPort), "application/json", nil)
|
||||
if err != nil {
|
||||
slog.Debug("failed to call local auth endpoint", "error", err)
|
||||
return false
|
||||
|
|
|
|||
|
|
@ -191,13 +191,6 @@ func LaunchNewApp() {
|
|||
C.launchApp(appName)
|
||||
}
|
||||
|
||||
// Send a request to the main app thread to load a UI page
|
||||
func sendUIRequestMessage(path string) {
|
||||
p := C.CString(path)
|
||||
defer C.free(unsafe.Pointer(p))
|
||||
C.uiRequest(p)
|
||||
}
|
||||
|
||||
func registerLaunchAgent(hasCompletedFirstRun bool) {
|
||||
// Remove any stale Login Item registrations
|
||||
C.unregisterSelfFromLoginItem()
|
||||
|
|
|
|||
|
|
@ -263,11 +263,6 @@ func createLoginShortcut() error {
|
|||
return nil
|
||||
}
|
||||
|
||||
// Send a request to the main app thread to load a UI page
|
||||
func sendUIRequestMessage(path string) {
|
||||
wintray.SendUIRequestMessage(path)
|
||||
}
|
||||
|
||||
func LaunchNewApp() {
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -169,37 +169,47 @@ DlgResult fileDlg(FileDlgParams* params) {
|
|||
}
|
||||
|
||||
NSArray* urls = [panel URLs];
|
||||
if(self->params->allowMultiple && [urls count] >= 1) {
|
||||
if([urls count] == 0) {
|
||||
return DLG_CANCEL;
|
||||
}
|
||||
|
||||
if(self->params->allowMultiple) {
|
||||
// For multiple files, we need to return all paths separated by null bytes
|
||||
char* bufPtr = self->params->buf;
|
||||
int remainingBuf = self->params->nbuf;
|
||||
|
||||
// Calculate total required buffer size first
|
||||
int totalSize = 0;
|
||||
for(NSURL* url in urls) {
|
||||
char tempBuf[PATH_MAX];
|
||||
if(![url getFileSystemRepresentation:tempBuf maxLength:PATH_MAX]) {
|
||||
return DLG_URLFAIL;
|
||||
}
|
||||
totalSize += strlen(tempBuf) + 1; // +1 for null terminator
|
||||
}
|
||||
totalSize += 1; // Final null terminator
|
||||
// Calculate total required buffer size first
|
||||
int totalSize = 0;
|
||||
for(NSURL* url in urls) {
|
||||
char tempBuf[PATH_MAX];
|
||||
if(![url getFileSystemRepresentation:tempBuf maxLength:PATH_MAX]) {
|
||||
return DLG_URLFAIL;
|
||||
}
|
||||
totalSize += strlen(tempBuf) + 1; // +1 for null terminator
|
||||
}
|
||||
totalSize += 1; // Final null terminator
|
||||
|
||||
if(totalSize > self->params->nbuf) {
|
||||
// Not enough buffer space
|
||||
return DLG_URLFAIL;
|
||||
}
|
||||
if(totalSize > self->params->nbuf) {
|
||||
// Not enough buffer space
|
||||
return DLG_URLFAIL;
|
||||
}
|
||||
|
||||
// Now actually copy the paths (we know we have space)
|
||||
bufPtr = self->params->buf;
|
||||
for(NSURL* url in urls) {
|
||||
char tempBuf[PATH_MAX];
|
||||
[url getFileSystemRepresentation:tempBuf maxLength:PATH_MAX];
|
||||
int pathLen = strlen(tempBuf);
|
||||
strcpy(bufPtr, tempBuf);
|
||||
bufPtr += pathLen + 1;
|
||||
}
|
||||
*bufPtr = '\0'; // Final null terminator
|
||||
// Now actually copy the paths (we know we have space)
|
||||
bufPtr = self->params->buf;
|
||||
for(NSURL* url in urls) {
|
||||
char tempBuf[PATH_MAX];
|
||||
[url getFileSystemRepresentation:tempBuf maxLength:PATH_MAX];
|
||||
int pathLen = strlen(tempBuf);
|
||||
strcpy(bufPtr, tempBuf);
|
||||
bufPtr += pathLen + 1;
|
||||
}
|
||||
*bufPtr = '\0'; // Final null terminator
|
||||
} else {
|
||||
// Single file/directory selection - write path to buffer
|
||||
NSURL* url = [urls firstObject];
|
||||
if(![url getFileSystemRepresentation:self->params->buf maxLength:self->params->nbuf]) {
|
||||
return DLG_URLFAIL;
|
||||
}
|
||||
}
|
||||
|
||||
return DLG_OK;
|
||||
|
|
|
|||
|
|
@ -15,7 +15,7 @@ const multiFileBufferSize = w32.MAX_PATH * 10
|
|||
type WinDlgError int
|
||||
|
||||
func (e WinDlgError) Error() string {
|
||||
return fmt.Sprintf("CommDlgExtendedError: %#x", e)
|
||||
return fmt.Sprintf("CommDlgExtendedError: %#x", int(e))
|
||||
}
|
||||
|
||||
func err() error {
|
||||
|
|
|
|||
|
|
@ -224,9 +224,7 @@ func (s *Server) cmd(ctx context.Context) (*exec.Cmd, error) {
|
|||
if _, err := os.Stat(settings.Models); err == nil {
|
||||
env["OLLAMA_MODELS"] = settings.Models
|
||||
} else {
|
||||
slog.Warn("models path not accessible, clearing models setting", "path", settings.Models, "err", err)
|
||||
settings.Models = ""
|
||||
s.store.SetSettings(settings)
|
||||
slog.Warn("models path not accessible, using default", "path", settings.Models, "err", err)
|
||||
}
|
||||
}
|
||||
if settings.ContextLength > 0 {
|
||||
|
|
|
|||
|
|
@ -469,26 +469,24 @@ export class HealthResponse {
|
|||
}
|
||||
export class User {
|
||||
id: string;
|
||||
name: string;
|
||||
email: string;
|
||||
avatarURL: string;
|
||||
plan: string;
|
||||
bio: string;
|
||||
firstName: string;
|
||||
lastName: string;
|
||||
overThreshold: boolean;
|
||||
name: string;
|
||||
bio?: string;
|
||||
avatarurl?: string;
|
||||
firstname?: string;
|
||||
lastname?: string;
|
||||
plan?: string;
|
||||
|
||||
constructor(source: any = {}) {
|
||||
if ('string' === typeof source) source = JSON.parse(source);
|
||||
this.id = source["id"];
|
||||
this.name = source["name"];
|
||||
this.email = source["email"];
|
||||
this.avatarURL = source["avatarURL"];
|
||||
this.plan = source["plan"];
|
||||
this.name = source["name"];
|
||||
this.bio = source["bio"];
|
||||
this.firstName = source["firstName"];
|
||||
this.lastName = source["lastName"];
|
||||
this.overThreshold = source["overThreshold"];
|
||||
this.avatarurl = source["avatarurl"];
|
||||
this.firstname = source["firstname"];
|
||||
this.lastname = source["lastname"];
|
||||
this.plan = source["plan"];
|
||||
}
|
||||
}
|
||||
export class Attachment {
|
||||
|
|
|
|||
|
|
@ -15,7 +15,7 @@ import {
|
|||
import { parseJsonlFromResponse } from "./util/jsonl-parsing";
|
||||
import { ollamaClient as ollama } from "./lib/ollama-client";
|
||||
import type { ModelResponse } from "ollama/browser";
|
||||
import { API_BASE } from "./lib/config";
|
||||
import { API_BASE, OLLAMA_DOT_COM } from "./lib/config";
|
||||
|
||||
// Extend Model class with utility methods
|
||||
declare module "@/gotypes" {
|
||||
|
|
@ -27,7 +27,6 @@ declare module "@/gotypes" {
|
|||
Model.prototype.isCloud = function (): boolean {
|
||||
return this.model.endsWith("cloud");
|
||||
};
|
||||
|
||||
// Helper function to convert Uint8Array to base64
|
||||
function uint8ArrayToBase64(uint8Array: Uint8Array): string {
|
||||
const chunkSize = 0x8000; // 32KB chunks to avoid stack overflow
|
||||
|
|
@ -42,44 +41,50 @@ function uint8ArrayToBase64(uint8Array: Uint8Array): string {
|
|||
}
|
||||
|
||||
export async function fetchUser(): Promise<User | null> {
|
||||
try {
|
||||
const response = await fetch(`${API_BASE}/api/v1/me`, {
|
||||
method: "GET",
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
});
|
||||
|
||||
if (response.ok) {
|
||||
const userData: User = await response.json();
|
||||
return userData;
|
||||
}
|
||||
|
||||
return null;
|
||||
} catch (error) {
|
||||
console.error("Error fetching user:", error);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
export async function fetchConnectUrl(): Promise<string> {
|
||||
const response = await fetch(`${API_BASE}/api/v1/connect`, {
|
||||
method: "GET",
|
||||
const response = await fetch(`${API_BASE}/api/me`, {
|
||||
method: "POST",
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error("Failed to fetch connect URL");
|
||||
if (response.ok) {
|
||||
const userData: User = await response.json();
|
||||
|
||||
if (userData.avatarurl && !userData.avatarurl.startsWith("http")) {
|
||||
userData.avatarurl = `${OLLAMA_DOT_COM}${userData.avatarurl}`;
|
||||
}
|
||||
|
||||
return userData;
|
||||
}
|
||||
|
||||
const data = await response.json();
|
||||
return data.connect_url;
|
||||
if (response.status === 401 || response.status === 403) {
|
||||
return null;
|
||||
}
|
||||
|
||||
throw new Error(`Failed to fetch user: ${response.status}`);
|
||||
}
|
||||
|
||||
export async function fetchConnectUrl(): Promise<string> {
|
||||
const response = await fetch(`${API_BASE}/api/me`, {
|
||||
method: "POST",
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
});
|
||||
|
||||
if (response.status === 401) {
|
||||
const data = await response.json();
|
||||
if (data.signin_url) {
|
||||
return data.signin_url;
|
||||
}
|
||||
}
|
||||
|
||||
throw new Error("Failed to fetch connect URL");
|
||||
}
|
||||
|
||||
export async function disconnectUser(): Promise<void> {
|
||||
const response = await fetch(`${API_BASE}/api/v1/disconnect`, {
|
||||
const response = await fetch(`${API_BASE}/api/signout`, {
|
||||
method: "POST",
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
|
|
@ -389,7 +394,8 @@ export async function getInferenceCompute(): Promise<InferenceCompute[]> {
|
|||
|
||||
export async function fetchHealth(): Promise<boolean> {
|
||||
try {
|
||||
const response = await fetch(`${API_BASE}/api/v1/health`, {
|
||||
// Use the /api/version endpoint as a health check
|
||||
const response = await fetch(`${API_BASE}/api/version`, {
|
||||
method: "GET",
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
|
|
@ -398,7 +404,8 @@ export async function fetchHealth(): Promise<boolean> {
|
|||
|
||||
if (response.ok) {
|
||||
const data = await response.json();
|
||||
return data.healthy || false;
|
||||
// If we get a version back, the server is healthy
|
||||
return !!data.version;
|
||||
}
|
||||
|
||||
return false;
|
||||
|
|
|
|||
|
|
@ -299,9 +299,9 @@ export default function Settings() {
|
|||
</Button>
|
||||
</div>
|
||||
</div>
|
||||
{user?.avatarURL && (
|
||||
{user?.avatarurl && (
|
||||
<img
|
||||
src={user.avatarURL}
|
||||
src={user.avatarurl}
|
||||
alt={user?.name}
|
||||
className="h-10 w-10 rounded-full bg-neutral-200 dark:bg-neutral-700 flex-shrink-0"
|
||||
onError={(e) => {
|
||||
|
|
|
|||
|
|
@ -50,21 +50,33 @@ export default function Thinking({
|
|||
// Position content to show bottom when collapsed
|
||||
useEffect(() => {
|
||||
if (isCollapsed && contentRef.current && wrapperRef.current) {
|
||||
const contentHeight = contentRef.current.scrollHeight;
|
||||
const wrapperHeight = wrapperRef.current.clientHeight;
|
||||
if (contentHeight > wrapperHeight) {
|
||||
const translateY = -(contentHeight - wrapperHeight);
|
||||
contentRef.current.style.transform = `translateY(${translateY}px)`;
|
||||
setHasOverflow(true);
|
||||
} else {
|
||||
setHasOverflow(false);
|
||||
}
|
||||
requestAnimationFrame(() => {
|
||||
if (!contentRef.current || !wrapperRef.current) return;
|
||||
|
||||
const contentHeight = contentRef.current.scrollHeight;
|
||||
const wrapperHeight = wrapperRef.current.clientHeight;
|
||||
if (contentHeight > wrapperHeight) {
|
||||
const translateY = -(contentHeight - wrapperHeight);
|
||||
contentRef.current.style.transform = `translateY(${translateY}px)`;
|
||||
setHasOverflow(true);
|
||||
} else {
|
||||
contentRef.current.style.transform = "translateY(0)";
|
||||
setHasOverflow(false);
|
||||
}
|
||||
});
|
||||
} else if (contentRef.current) {
|
||||
contentRef.current.style.transform = "translateY(0)";
|
||||
setHasOverflow(false);
|
||||
}
|
||||
}, [thinking, isCollapsed]);
|
||||
|
||||
useEffect(() => {
|
||||
if (activelyThinking && wrapperRef.current && !isCollapsed) {
|
||||
// When expanded and actively thinking, scroll to bottom
|
||||
wrapperRef.current.scrollTop = wrapperRef.current.scrollHeight;
|
||||
}
|
||||
}, [thinking, activelyThinking, isCollapsed]);
|
||||
|
||||
const handleToggle = () => {
|
||||
setIsCollapsed(!isCollapsed);
|
||||
setHasUserInteracted(true);
|
||||
|
|
|
|||
|
|
@ -7,6 +7,7 @@ import { createQueryBatcher } from "./useQueryBatcher";
|
|||
import { useRefetchModels } from "./useModels";
|
||||
import { useStreamingContext } from "@/contexts/StreamingContext";
|
||||
import { useSettings } from "./useSettings";
|
||||
import { getModelCapabilities } from "@/api";
|
||||
|
||||
export const useChats = () => {
|
||||
return useQuery({
|
||||
|
|
@ -606,6 +607,24 @@ export const useSendMessage = (chatId: string) => {
|
|||
queryClient.setQueryData(["staleModels"], newStaleMap);
|
||||
|
||||
queryClient.invalidateQueries({ queryKey: ["models"] });
|
||||
|
||||
// Fetch fresh capabilities for the downloaded model
|
||||
getModelCapabilities(selectedModel.model)
|
||||
.then((capabilities) => {
|
||||
queryClient.setQueryData(
|
||||
["modelCapabilities", selectedModel.model],
|
||||
capabilities,
|
||||
);
|
||||
})
|
||||
.catch((error) => {
|
||||
console.error(
|
||||
"Failed to fetch capabilities after download:",
|
||||
error,
|
||||
);
|
||||
queryClient.invalidateQueries({
|
||||
queryKey: ["modelCapabilities", selectedModel.model],
|
||||
});
|
||||
});
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,114 +0,0 @@
|
|||
import { useMutation, useQueryClient } from "@tanstack/react-query";
|
||||
import { useState } from "react";
|
||||
import { pullModel } from "@/api";
|
||||
import { useSelectedModel } from "./useSelectedModel";
|
||||
import { useSettings } from "./useSettings";
|
||||
|
||||
interface DownloadProgress {
|
||||
status: string;
|
||||
digest?: string;
|
||||
total?: number;
|
||||
completed?: number;
|
||||
done?: boolean;
|
||||
}
|
||||
|
||||
export function useDownloadModel(chatId?: string) {
|
||||
const queryClient = useQueryClient();
|
||||
const { selectedModel } = useSelectedModel(chatId);
|
||||
const { setSettings } = useSettings();
|
||||
const [downloadProgress, setDownloadProgress] =
|
||||
useState<DownloadProgress | null>(null);
|
||||
const [abortController, setAbortController] =
|
||||
useState<AbortController | null>(null);
|
||||
const [downloadingChatIds, setDownloadingChatIds] = useState<Set<string>>(
|
||||
new Set(),
|
||||
);
|
||||
|
||||
const mutation = useMutation({
|
||||
mutationFn: async (modelName: string) => {
|
||||
const controller = new AbortController();
|
||||
setAbortController(controller);
|
||||
setDownloadProgress({ status: "Starting download..." });
|
||||
if (chatId) {
|
||||
setDownloadingChatIds((prev) => new Set(prev).add(chatId));
|
||||
}
|
||||
|
||||
try {
|
||||
for await (const progress of pullModel(modelName, controller.signal)) {
|
||||
setDownloadProgress(progress);
|
||||
|
||||
if (progress.status === "success") {
|
||||
// Update selected model to indicate it's now available locally
|
||||
if (selectedModel && selectedModel.model === modelName) {
|
||||
setSettings({ SelectedModel: modelName });
|
||||
}
|
||||
// Invalidate models query to refresh the list
|
||||
await queryClient.invalidateQueries({ queryKey: ["models"] });
|
||||
break;
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
setAbortController(null);
|
||||
if (chatId) {
|
||||
setDownloadingChatIds((prev) => {
|
||||
const newSet = new Set(prev);
|
||||
newSet.delete(chatId);
|
||||
return newSet;
|
||||
});
|
||||
}
|
||||
}
|
||||
},
|
||||
onSuccess: () => {
|
||||
setDownloadProgress(null);
|
||||
if (chatId) {
|
||||
setDownloadingChatIds((prev) => {
|
||||
const newSet = new Set(prev);
|
||||
newSet.delete(chatId);
|
||||
return newSet;
|
||||
});
|
||||
}
|
||||
},
|
||||
onError: (error: Error) => {
|
||||
const status =
|
||||
error.name === "AbortError" ? "Download cancelled" : "Download failed";
|
||||
setDownloadProgress({ status, done: true });
|
||||
|
||||
// Clear error message after delay
|
||||
const delay = error.name === "AbortError" ? 1500 : 3000;
|
||||
setTimeout(() => {
|
||||
setDownloadProgress(null);
|
||||
if (chatId) {
|
||||
setDownloadingChatIds((prev) => {
|
||||
const newSet = new Set(prev);
|
||||
newSet.delete(chatId);
|
||||
return newSet;
|
||||
});
|
||||
}
|
||||
}, delay);
|
||||
},
|
||||
});
|
||||
|
||||
const cancelDownload = () => {
|
||||
if (abortController) {
|
||||
abortController.abort();
|
||||
setAbortController(null);
|
||||
if (chatId) {
|
||||
setDownloadingChatIds((prev) => {
|
||||
const newSet = new Set(prev);
|
||||
newSet.delete(chatId);
|
||||
return newSet;
|
||||
});
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
return {
|
||||
downloadModel: mutation.mutate,
|
||||
isDownloading:
|
||||
mutation.isPending && chatId ? downloadingChatIds.has(chatId) : false,
|
||||
downloadProgress:
|
||||
chatId && downloadingChatIds.has(chatId) ? downloadProgress : null,
|
||||
error: mutation.error,
|
||||
cancelDownload,
|
||||
};
|
||||
}
|
||||
|
|
@ -1,29 +1,20 @@
|
|||
import { useQuery, useMutation, useQueryClient } from "@tanstack/react-query";
|
||||
import { useEffect, useState } from "react";
|
||||
import { fetchUser, fetchConnectUrl, disconnectUser } from "@/api";
|
||||
|
||||
export function useUser() {
|
||||
const queryClient = useQueryClient();
|
||||
const [initialDataLoaded, setInitialDataLoaded] = useState(false);
|
||||
|
||||
// Wait for initial data to be loaded
|
||||
useEffect(() => {
|
||||
const initialPromise = window.__initialUserDataPromise;
|
||||
if (initialPromise) {
|
||||
initialPromise.finally(() => {
|
||||
setInitialDataLoaded(true);
|
||||
});
|
||||
} else {
|
||||
setInitialDataLoaded(true);
|
||||
}
|
||||
}, []);
|
||||
|
||||
const userQuery = useQuery({
|
||||
queryKey: ["user"],
|
||||
queryFn: () => fetchUser(),
|
||||
queryFn: async () => {
|
||||
const result = await fetchUser();
|
||||
return result;
|
||||
},
|
||||
staleTime: 5 * 60 * 1000, // Consider data stale after 5 minutes
|
||||
gcTime: 10 * 60 * 1000, // Keep in cache for 10 minutes
|
||||
initialData: null, // Start with null to prevent flashing
|
||||
retry: 10,
|
||||
retryDelay: (attemptIndex) => Math.min(500 * attemptIndex, 2000),
|
||||
refetchOnMount: true, // Always fetch when component mounts
|
||||
});
|
||||
|
||||
// Mutation to refresh user data
|
||||
|
|
@ -49,14 +40,15 @@ export function useUser() {
|
|||
},
|
||||
});
|
||||
|
||||
const isLoading = userQuery.isLoading || userQuery.isFetching;
|
||||
const isAuthenticated = Boolean(userQuery.data?.name);
|
||||
|
||||
return {
|
||||
user: userQuery.data,
|
||||
isLoading:
|
||||
!initialDataLoaded ||
|
||||
(userQuery.isLoading && userQuery.data === undefined), // Show loading until initial data is loaded
|
||||
isLoading,
|
||||
isError: userQuery.isError,
|
||||
error: userQuery.error,
|
||||
isAuthenticated: Boolean(userQuery.data?.name),
|
||||
isAuthenticated,
|
||||
refreshUser: refreshUser.mutate,
|
||||
isRefreshing: refreshUser.isPending,
|
||||
refetchUser: userQuery.refetch,
|
||||
|
|
|
|||
|
|
@ -8,3 +8,6 @@ export const API_BASE = import.meta.env.DEV ? DEV_API_URL : "";
|
|||
export const OLLAMA_HOST = import.meta.env.DEV
|
||||
? DEV_API_URL
|
||||
: window.location.origin;
|
||||
|
||||
export const OLLAMA_DOT_COM =
|
||||
import.meta.env.VITE_OLLAMA_DOT_COM_URL || "https://ollama.com";
|
||||
|
|
|
|||
|
|
@ -5,13 +5,6 @@ import { QueryClient, QueryClientProvider } from "@tanstack/react-query";
|
|||
import { routeTree } from "./routeTree.gen";
|
||||
import { fetchUser } from "./api";
|
||||
import { StreamingProvider } from "./contexts/StreamingContext";
|
||||
import { User } from "@/gotypes";
|
||||
|
||||
declare global {
|
||||
interface Window {
|
||||
__initialUserDataPromise?: Promise<User | null>;
|
||||
}
|
||||
}
|
||||
|
||||
const queryClient = new QueryClient({
|
||||
defaultOptions: {
|
||||
|
|
@ -24,27 +17,11 @@ const queryClient = new QueryClient({
|
|||
},
|
||||
});
|
||||
|
||||
// Track initial user data fetch
|
||||
let initialUserDataPromise: Promise<User | null> | null = null;
|
||||
|
||||
// Initialize user data on app startup
|
||||
const initializeUserData = async () => {
|
||||
try {
|
||||
const userData = await fetchUser();
|
||||
fetchUser().then((userData) => {
|
||||
if (userData) {
|
||||
queryClient.setQueryData(["user"], userData);
|
||||
return userData;
|
||||
} catch (error) {
|
||||
console.error("Error initializing user data:", error);
|
||||
queryClient.setQueryData(["user"], null);
|
||||
return null;
|
||||
}
|
||||
};
|
||||
|
||||
// Start initialization immediately and track the promise
|
||||
initialUserDataPromise = initializeUserData();
|
||||
|
||||
// Export the promise so hooks can await it
|
||||
window.__initialUserDataPromise = initialUserDataPromise;
|
||||
});
|
||||
|
||||
const router = createRouter({
|
||||
routeTree,
|
||||
|
|
|
|||
|
|
@ -101,15 +101,14 @@ type HealthResponse struct {
|
|||
}
|
||||
|
||||
type User struct {
|
||||
ID string `json:"id"`
|
||||
Name string `json:"name"`
|
||||
Email string `json:"email"`
|
||||
AvatarURL string `json:"avatarURL"`
|
||||
Plan string `json:"plan"`
|
||||
Bio string `json:"bio"`
|
||||
FirstName string `json:"firstName"`
|
||||
LastName string `json:"lastName"`
|
||||
OverThreshold bool `json:"overThreshold"`
|
||||
ID string `json:"id"`
|
||||
Email string `json:"email"`
|
||||
Name string `json:"name"`
|
||||
Bio string `json:"bio,omitempty"`
|
||||
AvatarURL string `json:"avatarurl,omitempty"`
|
||||
FirstName string `json:"firstname,omitempty"`
|
||||
LastName string `json:"lastname,omitempty"`
|
||||
Plan string `json:"plan,omitempty"`
|
||||
}
|
||||
|
||||
type Attachment struct {
|
||||
|
|
|
|||
241
app/ui/ui.go
241
app/ui/ui.go
|
|
@ -12,18 +12,17 @@ import (
|
|||
"log/slog"
|
||||
"net/http"
|
||||
"net/http/httputil"
|
||||
"net/url"
|
||||
"os"
|
||||
"runtime"
|
||||
"runtime/debug"
|
||||
"slices"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/google/uuid"
|
||||
"github.com/ollama/ollama/api"
|
||||
"github.com/ollama/ollama/app/auth"
|
||||
"github.com/ollama/ollama/app/server"
|
||||
"github.com/ollama/ollama/app/store"
|
||||
"github.com/ollama/ollama/app/tools"
|
||||
|
|
@ -118,40 +117,66 @@ func (s *Server) log() *slog.Logger {
|
|||
|
||||
// ollamaProxy creates a reverse proxy handler to the Ollama server
|
||||
func (s *Server) ollamaProxy() http.Handler {
|
||||
ollamaHost := os.Getenv("OLLAMA_HOST")
|
||||
if ollamaHost == "" {
|
||||
ollamaHost = "http://127.0.0.1:11434"
|
||||
}
|
||||
var (
|
||||
proxy http.Handler
|
||||
proxyMu sync.Mutex
|
||||
)
|
||||
|
||||
if !strings.HasPrefix(ollamaHost, "http://") && !strings.HasPrefix(ollamaHost, "https://") {
|
||||
ollamaHost = "http://" + ollamaHost
|
||||
}
|
||||
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
proxyMu.Lock()
|
||||
p := proxy
|
||||
proxyMu.Unlock()
|
||||
|
||||
target, err := url.Parse(ollamaHost)
|
||||
if err != nil {
|
||||
s.log().Error("failed to parse OLLAMA_HOST", "error", err, "host", ollamaHost)
|
||||
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
http.Error(w, "failed to configure proxy", http.StatusInternalServerError)
|
||||
})
|
||||
}
|
||||
if p == nil {
|
||||
proxyMu.Lock()
|
||||
if proxy == nil {
|
||||
var err error
|
||||
for i := range 2 {
|
||||
if i > 0 {
|
||||
s.log().Warn("ollama server not ready, retrying", "attempt", i+1)
|
||||
time.Sleep(1 * time.Second)
|
||||
}
|
||||
|
||||
s.log().Info("configuring ollama proxy", "target", target.String())
|
||||
err = WaitForServer(context.Background(), 10*time.Second)
|
||||
if err == nil {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
proxy := httputil.NewSingleHostReverseProxy(target)
|
||||
if err != nil {
|
||||
proxyMu.Unlock()
|
||||
s.log().Error("ollama server not ready after retries", "error", err)
|
||||
http.Error(w, "Ollama server is not ready", http.StatusServiceUnavailable)
|
||||
return
|
||||
}
|
||||
|
||||
originalDirector := proxy.Director
|
||||
proxy.Director = func(req *http.Request) {
|
||||
originalDirector(req)
|
||||
req.Host = target.Host
|
||||
s.log().Debug("proxying request", "method", req.Method, "path", req.URL.Path, "target", target.Host)
|
||||
}
|
||||
target := envconfig.Host()
|
||||
s.log().Info("configuring ollama proxy", "target", target.String())
|
||||
|
||||
proxy.ErrorHandler = func(w http.ResponseWriter, r *http.Request, err error) {
|
||||
s.log().Error("proxy error", "error", err, "path", r.URL.Path, "target", target.String())
|
||||
http.Error(w, "proxy error: "+err.Error(), http.StatusBadGateway)
|
||||
}
|
||||
newProxy := httputil.NewSingleHostReverseProxy(target)
|
||||
|
||||
return proxy
|
||||
originalDirector := newProxy.Director
|
||||
newProxy.Director = func(req *http.Request) {
|
||||
originalDirector(req)
|
||||
req.Host = target.Host
|
||||
s.log().Debug("proxying request", "method", req.Method, "path", req.URL.Path, "target", target.Host)
|
||||
}
|
||||
|
||||
newProxy.ErrorHandler = func(w http.ResponseWriter, r *http.Request, err error) {
|
||||
s.log().Error("proxy error", "error", err, "path", r.URL.Path, "target", target.String())
|
||||
http.Error(w, "proxy error: "+err.Error(), http.StatusBadGateway)
|
||||
}
|
||||
|
||||
proxy = newProxy
|
||||
p = newProxy
|
||||
} else {
|
||||
p = proxy
|
||||
}
|
||||
proxyMu.Unlock()
|
||||
}
|
||||
|
||||
p.ServeHTTP(w, r)
|
||||
})
|
||||
}
|
||||
|
||||
type errHandlerFunc func(http.ResponseWriter, *http.Request) error
|
||||
|
|
@ -264,11 +289,10 @@ func (s *Server) Handler() http.Handler {
|
|||
ollamaProxy := s.ollamaProxy()
|
||||
mux.Handle("GET /api/tags", ollamaProxy)
|
||||
mux.Handle("POST /api/show", ollamaProxy)
|
||||
|
||||
mux.Handle("GET /api/v1/me", handle(s.me))
|
||||
mux.Handle("POST /api/v1/disconnect", handle(s.disconnect))
|
||||
mux.Handle("GET /api/v1/connect", handle(s.connectURL))
|
||||
mux.Handle("GET /api/v1/health", handle(s.health))
|
||||
mux.Handle("GET /api/version", ollamaProxy)
|
||||
mux.Handle("HEAD /api/version", ollamaProxy)
|
||||
mux.Handle("POST /api/me", ollamaProxy)
|
||||
mux.Handle("POST /api/signout", ollamaProxy)
|
||||
|
||||
// React app - catch all non-API routes and serve the React app
|
||||
mux.Handle("GET /", s.appHandler())
|
||||
|
|
@ -338,7 +362,7 @@ func (s *Server) doSelfSigned(ctx context.Context, method, path string) (*http.R
|
|||
}
|
||||
|
||||
// UserData fetches user data from ollama.com API for the current ollama key
|
||||
func (s *Server) UserData(ctx context.Context) (*responses.User, error) {
|
||||
func (s *Server) UserData(ctx context.Context) (*api.UserResponse, error) {
|
||||
resp, err := s.doSelfSigned(ctx, http.MethodPost, "/api/me")
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to call ollama.com/api/me: %w", err)
|
||||
|
|
@ -349,7 +373,7 @@ func (s *Server) UserData(ctx context.Context) (*responses.User, error) {
|
|||
return nil, fmt.Errorf("unexpected status code: %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
var user responses.User
|
||||
var user api.UserResponse
|
||||
if err := json.NewDecoder(resp.Body).Decode(&user); err != nil {
|
||||
return nil, fmt.Errorf("failed to parse user response: %w", err)
|
||||
}
|
||||
|
|
@ -368,29 +392,27 @@ func (s *Server) UserData(ctx context.Context) (*responses.User, error) {
|
|||
return &user, nil
|
||||
}
|
||||
|
||||
func waitForServer(ctx context.Context) error {
|
||||
timeout := time.Now().Add(10 * time.Second)
|
||||
// TODO: this avoids an error on first load of the app
|
||||
// however we should either show a loading state or
|
||||
// wait for the Ollama server to be ready before redirecting
|
||||
for {
|
||||
// WaitForServer waits for the Ollama server to be ready
|
||||
func WaitForServer(ctx context.Context, timeout time.Duration) error {
|
||||
deadline := time.Now().Add(timeout)
|
||||
for time.Now().Before(deadline) {
|
||||
c, err := api.ClientFromEnvironment()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if _, err := c.Version(ctx); err == nil {
|
||||
break
|
||||
}
|
||||
if time.Now().After(timeout) {
|
||||
return fmt.Errorf("timeout waiting for Ollama server to be ready")
|
||||
slog.Debug("ollama server is ready")
|
||||
return nil
|
||||
}
|
||||
time.Sleep(10 * time.Millisecond)
|
||||
}
|
||||
return nil
|
||||
return errors.New("timeout waiting for Ollama server to be ready")
|
||||
}
|
||||
|
||||
func (s *Server) createChat(w http.ResponseWriter, r *http.Request) error {
|
||||
waitForServer(r.Context())
|
||||
if err := WaitForServer(r.Context(), 10*time.Second); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
id, err := uuid.NewV7()
|
||||
if err != nil {
|
||||
|
|
@ -1438,129 +1460,6 @@ func (s *Server) settings(w http.ResponseWriter, r *http.Request) error {
|
|||
})
|
||||
}
|
||||
|
||||
func (s *Server) me(w http.ResponseWriter, r *http.Request) error {
|
||||
if r.Method != http.MethodGet {
|
||||
http.Error(w, "Method Not Allowed", http.StatusMethodNotAllowed)
|
||||
return nil
|
||||
}
|
||||
|
||||
user, err := s.UserData(r.Context())
|
||||
if err != nil {
|
||||
// If fetching from API fails, try to return cached user data if available
|
||||
if cachedUser, cacheErr := s.Store.User(); cacheErr == nil && cachedUser != nil {
|
||||
s.log().Info("API request failed, returning cached user data", "error", err)
|
||||
responseUser := &responses.User{
|
||||
Name: cachedUser.Name,
|
||||
Email: cachedUser.Email,
|
||||
Plan: cachedUser.Plan,
|
||||
}
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
w.WriteHeader(http.StatusOK)
|
||||
return json.NewEncoder(w).Encode(responseUser)
|
||||
}
|
||||
|
||||
s.log().Error("failed to get user data", "error", err)
|
||||
w.WriteHeader(http.StatusInternalServerError)
|
||||
return json.NewEncoder(w).Encode(responses.Error{
|
||||
Error: "failed to get user data",
|
||||
})
|
||||
}
|
||||
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
w.WriteHeader(http.StatusOK)
|
||||
return json.NewEncoder(w).Encode(user)
|
||||
}
|
||||
|
||||
func (s *Server) disconnect(w http.ResponseWriter, r *http.Request) error {
|
||||
if r.Method != http.MethodPost {
|
||||
http.Error(w, "Method Not Allowed", http.StatusMethodNotAllowed)
|
||||
return nil
|
||||
}
|
||||
|
||||
if err := s.Store.ClearUser(); err != nil {
|
||||
s.log().Warn("failed to clear cached user data", "error", err)
|
||||
}
|
||||
|
||||
// Get the SSH public key to encode for the delete request
|
||||
pubKey, err := ollamaAuth.GetPublicKey()
|
||||
if err != nil {
|
||||
s.log().Error("failed to get public key", "error", err)
|
||||
w.WriteHeader(http.StatusInternalServerError)
|
||||
return json.NewEncoder(w).Encode(responses.Error{
|
||||
Error: "failed to get public key",
|
||||
})
|
||||
}
|
||||
|
||||
// Encode the key using base64 URL encoding
|
||||
encodedKey := base64.RawURLEncoding.EncodeToString([]byte(pubKey))
|
||||
|
||||
// Call the /api/user/keys/{encodedKey} endpoint with DELETE
|
||||
resp, err := s.doSelfSigned(r.Context(), http.MethodDelete, fmt.Sprintf("/api/user/keys/%s", encodedKey))
|
||||
if err != nil {
|
||||
s.log().Error("failed to call ollama.com/api/user/keys", "error", err)
|
||||
w.WriteHeader(http.StatusInternalServerError)
|
||||
return json.NewEncoder(w).Encode(responses.Error{
|
||||
Error: "failed to disconnect from ollama.com",
|
||||
})
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
s.log().Error("disconnect request failed", "status", resp.StatusCode)
|
||||
w.WriteHeader(http.StatusInternalServerError)
|
||||
return json.NewEncoder(w).Encode(responses.Error{
|
||||
Error: "failed to disconnect from ollama.com",
|
||||
})
|
||||
}
|
||||
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
w.WriteHeader(http.StatusOK)
|
||||
return json.NewEncoder(w).Encode(map[string]string{"status": "disconnected"})
|
||||
}
|
||||
|
||||
func (s *Server) connectURL(w http.ResponseWriter, r *http.Request) error {
|
||||
if r.Method != http.MethodGet {
|
||||
http.Error(w, "Method Not Allowed", http.StatusMethodNotAllowed)
|
||||
return nil
|
||||
}
|
||||
|
||||
connectURL, err := auth.BuildConnectURL(OllamaDotCom)
|
||||
if err != nil {
|
||||
s.log().Error("failed to build connect URL", "error", err)
|
||||
w.WriteHeader(http.StatusInternalServerError)
|
||||
return json.NewEncoder(w).Encode(responses.Error{
|
||||
Error: "failed to build connect URL",
|
||||
})
|
||||
}
|
||||
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
w.WriteHeader(http.StatusOK)
|
||||
return json.NewEncoder(w).Encode(map[string]string{
|
||||
"connect_url": connectURL,
|
||||
})
|
||||
}
|
||||
|
||||
func (s *Server) health(w http.ResponseWriter, r *http.Request) error {
|
||||
if r.Method != http.MethodGet {
|
||||
http.Error(w, "Method Not Allowed", http.StatusMethodNotAllowed)
|
||||
return nil
|
||||
}
|
||||
|
||||
healthy := false
|
||||
c, err := api.ClientFromEnvironment()
|
||||
if err == nil {
|
||||
if _, err := c.Version(r.Context()); err == nil {
|
||||
healthy = true
|
||||
}
|
||||
}
|
||||
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
w.WriteHeader(http.StatusOK)
|
||||
return json.NewEncoder(w).Encode(responses.HealthResponse{
|
||||
Healthy: healthy,
|
||||
})
|
||||
}
|
||||
|
||||
func (s *Server) getInferenceCompute(w http.ResponseWriter, r *http.Request) error {
|
||||
ctx, cancel := context.WithTimeout(r.Context(), 500*time.Millisecond)
|
||||
defer cancel()
|
||||
|
|
|
|||
|
|
@ -158,16 +158,16 @@ func (t *winTray) wndProc(hWnd windows.Handle, message uint32, wParam, lParam ui
|
|||
case uint32(UI_REQUEST_MSG_ID):
|
||||
// Requests for the UI must always come from the main event thread
|
||||
l := int(wParam)
|
||||
path := unsafe.String((*byte)(unsafe.Pointer(lParam)), l)
|
||||
path := unsafe.String((*byte)(unsafe.Pointer(lParam)), l) //nolint:govet,gosec
|
||||
t.app.UIRun(path)
|
||||
case WM_COPYDATA:
|
||||
// Handle URL scheme requests from other instances
|
||||
if lParam != 0 {
|
||||
cds := (*COPYDATASTRUCT)(unsafe.Pointer(lParam))
|
||||
if cds.DwData == 1 { // Our identifier for URL scheme messages
|
||||
cds := (*COPYDATASTRUCT)(unsafe.Pointer(lParam)) //nolint:govet,gosec
|
||||
if cds.DwData == 1 { // Our identifier for URL scheme messages
|
||||
// Convert the data back to string
|
||||
data := make([]byte, cds.CbData)
|
||||
copy(data, (*[1 << 30]byte)(unsafe.Pointer(cds.LpData))[:cds.CbData:cds.CbData])
|
||||
copy(data, (*[1 << 30]byte)(unsafe.Pointer(cds.LpData))[:cds.CbData:cds.CbData]) //nolint:govet,gosec
|
||||
urlScheme := string(data)
|
||||
handleURLSchemeRequest(urlScheme)
|
||||
lResult = 1 // Return non-zero to indicate success
|
||||
|
|
|
|||
|
|
@ -15,7 +15,7 @@ A Go-based command-line tool for benchmarking Ollama models with configurable pa
|
|||
|
||||
```
|
||||
go build -o ollama-bench bench.go
|
||||
./bench -model gpt-oss:20b -epochs 6 -format csv
|
||||
./ollama-bench -model gpt-oss:20b -epochs 6 -format csv
|
||||
```
|
||||
|
||||
Using Go Run (without building)
|
||||
|
|
@ -29,31 +29,32 @@ go run bench.go -model gpt-oss:20b -epochs 3
|
|||
### Basic Example
|
||||
|
||||
```
|
||||
./bench -model gemma3 -epochs 6
|
||||
./ollama-bench -model gemma3 -epochs 6
|
||||
```
|
||||
|
||||
### Benchmark Multiple Models
|
||||
|
||||
```
|
||||
./bench -model gemma3,gemma3n -epochs 6 -max-tokens 100 -p "Write me a short story" | tee gemma.bench
|
||||
./ollama-bench -model gemma3,gemma3n -epochs 6 -max-tokens 100 -p "Write me a short story" | tee gemma.bench
|
||||
benchstat -col /name gemma.bench
|
||||
```
|
||||
|
||||
### With Image Prompt
|
||||
|
||||
```
|
||||
./bench -model qwen3-vl -image photo.jpg -epochs 6 -max-tokens 100 -p "Describe this image"
|
||||
./ollama-bench -model qwen3-vl -image photo.jpg -epochs 6 -max-tokens 100 -p "Describe this image"
|
||||
```
|
||||
|
||||
### Advanced Example
|
||||
|
||||
```
|
||||
./bench -model llama3 -epochs 10 -temperature 0.7 -max-tokens 500 -seed 42 -format csv -output results.csv
|
||||
./ollama-bench -model llama3 -epochs 10 -temperature 0.7 -max-tokens 500 -seed 42 -format csv -output results.csv
|
||||
```
|
||||
|
||||
## Command Line Options
|
||||
|
||||
| Option | Description | Default |
|
||||
|----------|-------------|---------|
|
||||
| -model | Comma-separated list of models to benchmark | (required) |
|
||||
| -epochs | Number of iterations per model | 1 |
|
||||
| -max-tokens | Maximum tokens for model response | 0 (unlimited) |
|
||||
|
|
|
|||
|
|
@ -182,6 +182,8 @@ func ConvertModel(fsys fs.FS, f *os.File) error {
|
|||
conv = &llama4Model{}
|
||||
case "Mistral3ForConditionalGeneration":
|
||||
conv = &mistral3Model{}
|
||||
case "Ministral3ForCausalLM":
|
||||
conv = &mistral3CausalModel{}
|
||||
case "MixtralForCausalLM":
|
||||
conv = &mixtralModel{}
|
||||
case "GemmaForCausalLM":
|
||||
|
|
@ -200,8 +202,12 @@ func ConvertModel(fsys fs.FS, f *os.File) error {
|
|||
conv = &qwen25VLModel{}
|
||||
case "Qwen3VLForConditionalGeneration", "Qwen3VLMoeForConditionalGeneration":
|
||||
conv = &qwen3VLModel{}
|
||||
case "Olmo3ForCausalLM":
|
||||
conv = &olmoModel{}
|
||||
case "BertModel":
|
||||
conv = &bertModel{}
|
||||
case "NomicBertModel", "NomicBertMoEModel":
|
||||
conv = &nomicbertModel{}
|
||||
case "CohereForCausalLM":
|
||||
conv = &commandrModel{}
|
||||
case "GptOssForCausalLM":
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ package convert
|
|||
|
||||
import (
|
||||
"cmp"
|
||||
"slices"
|
||||
|
||||
"github.com/ollama/ollama/fs/ggml"
|
||||
)
|
||||
|
|
@ -26,16 +27,26 @@ type gemma3Model struct {
|
|||
NumChannels uint32 `json:"num_channels"` // num_channels 3
|
||||
PatchSize uint32 `json:"patch_size"` // patch_size 14
|
||||
} `json:"vision_config"`
|
||||
MaxPositionEmbeddings uint32 `json:"max_position_embeddings"`
|
||||
NumAttentionHeads uint32 `json:"num_attention_heads"`
|
||||
NumKeyValueHeads uint32 `json:"num_key_value_heads"`
|
||||
RMSNormEPS float32 `json:"rms_norm_eps"`
|
||||
HeadDim uint32 `json:"head_dim"`
|
||||
FinalLogitSoftcap float32 `json:"final_logit_softcapping"`
|
||||
RopeLocalTheta float32 `json:"rope_local_base_freq"`
|
||||
RopeGlobalTheta float32 `json:"rope_global_base_freq"`
|
||||
SlidingWindow uint32 `json:"sliding_window"`
|
||||
MultiModalTokensPerImage uint32 `json:"mm_tokens_per_image"`
|
||||
MaxPositionEmbeddings uint32 `json:"max_position_embeddings"`
|
||||
NumAttentionHeads uint32 `json:"num_attention_heads"`
|
||||
NumKeyValueHeads uint32 `json:"num_key_value_heads"`
|
||||
RMSNormEPS float32 `json:"rms_norm_eps"`
|
||||
HeadDim uint32 `json:"head_dim"`
|
||||
FinalLogitSoftcap float32 `json:"final_logit_softcapping"`
|
||||
RopeLocalTheta float32 `json:"rope_local_base_freq"`
|
||||
RopeTheta float32 `json:"rope_theta"`
|
||||
SlidingWindow uint32 `json:"sliding_window"`
|
||||
SlidingWindowPattern *uint32 `json:"sliding_window_pattern"`
|
||||
LayerTypes []string `json:"layer_types"`
|
||||
MultiModalTokensPerImage uint32 `json:"mm_tokens_per_image"`
|
||||
RopeScaling *struct {
|
||||
Type string `json:"rope_type"`
|
||||
Factor float32 `json:"factor"`
|
||||
OriginalMaxPositionEmbeddings uint32 `json:"original_max_position_embeddings"`
|
||||
ExtrapolationFactor float32 `json:"extrapolation_factor"`
|
||||
BetaFast float32 `json:"beta_fast"`
|
||||
BetaSlow float32 `json:"beta_slow"`
|
||||
} `json:"rope_scaling"`
|
||||
}
|
||||
|
||||
const (
|
||||
|
|
@ -81,9 +92,38 @@ func (p *gemma3Model) KV(t *Tokenizer) ggml.KV {
|
|||
kv["gemma3.attention.key_length"] = p.HeadDim
|
||||
kv["gemma3.attention.value_length"] = p.HeadDim
|
||||
kv["gemma3.attention.sliding_window"] = p.SlidingWindow
|
||||
kv["gemma3.final_logit_softcapping"] = cmp.Or(p.FinalLogitSoftcap, 30)
|
||||
|
||||
// The sliding window pattern is either provided as the sliding_window_pattern
|
||||
// key (an int) or as the layer_types key (a list of strings).
|
||||
if p.SlidingWindowPattern != nil || len(p.LayerTypes) > 0 {
|
||||
kv["gemma3.attention.sliding_window_pattern"] = slices.Collect(func(yield func(bool) bool) {
|
||||
for i := range numBlocks {
|
||||
var isLocal bool
|
||||
if len(p.LayerTypes) > 0 && int(i) < len(p.LayerTypes) {
|
||||
isLocal = p.LayerTypes[i] == "sliding_attention"
|
||||
} else if p.SlidingWindowPattern != nil && *p.SlidingWindowPattern > 0 {
|
||||
isLocal = (i+1)%*p.SlidingWindowPattern != 0
|
||||
}
|
||||
if !yield(isLocal) {
|
||||
break
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
if p.FinalLogitSoftcap > 0 {
|
||||
kv["gemma3.final_logit_softcapping"] = p.FinalLogitSoftcap
|
||||
}
|
||||
kv["gemma3.rope.local.freq_base"] = cmp.Or(p.RopeLocalTheta, 10000.0)
|
||||
kv["gemma3.rope.global.freq_base"] = cmp.Or(p.RopeGlobalTheta, 1000000.0)
|
||||
kv["gemma3.rope.freq_base"] = cmp.Or(p.RopeTheta, 1000000.0)
|
||||
if p.RopeScaling != nil && p.RopeScaling.Type == "yarn" && p.RopeScaling.Factor > 0 {
|
||||
kv["gemma3.rope.scaling.type"] = "yarn"
|
||||
kv["gemma3.rope.scaling.factor"] = p.RopeScaling.Factor
|
||||
kv["gemma3.rope.scaling.original_context_length"] = p.RopeScaling.OriginalMaxPositionEmbeddings
|
||||
kv["gemma3.rope.scaling.extrapolation_factor"] = cmp.Or(p.RopeScaling.ExtrapolationFactor, float32(1.0))
|
||||
kv["gemma3.rope.scaling.beta_fast"] = cmp.Or(p.RopeScaling.BetaFast, float32(64.0))
|
||||
kv["gemma3.rope.scaling.beta_slow"] = cmp.Or(p.RopeScaling.BetaSlow, float32(1.0))
|
||||
}
|
||||
|
||||
kv["gemma3.embedding_length"] = p.HiddenSize
|
||||
kv["gemma3.feed_forward_length"] = p.IntermediateSize
|
||||
default:
|
||||
|
|
|
|||
|
|
@ -30,13 +30,15 @@ type mistral3Model struct {
|
|||
HiddenAct string `json:"hidden_act"`
|
||||
VocabSize uint32 `json:"vocab_size"`
|
||||
RopeParameters struct {
|
||||
BetaFast float32 `json:"beta_fast"`
|
||||
BetaSlow float32 `json:"beta_slow"`
|
||||
Factor float32 `json:"factor"`
|
||||
ScalingBeta float32 `json:"llama_4_scaling_beta"`
|
||||
OrigMaxPositionEmbeddings uint32 `json:"original_max_position_embeddings"`
|
||||
RopeType string `json:"rope_type"`
|
||||
RopeTheta float32 `json:"rope_theta"`
|
||||
BetaFast float32 `json:"beta_fast"`
|
||||
BetaSlow float32 `json:"beta_slow"`
|
||||
Factor float32 `json:"factor"`
|
||||
Llama4ScalingBeta *float32 `json:"llama_4_scaling_beta"`
|
||||
OrigMaxPositionEmbeddings uint32 `json:"original_max_position_embeddings"`
|
||||
RopeType string `json:"rope_type"`
|
||||
RopeTheta float32 `json:"rope_theta"`
|
||||
Mscale *float32 `json:"mscale"`
|
||||
MscaleAllDim *float32 `json:"mscale_all_dim"`
|
||||
} `json:"rope_parameters"`
|
||||
} `json:"text_config"`
|
||||
VisionModel struct {
|
||||
|
|
@ -50,6 +52,9 @@ type mistral3Model struct {
|
|||
HeadDim uint32 `json:"head_dim"`
|
||||
HiddenAct string `json:"hidden_act"`
|
||||
RopeTheta float32 `json:"rope_theta"`
|
||||
RopeParameters struct {
|
||||
RopeTheta float32 `json:"rope_theta"`
|
||||
} `json:"rope_parameters"`
|
||||
} `json:"vision_config"`
|
||||
MultiModalProjectorBias bool `json:"multimodal_projector_bias"`
|
||||
ProjectorHiddenAct string `json:"projector_hidden_act"`
|
||||
|
|
@ -72,10 +77,22 @@ func (p *mistral3Model) KV(t *Tokenizer) ggml.KV {
|
|||
kv["mistral3.attention.value_length"] = p.TextModel.HeadDim
|
||||
kv["mistral3.rope.dimension_count"] = cmp.Or(p.TextModel.HeadDim, p.TextModel.HiddenSize/p.TextModel.NumAttentionHeads)
|
||||
kv["mistral3.rope.freq_base"] = cmp.Or(p.TextModel.RopeTheta, p.TextModel.RopeParameters.RopeTheta)
|
||||
kv["mistral3.rope.scaling.factor"] = p.TextModel.RopeParameters.Factor
|
||||
kv["mistral3.rope.scaling.type"] = p.TextModel.RopeParameters.RopeType
|
||||
kv["mistral3.rope.scaling.beta_fast"] = p.TextModel.RopeParameters.BetaFast
|
||||
kv["mistral3.rope.scaling.beta_slow"] = p.TextModel.RopeParameters.BetaSlow
|
||||
|
||||
if p.TextModel.RopeParameters.Mscale != nil {
|
||||
kv["mistral3.rope.scaling.mscale"] = *p.TextModel.RopeParameters.Mscale
|
||||
}
|
||||
if p.TextModel.RopeParameters.MscaleAllDim != nil {
|
||||
kv["mistral3.rope.scaling.mscale_all_dim"] = *p.TextModel.RopeParameters.MscaleAllDim
|
||||
}
|
||||
if p.TextModel.RopeParameters.OrigMaxPositionEmbeddings > 0 {
|
||||
kv["mistral3.rope.scaling.original_context_length"] = p.TextModel.RopeParameters.OrigMaxPositionEmbeddings
|
||||
kv["mistral3.rope.scaling_beta"] = p.TextModel.RopeParameters.ScalingBeta
|
||||
}
|
||||
if p.TextModel.RopeParameters.Llama4ScalingBeta != nil {
|
||||
kv["mistral3.rope.scaling_beta"] = *p.TextModel.RopeParameters.Llama4ScalingBeta
|
||||
}
|
||||
|
||||
// Vision configuration
|
||||
|
|
@ -88,7 +105,7 @@ func (p *mistral3Model) KV(t *Tokenizer) ggml.KV {
|
|||
kv["mistral3.vision.patch_size"] = p.VisionModel.PatchSize
|
||||
kv["mistral3.vision.num_channels"] = p.VisionModel.NumChannels
|
||||
// kv["mistral3.vision.attention.layer_norm_epsilon"] = 1e-05 // Default value
|
||||
kv["mistral3.vision.rope.freq_base"] = p.VisionModel.RopeTheta
|
||||
kv["mistral3.vision.rope.freq_base"] = cmp.Or(p.VisionModel.RopeTheta, p.VisionModel.RopeParameters.RopeTheta)
|
||||
|
||||
// Multimodal configuration
|
||||
kv["mistral3.image_token_index"] = p.ImageTokenIndex
|
||||
|
|
|
|||
|
|
@ -0,0 +1,181 @@
|
|||
package convert
|
||||
|
||||
import (
|
||||
"cmp"
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
"github.com/pdevine/tensor"
|
||||
"github.com/pdevine/tensor/native"
|
||||
|
||||
"github.com/ollama/ollama/fs/ggml"
|
||||
)
|
||||
|
||||
type mistral3CausalModel struct {
|
||||
ModelParameters
|
||||
|
||||
NumHiddenLayers uint32 `json:"num_hidden_layers"`
|
||||
MaxPositionEmbeddings uint32 `json:"max_position_embeddings"`
|
||||
HiddenSize uint32 `json:"hidden_size"`
|
||||
IntermediateSize uint32 `json:"intermediate_size"`
|
||||
NumAttentionHeads uint32 `json:"num_attention_heads"`
|
||||
NumKeyValueHeads uint32 `json:"num_key_value_heads"`
|
||||
RopeTheta float32 `json:"rope_theta"`
|
||||
RMSNormEPS float32 `json:"rms_norm_eps"`
|
||||
HeadDim uint32 `json:"head_dim"`
|
||||
SlidingWindow *uint32 `json:"sliding_window"`
|
||||
HiddenAct string `json:"hidden_act"`
|
||||
VocabSize uint32 `json:"vocab_size"`
|
||||
RopeParameters struct {
|
||||
BetaFast float32 `json:"beta_fast"`
|
||||
BetaSlow float32 `json:"beta_slow"`
|
||||
Factor float32 `json:"factor"`
|
||||
Llama4ScalingBeta *float32 `json:"llama_4_scaling_beta"`
|
||||
OrigMaxPositionEmbeddings uint32 `json:"original_max_position_embeddings"`
|
||||
RopeType string `json:"rope_type"`
|
||||
RopeTheta float32 `json:"rope_theta"`
|
||||
Mscale *float32 `json:"mscale"`
|
||||
MscaleAllDim *float32 `json:"mscale_all_dim"`
|
||||
} `json:"rope_parameters"`
|
||||
}
|
||||
|
||||
func (p *mistral3CausalModel) KV(t *Tokenizer) ggml.KV {
|
||||
kv := p.ModelParameters.KV(t)
|
||||
kv["general.architecture"] = "mistral3"
|
||||
kv["mistral3.vocab_size"] = p.VocabSize
|
||||
|
||||
// Text configuration
|
||||
kv["mistral3.block_count"] = p.NumHiddenLayers
|
||||
kv["mistral3.context_length"] = p.MaxPositionEmbeddings
|
||||
kv["mistral3.embedding_length"] = p.HiddenSize
|
||||
kv["mistral3.feed_forward_length"] = p.IntermediateSize
|
||||
kv["mistral3.attention.head_count"] = p.NumAttentionHeads
|
||||
kv["mistral3.attention.head_count_kv"] = p.NumKeyValueHeads
|
||||
kv["mistral3.attention.layer_norm_rms_epsilon"] = p.RMSNormEPS
|
||||
kv["mistral3.attention.key_length"] = p.HeadDim
|
||||
kv["mistral3.attention.value_length"] = p.HeadDim
|
||||
kv["mistral3.rope.dimension_count"] = cmp.Or(p.HeadDim, p.HiddenSize/p.NumAttentionHeads)
|
||||
kv["mistral3.rope.freq_base"] = cmp.Or(p.RopeTheta, p.RopeParameters.RopeTheta)
|
||||
kv["mistral3.rope.scaling.factor"] = p.RopeParameters.Factor
|
||||
kv["mistral3.rope.scaling.type"] = p.RopeParameters.RopeType
|
||||
kv["mistral3.rope.scaling.beta_fast"] = p.RopeParameters.BetaFast
|
||||
kv["mistral3.rope.scaling.beta_slow"] = p.RopeParameters.BetaSlow
|
||||
|
||||
if p.RopeParameters.Mscale != nil {
|
||||
kv["mistral3.rope.scaling.mscale"] = *p.RopeParameters.Mscale
|
||||
}
|
||||
|
||||
if p.RopeParameters.MscaleAllDim != nil {
|
||||
kv["mistral3.rope.scaling.mscale_all_dim"] = *p.RopeParameters.MscaleAllDim
|
||||
}
|
||||
|
||||
if p.RopeParameters.OrigMaxPositionEmbeddings > 0 {
|
||||
kv["mistral3.rope.scaling.original_context_length"] = p.RopeParameters.OrigMaxPositionEmbeddings
|
||||
kv["mistral3.rope.scaling_beta"] = *p.RopeParameters.Llama4ScalingBeta
|
||||
}
|
||||
|
||||
if p.RopeParameters.Llama4ScalingBeta != nil {
|
||||
kv["mistral3.rope.scaling_beta"] = *p.RopeParameters.Llama4ScalingBeta
|
||||
}
|
||||
|
||||
return kv
|
||||
}
|
||||
|
||||
func (p *mistral3CausalModel) Tensors(ts []Tensor) []*ggml.Tensor {
|
||||
var out []*ggml.Tensor
|
||||
|
||||
for _, t := range ts {
|
||||
if !strings.HasPrefix(t.Name(), "v.") {
|
||||
if strings.HasSuffix(t.Name(), ".attn_q.weight") ||
|
||||
strings.HasSuffix(t.Name(), ".attn_k.weight") {
|
||||
t.SetRepacker(p.repack)
|
||||
}
|
||||
}
|
||||
|
||||
out = append(out, &ggml.Tensor{
|
||||
Name: t.Name(),
|
||||
Kind: t.Kind(),
|
||||
Shape: t.Shape(),
|
||||
WriterTo: t,
|
||||
})
|
||||
}
|
||||
|
||||
return out
|
||||
}
|
||||
|
||||
func (p *mistral3CausalModel) Replacements() []string {
|
||||
return []string{
|
||||
"model.norm", "output_norm",
|
||||
"model.", "",
|
||||
"layers", "blk",
|
||||
"transformer.layers", "blk",
|
||||
"vision_tower", "v",
|
||||
"ln_pre", "encoder_norm",
|
||||
"input_layernorm", "attn_norm",
|
||||
"post_attention_layernorm", "ffn_norm",
|
||||
"embed_tokens", "token_embd",
|
||||
"self_attn.q_proj", "attn_q",
|
||||
"self_attn.k_proj", "attn_k",
|
||||
"self_attn.v_proj", "attn_v",
|
||||
"self_attn.o_proj", "attn_output",
|
||||
"mlp.down_proj", "ffn_down",
|
||||
"mlp.gate_proj", "ffn_gate",
|
||||
"mlp.up_proj", "ffn_up",
|
||||
"attention.q_proj", "attn_q",
|
||||
"attention.k_proj", "attn_k",
|
||||
"attention.v_proj", "attn_v",
|
||||
"attention.o_proj", "attn_output",
|
||||
"attention_norm", "attn_norm",
|
||||
"feed_forward.gate_proj", "ffn_gate",
|
||||
"feed_forward.down_proj", "ffn_down",
|
||||
"feed_forward.up_proj", "ffn_up",
|
||||
"multi_modal_projector", "mm",
|
||||
"ffn_norm", "ffn_norm",
|
||||
"lm_head", "output",
|
||||
}
|
||||
}
|
||||
|
||||
func (p *mistral3CausalModel) repack(name string, data []float32, shape []uint64) ([]float32, error) {
|
||||
var dims []int
|
||||
for _, dim := range shape {
|
||||
dims = append(dims, int(dim))
|
||||
}
|
||||
|
||||
var heads uint32
|
||||
if strings.HasSuffix(name, ".attn_q.weight") {
|
||||
heads = p.NumAttentionHeads
|
||||
} else if strings.HasSuffix(name, ".attn_k.weight") {
|
||||
heads = cmp.Or(p.NumKeyValueHeads, p.NumAttentionHeads)
|
||||
} else {
|
||||
return nil, fmt.Errorf("unknown tensor for repack: %s", name)
|
||||
}
|
||||
|
||||
n := tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
|
||||
if err := n.Reshape(append([]int{int(heads), 2, dims[0] / int(heads) / 2}, dims[1:]...)...); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if err := n.T(0, 2, 1, 3); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if err := n.Reshape(dims...); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if err := n.Transpose(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
ts, err := native.SelectF32(n, 1)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var f32s []float32
|
||||
for _, t := range ts {
|
||||
f32s = append(f32s, t...)
|
||||
}
|
||||
|
||||
return f32s, nil
|
||||
}
|
||||
|
|
@ -0,0 +1,213 @@
|
|||
package convert
|
||||
|
||||
import (
|
||||
"cmp"
|
||||
"encoding/json"
|
||||
"io/fs"
|
||||
"path/filepath"
|
||||
"slices"
|
||||
"strings"
|
||||
|
||||
"github.com/ollama/ollama/fs/ggml"
|
||||
)
|
||||
|
||||
type nomicbertModel struct {
|
||||
ModelParameters
|
||||
NLayers uint32 `json:"n_layers"`
|
||||
NumHiddenLayers uint32 `json:"num_hidden_layers"`
|
||||
MaxPositionEmbeddings uint32 `json:"max_position_embeddings"`
|
||||
HiddenSize uint32 `json:"hidden_size"`
|
||||
IntermediateSize uint32 `json:"intermediate_size"`
|
||||
NumAttentionHeads uint32 `json:"num_attention_heads"`
|
||||
NumKeyValueHeads uint32 `json:"num_key_value_heads"`
|
||||
LayerNormEPS float32 `json:"layer_norm_eps"`
|
||||
LayerNormEpsilon float32 `json:"layer_norm_epsilon"`
|
||||
RopeFreqBase float32 `json:"rope_theta"`
|
||||
normalizeEmbeddings bool
|
||||
PoolingType uint32
|
||||
|
||||
// MoE parameters (only present in v2 models)
|
||||
NumExperts uint32 `json:"num_local_experts"`
|
||||
NumExpertsUsed uint32 `json:"num_experts_per_tok"`
|
||||
MoEEveryNLayers uint32 `json:"moe_every_n_layers"`
|
||||
}
|
||||
|
||||
var (
|
||||
_ ModelConverter = (*nomicbertModel)(nil)
|
||||
_ moreParser = (*nomicbertModel)(nil)
|
||||
)
|
||||
|
||||
func (p *nomicbertModel) parseMore(fsys fs.FS) error {
|
||||
bts, err := fs.ReadFile(fsys, "modules.json")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
var modules []struct {
|
||||
Type string `json:"type"`
|
||||
Path string `json:"path"`
|
||||
}
|
||||
|
||||
if err := json.Unmarshal(bts, &modules); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
var pooling string
|
||||
for _, m := range modules {
|
||||
switch m.Type {
|
||||
case "sentence_transformers.models.Pooling":
|
||||
pooling = m.Path
|
||||
case "sentence_transformers.models.Normalize":
|
||||
p.normalizeEmbeddings = true
|
||||
}
|
||||
}
|
||||
|
||||
if pooling != "" {
|
||||
bts, err := fs.ReadFile(fsys, filepath.Join(pooling, "config.json"))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
var pc struct {
|
||||
PoolingModeCLSToken bool `json:"pooling_mode_cls_token"`
|
||||
PoolingModeMeanTokens bool `json:"pooling_mode_mean_tokens"`
|
||||
}
|
||||
|
||||
if err := json.Unmarshal(bts, &pc); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if pc.PoolingModeMeanTokens {
|
||||
p.PoolingType = 1
|
||||
} else if pc.PoolingModeCLSToken {
|
||||
p.PoolingType = 2
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (p *nomicbertModel) KV(t *Tokenizer) ggml.KV {
|
||||
kv := p.ModelParameters.KV(t)
|
||||
|
||||
// Determine architecture based on MoE parameters (following qwen3 pattern)
|
||||
arch := "nomic-bert"
|
||||
if p.MoEEveryNLayers > 0 {
|
||||
arch += "-moe"
|
||||
}
|
||||
|
||||
kv["general.architecture"] = arch
|
||||
kv["attention.causal"] = false
|
||||
kv["pooling_type"] = p.PoolingType
|
||||
kv["normalize_embeddings"] = p.normalizeEmbeddings
|
||||
|
||||
kv["block_count"] = cmp.Or(p.NLayers, p.NumHiddenLayers)
|
||||
|
||||
if contextLength := p.MaxPositionEmbeddings; contextLength > 0 {
|
||||
kv["context_length"] = contextLength
|
||||
}
|
||||
|
||||
if embeddingLength := p.HiddenSize; embeddingLength > 0 {
|
||||
kv["embedding_length"] = p.HiddenSize
|
||||
}
|
||||
|
||||
if feedForwardLength := p.IntermediateSize; feedForwardLength > 0 {
|
||||
kv["feed_forward_length"] = p.IntermediateSize
|
||||
}
|
||||
|
||||
if headCount := p.NumAttentionHeads; headCount > 0 {
|
||||
kv["attention.head_count"] = p.NumAttentionHeads
|
||||
}
|
||||
|
||||
if kvHeadCount := p.NumKeyValueHeads; kvHeadCount > 0 {
|
||||
kv["attention.head_count_kv"] = p.NumKeyValueHeads
|
||||
}
|
||||
|
||||
if layerNormEpsilon := cmp.Or(p.LayerNormEPS, p.LayerNormEpsilon); layerNormEpsilon > 0 {
|
||||
kv["attention.layer_norm_epsilon"] = layerNormEpsilon
|
||||
}
|
||||
|
||||
if p.RopeFreqBase > 0 {
|
||||
kv["rope.freq_base"] = p.RopeFreqBase
|
||||
}
|
||||
|
||||
// MoE specific parameters (only if MoE is enabled)
|
||||
if p.NumExperts > 0 {
|
||||
kv["expert_count"] = p.NumExperts
|
||||
}
|
||||
|
||||
if p.NumExpertsUsed > 0 {
|
||||
kv["expert_used_count"] = p.NumExpertsUsed
|
||||
}
|
||||
|
||||
if p.MoEEveryNLayers > 0 {
|
||||
kv["moe_every_n_layers"] = p.MoEEveryNLayers
|
||||
}
|
||||
|
||||
kv["tokenizer.ggml.model"] = "bert"
|
||||
kv["tokenizer.ggml.token_type_count"] = uint32(2)
|
||||
|
||||
// convert to phantom space tokens
|
||||
for i, e := range t.Tokens {
|
||||
switch {
|
||||
case strings.HasPrefix(e, "[") && strings.HasSuffix(e, "]"):
|
||||
// noop - keep special tokens as-is
|
||||
case strings.HasPrefix(e, "##"):
|
||||
t.Tokens[i] = e[2:]
|
||||
default:
|
||||
t.Tokens[i] = "\u2581" + e
|
||||
}
|
||||
}
|
||||
|
||||
kv["tokenizer.ggml.tokens"] = t.Tokens
|
||||
|
||||
return kv
|
||||
}
|
||||
|
||||
func (p *nomicbertModel) Tensors(ts []Tensor) []*ggml.Tensor {
|
||||
out := make([]*ggml.Tensor, 0, len(ts))
|
||||
for _, t := range ts {
|
||||
if slices.Contains([]string{
|
||||
"embeddings.position_ids",
|
||||
"pooler.dense.weight",
|
||||
"pooler.dense.bias",
|
||||
}, t.Name()) {
|
||||
continue
|
||||
}
|
||||
|
||||
out = append(out, &ggml.Tensor{
|
||||
Name: t.Name(),
|
||||
Kind: t.Kind(),
|
||||
Shape: t.Shape(),
|
||||
WriterTo: t,
|
||||
})
|
||||
}
|
||||
|
||||
return out
|
||||
}
|
||||
|
||||
func (nomicbertModel) Replacements() []string {
|
||||
return []string{
|
||||
"encoder.layer", "blk",
|
||||
"encoder.layers", "blk",
|
||||
"embeddings.word_embeddings", "token_embd",
|
||||
"embeddings.token_type_embeddings", "token_types",
|
||||
"embeddings.LayerNorm", "token_embd_norm",
|
||||
|
||||
"attention.self.qkv", "attn_qkv",
|
||||
|
||||
"attention.output.dense", "attn_output",
|
||||
"attention.output.LayerNorm", "attn_output_norm",
|
||||
|
||||
"mlp.up", "ffn_up",
|
||||
"mlp.down", "ffn_down",
|
||||
|
||||
"mlp.router", "ffn_gate_inp",
|
||||
"mlp.experts.up", "ffn_up_exps",
|
||||
"mlp.experts.down", "ffn_down_exps",
|
||||
|
||||
"intermediate.dense", "ffn_up",
|
||||
"output.dense", "ffn_down",
|
||||
"output.LayerNorm", "layer_output_norm",
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,117 @@
|
|||
package convert
|
||||
|
||||
import (
|
||||
"cmp"
|
||||
|
||||
"github.com/ollama/ollama/fs/ggml"
|
||||
)
|
||||
|
||||
type ropeScaling struct {
|
||||
Factor float32 `json:"factor"`
|
||||
OriginalMaxPositionEmbeds uint32 `json:"original_max_position_embeddings"`
|
||||
AttentionFactor float32 `json:"attention_factor"`
|
||||
BetaFast float32 `json:"beta_fast"`
|
||||
BetaSlow float32 `json:"beta_slow"`
|
||||
RopeType string `json:"rope_type"`
|
||||
ExtrapolationFactor float32 `json:"extrapolation_factor"`
|
||||
}
|
||||
|
||||
type olmoModel struct {
|
||||
ModelParameters
|
||||
|
||||
HiddenSize uint32 `json:"hidden_size"`
|
||||
NumHiddenLayers uint32 `json:"num_hidden_layers"`
|
||||
IntermediateSize uint32 `json:"intermediate_size"`
|
||||
NumAttentionHeads uint32 `json:"num_attention_heads"`
|
||||
NumKeyValueHeads uint32 `json:"num_key_value_heads"`
|
||||
MaxPositionEmbeddings uint32 `json:"max_position_embeddings"`
|
||||
RMSNormEPS float32 `json:"rms_norm_eps"`
|
||||
RopeTheta float32 `json:"rope_theta"`
|
||||
RopeScaling *ropeScaling `json:"rope_scaling"`
|
||||
SlidingWindow uint32 `json:"sliding_window"`
|
||||
LayerTypes []string `json:"layer_types"`
|
||||
}
|
||||
|
||||
var _ ModelConverter = (*olmoModel)(nil)
|
||||
|
||||
func (p *olmoModel) KV(t *Tokenizer) ggml.KV {
|
||||
kv := p.ModelParameters.KV(t)
|
||||
kv["general.architecture"] = "olmo3"
|
||||
kv["olmo3.block_count"] = p.NumHiddenLayers
|
||||
kv["olmo3.context_length"] = p.MaxPositionEmbeddings
|
||||
kv["olmo3.embedding_length"] = p.HiddenSize
|
||||
kv["olmo3.feed_forward_length"] = p.IntermediateSize
|
||||
kv["olmo3.attention.head_count"] = p.NumAttentionHeads
|
||||
kv["olmo3.attention.head_count_kv"] = cmp.Or(p.NumKeyValueHeads, p.NumAttentionHeads)
|
||||
|
||||
if p.RopeTheta > 0 {
|
||||
kv["olmo3.rope.freq_base"] = p.RopeTheta
|
||||
}
|
||||
|
||||
if p.RopeScaling != nil {
|
||||
if p.RopeScaling.Factor > 0 {
|
||||
kv["olmo3.rope.scaling.factor"] = p.RopeScaling.Factor
|
||||
}
|
||||
if p.RopeScaling.OriginalMaxPositionEmbeds > 0 {
|
||||
kv["olmo3.rope.scaling.original_context_length"] = p.RopeScaling.OriginalMaxPositionEmbeds
|
||||
}
|
||||
if p.RopeScaling.AttentionFactor > 0 {
|
||||
kv["olmo3.rope.scaling.attn_factor"] = p.RopeScaling.AttentionFactor
|
||||
}
|
||||
if p.RopeScaling.RopeType != "" {
|
||||
kv["olmo3.rope.scaling.type"] = p.RopeScaling.RopeType
|
||||
}
|
||||
}
|
||||
|
||||
if p.RMSNormEPS > 0 {
|
||||
kv["olmo3.attention.layer_norm_rms_epsilon"] = p.RMSNormEPS
|
||||
}
|
||||
|
||||
if p.SlidingWindow > 0 {
|
||||
kv["olmo3.attention.sliding_window"] = p.SlidingWindow
|
||||
}
|
||||
|
||||
if len(p.LayerTypes) > 0 {
|
||||
slidingPattern := make([]bool, len(p.LayerTypes))
|
||||
for i, layerType := range p.LayerTypes {
|
||||
slidingPattern[i] = (layerType == "sliding_attention")
|
||||
}
|
||||
kv["olmo3.attention.sliding_window_pattern"] = slidingPattern
|
||||
}
|
||||
|
||||
return kv
|
||||
}
|
||||
|
||||
func (p *olmoModel) Tensors(ts []Tensor) []*ggml.Tensor {
|
||||
out := make([]*ggml.Tensor, 0, len(ts))
|
||||
for _, t := range ts {
|
||||
out = append(out, &ggml.Tensor{
|
||||
Name: t.Name(),
|
||||
Kind: t.Kind(),
|
||||
Shape: t.Shape(),
|
||||
WriterTo: t,
|
||||
})
|
||||
}
|
||||
|
||||
return out
|
||||
}
|
||||
|
||||
func (p *olmoModel) Replacements() []string {
|
||||
return []string{
|
||||
"lm_head", "output",
|
||||
"model.embed_tokens", "token_embd",
|
||||
"model.layers", "blk",
|
||||
"model.norm", "output_norm",
|
||||
"self_attn.q_proj", "attn_q",
|
||||
"self_attn.k_proj", "attn_k",
|
||||
"self_attn.v_proj", "attn_v",
|
||||
"self_attn.o_proj", "attn_output",
|
||||
"self_attn.q_norm", "attn_q_norm",
|
||||
"self_attn.k_norm", "attn_k_norm",
|
||||
"post_attention_layernorm", "post_attention_norm",
|
||||
"post_feedforward_layernorm", "post_ffw_norm",
|
||||
"mlp.gate_proj", "ffn_gate",
|
||||
"mlp.down_proj", "ffn_down",
|
||||
"mlp.up_proj", "ffn_up",
|
||||
}
|
||||
}
|
||||
10
docs/api.md
10
docs/api.md
|
|
@ -50,7 +50,7 @@ Generate a response for a given prompt with a provided model. This is a streamin
|
|||
Advanced parameters (optional):
|
||||
|
||||
- `format`: the format to return a response in. Format can be `json` or a JSON schema
|
||||
- `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
|
||||
- `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.mdx#valid-parameters-and-values) such as `temperature`
|
||||
- `system`: system message to (overrides what is defined in the `Modelfile`)
|
||||
- `template`: the prompt template to use (overrides what is defined in the `Modelfile`)
|
||||
- `stream`: if `false` the response will be returned as a single response object, rather than a stream of objects
|
||||
|
|
@ -507,7 +507,7 @@ The `message` object has the following fields:
|
|||
Advanced parameters (optional):
|
||||
|
||||
- `format`: the format to return a response in. Format can be `json` or a JSON schema.
|
||||
- `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
|
||||
- `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.mdx#valid-parameters-and-values) such as `temperature`
|
||||
- `stream`: if `false` the response will be returned as a single response object, rather than a stream of objects
|
||||
- `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`)
|
||||
|
||||
|
|
@ -1189,7 +1189,7 @@ If you are creating a model from a safetensors directory or from a GGUF file, yo
|
|||
- `template`: (optional) the prompt template for the model
|
||||
- `license`: (optional) a string or list of strings containing the license or licenses for the model
|
||||
- `system`: (optional) a string containing the system prompt for the model
|
||||
- `parameters`: (optional) a dictionary of parameters for the model (see [Modelfile](./modelfile.md#valid-parameters-and-values) for a list of parameters)
|
||||
- `parameters`: (optional) a dictionary of parameters for the model (see [Modelfile](./modelfile.mdx#valid-parameters-and-values) for a list of parameters)
|
||||
- `messages`: (optional) a list of message objects used to create a conversation
|
||||
- `stream`: (optional) if `false` the response will be returned as a single response object, rather than a stream of objects
|
||||
- `quantize` (optional): quantize a non-quantized (e.g. float16) model
|
||||
|
|
@ -1698,7 +1698,7 @@ Generate embeddings from a model
|
|||
Advanced parameters:
|
||||
|
||||
- `truncate`: truncates the end of each input to fit within context length. Returns error if `false` and context length is exceeded. Defaults to `true`
|
||||
- `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
|
||||
- `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.mdx#valid-parameters-and-values) such as `temperature`
|
||||
- `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`)
|
||||
- `dimensions`: number of dimensions for the embedding
|
||||
|
||||
|
|
@ -1817,7 +1817,7 @@ Generate embeddings from a model
|
|||
|
||||
Advanced parameters:
|
||||
|
||||
- `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
|
||||
- `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.mdx#valid-parameters-and-values) such as `temperature`
|
||||
- `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`)
|
||||
|
||||
### Examples
|
||||
|
|
|
|||
File diff suppressed because one or more lines are too long
|
|
@ -0,0 +1,46 @@
|
|||
# extract-examples
|
||||
|
||||
Extracts code examples from MDX files to a temp directory so you can run them.
|
||||
|
||||
## Usage
|
||||
|
||||
```shell
|
||||
go run docs/tools/extract-examples/main.go <mdx-file>
|
||||
```
|
||||
|
||||
## Example
|
||||
|
||||
```shell
|
||||
go run docs/tools/extract-examples/main.go docs/api/openai-compatibility.mdx
|
||||
```
|
||||
|
||||
Output:
|
||||
|
||||
```
|
||||
Extracting code examples to: /var/folders/vq/wfm2g6k917d3ldzpjdxc8ph00000gn/T/mdx-examples-3271754368
|
||||
|
||||
- 01_basic.py
|
||||
- 01_basic.js
|
||||
- 01_basic.sh
|
||||
- 02_responses.py
|
||||
- 02_responses.js
|
||||
- 02_responses.sh
|
||||
- 03_vision.py
|
||||
- 03_vision.js
|
||||
- 03_vision.sh
|
||||
|
||||
Extracted 9 file(s) to /var/folders/vq/wfm2g6k917d3ldzpjdxc8ph00000gn/T/mdx-examples-3271754368
|
||||
|
||||
To run examples:
|
||||
|
||||
cd /var/folders/vq/wfm2g6k917d3ldzpjdxc8ph00000gn/T/mdx-examples-3271754368
|
||||
npm install # for JS examples
|
||||
|
||||
then run individual files with `node file.js`, `python file.py`, `bash file.sh`
|
||||
```
|
||||
|
||||
## How it works
|
||||
|
||||
- Parses MDX files looking for fenced code blocks with filenames (e.g., ` ```python basic.py `)
|
||||
- Groups examples by their `<CodeGroup>` and prefixes filenames with `01_`, `02_`, etc.
|
||||
- Writes all extracted files to a temp directory
|
||||
|
|
@ -0,0 +1,137 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"strings"
|
||||
)
|
||||
|
||||
func main() {
|
||||
if len(os.Args) < 2 {
|
||||
fmt.Fprintln(os.Stderr, "Usage: go run extract-examples.go <mdx-file>")
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
mdxFile := os.Args[1]
|
||||
|
||||
f, err := os.Open(mdxFile)
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "Error: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
// Create temp directory
|
||||
tempDir, err := os.MkdirTemp("", "mdx-examples-*")
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "Error creating temp dir: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
fmt.Printf("Extracting code examples to: %s\n\n", tempDir)
|
||||
|
||||
// Patterns
|
||||
codeBlockStart := regexp.MustCompile("^```([a-zA-Z0-9_-]+)\\s+([^\\s]+)$")
|
||||
codeGroupStart := regexp.MustCompile("^<CodeGroup")
|
||||
codeGroupEnd := regexp.MustCompile("^</CodeGroup>")
|
||||
|
||||
scanner := bufio.NewScanner(f)
|
||||
inCodeBlock := false
|
||||
inCodeGroup := false
|
||||
var currentFile string
|
||||
var content strings.Builder
|
||||
count := 0
|
||||
codeGroupNum := 0
|
||||
|
||||
for scanner.Scan() {
|
||||
line := scanner.Text()
|
||||
|
||||
// Track CodeGroup boundaries
|
||||
if codeGroupStart.MatchString(line) {
|
||||
inCodeGroup = true
|
||||
codeGroupNum++
|
||||
continue
|
||||
}
|
||||
if codeGroupEnd.MatchString(line) {
|
||||
inCodeGroup = false
|
||||
continue
|
||||
}
|
||||
|
||||
if inCodeBlock {
|
||||
if line == "```" {
|
||||
// End of code block - write file
|
||||
if currentFile != "" {
|
||||
outPath := filepath.Join(tempDir, currentFile)
|
||||
if err := os.WriteFile(outPath, []byte(content.String()), 0o644); err != nil {
|
||||
fmt.Fprintf(os.Stderr, "Error writing %s: %v\n", currentFile, err)
|
||||
} else {
|
||||
fmt.Printf(" - %s\n", currentFile)
|
||||
count++
|
||||
}
|
||||
}
|
||||
inCodeBlock = false
|
||||
currentFile = ""
|
||||
content.Reset()
|
||||
} else {
|
||||
content.WriteString(line)
|
||||
content.WriteString("\n")
|
||||
}
|
||||
} else {
|
||||
if matches := codeBlockStart.FindStringSubmatch(line); matches != nil {
|
||||
inCodeBlock = true
|
||||
filename := matches[2]
|
||||
// Prefix with CodeGroup number if inside a CodeGroup
|
||||
if inCodeGroup {
|
||||
currentFile = fmt.Sprintf("%02d_%s", codeGroupNum, filename)
|
||||
} else {
|
||||
currentFile = filename
|
||||
}
|
||||
content.Reset()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if err := scanner.Err(); err != nil {
|
||||
fmt.Fprintf(os.Stderr, "Error reading file: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
// Write package.json for JavaScript dependencies
|
||||
packageJSON := `{
|
||||
"name": "mdx-examples",
|
||||
"type": "module",
|
||||
"dependencies": {
|
||||
"openai": "^4",
|
||||
"ollama": "^0.5"
|
||||
}
|
||||
}
|
||||
`
|
||||
if err := os.WriteFile(filepath.Join(tempDir, "package.json"), []byte(packageJSON), 0o644); err != nil {
|
||||
fmt.Fprintf(os.Stderr, "Error writing package.json: %v\n", err)
|
||||
}
|
||||
|
||||
// Write pyproject.toml for Python dependencies
|
||||
pyprojectTOML := `[project]
|
||||
name = "mdx-examples"
|
||||
version = "0.0.0"
|
||||
dependencies = [
|
||||
"openai",
|
||||
"ollama",
|
||||
]
|
||||
`
|
||||
if err := os.WriteFile(filepath.Join(tempDir, "pyproject.toml"), []byte(pyprojectTOML), 0o644); err != nil {
|
||||
fmt.Fprintf(os.Stderr, "Error writing pyproject.toml: %v\n", err)
|
||||
}
|
||||
|
||||
fmt.Printf("\n")
|
||||
fmt.Printf("Extracted %d file(s) to %s\n", count, tempDir)
|
||||
fmt.Printf("\n")
|
||||
fmt.Printf("To run examples:\n")
|
||||
fmt.Printf("\n")
|
||||
fmt.Printf(" cd %s\n npm install # for JS examples\n", tempDir)
|
||||
fmt.Printf("\n")
|
||||
fmt.Printf("then run individual files with `node file.js`, `python file.py`, `bash file.sh`\n")
|
||||
}
|
||||
|
|
@ -13,6 +13,7 @@ import (
|
|||
|
||||
"github.com/ollama/ollama/format"
|
||||
"github.com/ollama/ollama/fs/util/bufioutil"
|
||||
"github.com/ollama/ollama/ml"
|
||||
)
|
||||
|
||||
type GGML struct {
|
||||
|
|
@ -240,18 +241,20 @@ func (kv KV) Bools(key string, defaultValue ...[]bool) []bool {
|
|||
|
||||
func (kv KV) OllamaEngineRequired() bool {
|
||||
return slices.Contains([]string{
|
||||
"bert",
|
||||
"deepseek2",
|
||||
"deepseekocr",
|
||||
"gemma3",
|
||||
"gemma3n",
|
||||
"gptoss", "gpt-oss",
|
||||
"llama4",
|
||||
"mistral3",
|
||||
"mllama",
|
||||
"nomic-bert",
|
||||
"olmo3",
|
||||
"qwen25vl",
|
||||
"qwen3", "qwen3moe",
|
||||
"qwen3vl", "qwen3vlmoe",
|
||||
"deepseekocr",
|
||||
"deepseek2",
|
||||
"nomic-bert",
|
||||
}, kv.Architecture())
|
||||
}
|
||||
|
||||
|
|
@ -550,7 +553,7 @@ func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, error) {
|
|||
}, nil
|
||||
}
|
||||
|
||||
func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType string, useFlashAttention bool) (kv []uint64, partialOffload, fullOffload uint64) {
|
||||
func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType string, useFlashAttention ml.FlashAttentionType) (kv []uint64, partialOffload, fullOffload uint64) {
|
||||
context *= uint64(numParallel)
|
||||
|
||||
embedding := f.KV().EmbeddingLength()
|
||||
|
|
@ -791,7 +794,7 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri
|
|||
}
|
||||
|
||||
partialOffload = 2 * f.KV().HeadCountMax() / cmp.Or(f.KV().HeadCountKVMin(), 1) * kvTotal / 6
|
||||
if useFlashAttention {
|
||||
if useFlashAttention == ml.FlashAttentionEnabled {
|
||||
// rough estimate of graph size with flash attention on
|
||||
partialOffload = (4*uint64(numParallel) + context>>10 + 110) * format.MebiByte
|
||||
}
|
||||
|
|
@ -809,6 +812,14 @@ func (f GGML) SupportsKVCacheType(cacheType string) bool {
|
|||
return slices.Contains([]string{"q8_0", "q4_0"}, cacheType)
|
||||
}
|
||||
|
||||
// KVCacheTypeIsQuantized checks if the requested cache type is a quantized type
|
||||
func (f GGML) KVCacheTypeIsQuantized(cacheType string) bool {
|
||||
if cacheType == "" || cacheType == "f16" || cacheType == "f32" || cacheType == "bf16" {
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// SupportsFlashAttention checks if the model supports flash attention
|
||||
func (f GGML) SupportsFlashAttention() bool {
|
||||
_, isEmbedding := f.KV()[fmt.Sprintf("%s.pooling_type", f.KV().Architecture())]
|
||||
|
|
@ -829,9 +840,11 @@ func (f GGML) SupportsFlashAttention() bool {
|
|||
// FlashAttention checks if the model should enable flash attention
|
||||
func (f GGML) FlashAttention() bool {
|
||||
return slices.Contains([]string{
|
||||
"bert",
|
||||
"gemma3",
|
||||
"gptoss", "gpt-oss",
|
||||
"mistral3",
|
||||
"olmo3",
|
||||
"qwen3", "qwen3moe",
|
||||
"qwen3vl", "qwen3vlmoe",
|
||||
}, f.KV().String("general.architecture"))
|
||||
|
|
|
|||
|
|
@ -597,6 +597,10 @@ func ggufWriteKV(ws io.WriteSeeker, arch, k string, v any) error {
|
|||
|
||||
var err error
|
||||
switch v := v.(type) {
|
||||
case int32:
|
||||
err = writeGGUF(ws, ggufTypeInt32, v)
|
||||
case int64:
|
||||
err = writeGGUF(ws, ggufTypeInt64, v)
|
||||
case uint32, FileType:
|
||||
err = writeGGUF(ws, ggufTypeUint32, v)
|
||||
case uint64:
|
||||
|
|
@ -611,6 +615,10 @@ func ggufWriteKV(ws io.WriteSeeker, arch, k string, v any) error {
|
|||
err = writeGGUFArray(ws, ggufTypeInt32, v)
|
||||
case *array[int32]:
|
||||
err = writeGGUFArray(ws, ggufTypeInt32, v.values)
|
||||
case []int64:
|
||||
err = writeGGUFArray(ws, ggufTypeInt64, v)
|
||||
case *array[int64]:
|
||||
err = writeGGUFArray(ws, ggufTypeInt64, v.values)
|
||||
case []uint32:
|
||||
err = writeGGUFArray(ws, ggufTypeUint32, v)
|
||||
case *array[uint32]:
|
||||
|
|
|
|||
|
|
@ -42,6 +42,10 @@ func TestWriteGGUF(t *testing.T) {
|
|||
"general.architecture": "test",
|
||||
"general.alignment": uint32(16),
|
||||
"test.key": "value",
|
||||
"test.int32_key": int32(-42),
|
||||
"test.int64_key": int64(-9223372036854775808),
|
||||
"test.int32_array": []int32{-1, 0, 1, 2147483647, -2147483648},
|
||||
"test.int64_array": []int64{-1, 0, 1, 9223372036854775807, -9223372036854775808},
|
||||
"attention.key": "value2",
|
||||
"tokenizer.key": "value3",
|
||||
"adapter.key": "value4",
|
||||
|
|
@ -55,7 +59,7 @@ func TestWriteGGUF(t *testing.T) {
|
|||
}
|
||||
defer r.Close()
|
||||
|
||||
ff, err := Decode(r, 0)
|
||||
ff, err := Decode(r, -1)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
|
@ -65,15 +69,19 @@ func TestWriteGGUF(t *testing.T) {
|
|||
"general.alignment": uint32(16),
|
||||
"general.parameter_count": uint64(54),
|
||||
"test.key": "value",
|
||||
"test.int32_key": int32(-42),
|
||||
"test.int64_key": int64(-9223372036854775808),
|
||||
"test.int32_array": &array[int32]{size: 5, values: []int32{-1, 0, 1, 2147483647, -2147483648}},
|
||||
"test.int64_array": &array[int64]{size: 5, values: []int64{-1, 0, 1, 9223372036854775807, -9223372036854775808}},
|
||||
"test.attention.key": "value2",
|
||||
"tokenizer.key": "value3",
|
||||
"adapter.key": "value4",
|
||||
}, ff.KV()); diff != "" {
|
||||
}, ff.KV(), cmp.AllowUnexported(array[int32]{}, array[int64]{})); diff != "" {
|
||||
t.Errorf("Mismatch (-want +got):\n%s", diff)
|
||||
}
|
||||
|
||||
if diff := cmp.Diff(Tensors{
|
||||
Offset: 800,
|
||||
Offset: 992,
|
||||
items: []*Tensor{
|
||||
{Name: "blk.0.attn_k.weight", Offset: 0, Shape: []uint64{2, 3}},
|
||||
{Name: "blk.0.attn_norm.weight", Offset: 32, Shape: []uint64{2, 3}},
|
||||
|
|
|
|||
|
|
@ -4,7 +4,9 @@ package integration
|
|||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"math"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
|
|
@ -204,8 +206,8 @@ func TestAllMiniLMEmbed(t *testing.T) {
|
|||
t.Fatalf("expected %v, got %v (similarity: %f)", expected[0:5], res.Embeddings[0][0:5], sim)
|
||||
}
|
||||
|
||||
if res.PromptEvalCount != 6 {
|
||||
t.Fatalf("expected 6 prompt tokens, got %d", res.PromptEvalCount)
|
||||
if res.PromptEvalCount != 8 {
|
||||
t.Fatalf("expected 8 prompt tokens, got %d", res.PromptEvalCount)
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -251,8 +253,8 @@ func TestAllMiniLMBatchEmbed(t *testing.T) {
|
|||
t.Fatalf("expected %v, got %v (similarity: %f)", expected[1][0:5], res.Embeddings[1][0:5], sim)
|
||||
}
|
||||
|
||||
if res.PromptEvalCount != 12 {
|
||||
t.Fatalf("expected 12 prompt tokens, got %d", res.PromptEvalCount)
|
||||
if res.PromptEvalCount != 16 {
|
||||
t.Fatalf("expected 16 prompt tokens, got %d", res.PromptEvalCount)
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -275,7 +277,7 @@ func TestAllMiniLMEmbedTruncate(t *testing.T) {
|
|||
cases := []struct {
|
||||
name string
|
||||
request api.EmbedRequest
|
||||
check func(*api.EmbedResponse, error)
|
||||
check func(*testing.T, *api.EmbedResponse, error)
|
||||
}{
|
||||
{
|
||||
name: "target truncation",
|
||||
|
|
@ -283,7 +285,7 @@ func TestAllMiniLMEmbedTruncate(t *testing.T) {
|
|||
Model: "all-minilm",
|
||||
Input: "why",
|
||||
},
|
||||
check: func(got *api.EmbedResponse, err error) {
|
||||
check: func(t *testing.T, got *api.EmbedResponse, err error) {
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
|
@ -300,10 +302,11 @@ func TestAllMiniLMEmbedTruncate(t *testing.T) {
|
|||
Input: "why is the sky blue?",
|
||||
Options: map[string]any{"num_ctx": 3},
|
||||
},
|
||||
check: func(got *api.EmbedResponse, err error) {
|
||||
check: func(t *testing.T, got *api.EmbedResponse, err error) {
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
t.Logf("PromptEvalCount: want=%d got=%d", want.PromptEvalCount, got.PromptEvalCount)
|
||||
if diff := cmp.Diff(want.Embeddings[0], got.Embeddings[0]); diff != "" {
|
||||
t.Errorf("embedding mismatch (-want +got):\n%s", diff)
|
||||
}
|
||||
|
|
@ -317,10 +320,11 @@ func TestAllMiniLMEmbedTruncate(t *testing.T) {
|
|||
Truncate: &truncTrue,
|
||||
Options: map[string]any{"num_ctx": 3},
|
||||
},
|
||||
check: func(got *api.EmbedResponse, err error) {
|
||||
check: func(t *testing.T, got *api.EmbedResponse, err error) {
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
t.Logf("PromptEvalCount: want=%d got=%d", want.PromptEvalCount, got.PromptEvalCount)
|
||||
if diff := cmp.Diff(want.Embeddings[0], got.Embeddings[0]); diff != "" {
|
||||
t.Errorf("embedding mismatch (-want +got):\n%s", diff)
|
||||
}
|
||||
|
|
@ -334,21 +338,21 @@ func TestAllMiniLMEmbedTruncate(t *testing.T) {
|
|||
Truncate: &truncFalse,
|
||||
Options: map[string]any{"num_ctx": 3},
|
||||
},
|
||||
check: func(res *api.EmbedResponse, err error) {
|
||||
if err.Error() != "input exceeds maximum context length" {
|
||||
check: func(t *testing.T, res *api.EmbedResponse, err error) {
|
||||
if err.Error() != "the input length exceeds the context length" {
|
||||
t.Fatalf("expected truncation error, got: %v", err)
|
||||
}
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "input after truncate error",
|
||||
name: "input after truncate error with context length of 1",
|
||||
request: api.EmbedRequest{
|
||||
Model: "all-minilm",
|
||||
Input: "why is the sky blue?",
|
||||
Truncate: &truncTrue,
|
||||
Options: map[string]any{"num_ctx": 1},
|
||||
},
|
||||
check: func(res *api.EmbedResponse, err error) {
|
||||
check: func(t *testing.T, res *api.EmbedResponse, err error) {
|
||||
if err.Error() != "input after truncation exceeds maximum context length" {
|
||||
t.Fatalf("expected truncation error, got: %v", err)
|
||||
}
|
||||
|
|
@ -362,7 +366,7 @@ func TestAllMiniLMEmbedTruncate(t *testing.T) {
|
|||
Truncate: &truncTrue,
|
||||
Options: map[string]any{"num_ctx": 0},
|
||||
},
|
||||
check: func(res *api.EmbedResponse, err error) {
|
||||
check: func(t *testing.T, res *api.EmbedResponse, err error) {
|
||||
if err.Error() != "input after truncation exceeds maximum context length" {
|
||||
t.Fatalf("expected truncation error, got: %v", err)
|
||||
}
|
||||
|
|
@ -375,7 +379,7 @@ func TestAllMiniLMEmbedTruncate(t *testing.T) {
|
|||
Input: "why is the sky blue? Why is the sky blue? hi there my",
|
||||
Options: map[string]any{"num_ctx": 16},
|
||||
},
|
||||
check: func(res *api.EmbedResponse, err error) {
|
||||
check: func(t *testing.T, res *api.EmbedResponse, err error) {
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
|
@ -385,7 +389,8 @@ func TestAllMiniLMEmbedTruncate(t *testing.T) {
|
|||
|
||||
for _, req := range cases {
|
||||
t.Run(req.name, func(t *testing.T) {
|
||||
req.check(embedTestHelper(ctx, client, t, req.request))
|
||||
resp, err := embedTestHelper(ctx, client, t, req.request)
|
||||
req.check(t, resp, err)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
|
@ -409,3 +414,230 @@ func embedTestHelper(ctx context.Context, client *api.Client, t *testing.T, req
|
|||
|
||||
return client.Embed(ctx, &req)
|
||||
}
|
||||
|
||||
func TestEmbedTruncation(t *testing.T) {
|
||||
// Use test deadline if set, otherwise default to 2 minutes
|
||||
timeout := 2 * time.Minute
|
||||
if deadline, ok := t.Deadline(); ok {
|
||||
timeout = time.Until(deadline) - 10*time.Second // Reserve 10s buffer
|
||||
}
|
||||
ctx, cancel := context.WithTimeout(context.Background(), timeout)
|
||||
defer cancel()
|
||||
client, _, cleanup := InitServerConnection(ctx, t)
|
||||
defer cleanup()
|
||||
|
||||
for _, model := range libraryEmbedModels {
|
||||
model := model
|
||||
t.Run(model, func(t *testing.T) {
|
||||
// Check if we're running out of time (reserve 20s for current model)
|
||||
if deadline, ok := t.Deadline(); ok && time.Until(deadline) < 20*time.Second {
|
||||
t.Skip("skipping remaining tests to avoid timeout")
|
||||
}
|
||||
|
||||
// Give each model its own budget to account for first-time pulls/loads
|
||||
mctx, mcancel := context.WithTimeout(ctx, 3*time.Minute)
|
||||
defer mcancel()
|
||||
|
||||
t.Run("truncation batch", func(t *testing.T) {
|
||||
truncTrue := true
|
||||
req := api.EmbedRequest{
|
||||
Model: model,
|
||||
Input: []string{"short", strings.Repeat("long ", 100), "medium text"},
|
||||
Truncate: &truncTrue,
|
||||
Options: map[string]any{"num_ctx": 30},
|
||||
}
|
||||
|
||||
res, err := embedTestHelper(mctx, client, t, req)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if len(res.Embeddings) != 3 {
|
||||
t.Fatalf("expected 3 embeddings, got %d", len(res.Embeddings))
|
||||
}
|
||||
|
||||
if res.PromptEvalCount > 90 {
|
||||
t.Fatalf("expected tokens <= 90 (3 × 30 max), got %d", res.PromptEvalCount)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("runner token count accuracy", func(t *testing.T) {
|
||||
baseline := api.EmbedRequest{Model: model, Input: "test"}
|
||||
baseRes, err := embedTestHelper(mctx, client, t, baseline)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
batch := api.EmbedRequest{
|
||||
Model: model,
|
||||
Input: []string{"test", "test", "test"},
|
||||
}
|
||||
batchRes, err := embedTestHelper(mctx, client, t, batch)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
expectedCount := baseRes.PromptEvalCount * 3
|
||||
if batchRes.PromptEvalCount < expectedCount-2 || batchRes.PromptEvalCount > expectedCount+2 {
|
||||
t.Fatalf("expected ~%d tokens (3 × %d), got %d",
|
||||
expectedCount, baseRes.PromptEvalCount, batchRes.PromptEvalCount)
|
||||
}
|
||||
})
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TestEmbedLargeInput tests that embedding models can handle large inputs that would exceed typical batch sizes.
|
||||
func TestEmbedLargeInput(t *testing.T) {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 3*time.Minute)
|
||||
defer cancel()
|
||||
client, _, cleanup := InitServerConnection(ctx, t)
|
||||
defer cleanup()
|
||||
|
||||
for _, model := range libraryEmbedModels {
|
||||
model := model
|
||||
t.Run(model, func(t *testing.T) {
|
||||
mctx, mcancel := context.WithTimeout(ctx, 2*time.Minute)
|
||||
defer mcancel()
|
||||
|
||||
// Test with progressively larger inputs
|
||||
testCases := []struct {
|
||||
name string
|
||||
inputWords int
|
||||
}{
|
||||
{"medium_input_256_words", 256},
|
||||
{"large_input_512_words", 512},
|
||||
{"very_large_input_800_words", 800},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
words := make([]string, tc.inputWords)
|
||||
for i := range words {
|
||||
words[i] = "word"
|
||||
}
|
||||
input := strings.Join(words, " ")
|
||||
|
||||
req := api.EmbedRequest{
|
||||
Model: model,
|
||||
Input: input,
|
||||
KeepAlive: &api.Duration{Duration: 30 * time.Second},
|
||||
}
|
||||
|
||||
res, err := embedTestHelper(mctx, client, t, req)
|
||||
if err != nil {
|
||||
t.Fatalf("embedding failed for %d words: %v", tc.inputWords, err)
|
||||
}
|
||||
|
||||
if len(res.Embeddings) != 1 {
|
||||
t.Fatalf("expected 1 embedding, got %d", len(res.Embeddings))
|
||||
}
|
||||
|
||||
if len(res.Embeddings[0]) == 0 {
|
||||
t.Fatal("expected non-empty embedding")
|
||||
}
|
||||
|
||||
t.Logf("Successfully embedded %d words (%d tokens)", tc.inputWords, res.PromptEvalCount)
|
||||
})
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TestEmbedStatusCode tests that errors from the embedding endpoint
|
||||
// properly preserve their HTTP status codes when returned to the client.
|
||||
// This test specifically checks the error handling path in EmbedHandler
|
||||
// where api.StatusError errors should maintain their original status code.
|
||||
func TestEmbedStatusCode(t *testing.T) {
|
||||
// Use test deadline if set, otherwise default to 2 minutes
|
||||
timeout := 2 * time.Minute
|
||||
if deadline, ok := t.Deadline(); ok {
|
||||
timeout = time.Until(deadline) - 10*time.Second // Reserve 10s buffer
|
||||
}
|
||||
ctx, cancel := context.WithTimeout(context.Background(), timeout)
|
||||
defer cancel()
|
||||
client, _, cleanup := InitServerConnection(ctx, t)
|
||||
defer cleanup()
|
||||
|
||||
for _, model := range libraryEmbedModels {
|
||||
model := model
|
||||
t.Run(model, func(t *testing.T) {
|
||||
// Check if we're running out of time (reserve 20s for current model)
|
||||
if deadline, ok := t.Deadline(); ok && time.Until(deadline) < 20*time.Second {
|
||||
t.Skip("skipping remaining tests to avoid timeout")
|
||||
}
|
||||
|
||||
mctx, mcancel := context.WithTimeout(ctx, 3*time.Minute)
|
||||
defer mcancel()
|
||||
|
||||
// Pull the model if needed
|
||||
if err := PullIfMissing(mctx, client, model); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
t.Run("truncation error status code", func(t *testing.T) {
|
||||
truncFalse := false
|
||||
longInput := strings.Repeat("word ", 100)
|
||||
|
||||
req := api.EmbedRequest{
|
||||
Model: model,
|
||||
Input: longInput,
|
||||
Truncate: &truncFalse,
|
||||
Options: map[string]any{"num_ctx": 10},
|
||||
}
|
||||
|
||||
_, err := embedTestHelper(mctx, client, t, req)
|
||||
if err == nil {
|
||||
t.Fatal("expected error when truncate=false with long input")
|
||||
}
|
||||
|
||||
// Check that it's a StatusError with the correct status code
|
||||
var statusErr api.StatusError
|
||||
if !errors.As(err, &statusErr) {
|
||||
t.Fatalf("expected api.StatusError, got %T: %v", err, err)
|
||||
}
|
||||
|
||||
// The error should be a 4xx client error (likely 400 Bad Request)
|
||||
// not a 500 Internal Server Error
|
||||
if statusErr.StatusCode < 400 || statusErr.StatusCode >= 500 {
|
||||
t.Errorf("expected 4xx status code, got %d", statusErr.StatusCode)
|
||||
}
|
||||
|
||||
// Verify the error message is meaningful
|
||||
if !strings.Contains(err.Error(), "context length") {
|
||||
t.Errorf("expected error message to mention context length, got: %v", err)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("batch truncation error status code", func(t *testing.T) {
|
||||
truncFalse := false
|
||||
req := api.EmbedRequest{
|
||||
Model: model,
|
||||
Input: []string{
|
||||
"short input",
|
||||
strings.Repeat("very long input ", 100),
|
||||
"another short input",
|
||||
},
|
||||
Truncate: &truncFalse,
|
||||
Options: map[string]any{"num_ctx": 10},
|
||||
}
|
||||
|
||||
_, err := embedTestHelper(mctx, client, t, req)
|
||||
if err == nil {
|
||||
t.Fatal("expected error when one input exceeds context with truncate=false")
|
||||
}
|
||||
|
||||
// Check that it's a StatusError with the correct status code
|
||||
var statusErr api.StatusError
|
||||
if !errors.As(err, &statusErr) {
|
||||
t.Fatalf("expected api.StatusError, got %T: %v", err, err)
|
||||
}
|
||||
|
||||
// The error should be a 4xx client error, not a 500 Internal Server Error
|
||||
if statusErr.StatusCode < 400 || statusErr.StatusCode >= 500 {
|
||||
t.Errorf("expected 4xx status code, got %d", statusErr.StatusCode)
|
||||
}
|
||||
})
|
||||
})
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
int LLAMA_BUILD_NUMBER = 0;
|
||||
char const *LLAMA_COMMIT = "7f8ef50cce40e3e7e4526a3696cb45658190e69a";
|
||||
char const *LLAMA_COMMIT = "17f7f4baad8b3a716ee139da7bb56ae984e8c0fa";
|
||||
char const *LLAMA_COMPILER = "";
|
||||
char const *LLAMA_BUILD_TARGET = "";
|
||||
|
|
|
|||
|
|
@ -694,7 +694,7 @@ bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_over
|
|||
|
||||
// Validate if a filename is safe to use
|
||||
// To validate a full path, split the path by the OS-specific path separator, and validate each part with this function
|
||||
bool fs_validate_filename(const std::string & filename) {
|
||||
bool fs_validate_filename(const std::string & filename, bool allow_subdirs) {
|
||||
if (!filename.length()) {
|
||||
// Empty filename invalid
|
||||
return false;
|
||||
|
|
@ -754,10 +754,14 @@ bool fs_validate_filename(const std::string & filename) {
|
|||
|| (c >= 0xD800 && c <= 0xDFFF) // UTF-16 surrogate pairs
|
||||
|| c == 0xFFFD // Replacement Character (UTF-8)
|
||||
|| c == 0xFEFF // Byte Order Mark (BOM)
|
||||
|| c == '/' || c == '\\' || c == ':' || c == '*' // Illegal characters
|
||||
|| c == ':' || c == '*' // Illegal characters
|
||||
|| c == '?' || c == '"' || c == '<' || c == '>' || c == '|') {
|
||||
return false;
|
||||
}
|
||||
if (!allow_subdirs && (c == '/' || c == '\\')) {
|
||||
// Subdirectories not allowed, reject path separators
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Reject any leading or trailing ' ', or any trailing '.', these are stripped on Windows and will cause a different filename
|
||||
|
|
@ -782,11 +786,29 @@ bool fs_validate_filename(const std::string & filename) {
|
|||
#include <iostream>
|
||||
|
||||
|
||||
#ifdef _WIN32
|
||||
static std::wstring utf8_to_wstring(const std::string & str) {
|
||||
if (str.empty()) {
|
||||
return std::wstring();
|
||||
}
|
||||
|
||||
int size = MultiByteToWideChar(CP_UTF8, 0, str.c_str(), (int)str.size(), NULL, 0);
|
||||
|
||||
if (size <= 0) {
|
||||
return std::wstring();
|
||||
}
|
||||
|
||||
std::wstring wstr(size, 0);
|
||||
MultiByteToWideChar(CP_UTF8, 0, str.c_str(), (int)str.size(), &wstr[0], size);
|
||||
|
||||
return wstr;
|
||||
}
|
||||
#endif
|
||||
|
||||
// returns true if successful, false otherwise
|
||||
bool fs_create_directory_with_parents(const std::string & path) {
|
||||
#ifdef _WIN32
|
||||
std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
|
||||
std::wstring wpath = converter.from_bytes(path);
|
||||
std::wstring wpath = utf8_to_wstring(path);
|
||||
|
||||
// if the path already exists, check whether it's a directory
|
||||
const DWORD attributes = GetFileAttributesW(wpath.c_str());
|
||||
|
|
@ -859,6 +881,11 @@ bool fs_create_directory_with_parents(const std::string & path) {
|
|||
#endif // _WIN32
|
||||
}
|
||||
|
||||
bool fs_is_directory(const std::string & path) {
|
||||
std::filesystem::path dir(path);
|
||||
return std::filesystem::exists(dir) && std::filesystem::is_directory(dir);
|
||||
}
|
||||
|
||||
std::string fs_get_cache_directory() {
|
||||
std::string cache_directory = "";
|
||||
auto ensure_trailing_slash = [](std::string p) {
|
||||
|
|
@ -893,6 +920,8 @@ std::string fs_get_cache_directory() {
|
|||
cache_directory = std::getenv("HOME") + std::string("/Library/Caches/");
|
||||
#elif defined(_WIN32)
|
||||
cache_directory = std::getenv("LOCALAPPDATA");
|
||||
#elif defined(__EMSCRIPTEN__)
|
||||
GGML_ABORT("not implemented on this platform");
|
||||
#else
|
||||
# error Unknown architecture
|
||||
#endif
|
||||
|
|
@ -912,7 +941,7 @@ std::string fs_get_cache_file(const std::string & filename) {
|
|||
return cache_directory + filename;
|
||||
}
|
||||
|
||||
std::vector<common_file_info> fs_list_files(const std::string & path) {
|
||||
std::vector<common_file_info> fs_list(const std::string & path, bool include_directories) {
|
||||
std::vector<common_file_info> files;
|
||||
if (path.empty()) return files;
|
||||
|
||||
|
|
@ -927,14 +956,22 @@ std::vector<common_file_info> fs_list_files(const std::string & path) {
|
|||
const auto & p = entry.path();
|
||||
if (std::filesystem::is_regular_file(p)) {
|
||||
common_file_info info;
|
||||
info.path = p.string();
|
||||
info.name = p.filename().string();
|
||||
info.path = p.string();
|
||||
info.name = p.filename().string();
|
||||
info.is_dir = false;
|
||||
try {
|
||||
info.size = static_cast<size_t>(std::filesystem::file_size(p));
|
||||
} catch (const std::filesystem::filesystem_error &) {
|
||||
info.size = 0;
|
||||
}
|
||||
files.push_back(std::move(info));
|
||||
} else if (include_directories && std::filesystem::is_directory(p)) {
|
||||
common_file_info info;
|
||||
info.path = p.string();
|
||||
info.name = p.filename().string();
|
||||
info.size = 0; // Directories have no size
|
||||
info.is_dir = true;
|
||||
files.push_back(std::move(info));
|
||||
}
|
||||
} catch (const std::filesystem::filesystem_error &) {
|
||||
// skip entries we cannot inspect
|
||||
|
|
@ -945,6 +982,32 @@ std::vector<common_file_info> fs_list_files(const std::string & path) {
|
|||
return files;
|
||||
}
|
||||
|
||||
//
|
||||
// TTY utils
|
||||
//
|
||||
|
||||
bool tty_can_use_colors() {
|
||||
// Check NO_COLOR environment variable (https://no-color.org/)
|
||||
if (const char * no_color = std::getenv("NO_COLOR")) {
|
||||
if (no_color[0] != '\0') {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Check TERM environment variable
|
||||
if (const char * term = std::getenv("TERM")) {
|
||||
if (std::strcmp(term, "dumb") == 0) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Check if stdout and stderr are connected to a terminal
|
||||
// We check both because log messages can go to either
|
||||
bool stdout_is_tty = isatty(fileno(stdout));
|
||||
bool stderr_is_tty = isatty(fileno(stderr));
|
||||
|
||||
return stdout_is_tty || stderr_is_tty;
|
||||
}
|
||||
|
||||
//
|
||||
// Model utils
|
||||
|
|
|
|||
|
|
@ -12,6 +12,10 @@
|
|||
#include <vector>
|
||||
#include <map>
|
||||
|
||||
#if defined(_WIN32) && !defined(_WIN32_WINNT)
|
||||
#define _WIN32_WINNT 0x0A00
|
||||
#endif
|
||||
|
||||
#ifdef _WIN32
|
||||
#define DIRECTORY_SEPARATOR '\\'
|
||||
#else
|
||||
|
|
@ -26,8 +30,6 @@
|
|||
fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \
|
||||
} while(0)
|
||||
|
||||
#define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
|
||||
|
||||
struct common_time_meas {
|
||||
common_time_meas(int64_t & t_acc, bool disable = false);
|
||||
~common_time_meas();
|
||||
|
|
@ -223,6 +225,7 @@ struct common_params_model {
|
|||
std::string hf_repo = ""; // HF repo // NOLINT
|
||||
std::string hf_file = ""; // HF file // NOLINT
|
||||
std::string docker_repo = ""; // Docker repo // NOLINT
|
||||
std::string name = ""; // in format <user>/<model>[:<tag>] (tag is optional) // NOLINT
|
||||
};
|
||||
|
||||
struct common_params_speculative {
|
||||
|
|
@ -369,7 +372,7 @@ struct common_params {
|
|||
|
||||
std::vector<common_control_vector_load_info> control_vectors; // control vector with user defined scale
|
||||
|
||||
int32_t verbosity = 0;
|
||||
int32_t verbosity = 3; // LOG_LEVEL_INFO
|
||||
int32_t control_vector_layer_start = -1; // layer range for control vector
|
||||
int32_t control_vector_layer_end = -1; // layer range for control vector
|
||||
bool offline = false;
|
||||
|
|
@ -478,9 +481,15 @@ struct common_params {
|
|||
bool endpoint_props = false; // only control POST requests, not GET
|
||||
bool endpoint_metrics = false;
|
||||
|
||||
// router server configs
|
||||
std::string models_dir = ""; // directory containing models for the router server
|
||||
int models_max = 4; // maximum number of models to load simultaneously
|
||||
bool models_autoload = true; // automatically load models when requested via the router server
|
||||
|
||||
bool log_json = false;
|
||||
|
||||
std::string slot_save_path;
|
||||
std::string media_path; // path to directory for loading media files
|
||||
|
||||
float slot_prompt_similarity = 0.1f;
|
||||
|
||||
|
|
@ -631,8 +640,9 @@ std::string string_from(const struct llama_context * ctx, const struct llama_bat
|
|||
// Filesystem utils
|
||||
//
|
||||
|
||||
bool fs_validate_filename(const std::string & filename);
|
||||
bool fs_validate_filename(const std::string & filename, bool allow_subdirs = false);
|
||||
bool fs_create_directory_with_parents(const std::string & path);
|
||||
bool fs_is_directory(const std::string & path);
|
||||
|
||||
std::string fs_get_cache_directory();
|
||||
std::string fs_get_cache_file(const std::string & filename);
|
||||
|
|
@ -641,8 +651,16 @@ struct common_file_info {
|
|||
std::string path;
|
||||
std::string name;
|
||||
size_t size = 0; // in bytes
|
||||
bool is_dir = false;
|
||||
};
|
||||
std::vector<common_file_info> fs_list_files(const std::string & path);
|
||||
std::vector<common_file_info> fs_list(const std::string & path, bool include_directories);
|
||||
|
||||
//
|
||||
// TTY utils
|
||||
//
|
||||
|
||||
// Auto-detect if colors can be enabled based on terminal and environment
|
||||
bool tty_can_use_colors();
|
||||
|
||||
//
|
||||
// Model utils
|
||||
|
|
|
|||
|
|
@ -974,7 +974,7 @@ public:
|
|||
|
||||
void check_errors() {
|
||||
if (!_errors.empty()) {
|
||||
throw std::runtime_error("JSON schema conversion failed:\n" + string_join(_errors, "\n"));
|
||||
throw std::invalid_argument("JSON schema conversion failed:\n" + string_join(_errors, "\n"));
|
||||
}
|
||||
if (!_warnings.empty()) {
|
||||
fprintf(stderr, "WARNING: JSON schema conversion was incomplete: %s\n", string_join(_warnings, "; ").c_str());
|
||||
|
|
|
|||
|
|
@ -1,3 +1,4 @@
|
|||
#include "common.h"
|
||||
#include "log.h"
|
||||
|
||||
#include <chrono>
|
||||
|
|
@ -26,30 +27,6 @@ void common_log_set_verbosity_thold(int verbosity) {
|
|||
common_log_verbosity_thold = verbosity;
|
||||
}
|
||||
|
||||
// Auto-detect if colors should be enabled based on terminal and environment
|
||||
static bool common_log_should_use_colors_auto() {
|
||||
// Check NO_COLOR environment variable (https://no-color.org/)
|
||||
if (const char * no_color = std::getenv("NO_COLOR")) {
|
||||
if (no_color[0] != '\0') {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Check TERM environment variable
|
||||
if (const char * term = std::getenv("TERM")) {
|
||||
if (std::strcmp(term, "dumb") == 0) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Check if stdout and stderr are connected to a terminal
|
||||
// We check both because log messages can go to either
|
||||
bool stdout_is_tty = isatty(fileno(stdout));
|
||||
bool stderr_is_tty = isatty(fileno(stderr));
|
||||
|
||||
return stdout_is_tty || stderr_is_tty;
|
||||
}
|
||||
|
||||
static int64_t t_us() {
|
||||
return std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
|
||||
}
|
||||
|
|
@ -391,7 +368,7 @@ struct common_log * common_log_main() {
|
|||
static std::once_flag init_flag;
|
||||
std::call_once(init_flag, [&]() {
|
||||
// Set default to auto-detect colors
|
||||
log.set_colors(common_log_should_use_colors_auto());
|
||||
log.set_colors(tty_can_use_colors());
|
||||
});
|
||||
|
||||
return &log;
|
||||
|
|
@ -422,7 +399,7 @@ void common_log_set_file(struct common_log * log, const char * file) {
|
|||
|
||||
void common_log_set_colors(struct common_log * log, log_colors colors) {
|
||||
if (colors == LOG_COLORS_AUTO) {
|
||||
log->set_colors(common_log_should_use_colors_auto());
|
||||
log->set_colors(tty_can_use_colors());
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
@ -443,8 +420,22 @@ void common_log_set_timestamps(struct common_log * log, bool timestamps) {
|
|||
log->set_timestamps(timestamps);
|
||||
}
|
||||
|
||||
static int common_get_verbosity(enum ggml_log_level level) {
|
||||
switch (level) {
|
||||
case GGML_LOG_LEVEL_DEBUG: return LOG_LEVEL_DEBUG;
|
||||
case GGML_LOG_LEVEL_INFO: return LOG_LEVEL_INFO;
|
||||
case GGML_LOG_LEVEL_WARN: return LOG_LEVEL_WARN;
|
||||
case GGML_LOG_LEVEL_ERROR: return LOG_LEVEL_ERROR;
|
||||
case GGML_LOG_LEVEL_CONT: return LOG_LEVEL_INFO; // same as INFO
|
||||
case GGML_LOG_LEVEL_NONE:
|
||||
default:
|
||||
return LOG_LEVEL_OUTPUT;
|
||||
}
|
||||
}
|
||||
|
||||
void common_log_default_callback(enum ggml_log_level level, const char * text, void * /*user_data*/) {
|
||||
if (LOG_DEFAULT_LLAMA <= common_log_verbosity_thold) {
|
||||
auto verbosity = common_get_verbosity(level);
|
||||
if (verbosity <= common_log_verbosity_thold) {
|
||||
common_log_add(common_log_main(), level, "%s", text);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -21,8 +21,14 @@
|
|||
# define LOG_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
|
||||
#endif
|
||||
|
||||
#define LOG_DEFAULT_DEBUG 1
|
||||
#define LOG_DEFAULT_LLAMA 0
|
||||
#define LOG_LEVEL_DEBUG 4
|
||||
#define LOG_LEVEL_INFO 3
|
||||
#define LOG_LEVEL_WARN 2
|
||||
#define LOG_LEVEL_ERROR 1
|
||||
#define LOG_LEVEL_OUTPUT 0 // output data from tools
|
||||
|
||||
#define LOG_DEFAULT_DEBUG LOG_LEVEL_DEBUG
|
||||
#define LOG_DEFAULT_LLAMA LOG_LEVEL_INFO
|
||||
|
||||
enum log_colors {
|
||||
LOG_COLORS_AUTO = -1,
|
||||
|
|
@ -67,10 +73,11 @@ void common_log_add(struct common_log * log, enum ggml_log_level level, const ch
|
|||
// 0.00.090.578 I llm_load_tensors: offloading 32 repeating layers to GPU
|
||||
// 0.00.090.579 I llm_load_tensors: offloading non-repeating layers to GPU
|
||||
//
|
||||
// I - info (stdout, V = 0)
|
||||
// W - warning (stderr, V = 0)
|
||||
// E - error (stderr, V = 0)
|
||||
// D - debug (stderr, V = LOG_DEFAULT_DEBUG)
|
||||
// I - info (stdout, V = LOG_DEFAULT_INFO)
|
||||
// W - warning (stderr, V = LOG_DEFAULT_WARN)
|
||||
// E - error (stderr, V = LOG_DEFAULT_ERROR)
|
||||
// O - output (stdout, V = LOG_DEFAULT_OUTPUT)
|
||||
//
|
||||
|
||||
void common_log_set_file (struct common_log * log, const char * file); // not thread-safe
|
||||
|
|
@ -95,14 +102,14 @@ void common_log_set_timestamps(struct common_log * log, bool timestamps); // w
|
|||
} \
|
||||
} while (0)
|
||||
|
||||
#define LOG(...) LOG_TMPL(GGML_LOG_LEVEL_NONE, 0, __VA_ARGS__)
|
||||
#define LOGV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_NONE, verbosity, __VA_ARGS__)
|
||||
#define LOG(...) LOG_TMPL(GGML_LOG_LEVEL_NONE, LOG_LEVEL_OUTPUT, __VA_ARGS__)
|
||||
#define LOGV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_NONE, verbosity, __VA_ARGS__)
|
||||
|
||||
#define LOG_INF(...) LOG_TMPL(GGML_LOG_LEVEL_INFO, 0, __VA_ARGS__)
|
||||
#define LOG_WRN(...) LOG_TMPL(GGML_LOG_LEVEL_WARN, 0, __VA_ARGS__)
|
||||
#define LOG_ERR(...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, 0, __VA_ARGS__)
|
||||
#define LOG_DBG(...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, LOG_DEFAULT_DEBUG, __VA_ARGS__)
|
||||
#define LOG_CNT(...) LOG_TMPL(GGML_LOG_LEVEL_CONT, 0, __VA_ARGS__)
|
||||
#define LOG_DBG(...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, LOG_LEVEL_DEBUG, __VA_ARGS__)
|
||||
#define LOG_INF(...) LOG_TMPL(GGML_LOG_LEVEL_INFO, LOG_LEVEL_INFO, __VA_ARGS__)
|
||||
#define LOG_WRN(...) LOG_TMPL(GGML_LOG_LEVEL_WARN, LOG_LEVEL_WARN, __VA_ARGS__)
|
||||
#define LOG_ERR(...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, LOG_LEVEL_ERROR, __VA_ARGS__)
|
||||
#define LOG_CNT(...) LOG_TMPL(GGML_LOG_LEVEL_CONT, LOG_LEVEL_INFO, __VA_ARGS__) // same as INFO
|
||||
|
||||
#define LOG_INFV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_INFO, verbosity, __VA_ARGS__)
|
||||
#define LOG_WRNV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_WARN, verbosity, __VA_ARGS__)
|
||||
|
|
|
|||
|
|
@ -75,6 +75,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|||
{ LLM_ARCH_JAIS, "jais" },
|
||||
{ LLM_ARCH_NEMOTRON, "nemotron" },
|
||||
{ LLM_ARCH_NEMOTRON_H, "nemotron_h" },
|
||||
{ LLM_ARCH_NEMOTRON_H_MOE, "nemotron_h_moe" },
|
||||
{ LLM_ARCH_EXAONE, "exaone" },
|
||||
{ LLM_ARCH_EXAONE4, "exaone4" },
|
||||
{ LLM_ARCH_RWKV6, "rwkv6" },
|
||||
|
|
@ -112,6 +113,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|||
{ LLM_ARCH_COGVLM, "cogvlm" },
|
||||
{ LLM_ARCH_RND1, "rnd1" },
|
||||
{ LLM_ARCH_PANGU_EMBED, "pangu-embedded" },
|
||||
{ LLM_ARCH_MISTRAL3, "mistral3" },
|
||||
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
||||
};
|
||||
|
||||
|
|
@ -205,6 +207,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|||
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
|
||||
{ LLM_KV_ATTENTION_OUTPUT_SCALE, "%s.attention.output_scale" },
|
||||
{ LLM_KV_ATTENTION_TEMPERATURE_LENGTH, "%s.attention.temperature_length" },
|
||||
{ LLM_KV_ATTENTION_TEMPERATURE_SCALE, "%s.attention.temperature_scale" },
|
||||
{ LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection" },
|
||||
{ LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
|
||||
{ LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
|
||||
|
|
@ -855,7 +858,7 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|||
{ LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
|
||||
{ LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
|
||||
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
||||
{ LLM_TENSOR_SSM_A, "blk.%d.ssm_a" },
|
||||
{ LLM_TENSOR_SSM_A_NOSCAN, "blk.%d.ssm_a" },
|
||||
{ LLM_TENSOR_SSM_CONV1D, "blk.%d.ssm_conv1d" },
|
||||
{ LLM_TENSOR_SSM_DT, "blk.%d.ssm_dt" },
|
||||
{ LLM_TENSOR_SSM_BETA_ALPHA, "blk.%d.ssm_ba" },
|
||||
|
|
@ -1763,6 +1766,39 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_NEMOTRON_H_MOE,
|
||||
{
|
||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||
{ LLM_TENSOR_OUTPUT, "output" },
|
||||
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||
// mamba(2) ssm layers
|
||||
{ LLM_TENSOR_SSM_IN, "blk.%d.ssm_in" },
|
||||
{ LLM_TENSOR_SSM_CONV1D, "blk.%d.ssm_conv1d" },
|
||||
{ LLM_TENSOR_SSM_DT, "blk.%d.ssm_dt" },
|
||||
{ LLM_TENSOR_SSM_A, "blk.%d.ssm_a" },
|
||||
{ LLM_TENSOR_SSM_D, "blk.%d.ssm_d" },
|
||||
{ LLM_TENSOR_SSM_NORM, "blk.%d.ssm_norm" },
|
||||
{ LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" },
|
||||
// attention layers
|
||||
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||
// dense FFN
|
||||
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
// MoE FFN (for MoE layers)
|
||||
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
||||
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
||||
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
||||
{ LLM_TENSOR_FFN_EXP_PROBS_B,"blk.%d.exp_probs_b" },
|
||||
// MoE shared expert layer
|
||||
{ LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
|
||||
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_EXAONE,
|
||||
{
|
||||
|
|
@ -2532,6 +2568,32 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|||
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_MISTRAL3,
|
||||
{
|
||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||
{ LLM_TENSOR_OUTPUT, "output" },
|
||||
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
||||
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
|
||||
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
||||
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
||||
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
{ LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
|
||||
{ LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
|
||||
{ LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
|
||||
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
||||
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
||||
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_UNKNOWN,
|
||||
{
|
||||
|
|
@ -2631,6 +2693,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
|||
{LLM_TENSOR_FFN_ACT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_DIV}},
|
||||
{LLM_TENSOR_SSM_CONV1D, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SSM_CONV}},
|
||||
{LLM_TENSOR_SSM_A, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SSM_SCAN}},
|
||||
{LLM_TENSOR_SSM_A_NOSCAN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, // a version of SSM_A used for MUL instead of SSM_SCAN
|
||||
{LLM_TENSOR_SSM_DT_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_SSM_B_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_SSM_C_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
|
|
@ -2809,6 +2872,7 @@ bool llm_arch_is_hybrid(const llm_arch & arch) {
|
|||
case LLM_ARCH_LFM2:
|
||||
case LLM_ARCH_LFM2MOE:
|
||||
case LLM_ARCH_NEMOTRON_H:
|
||||
case LLM_ARCH_NEMOTRON_H_MOE:
|
||||
case LLM_ARCH_QWEN3NEXT:
|
||||
return true;
|
||||
default:
|
||||
|
|
|
|||
|
|
@ -79,6 +79,7 @@ enum llm_arch {
|
|||
LLM_ARCH_JAIS,
|
||||
LLM_ARCH_NEMOTRON,
|
||||
LLM_ARCH_NEMOTRON_H,
|
||||
LLM_ARCH_NEMOTRON_H_MOE,
|
||||
LLM_ARCH_EXAONE,
|
||||
LLM_ARCH_EXAONE4,
|
||||
LLM_ARCH_RWKV6,
|
||||
|
|
@ -116,6 +117,7 @@ enum llm_arch {
|
|||
LLM_ARCH_COGVLM,
|
||||
LLM_ARCH_RND1,
|
||||
LLM_ARCH_PANGU_EMBED,
|
||||
LLM_ARCH_MISTRAL3,
|
||||
LLM_ARCH_UNKNOWN,
|
||||
};
|
||||
|
||||
|
|
@ -209,6 +211,7 @@ enum llm_kv {
|
|||
LLM_KV_ATTENTION_SCALE,
|
||||
LLM_KV_ATTENTION_OUTPUT_SCALE,
|
||||
LLM_KV_ATTENTION_TEMPERATURE_LENGTH,
|
||||
LLM_KV_ATTENTION_TEMPERATURE_SCALE,
|
||||
LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
|
||||
LLM_KV_ATTENTION_KEY_LENGTH_MLA,
|
||||
LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
|
||||
|
|
@ -379,6 +382,7 @@ enum llm_tensor {
|
|||
LLM_TENSOR_SSM_DT,
|
||||
LLM_TENSOR_SSM_DT_NORM,
|
||||
LLM_TENSOR_SSM_A,
|
||||
LLM_TENSOR_SSM_A_NOSCAN, // qwen3next special case with MUL instead of SSM_SCAN
|
||||
LLM_TENSOR_SSM_B_NORM,
|
||||
LLM_TENSOR_SSM_C_NORM,
|
||||
LLM_TENSOR_SSM_D,
|
||||
|
|
|
|||
|
|
@ -248,7 +248,10 @@ llama_context::llama_context(
|
|||
|
||||
LLAMA_LOG_DEBUG("%s: backend_ptrs.size() = %zu\n", __func__, backend_ptrs.size());
|
||||
|
||||
const size_t max_nodes = this->graph_max_nodes();
|
||||
const uint32_t n_seqs = cparams.n_seq_max;
|
||||
const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
|
||||
|
||||
const size_t max_nodes = this->graph_max_nodes(n_tokens);
|
||||
|
||||
LLAMA_LOG_DEBUG("%s: max_nodes = %zu\n", __func__, max_nodes);
|
||||
|
||||
|
|
@ -300,9 +303,6 @@ llama_context::llama_context(
|
|||
|
||||
cross.v_embd.clear();
|
||||
|
||||
const uint32_t n_seqs = cparams.n_seq_max;
|
||||
const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
|
||||
|
||||
// avoid reserving graphs with zero outputs - assume one output per sequence
|
||||
n_outputs = n_seqs;
|
||||
|
||||
|
|
@ -1385,9 +1385,9 @@ void llama_context::output_reorder() {
|
|||
// graph
|
||||
//
|
||||
|
||||
uint32_t llama_context::graph_max_nodes() const {
|
||||
uint32_t llama_context::graph_max_nodes(uint32_t n_tokens) const {
|
||||
if (model.arch == LLM_ARCH_QWEN3NEXT) {
|
||||
return std::max<uint32_t>(8192u, 32u*model.n_tensors());
|
||||
return std::max<uint32_t>(n_tokens * 40, 32u * model.n_tensors());
|
||||
}
|
||||
return std::max<uint32_t>(1024u, 8u*model.n_tensors());
|
||||
}
|
||||
|
|
|
|||
|
|
@ -197,7 +197,7 @@ private:
|
|||
//
|
||||
|
||||
public:
|
||||
uint32_t graph_max_nodes() const;
|
||||
uint32_t graph_max_nodes(uint32_t n_tokens) const;
|
||||
|
||||
// can reuse the llm_graph_result instance of the context (for example to update a memory module)
|
||||
llm_graph_result * get_gf_res_reserve() const;
|
||||
|
|
|
|||
|
|
@ -181,6 +181,52 @@ static std::pair<uint32_t, const char *> parse_char(const char * src) {
|
|||
throw std::runtime_error("unexpected end of input");
|
||||
}
|
||||
|
||||
static std::pair<uint32_t, const char *> parse_token(const llama_vocab * vocab, const char * src) {
|
||||
const char * pos = src;
|
||||
if (*pos != '<') {
|
||||
throw std::runtime_error(std::string("expecting '<' at ") + pos);
|
||||
}
|
||||
pos++;
|
||||
|
||||
// Parse <[id]>
|
||||
if (*pos == '[') {
|
||||
pos++;
|
||||
const char * int_end = parse_int(pos);
|
||||
uint32_t token_id = std::stoul(std::string(pos, int_end - pos));
|
||||
pos = int_end;
|
||||
if (*pos != ']') {
|
||||
throw std::runtime_error(std::string("expecting ']' at ") + pos);
|
||||
}
|
||||
pos++;
|
||||
if (*pos != '>') {
|
||||
throw std::runtime_error(std::string("expecting '>' at ") + pos);
|
||||
}
|
||||
pos++;
|
||||
return std::make_pair(token_id, pos);
|
||||
}
|
||||
|
||||
if (vocab == nullptr) {
|
||||
throw std::runtime_error(std::string("no vocab to parse token at ") + src);
|
||||
}
|
||||
|
||||
// Parse <token> and tokenize to obtain the token id
|
||||
while (*pos != 0 && *pos != '>') {
|
||||
pos++;
|
||||
}
|
||||
if (*pos != '>') {
|
||||
throw std::runtime_error(std::string("expecting '>' at ") + pos);
|
||||
}
|
||||
pos++;
|
||||
|
||||
llama_token tokens[2];
|
||||
int32_t n_tokens = vocab->tokenize(src, static_cast<int32_t>(pos - src), tokens, 2, false, true);
|
||||
if (n_tokens != 1) {
|
||||
// must tokenize to exactly 1 token
|
||||
throw std::runtime_error("invalid token '" + std::string(src, pos - src) + "'");
|
||||
}
|
||||
return std::make_pair(tokens[0], pos);
|
||||
}
|
||||
|
||||
static void print_grammar_char(FILE * file, uint32_t c) {
|
||||
if (0x20 <= c && c <= 0x7f) {
|
||||
fprintf(file, "%c", static_cast<char>(c));
|
||||
|
|
@ -212,6 +258,8 @@ static void print_rule_binary(FILE * file, const llama_grammar_rule & rule) {
|
|||
case LLAMA_GRETYPE_CHAR_RNG_UPPER: fprintf(file, "CHAR_RNG_UPPER"); break;
|
||||
case LLAMA_GRETYPE_CHAR_ALT: fprintf(file, "CHAR_ALT"); break;
|
||||
case LLAMA_GRETYPE_CHAR_ANY: fprintf(file, "CHAR_ANY"); break;
|
||||
case LLAMA_GRETYPE_TOKEN: fprintf(file, "TOKEN"); break;
|
||||
case LLAMA_GRETYPE_TOKEN_NOT: fprintf(file, "TOKEN_NOT"); break;
|
||||
}
|
||||
switch (elem.type) {
|
||||
case LLAMA_GRETYPE_END:
|
||||
|
|
@ -228,6 +276,17 @@ static void print_rule_binary(FILE * file, const llama_grammar_rule & rule) {
|
|||
print_grammar_char(file, elem.value);
|
||||
fprintf(file, "\") ");
|
||||
break;
|
||||
case LLAMA_GRETYPE_TOKEN:
|
||||
fprintf(file, "<[");
|
||||
fprintf(file, "%u", elem.value);
|
||||
fprintf(file, "]> ");
|
||||
break;
|
||||
case LLAMA_GRETYPE_TOKEN_NOT:
|
||||
fprintf(file, "!");
|
||||
fprintf(file, "<[");
|
||||
fprintf(file, "%u", elem.value);
|
||||
fprintf(file, "]> ");
|
||||
break;
|
||||
}
|
||||
}
|
||||
fprintf(file, "\n");
|
||||
|
|
@ -284,6 +343,17 @@ static void print_rule(
|
|||
case LLAMA_GRETYPE_CHAR_ANY:
|
||||
fprintf(file, ".");
|
||||
break;
|
||||
case LLAMA_GRETYPE_TOKEN:
|
||||
fprintf(file, "<[");
|
||||
fprintf(file, "%u", elem.value);
|
||||
fprintf(file, "]> ");
|
||||
break;
|
||||
case LLAMA_GRETYPE_TOKEN_NOT:
|
||||
fprintf(file, "!");
|
||||
fprintf(file, "<[");
|
||||
fprintf(file, "%u", elem.value);
|
||||
fprintf(file, "]> ");
|
||||
break;
|
||||
}
|
||||
if (is_char_element(elem)) {
|
||||
switch (rule[i + 1].type) {
|
||||
|
|
@ -444,6 +514,17 @@ const char * llama_grammar_parser::parse_sequence(
|
|||
}
|
||||
}
|
||||
pos = parse_space(pos + 1, is_nested);
|
||||
} else if (*pos == '<' || *pos == '!') { // token
|
||||
auto type = LLAMA_GRETYPE_TOKEN;
|
||||
if (*pos == '!') { // token inverse
|
||||
type = LLAMA_GRETYPE_TOKEN_NOT;
|
||||
pos++;
|
||||
}
|
||||
auto token_pair = parse_token(vocab, pos);
|
||||
const char * token_end = token_pair.second;
|
||||
last_sym_start = rule.size();
|
||||
rule.push_back({type, token_pair.first});
|
||||
pos = parse_space(token_end, is_nested);
|
||||
} else if (is_word_char(*pos)) { // rule reference
|
||||
const char * name_end = parse_name(pos);
|
||||
uint32_t ref_rule_id = get_symbol_id(pos, name_end - pos);
|
||||
|
|
@ -691,6 +772,21 @@ static bool llama_grammar_match_partial_char(
|
|||
return !is_positive_char;
|
||||
}
|
||||
|
||||
// returns true iff token matches the rule at pos (regular or inverse)
|
||||
// asserts that pos is pointing to a token element
|
||||
static bool llama_grammar_match_token(
|
||||
const llama_grammar_element * pos,
|
||||
const llama_token token) {
|
||||
GGML_ASSERT(pos->type == LLAMA_GRETYPE_TOKEN || pos->type == LLAMA_GRETYPE_TOKEN_NOT);
|
||||
if (pos->type == LLAMA_GRETYPE_TOKEN) {
|
||||
return pos->value == static_cast<uint32_t>(token);
|
||||
}
|
||||
if (pos->type == LLAMA_GRETYPE_TOKEN_NOT) {
|
||||
return pos->value != static_cast<uint32_t>(token);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// transforms a grammar pushdown stack into N possible stacks, all ending
|
||||
// at a character range (terminal element)
|
||||
static void llama_grammar_advance_stack(
|
||||
|
|
@ -738,6 +834,8 @@ static void llama_grammar_advance_stack(
|
|||
case LLAMA_GRETYPE_CHAR:
|
||||
case LLAMA_GRETYPE_CHAR_NOT:
|
||||
case LLAMA_GRETYPE_CHAR_ANY:
|
||||
case LLAMA_GRETYPE_TOKEN:
|
||||
case LLAMA_GRETYPE_TOKEN_NOT:
|
||||
if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) {
|
||||
// only add the stack if it's not a duplicate of one we already have
|
||||
new_stacks.emplace_back(stack);
|
||||
|
|
@ -831,26 +929,38 @@ llama_grammar_stacks & llama_grammar_get_stacks(struct llama_grammar * grammar)
|
|||
return grammar->stacks;
|
||||
}
|
||||
|
||||
static void llama_grammar_accept_chr(
|
||||
struct llama_grammar & grammar,
|
||||
const llama_grammar_stack & stack,
|
||||
uint32_t chr,
|
||||
llama_grammar_stacks & new_stacks) {
|
||||
if (stack.empty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
const llama_grammar_element * pos = stack.back();
|
||||
|
||||
// ignore if this turns into a token
|
||||
if (pos->type == LLAMA_GRETYPE_TOKEN || pos->type == LLAMA_GRETYPE_TOKEN_NOT) {
|
||||
return;
|
||||
}
|
||||
|
||||
auto match = llama_grammar_match_char(pos, chr);
|
||||
if (match.first) {
|
||||
llama_grammar_stack new_stack(stack.begin(), stack.end() - 1);
|
||||
if (!llama_grammar_is_end_of_sequence(match.second)) {
|
||||
new_stack.push_back(match.second);
|
||||
}
|
||||
llama_grammar_advance_stack(grammar.rules, new_stack, new_stacks);
|
||||
}
|
||||
}
|
||||
|
||||
void llama_grammar_accept(struct llama_grammar * grammar, uint32_t chr) {
|
||||
llama_grammar_stacks stacks_new;
|
||||
stacks_new.reserve(grammar->stacks.size());
|
||||
|
||||
for (const auto & stack : grammar->stacks) {
|
||||
if (stack.empty()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
auto match = llama_grammar_match_char(stack.back(), chr);
|
||||
if (match.first) {
|
||||
const llama_grammar_element * pos = match.second;
|
||||
|
||||
// update top of stack to next element, if any
|
||||
llama_grammar_stack new_stack(stack.begin(), stack.end() - 1);
|
||||
if (!llama_grammar_is_end_of_sequence(pos)) {
|
||||
new_stack.push_back(pos);
|
||||
}
|
||||
llama_grammar_advance_stack(grammar->rules, new_stack, stacks_new);
|
||||
}
|
||||
llama_grammar_accept_chr(*grammar, stack, chr, stacks_new);
|
||||
}
|
||||
|
||||
grammar->stacks = std::move(stacks_new);
|
||||
|
|
@ -875,6 +985,22 @@ llama_grammar_candidates llama_grammar_reject_candidates_for_stack(
|
|||
|
||||
const llama_grammar_element * stack_pos = stack.back();
|
||||
|
||||
// if the top of the stack is a token rule, then we only need to check the token id
|
||||
if (stack_pos->type == LLAMA_GRETYPE_TOKEN || stack_pos->type == LLAMA_GRETYPE_TOKEN_NOT) {
|
||||
for (const auto & tok : candidates) {
|
||||
if (*tok.code_points == 0) {
|
||||
// reached the end of a token consumed by char rules, reject iff it ended
|
||||
// in a partial response
|
||||
if (tok.partial_utf8.n_remain != 0) {
|
||||
rejects.push_back(tok);
|
||||
}
|
||||
} else if (!llama_grammar_match_token(stack_pos, tok.id)) {
|
||||
rejects.push_back(tok);
|
||||
}
|
||||
}
|
||||
return rejects;
|
||||
}
|
||||
|
||||
llama_grammar_candidates next_candidates;
|
||||
next_candidates.reserve(candidates.size());
|
||||
|
||||
|
|
@ -887,7 +1013,7 @@ llama_grammar_candidates llama_grammar_reject_candidates_for_stack(
|
|||
rejects.push_back(tok);
|
||||
}
|
||||
} else if (llama_grammar_match_char(stack_pos, *tok.code_points).first) {
|
||||
next_candidates.push_back({ tok.index, tok.code_points + 1, tok.partial_utf8 });
|
||||
next_candidates.push_back({ tok.index, tok.code_points + 1, tok.partial_utf8, tok.id });
|
||||
} else {
|
||||
rejects.push_back(tok);
|
||||
}
|
||||
|
|
@ -905,7 +1031,7 @@ llama_grammar_candidates llama_grammar_reject_candidates_for_stack(
|
|||
|
||||
auto next_rejects = llama_grammar_reject_candidates(rules, next_stacks, next_candidates);
|
||||
for (const auto & tok : next_rejects) {
|
||||
rejects.push_back({ tok.index, tok.code_points - 1, tok.partial_utf8 });
|
||||
rejects.push_back({ tok.index, tok.code_points - 1, tok.partial_utf8, tok.id });
|
||||
}
|
||||
|
||||
return rejects;
|
||||
|
|
@ -974,12 +1100,13 @@ struct llama_grammar * llama_grammar_init_impl(
|
|||
ollama_vocab,
|
||||
std::move(vec_rules),
|
||||
std::move(stacks),
|
||||
/* .partial_utf8 = */ {},
|
||||
/* .lazy =*/ false,
|
||||
/* .awaiting_trigger = */ false,
|
||||
/* .trigger_buffer = */ "",
|
||||
/* .trigger_tokens = */ {},
|
||||
/* .trigger_patterns = */ {},
|
||||
/* .partial_utf8 = */ {},
|
||||
/* .lazy = */ false,
|
||||
/* .awaiting_trigger = */ false,
|
||||
/* .trigger_buffer = */ "",
|
||||
/* .trigger_buffer_positions = */ {},
|
||||
/* .trigger_tokens = */ {},
|
||||
/* .trigger_patterns = */ {},
|
||||
};
|
||||
}
|
||||
|
||||
|
|
@ -993,7 +1120,7 @@ struct llama_grammar * llama_grammar_init_impl(
|
|||
size_t num_trigger_patterns,
|
||||
const llama_token * trigger_tokens,
|
||||
size_t num_trigger_tokens) {
|
||||
llama_grammar_parser parser;
|
||||
llama_grammar_parser parser(vocab);
|
||||
|
||||
// if there is a grammar, parse it
|
||||
// rules will be empty (default) if there are parse errors
|
||||
|
|
@ -1081,10 +1208,11 @@ struct llama_grammar * llama_grammar_init_impl(
|
|||
ollama_vocab,
|
||||
std::move(vec_rules),
|
||||
std::move(stacks),
|
||||
/* .partial_utf8 = */ {},
|
||||
/* .lazy = */ lazy,
|
||||
/* .awaiting_trigger = */ lazy,
|
||||
/* .trigger_buffer = */ "",
|
||||
/* .partial_utf8 = */ {},
|
||||
/* .lazy = */ lazy,
|
||||
/* .awaiting_trigger = */ lazy,
|
||||
/* .trigger_buffer = */ "",
|
||||
/* .trigger_buffer_positions = */ {},
|
||||
std::move(vec_trigger_tokens),
|
||||
std::move(vec_trigger_patterns),
|
||||
};
|
||||
|
|
@ -1108,6 +1236,7 @@ struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & gra
|
|||
grammar.lazy,
|
||||
grammar.awaiting_trigger,
|
||||
grammar.trigger_buffer,
|
||||
grammar.trigger_buffer_positions,
|
||||
grammar.trigger_tokens,
|
||||
grammar.trigger_patterns,
|
||||
};
|
||||
|
|
@ -1164,7 +1293,7 @@ void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_
|
|||
cur_p->data[i].logit = -INFINITY;
|
||||
} else {
|
||||
candidates_decoded.push_back(decode_utf8(piece, grammar.partial_utf8));
|
||||
candidates_grammar.push_back({ i, candidates_decoded.back().first.data(), candidates_decoded.back().second });
|
||||
candidates_grammar.push_back({ i, candidates_decoded.back().first.data(), candidates_decoded.back().second, id });
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -1184,10 +1313,12 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
|
|||
if (std::find(grammar.trigger_tokens.begin(), grammar.trigger_tokens.end(), token) != grammar.trigger_tokens.end()) {
|
||||
grammar.awaiting_trigger = false;
|
||||
grammar.trigger_buffer.clear();
|
||||
llama_grammar_accept_str(grammar, piece);
|
||||
llama_grammar_accept_token(grammar, token, piece);
|
||||
LLAMA_LOG_DEBUG("Grammar triggered on token %u (`%s`)", token, piece.c_str());
|
||||
return;
|
||||
} else {
|
||||
auto position = std::make_pair(grammar.trigger_buffer.size(), grammar.trigger_buffer.size() + piece.size());
|
||||
grammar.trigger_buffer_positions.push_back(std::make_pair(token, position));
|
||||
grammar.trigger_buffer += piece;
|
||||
|
||||
std::smatch match;
|
||||
|
|
@ -1205,10 +1336,23 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
|
|||
if (start == std::string::npos) {
|
||||
start = match.position(0);
|
||||
}
|
||||
|
||||
// replay tokens that overlap with [start, end)
|
||||
for (const auto & [tok, tok_pos] : grammar.trigger_buffer_positions) {
|
||||
auto [tok_start, tok_end] = tok_pos;
|
||||
if (tok_end <= start) {
|
||||
continue;
|
||||
}
|
||||
|
||||
size_t piece_start = (tok_start < start) ? start : tok_start; // allow for partial token pieces
|
||||
size_t piece_len = tok_end - piece_start;
|
||||
auto tok_piece = grammar.trigger_buffer.substr(piece_start, piece_len);
|
||||
llama_grammar_accept_token(grammar, tok, tok_piece);
|
||||
}
|
||||
|
||||
auto constrained_str = grammar.trigger_buffer.substr(start);
|
||||
// std::string constrained_str(match[1].first, grammar.trigger_buffer.end());
|
||||
grammar.trigger_buffer.clear();
|
||||
llama_grammar_accept_str(grammar, constrained_str);
|
||||
grammar.trigger_buffer_positions.clear();
|
||||
LLAMA_LOG_DEBUG("Grammar triggered on regex: '%s'\n", constrained_str.c_str());
|
||||
return;
|
||||
}
|
||||
|
|
@ -1228,7 +1372,7 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
|
|||
GGML_ABORT("grammar error: end of grammar token received but grammar stack is not empty");
|
||||
}
|
||||
|
||||
llama_grammar_accept_str(grammar, piece);
|
||||
llama_grammar_accept_token(grammar, token, piece);
|
||||
}
|
||||
|
||||
void llama_grammar_accept_str(struct llama_grammar & grammar, const std::string & piece) {
|
||||
|
|
@ -1246,6 +1390,61 @@ void llama_grammar_accept_str(struct llama_grammar & grammar, const std::string
|
|||
}
|
||||
}
|
||||
|
||||
void llama_grammar_accept_token(struct llama_grammar & grammar, llama_token token, const std::string & piece) {
|
||||
// Note terminating 0 in decoded string
|
||||
const auto decoded = decode_utf8(piece, grammar.partial_utf8);
|
||||
const auto & code_points = decoded.first;
|
||||
|
||||
llama_grammar_stacks stacks_new;
|
||||
stacks_new.reserve(grammar.stacks.size());
|
||||
|
||||
for (const auto & stack : grammar.stacks) {
|
||||
if (stack.empty()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const llama_grammar_element * pos = stack.back();
|
||||
|
||||
if (pos->type == LLAMA_GRETYPE_TOKEN || pos->type == LLAMA_GRETYPE_TOKEN_NOT) {
|
||||
if (llama_grammar_match_token(pos, token)) {
|
||||
llama_grammar_stack new_stack(stack.begin(), stack.end() - 1);
|
||||
if (!llama_grammar_is_end_of_sequence(pos + 1)) {
|
||||
new_stack.push_back(pos + 1);
|
||||
}
|
||||
llama_grammar_advance_stack(grammar.rules, new_stack, stacks_new);
|
||||
}
|
||||
} else {
|
||||
llama_grammar_stacks current_stacks = {stack};
|
||||
|
||||
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
|
||||
llama_grammar_stacks next_stacks;
|
||||
|
||||
for (const auto & cur_stack : current_stacks) {
|
||||
llama_grammar_accept_chr(grammar, cur_stack, *it, next_stacks);
|
||||
}
|
||||
|
||||
current_stacks = std::move(next_stacks);
|
||||
if (current_stacks.empty()) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
for (auto & surviving_stack : current_stacks) {
|
||||
if (std::find(stacks_new.begin(), stacks_new.end(), surviving_stack) == stacks_new.end()) {
|
||||
stacks_new.emplace_back(surviving_stack);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
grammar.stacks = std::move(stacks_new);
|
||||
grammar.partial_utf8 = decoded.second;
|
||||
|
||||
if (grammar.stacks.empty()) {
|
||||
throw std::runtime_error("Unexpected empty grammar stack after accepting piece: " + piece + " (" + std::to_string(token) + ")");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
const std::string & ollama_vocab::token_to_piece(const uint32_t token) const {
|
||||
try {
|
||||
|
|
|
|||
|
|
@ -47,11 +47,17 @@ enum llama_gretype {
|
|||
|
||||
// any character (.)
|
||||
LLAMA_GRETYPE_CHAR_ANY = 7,
|
||||
|
||||
// terminal element: token (<[token-id]>)
|
||||
LLAMA_GRETYPE_TOKEN = 8,
|
||||
|
||||
// inverse token (!<[token-id]>)
|
||||
LLAMA_GRETYPE_TOKEN_NOT = 9,
|
||||
};
|
||||
|
||||
typedef struct llama_grammar_element {
|
||||
enum llama_gretype type;
|
||||
uint32_t value; // Unicode code point or rule ID
|
||||
uint32_t value; // Unicode code point, rule ID, or token ID
|
||||
} llama_grammar_element;
|
||||
|
||||
struct llama_partial_utf8 {
|
||||
|
|
@ -63,6 +69,7 @@ struct llama_grammar_candidate {
|
|||
size_t index;
|
||||
const uint32_t * code_points;
|
||||
llama_partial_utf8 partial_utf8;
|
||||
llama_token id;
|
||||
};
|
||||
|
||||
using llama_grammar_rule = std::vector< llama_grammar_element>;
|
||||
|
|
@ -88,10 +95,13 @@ std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_stack(
|
|||
const llama_grammar_candidates & candidates);
|
||||
|
||||
struct llama_grammar_parser {
|
||||
const llama_vocab * vocab;
|
||||
std::map<std::string, uint32_t> symbol_ids;
|
||||
|
||||
llama_grammar_rules rules;
|
||||
|
||||
llama_grammar_parser(const struct llama_vocab * vocab = nullptr) : vocab(vocab) {}
|
||||
|
||||
llama_grammar_stack c_rules() const;
|
||||
|
||||
uint32_t get_symbol_id(const char * src, size_t len);
|
||||
|
|
@ -123,6 +133,9 @@ struct llama_grammar_trigger_pattern {
|
|||
};
|
||||
|
||||
struct llama_grammar {
|
||||
// maintain a list of llama_tokens and their positions in the trigger_buffer
|
||||
using token_pos = std::pair<llama_token, std::pair<size_t, size_t>>;
|
||||
|
||||
// note: allow null vocab for testing (not great)
|
||||
const llama_vocab * vocab;
|
||||
const ollama_vocab * o_vocab;
|
||||
|
|
@ -139,6 +152,7 @@ struct llama_grammar {
|
|||
bool lazy = false;
|
||||
bool awaiting_trigger = false; // Initialized to true for lazy grammars only
|
||||
std::string trigger_buffer; // Output buffered by lazy grammar. Will be cleared once trigger is found.
|
||||
std::vector<token_pos> trigger_buffer_positions; // Tokens buffered by lazy grammar. Used to replay when a trigger is found.
|
||||
std::vector<llama_token> trigger_tokens; // Tokens that trigger a lazy grammar, or tokens to force printing of (even if special).
|
||||
std::vector<llama_grammar_trigger_pattern>
|
||||
trigger_patterns; // Regular expressions that trigger a lazy grammar. Must be a full match of the entire generated
|
||||
|
|
@ -185,3 +199,8 @@ void llama_grammar_accept_impl(
|
|||
void llama_grammar_accept_str(
|
||||
struct llama_grammar & grammar,
|
||||
const std::string & piece);
|
||||
|
||||
void llama_grammar_accept_token(
|
||||
struct llama_grammar & grammar,
|
||||
llama_token token,
|
||||
const std::string & piece);
|
||||
|
|
|
|||
|
|
@ -71,6 +71,9 @@ void llm_graph_input_attn_temp::set_input(const llama_ubatch * ubatch) {
|
|||
if (ubatch->pos && attn_scale) {
|
||||
const int64_t n_tokens = ubatch->n_tokens;
|
||||
|
||||
GGML_ASSERT(f_attn_temp_scale != 0.0f);
|
||||
GGML_ASSERT(n_attn_temp_floor_scale != 0);
|
||||
|
||||
std::vector<float> attn_scale_data(n_tokens, 0.0f);
|
||||
for (int i = 0; i < n_tokens; ++i) {
|
||||
const float pos = ubatch->pos[i];
|
||||
|
|
@ -810,9 +813,6 @@ ggml_tensor * llm_graph_context::build_ffn(
|
|||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
//expand here so that we can fuse ffn gate
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
|
||||
if (gate && type_gate == LLM_FFN_PAR) {
|
||||
cur = ggml_mul(ctx0, cur, tmp);
|
||||
cb(cur, "ffn_gate_par", il);
|
||||
|
|
@ -973,7 +973,7 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
|||
|
||||
// mask out the other groups
|
||||
selection_probs = ggml_get_rows(ctx0, selection_groups, expert_groups); // [n_exp_per_group, n_group_used, n_tokens]
|
||||
selection_probs = ggml_set_rows(ctx0, ggml_scale_bias(ctx0, selection_groups, 0.0f, -INFINITY), selection_probs, expert_groups); // [n_exp_per_group, n_expert_groups, n_tokens]
|
||||
selection_probs = ggml_set_rows(ctx0, ggml_fill(ctx0, selection_groups, -INFINITY), selection_probs, expert_groups); // [n_exp_per_group, n_expert_groups, n_tokens]
|
||||
selection_probs = ggml_reshape_2d(ctx0, selection_probs, n_expert, n_tokens); // [n_expert, n_tokens]
|
||||
cb(selection_probs, "ffn_moe_probs_masked", il);
|
||||
}
|
||||
|
|
@ -1089,13 +1089,20 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
|||
cur = ggml_relu(ctx0, cur);
|
||||
cb(cur, "ffn_moe_relu", il);
|
||||
} break;
|
||||
case LLM_FFN_RELU_SQR:
|
||||
if (gate_exps) {
|
||||
// TODO: add support for gated squared relu
|
||||
GGML_ABORT("fatal error: gated squared relu not implemented");
|
||||
} else {
|
||||
cur = ggml_relu(ctx0, cur);
|
||||
cur = ggml_sqr(ctx0, cur);
|
||||
cb(cur, "ffn_moe_relu_sqr", il);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
//expand here so that we can fuse ffn gate
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
|
||||
experts = build_lora_mm_id(down_exps, cur, selected_experts); // [n_embd, n_expert_used, n_tokens]
|
||||
cb(experts, "ffn_moe_down", il);
|
||||
|
||||
|
|
|
|||
|
|
@ -164,8 +164,8 @@ struct llama_hparams {
|
|||
// llama4 smallthinker
|
||||
uint32_t n_moe_layer_step = 0;
|
||||
uint32_t n_no_rope_layer_step = 4;
|
||||
uint32_t n_attn_temp_floor_scale = 8192;
|
||||
float f_attn_temp_scale = 0.1;
|
||||
uint32_t n_attn_temp_floor_scale = 0;
|
||||
float f_attn_temp_scale = 0.0f;
|
||||
|
||||
// gemma3n altup
|
||||
uint32_t n_altup = 4; // altup_num_inputs
|
||||
|
|
|
|||
|
|
@ -37,7 +37,7 @@ void llama_log_callback_default(ggml_log_level level, const char * text, void *
|
|||
template <typename T>
|
||||
struct no_init {
|
||||
T value;
|
||||
no_init() { /* do nothing */ }
|
||||
no_init() = default;
|
||||
};
|
||||
|
||||
struct time_meas {
|
||||
|
|
|
|||
|
|
@ -485,7 +485,7 @@ struct llama_mlock::impl {
|
|||
if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit)) {
|
||||
suggest = false;
|
||||
}
|
||||
if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size)) {
|
||||
if (suggest && ((uint64_t)lock_limit.rlim_max > (uint64_t)lock_limit.rlim_cur + size)) {
|
||||
suggest = false;
|
||||
}
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -120,6 +120,8 @@ const char * llm_type_name(llm_type type) {
|
|||
case LLM_TYPE_16B_A1B: return "16B.A1B";
|
||||
case LLM_TYPE_21B_A3B: return "21B.A3B";
|
||||
case LLM_TYPE_30B_A3B: return "30B.A3B";
|
||||
case LLM_TYPE_31B_A3_5B: return "31B.A3.5B";
|
||||
case LLM_TYPE_80B_A3B: return "80B.A3B";
|
||||
case LLM_TYPE_100B_A6B: return "100B.A6B";
|
||||
case LLM_TYPE_106B_A12B: return "106B.A12B";
|
||||
case LLM_TYPE_230B_A10B: return "230B.A10B";
|
||||
|
|
@ -423,8 +425,8 @@ static buft_list_t make_gpu_buft_list(ggml_backend_dev_t dev, llama_split_mode s
|
|||
}
|
||||
|
||||
struct llama_model::impl {
|
||||
impl() {}
|
||||
~impl() {}
|
||||
impl() = default;
|
||||
~impl() = default;
|
||||
|
||||
uint64_t n_elements = 0;
|
||||
|
||||
|
|
@ -461,7 +463,7 @@ llama_model::llama_model(const llama_model_params & params) : params(params), pi
|
|||
pimpl->has_tensor_overrides = params.tensor_buft_overrides && params.tensor_buft_overrides[0].pattern;
|
||||
}
|
||||
|
||||
llama_model::~llama_model() {}
|
||||
llama_model::~llama_model() = default;
|
||||
|
||||
void llama_model::load_stats(llama_model_loader & ml) {
|
||||
pimpl->n_elements = ml.n_elements;
|
||||
|
|
@ -663,8 +665,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
|
||||
hparams.n_no_rope_layer_step = hparams.n_layer; // always use rope
|
||||
} else {
|
||||
hparams.swa_type = LLAMA_SWA_TYPE_CHUNKED;
|
||||
hparams.n_swa = 8192;
|
||||
hparams.swa_type = LLAMA_SWA_TYPE_CHUNKED;
|
||||
hparams.n_swa = 8192;
|
||||
hparams.n_attn_temp_floor_scale = 8192;
|
||||
hparams.f_attn_temp_scale = 0.1f;
|
||||
hparams.set_swa_pattern(4); // pattern: 3 chunked - 1 full
|
||||
}
|
||||
|
||||
|
|
@ -1262,18 +1266,25 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
} break;
|
||||
case LLM_ARCH_GEMMA3:
|
||||
{
|
||||
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
||||
hparams.set_swa_pattern(6);
|
||||
const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
|
||||
if (found_swa && hparams.n_swa > 0) {
|
||||
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
||||
hparams.set_swa_pattern(6);
|
||||
|
||||
hparams.rope_freq_base_train_swa = 10000.0f;
|
||||
hparams.rope_freq_scale_train_swa = 1.0f;
|
||||
hparams.rope_freq_base_train_swa = 10000.0f;
|
||||
hparams.rope_freq_scale_train_swa = 1.0f;
|
||||
} else {
|
||||
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
|
||||
}
|
||||
|
||||
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
||||
hparams.f_final_logit_softcapping = 0.0f;
|
||||
ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping, false);
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
case 18: type = LLM_TYPE_270M; break;
|
||||
case 26: type = LLM_TYPE_1B; break;
|
||||
case 32: type = LLM_TYPE_8B; break; // Rnj-1
|
||||
case 34: type = LLM_TYPE_4B; break;
|
||||
case 48: type = LLM_TYPE_12B; break;
|
||||
case 62: type = LLM_TYPE_27B; break;
|
||||
|
|
@ -1597,8 +1608,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
|
||||
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
case 28: type = LLM_TYPE_20B; break;
|
||||
switch (hparams.n_ff_exp) {
|
||||
case 1408: type = LLM_TYPE_16B; break;
|
||||
case 1792: type = LLM_TYPE_20B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
|
|
@ -1626,6 +1638,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
}
|
||||
ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, false);
|
||||
|
||||
// (optional) temperature tuning - used by mistral-large
|
||||
ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_SCALE, hparams.f_attn_temp_scale, false);
|
||||
ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_LENGTH, hparams.n_attn_temp_floor_scale, false);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
case 27: type = LLM_TYPE_16B; break;
|
||||
case 60: type = LLM_TYPE_236B; break;
|
||||
|
|
@ -1774,6 +1790,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
}
|
||||
} break;
|
||||
case LLM_ARCH_NEMOTRON_H:
|
||||
case LLM_ARCH_NEMOTRON_H_MOE:
|
||||
{
|
||||
ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
|
||||
ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
|
||||
|
|
@ -1789,7 +1806,14 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
|
||||
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
|
||||
ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
|
||||
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared, false);
|
||||
ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
|
||||
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
case 52: type = LLM_TYPE_31B_A3_5B; break; // Nemotron-H_MOE 31B
|
||||
case 56: type = LLM_TYPE_9B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
|
|
@ -2262,6 +2286,42 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_MISTRAL3:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_SCALE, hparams.f_attn_temp_scale, false);
|
||||
|
||||
ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_FAST, hparams.yarn_beta_fast, false);
|
||||
ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, hparams.yarn_beta_slow, false);
|
||||
ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, false);
|
||||
|
||||
// TODO: maybe add n_attn_temp_floor_scale as a separate KV?
|
||||
if (hparams.f_attn_temp_scale != 0.0f) {
|
||||
hparams.n_attn_temp_floor_scale = hparams.n_ctx_orig_yarn;
|
||||
if (hparams.n_attn_temp_floor_scale == 0) {
|
||||
throw std::runtime_error("invalid n_ctx_orig_yarn for attention temperature scaling");
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: this seems to be correct with the case of mscale == mscale_all_dims == 1.0f
|
||||
// but may need further verification with other values
|
||||
if (hparams.rope_yarn_log_mul != 0.0f) {
|
||||
float factor = 1.0f / hparams.rope_freq_scale_train;
|
||||
float mscale = 1.0f;
|
||||
float mscale_all_dims = hparams.rope_yarn_log_mul;
|
||||
static auto get_mscale = [](float scale, float mscale) {
|
||||
return scale <= 1.0f ? 1.0f : (0.1f * mscale * logf(scale) + 1.0f);
|
||||
};
|
||||
hparams.yarn_attn_factor = get_mscale(factor, mscale) / get_mscale(factor, mscale_all_dims);
|
||||
}
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
case 26: type = LLM_TYPE_3B; break;
|
||||
case 34: type = LLM_TYPE_8B; break;
|
||||
case 40: type = LLM_TYPE_14B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
default: throw std::runtime_error("unsupported model architecture");
|
||||
}
|
||||
|
||||
|
|
@ -2575,6 +2635,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
case LLM_ARCH_MINICPM:
|
||||
case LLM_ARCH_GRANITE:
|
||||
case LLM_ARCH_GRANITE_MOE:
|
||||
case LLM_ARCH_MISTRAL3:
|
||||
{
|
||||
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||
|
||||
|
|
@ -5124,6 +5185,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
}
|
||||
} break;
|
||||
case LLM_ARCH_NEMOTRON_H:
|
||||
case LLM_ARCH_NEMOTRON_H_MOE:
|
||||
{
|
||||
// mamba2 Mixer SSM params
|
||||
// NOTE: int64_t for tensor dimensions
|
||||
|
|
@ -5134,6 +5196,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
const int64_t n_group = hparams.ssm_n_group;
|
||||
const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_ssm_head;
|
||||
|
||||
const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
|
||||
const int64_t n_ff_shexp = hparams.n_ff_shexp;
|
||||
|
||||
// embeddings
|
||||
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||
|
||||
|
|
@ -5183,12 +5248,26 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_k_gqa_i}, TENSOR_NOT_REQUIRED);
|
||||
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_v_gqa_i}, TENSOR_NOT_REQUIRED);
|
||||
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
||||
} else {
|
||||
// mlp layers
|
||||
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { hparams.n_ff(i), n_embd}, 0);
|
||||
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, hparams.n_ff(i)}, 0);
|
||||
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
||||
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {hparams.n_ff(i)}, TENSOR_NOT_REQUIRED);
|
||||
} else {
|
||||
if (n_expert != 0) {
|
||||
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert}, 0);
|
||||
layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert }, 0);
|
||||
|
||||
// MoE branch
|
||||
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
|
||||
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
|
||||
|
||||
// Shared expert branch
|
||||
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, 0);
|
||||
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_shexp}, 0);
|
||||
|
||||
} else {
|
||||
// mlp layers
|
||||
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { hparams.n_ff(i), n_embd}, 0);
|
||||
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, hparams.n_ff(i)}, 0);
|
||||
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
||||
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {hparams.n_ff(i)}, TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
}
|
||||
}
|
||||
} break;
|
||||
|
|
@ -6530,7 +6609,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), { n_embd, qkvz_dim }, 0);
|
||||
layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), { hparams.ssm_d_conv, conv_dim }, 0);
|
||||
layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), { hparams.ssm_dt_rank }, 0);
|
||||
layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), { hparams.ssm_dt_rank }, 0);
|
||||
layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A_NOSCAN, i), { hparams.ssm_dt_rank }, 0);
|
||||
layer.ssm_beta_alpha = create_tensor(tn(LLM_TENSOR_SSM_BETA_ALPHA, "weight", i), { n_embd, ba_dim }, 0);
|
||||
layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), { head_v_dim }, 0);
|
||||
layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), { value_dim, n_embd }, 0);
|
||||
|
|
@ -6819,7 +6898,8 @@ void llama_model::print_info() const {
|
|||
arch == LLM_ARCH_PLAMO2 ||
|
||||
arch == LLM_ARCH_GRANITE_HYBRID ||
|
||||
arch == LLM_ARCH_QWEN3NEXT ||
|
||||
arch == LLM_ARCH_NEMOTRON_H) {
|
||||
arch == LLM_ARCH_NEMOTRON_H ||
|
||||
arch == LLM_ARCH_NEMOTRON_H_MOE) {
|
||||
LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
|
||||
LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
|
||||
LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
|
||||
|
|
@ -6875,7 +6955,8 @@ void llama_model::print_info() const {
|
|||
if (arch == LLM_ARCH_MINICPM ||
|
||||
arch == LLM_ARCH_GRANITE ||
|
||||
arch == LLM_ARCH_GRANITE_MOE ||
|
||||
arch == LLM_ARCH_GRANITE_HYBRID) {
|
||||
arch == LLM_ARCH_GRANITE_HYBRID ||
|
||||
arch == LLM_ARCH_NEMOTRON_H_MOE) {
|
||||
LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
|
||||
LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
|
||||
LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
|
||||
|
|
@ -7056,7 +7137,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|||
if (arch == LLM_ARCH_FALCON_H1) {
|
||||
filter_attn = [&](int32_t) { return true; };
|
||||
filter_recr = [&](int32_t) { return true; };
|
||||
} else if (arch == LLM_ARCH_NEMOTRON_H) {
|
||||
} else if (arch == LLM_ARCH_NEMOTRON_H || arch == LLM_ARCH_NEMOTRON_H_MOE) {
|
||||
filter_attn = [&](int32_t il) {
|
||||
return !hparams.is_recurrent(il) && hparams.n_ff(il) == 0;
|
||||
};
|
||||
|
|
@ -7304,7 +7385,11 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|||
} break;
|
||||
case LLM_ARCH_GEMMA3:
|
||||
{
|
||||
llm = std::make_unique<llm_build_gemma3_iswa>(*this, params);
|
||||
if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) {
|
||||
llm = std::make_unique<llm_build_gemma3<true>>(*this, params);
|
||||
} else {
|
||||
llm = std::make_unique<llm_build_gemma3<false>>(*this, params);
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_GEMMA3N:
|
||||
{
|
||||
|
|
@ -7423,6 +7508,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|||
llm = std::make_unique<llm_build_nemotron>(*this, params);
|
||||
} break;
|
||||
case LLM_ARCH_NEMOTRON_H:
|
||||
case LLM_ARCH_NEMOTRON_H_MOE:
|
||||
{
|
||||
llm = std::make_unique<llm_build_nemotron_h>(*this, params);
|
||||
} break;
|
||||
|
|
@ -7569,6 +7655,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|||
{
|
||||
llm = std::make_unique<llm_build_qwen3next>(*this, params);
|
||||
} break;
|
||||
case LLM_ARCH_MISTRAL3:
|
||||
{
|
||||
llm = std::make_unique<llm_build_mistral3>(*this, params);
|
||||
} break;
|
||||
default:
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
|
@ -7706,6 +7796,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|||
case LLM_ARCH_ARWKV7:
|
||||
case LLM_ARCH_WAVTOKENIZER_DEC:
|
||||
case LLM_ARCH_NEMOTRON_H:
|
||||
case LLM_ARCH_NEMOTRON_H_MOE:
|
||||
return LLAMA_ROPE_TYPE_NONE;
|
||||
|
||||
// use what we call a normal RoPE, operating on pairs of consecutive head values
|
||||
|
|
@ -7738,6 +7829,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|||
case LLM_ARCH_ARCEE:
|
||||
case LLM_ARCH_ERNIE4_5:
|
||||
case LLM_ARCH_ERNIE4_5_MOE:
|
||||
case LLM_ARCH_MISTRAL3:
|
||||
return LLAMA_ROPE_TYPE_NORM;
|
||||
|
||||
// the pairs of head values are offset by n_rot/2
|
||||
|
|
|
|||
|
|
@ -114,6 +114,7 @@ enum llm_type {
|
|||
LLM_TYPE_16B_A1B,
|
||||
LLM_TYPE_21B_A3B, // Ernie MoE small
|
||||
LLM_TYPE_30B_A3B,
|
||||
LLM_TYPE_31B_A3_5B,
|
||||
LLM_TYPE_80B_A3B, // Qwen3 Next
|
||||
LLM_TYPE_100B_A6B,
|
||||
LLM_TYPE_106B_A12B, // GLM-4.5-Air
|
||||
|
|
|
|||
|
|
@ -666,7 +666,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|||
|
||||
std::map<int, std::string> mapped;
|
||||
int blk_id = 0;
|
||||
int pruned_attention_w = 0;
|
||||
|
||||
// make a list of weights
|
||||
std::vector<const llama_model_loader::llama_tensor_weight *> tensors;
|
||||
|
|
@ -674,11 +673,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|||
for (const auto & it : ml.weights_map) {
|
||||
const std::string remapped_name(remap_layer(it.first, prune_list, mapped, blk_id));
|
||||
if (remapped_name.empty()) {
|
||||
if (it.first.find("attn_v.weight") != std::string::npos ||
|
||||
it.first.find("attn_qkv.weight") != std::string::npos ||
|
||||
it.first.find("attn_kv_b.weight") != std::string::npos) {
|
||||
pruned_attention_w++;
|
||||
}
|
||||
LLAMA_LOG_DEBUG("%s: pruning tensor %s\n", __func__, it.first.c_str());
|
||||
continue;
|
||||
}
|
||||
|
|
@ -703,7 +697,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|||
});
|
||||
}
|
||||
|
||||
bool is_clip_model = false;
|
||||
for (const auto * it : tensors) {
|
||||
const struct ggml_tensor * tensor = it->tensor;
|
||||
|
||||
|
|
@ -717,32 +710,10 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|||
} else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
|
||||
qs.has_output = true;
|
||||
}
|
||||
|
||||
is_clip_model |= name.rfind("mm.", 0) == 0; // check the "mm." prefix
|
||||
}
|
||||
|
||||
qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
|
||||
|
||||
// sanity checks for models that have attention layers
|
||||
if (qs.n_attention_wv != 0 && !is_clip_model)
|
||||
{
|
||||
const auto & n_head_kv_iter = model.hparams.n_head_kv_arr.begin();
|
||||
// attention layers have a non-zero number of kv heads
|
||||
int32_t n_layer_attn = model.hparams.n_layer - std::count(n_head_kv_iter, n_head_kv_iter + model.hparams.n_layer, 0);
|
||||
if (llama_model_has_encoder(&model)) {
|
||||
// now n_layer_attn is the number of attention layers in the encoder
|
||||
// for each decoder block, there are 2 attention layers
|
||||
n_layer_attn += 2 * model.hparams.dec_n_layer;
|
||||
}
|
||||
|
||||
// note: for linear-attention models (such as Qwen3 Next) this is the number of linear layers
|
||||
const int32_t n_layer_recr = std::count(model.hparams.recurrent_layer_arr.begin(), model.hparams.recurrent_layer_arr.end(), true);
|
||||
|
||||
LLAMA_LOG_INFO("%s: n_layer_attn = %d, n_layer_recr = %d, pruned_attention_w = %d\n", __func__, n_layer_attn, n_layer_recr, pruned_attention_w);
|
||||
|
||||
GGML_ASSERT((qs.n_attention_wv == n_layer_attn - pruned_attention_w - n_layer_recr) && "n_attention_wv is unexpected");
|
||||
}
|
||||
|
||||
size_t total_size_org = 0;
|
||||
size_t total_size_new = 0;
|
||||
|
||||
|
|
|
|||
|
|
@ -3243,8 +3243,7 @@ void llama_vocab::impl::print_info() const {
|
|||
llama_vocab::llama_vocab() : pimpl(new impl(*this)) {
|
||||
}
|
||||
|
||||
llama_vocab::~llama_vocab() {
|
||||
}
|
||||
llama_vocab::~llama_vocab() = default;
|
||||
|
||||
void llama_vocab::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
pimpl->load(ml, kv);
|
||||
|
|
|
|||
|
|
@ -30,6 +30,12 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
|
|||
// {n_embd, n_tokens}
|
||||
inpL = build_inp_embd(model.tok_embd);
|
||||
|
||||
// (optional) temperature tuning - used by mistral-large
|
||||
ggml_tensor * inp_attn_scale = nullptr;
|
||||
if (hparams.f_attn_temp_scale != 0.0f) {
|
||||
inp_attn_scale = build_inp_attn_scale();
|
||||
}
|
||||
|
||||
// inp_pos - contains the positions
|
||||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
|
|
@ -128,6 +134,12 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
|
|||
ggml_tensor * Vcur = kv_cmpr;
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
if (inp_attn_scale) {
|
||||
// apply llama 4 temperature scaling
|
||||
Qcur = ggml_mul(ctx0, Qcur, inp_attn_scale);
|
||||
cb(Qcur, "Qcur_attn_temp_scaled", il);
|
||||
}
|
||||
|
||||
// note: MLA with the absorption optimzation converts into MQA (ie: GQA with 1 group)
|
||||
cur = build_attn(inp_attn,
|
||||
model.layers[il].wo, NULL,
|
||||
|
|
@ -160,6 +172,12 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
|
|||
ggml_tensor * Kcur = ggml_concat(ctx0, ggml_repeat(ctx0, k_pe, q_pe), k_nope, 0);
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
||||
if (inp_attn_scale) {
|
||||
// apply llama 4 temperature scaling
|
||||
Qcur = ggml_mul(ctx0, Qcur, inp_attn_scale);
|
||||
cb(Qcur, "Qcur_attn_temp_scaled", il);
|
||||
}
|
||||
|
||||
// note: MLA without the absorption optimization converts into MHA (ie: GQA with full n_head groups)
|
||||
cur = build_attn(inp_attn,
|
||||
model.layers[il].wo, NULL,
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
#include "models.h"
|
||||
|
||||
llm_build_gemma3_iswa::llm_build_gemma3_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
template <bool iswa>
|
||||
llm_build_gemma3<iswa>::llm_build_gemma3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_k;
|
||||
|
||||
ggml_tensor * cur;
|
||||
|
|
@ -17,13 +18,28 @@ llm_build_gemma3_iswa::llm_build_gemma3_iswa(const llama_model & model, const ll
|
|||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
// TODO: is causal == true correct? might need some changes
|
||||
auto * inp_attn = build_attn_inp_kv_iswa();
|
||||
using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_iswa, llm_graph_input_attn_kv>;
|
||||
inp_attn_type * inp_attn = nullptr;
|
||||
|
||||
if constexpr (iswa) {
|
||||
inp_attn = build_attn_inp_kv_iswa();
|
||||
} else {
|
||||
inp_attn = build_attn_inp_kv();
|
||||
}
|
||||
|
||||
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
const float freq_base_l = model.get_rope_freq_base (cparams, il);
|
||||
const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
|
||||
float freq_base_l = 0.0f;
|
||||
float freq_scale_l = 0.0f;
|
||||
|
||||
if constexpr (iswa) {
|
||||
freq_base_l = model.get_rope_freq_base (cparams, il);
|
||||
freq_scale_l = model.get_rope_freq_scale(cparams, il);
|
||||
} else {
|
||||
freq_base_l = freq_base;
|
||||
freq_scale_l = freq_scale;
|
||||
}
|
||||
|
||||
// norm
|
||||
cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
|
||||
|
|
@ -102,7 +118,7 @@ llm_build_gemma3_iswa::llm_build_gemma3_iswa(const llama_model & model, const ll
|
|||
cur = build_norm(cur,
|
||||
model.layers[il].ffn_post_norm, NULL,
|
||||
LLM_NORM_RMS, -1);
|
||||
cb(cur, "ffn_post_norm", -1);
|
||||
cb(cur, "ffn_post_norm", il);
|
||||
|
||||
cur = ggml_add(ctx0, cur, sa_out);
|
||||
|
||||
|
|
@ -124,8 +140,17 @@ llm_build_gemma3_iswa::llm_build_gemma3_iswa(const llama_model & model, const ll
|
|||
// lm_head
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
|
||||
if (hparams.f_final_logit_softcapping) {
|
||||
cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping);
|
||||
cur = ggml_tanh(ctx0, cur);
|
||||
cur = ggml_scale(ctx0, cur, hparams.f_final_logit_softcapping);
|
||||
}
|
||||
|
||||
cb(cur, "result_output", -1);
|
||||
res->t_logits = cur;
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
}
|
||||
|
||||
template struct llm_build_gemma3<false>;
|
||||
template struct llm_build_gemma3<true>;
|
||||
|
|
@ -0,0 +1,160 @@
|
|||
#include "models.h"
|
||||
|
||||
llm_build_mistral3::llm_build_mistral3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
||||
inpL = build_inp_embd(model.tok_embd);
|
||||
|
||||
// inp_pos - contains the positions
|
||||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
// (optional) temperature tuning
|
||||
ggml_tensor * inp_attn_scale = nullptr;
|
||||
if (hparams.f_attn_temp_scale != 0.0f) {
|
||||
inp_attn_scale = build_inp_attn_scale();
|
||||
}
|
||||
|
||||
auto * inp_attn = build_attn_inp_kv();
|
||||
|
||||
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
||||
|
||||
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
ggml_tensor * inpSA = inpL;
|
||||
|
||||
// norm
|
||||
cur = build_norm(inpL,
|
||||
model.layers[il].attn_norm, NULL,
|
||||
LLM_NORM_RMS, il);
|
||||
cb(cur, "attn_norm", il);
|
||||
|
||||
// self-attention
|
||||
{
|
||||
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
||||
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
||||
|
||||
// compute Q and K and RoPE them
|
||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
||||
cb(Qcur, "Qcur", il);
|
||||
if (model.layers[il].bq) {
|
||||
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
||||
cb(Qcur, "Qcur", il);
|
||||
}
|
||||
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
||||
cb(Kcur, "Kcur", il);
|
||||
if (model.layers[il].bk) {
|
||||
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
||||
cb(Kcur, "Kcur", il);
|
||||
}
|
||||
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
||||
cb(Vcur, "Vcur", il);
|
||||
if (model.layers[il].bv) {
|
||||
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
||||
cb(Vcur, "Vcur", il);
|
||||
}
|
||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
||||
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
||||
|
||||
Qcur = ggml_rope_ext(
|
||||
ctx0, Qcur, inp_pos, rope_factors,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
|
||||
Kcur = ggml_rope_ext(
|
||||
ctx0, Kcur, inp_pos, rope_factors,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
|
||||
cb(Qcur, "Qcur", il);
|
||||
cb(Kcur, "Kcur", il);
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
if (inp_attn_scale) {
|
||||
// apply llama 4 temperature scaling
|
||||
Qcur = ggml_mul(ctx0, Qcur, inp_attn_scale);
|
||||
cb(Qcur, "Qcur_attn_temp_scaled", il);
|
||||
}
|
||||
|
||||
cur = build_attn(inp_attn,
|
||||
model.layers[il].wo, model.layers[il].bo,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
||||
cb(cur, "attn_out", il);
|
||||
}
|
||||
if (il == n_layer - 1 && inp_out_ids) {
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
||||
}
|
||||
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
||||
cb(ffn_inp, "ffn_inp", il);
|
||||
|
||||
// feed-forward network (non-MoE)
|
||||
if (model.layers[il].ffn_gate_inp == nullptr) {
|
||||
|
||||
cur = build_norm(ffn_inp,
|
||||
model.layers[il].ffn_norm, NULL,
|
||||
LLM_NORM_RMS, il);
|
||||
cb(cur, "ffn_norm", il);
|
||||
|
||||
cur = build_ffn(cur,
|
||||
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
||||
model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
|
||||
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
||||
NULL,
|
||||
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
} else {
|
||||
// MoE branch
|
||||
cur = build_norm(ffn_inp,
|
||||
model.layers[il].ffn_norm, NULL,
|
||||
LLM_NORM_RMS, il);
|
||||
cb(cur, "ffn_norm", il);
|
||||
|
||||
cur = build_moe_ffn(cur,
|
||||
model.layers[il].ffn_gate_inp,
|
||||
model.layers[il].ffn_up_exps,
|
||||
model.layers[il].ffn_gate_exps,
|
||||
model.layers[il].ffn_down_exps,
|
||||
nullptr,
|
||||
n_expert, n_expert_used,
|
||||
LLM_FFN_SILU, true,
|
||||
false, 0.0,
|
||||
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
||||
il);
|
||||
cb(cur, "ffn_moe_out", il);
|
||||
}
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
cb(cur, "ffn_out", il);
|
||||
|
||||
cur = build_cvec(cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
inpL = cur;
|
||||
}
|
||||
cur = inpL;
|
||||
|
||||
cur = build_norm(cur,
|
||||
model.output_norm, NULL,
|
||||
LLM_NORM_RMS, -1);
|
||||
|
||||
cb(cur, "result_norm", -1);
|
||||
res->t_embd = cur;
|
||||
|
||||
// lm_head
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
|
||||
cb(cur, "result_output", -1);
|
||||
res->t_logits = cur;
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
}
|
||||
|
|
@ -179,8 +179,9 @@ struct llm_build_gemma2_iswa : public llm_graph_context {
|
|||
llm_build_gemma2_iswa(const llama_model & model, const llm_graph_params & params);
|
||||
};
|
||||
|
||||
struct llm_build_gemma3_iswa : public llm_graph_context {
|
||||
llm_build_gemma3_iswa(const llama_model & model, const llm_graph_params & params);
|
||||
template <bool iswa>
|
||||
struct llm_build_gemma3 : public llm_graph_context {
|
||||
llm_build_gemma3(const llama_model & model, const llm_graph_params & params);
|
||||
};
|
||||
|
||||
struct llm_build_gemma3n_iswa : public llm_graph_context {
|
||||
|
|
@ -322,6 +323,10 @@ struct llm_build_minimax_m2 : public llm_graph_context {
|
|||
llm_build_minimax_m2(const llama_model & model, const llm_graph_params & params);
|
||||
};
|
||||
|
||||
struct llm_build_mistral3 : public llm_graph_context {
|
||||
llm_build_mistral3(const llama_model & model, const llm_graph_params & params);
|
||||
};
|
||||
|
||||
struct llm_build_mpt : public llm_graph_context {
|
||||
llm_build_mpt(const llama_model & model, const llm_graph_params & params);
|
||||
};
|
||||
|
|
|
|||
|
|
@ -107,12 +107,41 @@ ggml_tensor * llm_build_nemotron_h::build_attention_layer(ggml_tensor *
|
|||
}
|
||||
|
||||
ggml_tensor * llm_build_nemotron_h::build_ffn_layer(ggml_tensor * cur, const llama_model & model, const int il) {
|
||||
cur = build_ffn(cur,
|
||||
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
||||
NULL, NULL, NULL,
|
||||
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
||||
NULL, LLM_FFN_RELU_SQR, LLM_FFN_PAR, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
if (model.layers[il].ffn_gate_inp == nullptr) {
|
||||
cur = build_ffn(cur,
|
||||
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
||||
NULL, NULL, NULL,
|
||||
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
||||
NULL,
|
||||
LLM_FFN_RELU_SQR, LLM_FFN_PAR, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
} else {
|
||||
ggml_tensor * ffn_inp = cur;
|
||||
ggml_tensor * moe_out =
|
||||
build_moe_ffn(ffn_inp,
|
||||
model.layers[il].ffn_gate_inp,
|
||||
model.layers[il].ffn_up_exps,
|
||||
nullptr, // no gate
|
||||
model.layers[il].ffn_down_exps,
|
||||
model.layers[il].ffn_exp_probs_b,
|
||||
n_expert, n_expert_used,
|
||||
LLM_FFN_RELU_SQR, hparams.expert_weights_norm,
|
||||
true, hparams.expert_weights_scale,
|
||||
LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID,
|
||||
il);
|
||||
cb(moe_out, "ffn_moe_out", il);
|
||||
|
||||
ggml_tensor * ffn_shexp = build_ffn(ffn_inp,
|
||||
model.layers[il].ffn_up_shexp, NULL, NULL,
|
||||
NULL /* no gate */ , NULL, NULL,
|
||||
model.layers[il].ffn_down_shexp, NULL, NULL,
|
||||
NULL,
|
||||
LLM_FFN_RELU_SQR, LLM_FFN_PAR, il);
|
||||
cb(ffn_shexp, "ffn_shexp", il);
|
||||
|
||||
cur = ggml_add(ctx0, moe_out, ffn_shexp);
|
||||
cb(cur, "ffn_out", il);
|
||||
}
|
||||
|
||||
cur = build_cvec(cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
|
|
|||
|
|
@ -520,7 +520,7 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
|
|||
|
||||
// use std::wregex to split the text
|
||||
static std::vector<size_t> unicode_regex_split_stl(const std::wstring & wtext, const std::wstring & regex_expr, const std::vector<size_t> & offsets) {
|
||||
std::wregex expr(regex_expr);
|
||||
std::wregex expr(regex_expr, std::regex_constants::optimize | std::regex_constants::nosubs);
|
||||
std::vector<size_t> bpe_offsets; // store the offset of each word
|
||||
bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
|
||||
size_t start = 0;
|
||||
|
|
@ -550,7 +550,7 @@ static std::vector<size_t> unicode_regex_split_stl(const std::wstring & wtext, c
|
|||
|
||||
// use std::regex to split the text
|
||||
static std::vector<size_t> unicode_regex_split_stl(const std::string & text, const std::string & regex_expr, const std::vector<size_t> & offsets) {
|
||||
std::regex expr(regex_expr);
|
||||
std::regex expr(regex_expr, std::regex_constants::optimize | std::regex_constants::nosubs);
|
||||
std::vector<size_t> bpe_offsets; // store the offset of each word
|
||||
bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
|
||||
size_t start = 0;
|
||||
|
|
|
|||
|
|
@ -441,6 +441,7 @@ struct clip_ctx {
|
|||
int max_nodes = 8192;
|
||||
ggml_backend_sched_ptr sched;
|
||||
clip_flash_attn_type flash_attn_type = CLIP_FLASH_ATTN_TYPE_AUTO;
|
||||
bool is_allocated = false;
|
||||
|
||||
// for debugging
|
||||
bool debug_graph = false;
|
||||
|
|
@ -2033,7 +2034,7 @@ private:
|
|||
ggml_tensor * pos_embd = model.position_embeddings;
|
||||
const int height = img.ny / patch_size;
|
||||
const int width = img.nx / patch_size;
|
||||
const uint32_t mode = GGML_SCALE_MODE_BILINEAR;
|
||||
const uint32_t mode = GGML_SCALE_MODE_BILINEAR | GGML_SCALE_FLAG_ANTIALIAS;
|
||||
const int n_per_side = (int)std::sqrt(pos_embd->ne[1]);
|
||||
|
||||
GGML_ASSERT(pos_embd);
|
||||
|
|
@ -2812,7 +2813,8 @@ struct clip_model_loader {
|
|||
{
|
||||
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
|
||||
// ref: https://huggingface.co/LiquidAI/LFM2-VL-3B/blob/main/preprocessor_config.json
|
||||
hparams.set_limit_image_tokens(64, 256);
|
||||
// config above specifies number of tokens after downsampling, while here it is before, relax lowerbound to 64
|
||||
hparams.set_limit_image_tokens(64, 1024);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_PIXTRAL:
|
||||
case PROJECTOR_TYPE_LIGHTONOCR:
|
||||
|
|
@ -3347,12 +3349,30 @@ struct clip_model_loader {
|
|||
};
|
||||
|
||||
static void warmup(clip_ctx & ctx_clip) {
|
||||
// create a fake batch
|
||||
const auto & hparams = ctx_clip.model.hparams;
|
||||
clip_image_f32_batch batch;
|
||||
clip_image_f32_ptr img(clip_image_f32_init());
|
||||
if (ctx_clip.model.modality == CLIP_MODALITY_VISION) {
|
||||
img->nx = hparams.warmup_image_size;
|
||||
img->ny = hparams.warmup_image_size;
|
||||
LOG_INF("%s: warmup with image size = %d x %d\n", __func__, img->nx, img->ny);
|
||||
} else {
|
||||
img->nx = hparams.warmup_audio_size;
|
||||
img->ny = hparams.n_mel_bins;
|
||||
LOG_INF("%s: warmup with audio size = %d\n", __func__, img->nx);
|
||||
}
|
||||
batch.entries.push_back(std::move(img));
|
||||
warmup(ctx_clip, batch);
|
||||
}
|
||||
|
||||
static void warmup(clip_ctx & ctx_clip, const clip_image_f32_batch & batch) {
|
||||
support_info_graph info;
|
||||
|
||||
if (ctx_clip.flash_attn_type == CLIP_FLASH_ATTN_TYPE_AUTO) {
|
||||
// try to enable flash attention to see if it's supported
|
||||
ctx_clip.flash_attn_type = CLIP_FLASH_ATTN_TYPE_ENABLED;
|
||||
info = alloc_compute_meta(ctx_clip);
|
||||
info = alloc_compute_meta(ctx_clip, batch);
|
||||
if (!info.fattn && info.fattn_op) {
|
||||
auto op = info.fattn_op;
|
||||
LOG_WRN("%s: *****************************************************************\n", __func__);
|
||||
|
|
@ -3371,15 +3391,17 @@ struct clip_model_loader {
|
|||
LOG_WRN("%s: please report this on github as an issue\n", __func__);
|
||||
LOG_WRN("%s: *****************************************************************\n", __func__);
|
||||
ctx_clip.flash_attn_type = CLIP_FLASH_ATTN_TYPE_DISABLED;
|
||||
alloc_compute_meta(ctx_clip);
|
||||
alloc_compute_meta(ctx_clip, batch);
|
||||
}
|
||||
} else {
|
||||
info = alloc_compute_meta(ctx_clip);
|
||||
info = alloc_compute_meta(ctx_clip, batch);
|
||||
if (!info.fattn && ctx_clip.flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) {
|
||||
LOG_WRN("%s: flash attention is not supported by the current backend; falling back to CPU (performance will be degraded)\n", __func__);
|
||||
}
|
||||
}
|
||||
|
||||
ctx_clip.is_allocated = true; // mark buffers as allocated
|
||||
|
||||
LOG_INF("%s: flash attention is %s\n", __func__,
|
||||
(ctx_clip.flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) ? "enabled" : "disabled");
|
||||
|
||||
|
|
@ -3411,24 +3433,9 @@ struct clip_model_loader {
|
|||
}
|
||||
}
|
||||
|
||||
static support_info_graph alloc_compute_meta(clip_ctx & ctx_clip) {
|
||||
const auto & hparams = ctx_clip.model.hparams;
|
||||
static support_info_graph alloc_compute_meta(clip_ctx & ctx_clip, const clip_image_f32_batch & batch) {
|
||||
ctx_clip.buf_compute_meta.resize(ctx_clip.max_nodes * ggml_tensor_overhead() + ggml_graph_overhead());
|
||||
|
||||
// create a fake batch
|
||||
clip_image_f32_batch batch;
|
||||
clip_image_f32_ptr img(clip_image_f32_init());
|
||||
if (ctx_clip.model.modality == CLIP_MODALITY_VISION) {
|
||||
img->nx = hparams.warmup_image_size;
|
||||
img->ny = hparams.warmup_image_size;
|
||||
LOG_INF("%s: warmup with image size = %d x %d\n", __func__, img->nx, img->ny);
|
||||
} else {
|
||||
img->nx = hparams.warmup_audio_size;
|
||||
img->ny = hparams.n_mel_bins;
|
||||
LOG_INF("%s: warmup with audio size = %d\n", __func__, img->nx);
|
||||
}
|
||||
batch.entries.push_back(std::move(img));
|
||||
|
||||
ggml_cgraph * gf = clip_image_build_graph(&ctx_clip, batch);
|
||||
ggml_backend_sched_reserve(ctx_clip.sched.get(), gf);
|
||||
|
||||
|
|
@ -3568,14 +3575,18 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params
|
|||
ctx_vision = new clip_ctx(ctx_params);
|
||||
loader.load_hparams(ctx_vision->model, CLIP_MODALITY_VISION);
|
||||
loader.load_tensors(*ctx_vision);
|
||||
loader.warmup(*ctx_vision);
|
||||
if (ctx_params.warmup) {
|
||||
loader.warmup(*ctx_vision);
|
||||
}
|
||||
}
|
||||
|
||||
if (loader.has_audio) {
|
||||
ctx_audio = new clip_ctx(ctx_params);
|
||||
loader.load_hparams(ctx_audio->model, CLIP_MODALITY_AUDIO);
|
||||
loader.load_tensors(*ctx_audio);
|
||||
loader.warmup(*ctx_audio);
|
||||
if (ctx_params.warmup) {
|
||||
loader.warmup(*ctx_audio);
|
||||
}
|
||||
}
|
||||
|
||||
} catch (const std::exception & e) {
|
||||
|
|
@ -3788,12 +3799,13 @@ struct img_tool {
|
|||
const int width = inp_size.width;
|
||||
const int height = inp_size.height;
|
||||
|
||||
auto round_by_factor = [f = align_size](float x) { return static_cast<int>(std::round(x / static_cast<float>(f))) * f; };
|
||||
auto ceil_by_factor = [f = align_size](float x) { return static_cast<int>(std::ceil(x / static_cast<float>(f))) * f; };
|
||||
auto floor_by_factor = [f = align_size](float x) { return static_cast<int>(std::floor(x / static_cast<float>(f))) * f; };
|
||||
|
||||
// always align up first
|
||||
int h_bar = std::max(align_size, ceil_by_factor(height));
|
||||
int w_bar = std::max(align_size, ceil_by_factor(width));
|
||||
int h_bar = std::max(align_size, round_by_factor(height));
|
||||
int w_bar = std::max(align_size, round_by_factor(width));
|
||||
|
||||
if (h_bar * w_bar > max_pixels) {
|
||||
const auto beta = std::sqrt(static_cast<float>(height * width) / max_pixels);
|
||||
|
|
@ -4408,7 +4420,8 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
|
|||
const std::array<uint8_t, 3> pad_color = {122, 116, 104};
|
||||
|
||||
clip_image_u8 resized_img;
|
||||
img_tool::resize(*img, resized_img, target_size, img_tool::RESIZE_ALGO_BILINEAR, true, pad_color);
|
||||
const bool pad = (ctx->proj_type() != PROJECTOR_TYPE_LFM2);
|
||||
img_tool::resize(*img, resized_img, target_size, img_tool::RESIZE_ALGO_BILINEAR, pad, pad_color);
|
||||
clip_image_f32_ptr res(clip_image_f32_init());
|
||||
normalize_image_u8_to_f32(resized_img, *res, params.image_mean, params.image_std);
|
||||
res_imgs->entries.push_back(std::move(res));
|
||||
|
|
@ -4666,6 +4679,11 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|||
return false; // only support batch size of 1
|
||||
}
|
||||
|
||||
// if buffers are not allocated, we need to do a warmup run to allocate them
|
||||
if (!ctx->is_allocated) {
|
||||
clip_model_loader::warmup(*ctx, *imgs_c_ptr);
|
||||
}
|
||||
|
||||
// build the inference graph
|
||||
ctx->debug_print_tensors.clear();
|
||||
ggml_backend_sched_reset(ctx->sched.get());
|
||||
|
|
|
|||
|
|
@ -34,6 +34,7 @@ struct clip_context_params {
|
|||
enum clip_flash_attn_type flash_attn_type;
|
||||
int image_min_tokens;
|
||||
int image_max_tokens;
|
||||
bool warmup;
|
||||
};
|
||||
|
||||
struct clip_init_result {
|
||||
|
|
|
|||
|
|
@ -118,6 +118,7 @@ mtmd_context_params mtmd_context_params_default() {
|
|||
/* image_marker */ MTMD_DEFAULT_IMAGE_MARKER,
|
||||
/* media_marker */ mtmd_default_marker(),
|
||||
/* flash_attn_type */ LLAMA_FLASH_ATTN_TYPE_AUTO,
|
||||
/* warmup */ true,
|
||||
/* image_min_tokens */ -1,
|
||||
/* image_max_tokens */ -1,
|
||||
};
|
||||
|
|
@ -187,6 +188,7 @@ struct mtmd_context {
|
|||
/* flash_attn_type */ CLIP_FLASH_ATTN_TYPE_AUTO,
|
||||
/* image_min_tokens */ ctx_params.image_min_tokens,
|
||||
/* image_max_tokens */ ctx_params.image_max_tokens,
|
||||
/* warmup */ ctx_params.warmup,
|
||||
};
|
||||
|
||||
auto res = clip_init(mmproj_fname, ctx_clip_params);
|
||||
|
|
@ -314,6 +316,10 @@ struct mtmd_context {
|
|||
img_beg = "<|im_start|>";
|
||||
img_end = "<|im_end|>";
|
||||
|
||||
} else if (proj == PROJECTOR_TYPE_LFM2) {
|
||||
img_beg = "<|image_start|>";
|
||||
img_end = "<|image_end|>";
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -85,6 +85,7 @@ struct mtmd_context_params {
|
|||
const char * image_marker; // deprecated, use media_marker instead
|
||||
const char * media_marker;
|
||||
enum llama_flash_attn_type flash_attn_type;
|
||||
bool warmup; // whether to run a warmup encode pass after initialization
|
||||
|
||||
// limit number of image tokens, only for vision models with dynamic resolution
|
||||
int image_min_tokens; // minimum number of tokens for image input (default: read from metadata)
|
||||
|
|
|
|||
|
|
@ -118,18 +118,22 @@ type ContextParams struct {
|
|||
c C.struct_llama_context_params
|
||||
}
|
||||
|
||||
func NewContextParams(numCtx int, batchSize int, numSeqMax int, threads int, flashAttention bool, kvCacheType string) ContextParams {
|
||||
func NewContextParams(numCtx int, batchSize int, numSeqMax int, threads int, flashAttention ml.FlashAttentionType, kvCacheType string) ContextParams {
|
||||
params := C.llama_context_default_params()
|
||||
params.n_ctx = C.uint(numCtx)
|
||||
params.n_batch = C.uint(batchSize)
|
||||
params.n_batch = C.uint(batchSize * numSeqMax)
|
||||
params.n_ubatch = C.uint(batchSize)
|
||||
params.n_seq_max = C.uint(numSeqMax)
|
||||
params.n_threads = C.int(threads)
|
||||
params.n_threads_batch = params.n_threads
|
||||
params.embeddings = C.bool(true)
|
||||
if flashAttention {
|
||||
params.flash_attn_type = C.LLAMA_FLASH_ATTN_TYPE_ENABLED
|
||||
} else {
|
||||
params.flash_attn_type = C.LLAMA_FLASH_ATTN_TYPE_DISABLED
|
||||
switch flashAttention {
|
||||
case ml.FlashAttentionEnabled:
|
||||
params.flash_attn_type = int32(C.LLAMA_FLASH_ATTN_TYPE_ENABLED)
|
||||
case ml.FlashAttentionDisabled:
|
||||
params.flash_attn_type = int32(C.LLAMA_FLASH_ATTN_TYPE_DISABLED)
|
||||
case ml.FlashAttentionAuto:
|
||||
params.flash_attn_type = int32(C.LLAMA_FLASH_ATTN_TYPE_AUTO)
|
||||
}
|
||||
params.type_k = kvCacheTypeFromStr(strings.ToLower(kvCacheType))
|
||||
params.type_v = kvCacheTypeFromStr(strings.ToLower(kvCacheType))
|
||||
|
|
|
|||
|
|
@ -23,7 +23,7 @@ problem.
|
|||
8 files changed, 21 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
|
||||
index 4cf377e7f..4882541c8 100644
|
||||
index 08681f35e..afde2f0b7 100644
|
||||
--- a/ggml/src/ggml-backend.cpp
|
||||
+++ b/ggml/src/ggml-backend.cpp
|
||||
@@ -113,7 +113,6 @@ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
|
||||
|
|
@ -42,7 +42,7 @@ index 4cf377e7f..4882541c8 100644
|
|||
}
|
||||
|
||||
static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
||||
@@ -2079,6 +2079,11 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
@@ -2106,6 +2106,11 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
GGML_ASSERT(buffer);
|
||||
ggml_aligned_free(buffer->context, buffer->size);
|
||||
|
|
@ -54,7 +54,7 @@ index 4cf377e7f..4882541c8 100644
|
|||
}
|
||||
|
||||
static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
|
||||
@@ -2131,7 +2136,7 @@ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = {
|
||||
@@ -2158,7 +2163,7 @@ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = {
|
||||
};
|
||||
|
||||
static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = {
|
||||
|
|
@ -64,7 +64,7 @@ index 4cf377e7f..4882541c8 100644
|
|||
/* .init_tensor = */ NULL, // no initialization required
|
||||
/* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor,
|
||||
diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp
|
||||
index df28d67fb..1f6a56ba2 100644
|
||||
index 81288464c..866758782 100644
|
||||
--- a/ggml/src/ggml-cann/ggml-cann.cpp
|
||||
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
|
||||
@@ -831,6 +831,7 @@ static bool ggml_backend_buffer_is_cann(ggml_backend_buffer_t buffer) {
|
||||
|
|
@ -84,10 +84,10 @@ index df28d67fb..1f6a56ba2 100644
|
|||
|
||||
/**
|
||||
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
index fa7e1e13a..8f3b1c173 100644
|
||||
index 279679a4e..5145c1e88 100644
|
||||
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
@@ -579,6 +579,7 @@ struct ggml_backend_cuda_buffer_context {
|
||||
@@ -583,6 +583,7 @@ struct ggml_backend_cuda_buffer_context {
|
||||
static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
||||
delete ctx;
|
||||
|
|
@ -95,7 +95,7 @@ index fa7e1e13a..8f3b1c173 100644
|
|||
}
|
||||
|
||||
static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
|
||||
@@ -834,6 +835,7 @@ struct ggml_backend_cuda_split_buffer_context {
|
||||
@@ -838,6 +839,7 @@ struct ggml_backend_cuda_split_buffer_context {
|
||||
static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
|
||||
delete ctx;
|
||||
|
|
@ -103,7 +103,7 @@ index fa7e1e13a..8f3b1c173 100644
|
|||
}
|
||||
|
||||
static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
@@ -1115,6 +1117,7 @@ static bool ggml_backend_buft_is_cuda_host(ggml_backend_buffer_type_t buft) {
|
||||
@@ -1119,6 +1121,7 @@ static bool ggml_backend_buft_is_cuda_host(ggml_backend_buffer_type_t buft) {
|
||||
|
||||
static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
CUDA_CHECK(cudaFreeHost(buffer->context));
|
||||
|
|
@ -132,10 +132,10 @@ index 70bf6f3d9..f2b7fe692 100644
|
|||
|
||||
static void * ggml_backend_metal_buffer_private_get_base(ggml_backend_buffer_t buffer) {
|
||||
diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp
|
||||
index e5302f455..43fa83e8f 100644
|
||||
index 0d37587f6..ff373d413 100644
|
||||
--- a/ggml/src/ggml-opencl/ggml-opencl.cpp
|
||||
+++ b/ggml/src/ggml-opencl/ggml-opencl.cpp
|
||||
@@ -3412,6 +3412,7 @@ struct ggml_backend_opencl_buffer_context {
|
||||
@@ -3417,6 +3417,7 @@ struct ggml_backend_opencl_buffer_context {
|
||||
static void ggml_backend_opencl_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
|
||||
delete ctx;
|
||||
|
|
@ -144,10 +144,10 @@ index e5302f455..43fa83e8f 100644
|
|||
|
||||
static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp
|
||||
index 48fd99a76..da2aab3df 100644
|
||||
index 18a45d2d9..89041805e 100644
|
||||
--- a/ggml/src/ggml-rpc/ggml-rpc.cpp
|
||||
+++ b/ggml/src/ggml-rpc/ggml-rpc.cpp
|
||||
@@ -555,6 +555,7 @@ static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
@@ -556,6 +556,7 @@ static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_FREE_BUFFER, &request, sizeof(request), nullptr, 0);
|
||||
RPC_STATUS_ASSERT(status);
|
||||
delete ctx;
|
||||
|
|
@ -156,7 +156,7 @@ index 48fd99a76..da2aab3df 100644
|
|||
|
||||
static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
|
||||
index 3f1bdfb9f..a95c2f305 100644
|
||||
index 7449a9160..e69a1ff5f 100644
|
||||
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
|
||||
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
|
||||
@@ -355,6 +355,7 @@ ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
|
||||
|
|
@ -184,10 +184,10 @@ index 3f1bdfb9f..a95c2f305 100644
|
|||
|
||||
static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
index 66dd0bfab..83cdec29e 100644
|
||||
index c6f5809cc..c801d2fd2 100644
|
||||
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
@@ -12368,6 +12368,7 @@ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
@@ -12271,6 +12271,7 @@ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
|
||||
ggml_vk_destroy_buffer(ctx->dev_buffer);
|
||||
delete ctx;
|
||||
|
|
@ -195,7 +195,7 @@ index 66dd0bfab..83cdec29e 100644
|
|||
}
|
||||
|
||||
static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
@@ -12511,6 +12512,7 @@ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe
|
||||
@@ -12414,6 +12415,7 @@ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe
|
||||
static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
VK_LOG_MEMORY("ggml_backend_vk_host_buffer_free_buffer()");
|
||||
ggml_vk_host_free(vk_instance.devices[0], buffer->context);
|
||||
|
|
|
|||
|
|
@ -10,7 +10,7 @@ logs instead of throwing an error
|
|||
1 file changed, 3 insertions(+), 11 deletions(-)
|
||||
|
||||
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
|
||||
index a73c4c448..b9f0631f4 100644
|
||||
index e2cca66e4..8246a0a14 100644
|
||||
--- a/src/llama-vocab.cpp
|
||||
+++ b/src/llama-vocab.cpp
|
||||
@@ -1825,16 +1825,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
|
|
|
|||
|
|
@ -10,7 +10,7 @@ filesystems for paths that include wide characters
|
|||
1 file changed, 39 insertions(+)
|
||||
|
||||
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
|
||||
index 05777d2d9..f4c4d2c48 100644
|
||||
index 3ed08a0fe..6be1470ad 100644
|
||||
--- a/tools/mtmd/clip.cpp
|
||||
+++ b/tools/mtmd/clip.cpp
|
||||
@@ -24,6 +24,19 @@
|
||||
|
|
@ -33,7 +33,7 @@ index 05777d2d9..f4c4d2c48 100644
|
|||
struct clip_logger_state g_logger_state = {clip_log_callback_default, NULL};
|
||||
|
||||
enum ffn_op_type {
|
||||
@@ -3255,7 +3268,29 @@ struct clip_model_loader {
|
||||
@@ -3257,7 +3270,29 @@ struct clip_model_loader {
|
||||
{
|
||||
std::vector<uint8_t> read_buf;
|
||||
|
||||
|
|
@ -63,7 +63,7 @@ index 05777d2d9..f4c4d2c48 100644
|
|||
if (!fin) {
|
||||
throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str()));
|
||||
}
|
||||
@@ -3282,7 +3317,11 @@ struct clip_model_loader {
|
||||
@@ -3284,7 +3319,11 @@ struct clip_model_loader {
|
||||
ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -19,7 +19,7 @@ adds support for the Solar Pro architecture
|
|||
create mode 100644 src/models/solar.cpp
|
||||
|
||||
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
|
||||
index 67c7807e0..fda881640 100644
|
||||
index 4192af7c0..bd44d73e7 100644
|
||||
--- a/src/CMakeLists.txt
|
||||
+++ b/src/CMakeLists.txt
|
||||
@@ -125,6 +125,7 @@ add_library(llama
|
||||
|
|
@ -31,7 +31,7 @@ index 67c7807e0..fda881640 100644
|
|||
models/starcoder.cpp
|
||||
models/starcoder2.cpp
|
||||
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
|
||||
index 8571a2e02..b6bde25d5 100644
|
||||
index 64ad1b776..a5fe4f66c 100644
|
||||
--- a/src/llama-arch.cpp
|
||||
+++ b/src/llama-arch.cpp
|
||||
@@ -85,6 +85,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
|
|
@ -42,15 +42,15 @@ index 8571a2e02..b6bde25d5 100644
|
|||
{ LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
|
||||
{ LLM_ARCH_PLM, "plm" },
|
||||
{ LLM_ARCH_BAILINGMOE, "bailingmoe" },
|
||||
@@ -204,6 +205,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
|
||||
@@ -206,6 +207,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||
{ LLM_KV_ATTENTION_OUTPUT_SCALE, "%s.attention.output_scale" },
|
||||
{ LLM_KV_ATTENTION_TEMPERATURE_LENGTH, "%s.attention.temperature_length" },
|
||||
{ LLM_KV_ATTENTION_TEMPERATURE_SCALE, "%s.attention.temperature_scale" },
|
||||
+ { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection" },
|
||||
{ LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
|
||||
{ LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
|
||||
|
||||
@@ -2023,6 +2025,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
@@ -2025,6 +2027,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
||||
},
|
||||
},
|
||||
|
|
@ -75,7 +75,7 @@ index 8571a2e02..b6bde25d5 100644
|
|||
{
|
||||
LLM_ARCH_WAVTOKENIZER_DEC,
|
||||
{
|
||||
@@ -2681,6 +2701,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
||||
@@ -2710,6 +2730,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
||||
{LLM_TENSOR_LAUREL_POST_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
// this tensor is loaded for T5, but never used
|
||||
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
|
||||
|
|
@ -84,7 +84,7 @@ index 8571a2e02..b6bde25d5 100644
|
|||
{LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
diff --git a/src/llama-arch.h b/src/llama-arch.h
|
||||
index 150646478..3936a4687 100644
|
||||
index e11318002..ec9e3a6df 100644
|
||||
--- a/src/llama-arch.h
|
||||
+++ b/src/llama-arch.h
|
||||
@@ -89,6 +89,7 @@ enum llm_arch {
|
||||
|
|
@ -95,15 +95,15 @@ index 150646478..3936a4687 100644
|
|||
LLM_ARCH_WAVTOKENIZER_DEC,
|
||||
LLM_ARCH_PLM,
|
||||
LLM_ARCH_BAILINGMOE,
|
||||
@@ -208,6 +209,7 @@ enum llm_kv {
|
||||
LLM_KV_ATTENTION_SCALE,
|
||||
@@ -210,6 +211,7 @@ enum llm_kv {
|
||||
LLM_KV_ATTENTION_OUTPUT_SCALE,
|
||||
LLM_KV_ATTENTION_TEMPERATURE_LENGTH,
|
||||
LLM_KV_ATTENTION_TEMPERATURE_SCALE,
|
||||
+ LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
|
||||
LLM_KV_ATTENTION_KEY_LENGTH_MLA,
|
||||
LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
|
||||
|
||||
@@ -459,6 +461,7 @@ enum llm_tensor {
|
||||
@@ -462,6 +464,7 @@ enum llm_tensor {
|
||||
LLM_TENSOR_ENC_OUTPUT_NORM,
|
||||
LLM_TENSOR_CLS,
|
||||
LLM_TENSOR_CLS_OUT,
|
||||
|
|
@ -131,7 +131,7 @@ index 8cdbaf69f..41127bf91 100644
|
|||
if (il < n_layer) {
|
||||
return swa_layers[il];
|
||||
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
|
||||
index c3a53be79..2ffe7dd30 100644
|
||||
index 6eff334a5..a778fc3cf 100644
|
||||
--- a/src/llama-hparams.h
|
||||
+++ b/src/llama-hparams.h
|
||||
@@ -64,6 +64,8 @@ struct llama_hparams {
|
||||
|
|
@ -167,10 +167,10 @@ index aa3a65f87..ee303bd58 100644
|
|||
llama_model_loader::llama_model_loader(
|
||||
const std::string & fname,
|
||||
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
|
||||
index c2a545531..4468de2f9 100644
|
||||
index 04fccc979..3c503b424 100644
|
||||
--- a/src/llama-model.cpp
|
||||
+++ b/src/llama-model.cpp
|
||||
@@ -1961,6 +1961,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
@@ -1975,6 +1975,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
|
|
@ -192,7 +192,7 @@ index c2a545531..4468de2f9 100644
|
|||
case LLM_ARCH_WAVTOKENIZER_DEC:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
@@ -5350,6 +5365,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
@@ -5401,6 +5416,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
|
||||
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||
|
||||
|
|
@ -227,7 +227,7 @@ index c2a545531..4468de2f9 100644
|
|||
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
||||
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
||||
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
||||
@@ -7425,6 +7468,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
||||
@@ -7480,6 +7523,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
||||
{
|
||||
llm = std::make_unique<llm_build_chameleon>(*this, params);
|
||||
} break;
|
||||
|
|
@ -238,7 +238,7 @@ index c2a545531..4468de2f9 100644
|
|||
case LLM_ARCH_WAVTOKENIZER_DEC:
|
||||
{
|
||||
llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params);
|
||||
@@ -7684,6 +7731,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||
@@ -7743,6 +7790,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||
case LLM_ARCH_GRANITE_MOE:
|
||||
case LLM_ARCH_GRANITE_HYBRID:
|
||||
case LLM_ARCH_CHAMELEON:
|
||||
|
|
@ -268,10 +268,10 @@ index f8342cf2c..cbf4e1bfa 100644
|
|||
|
||||
struct llama_layer_convnext convnext;
|
||||
diff --git a/src/models/models.h b/src/models/models.h
|
||||
index 7ba225b47..71fea796d 100644
|
||||
index 6494f5450..e0aec822c 100644
|
||||
--- a/src/models/models.h
|
||||
+++ b/src/models/models.h
|
||||
@@ -510,6 +510,11 @@ struct llm_build_smollm3 : public llm_graph_context {
|
||||
@@ -515,6 +515,11 @@ struct llm_build_smollm3 : public llm_graph_context {
|
||||
llm_build_smollm3(const llama_model & model, const llm_graph_params & params);
|
||||
};
|
||||
|
||||
|
|
|
|||
|
|
@ -12,7 +12,7 @@ regex
|
|||
2 files changed, 22 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
|
||||
index b9f0631f4..1525283d7 100644
|
||||
index 8246a0a14..dfba7778b 100644
|
||||
--- a/src/llama-vocab.cpp
|
||||
+++ b/src/llama-vocab.cpp
|
||||
@@ -299,7 +299,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
||||
|
|
@ -25,7 +25,7 @@ index b9f0631f4..1525283d7 100644
|
|||
"\\s+$",
|
||||
"[一-龥ࠀ-一가-]+",
|
||||
diff --git a/src/unicode.cpp b/src/unicode.cpp
|
||||
index 77ba4fc46..040518e1e 100644
|
||||
index bb44edfad..13ced055f 100644
|
||||
--- a/src/unicode.cpp
|
||||
+++ b/src/unicode.cpp
|
||||
@@ -2,6 +2,11 @@
|
||||
|
|
|
|||
|
|
@ -8,7 +8,7 @@ Subject: [PATCH] maintain ordering for rules for grammar
|
|||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp
|
||||
index c8421e1e8..cb659915d 100644
|
||||
index c3b4e5d9d..6be552826 100644
|
||||
--- a/common/json-schema-to-grammar.cpp
|
||||
+++ b/common/json-schema-to-grammar.cpp
|
||||
@@ -310,7 +310,7 @@ private:
|
||||
|
|
|
|||
|
|
@ -11,10 +11,10 @@ with the fastest acceleration is loaded
|
|||
1 file changed, 13 insertions(+), 8 deletions(-)
|
||||
|
||||
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
|
||||
index e96b5c403..a55d9b280 100644
|
||||
index 4181a714a..079dba211 100644
|
||||
--- a/ggml/src/ggml-backend-reg.cpp
|
||||
+++ b/ggml/src/ggml-backend-reg.cpp
|
||||
@@ -179,7 +179,7 @@ struct ggml_backend_reg_entry {
|
||||
@@ -183,7 +183,7 @@ struct ggml_backend_reg_entry {
|
||||
|
||||
struct ggml_backend_registry {
|
||||
std::vector<ggml_backend_reg_entry> backends;
|
||||
|
|
@ -23,7 +23,7 @@ index e96b5c403..a55d9b280 100644
|
|||
|
||||
ggml_backend_registry() {
|
||||
#ifdef GGML_USE_CUDA
|
||||
@@ -230,7 +230,7 @@ struct ggml_backend_registry {
|
||||
@@ -237,7 +237,7 @@ struct ggml_backend_registry {
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -32,7 +32,7 @@ index e96b5c403..a55d9b280 100644
|
|||
if (!reg) {
|
||||
return;
|
||||
}
|
||||
@@ -241,15 +241,20 @@ struct ggml_backend_registry {
|
||||
@@ -248,15 +248,20 @@ struct ggml_backend_registry {
|
||||
#endif
|
||||
backends.push_back({ reg, std::move(handle) });
|
||||
for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) {
|
||||
|
|
@ -56,7 +56,7 @@ index e96b5c403..a55d9b280 100644
|
|||
}
|
||||
|
||||
ggml_backend_reg_t load_backend(const fs::path & path, bool silent) {
|
||||
@@ -293,7 +298,7 @@ struct ggml_backend_registry {
|
||||
@@ -300,7 +305,7 @@ struct ggml_backend_registry {
|
||||
|
||||
GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path_str(path).c_str());
|
||||
|
||||
|
|
@ -65,7 +65,7 @@ index e96b5c403..a55d9b280 100644
|
|||
|
||||
return reg;
|
||||
}
|
||||
@@ -316,7 +321,7 @@ struct ggml_backend_registry {
|
||||
@@ -323,7 +328,7 @@ struct ggml_backend_registry {
|
||||
// remove devices
|
||||
devices.erase(
|
||||
std::remove_if(devices.begin(), devices.end(),
|
||||
|
|
@ -74,7 +74,7 @@ index e96b5c403..a55d9b280 100644
|
|||
devices.end());
|
||||
|
||||
// remove backend
|
||||
@@ -374,7 +379,7 @@ size_t ggml_backend_dev_count() {
|
||||
@@ -381,7 +386,7 @@ size_t ggml_backend_dev_count() {
|
||||
|
||||
ggml_backend_dev_t ggml_backend_dev_get(size_t index) {
|
||||
GGML_ASSERT(index < ggml_backend_dev_count());
|
||||
|
|
|
|||
|
|
@ -8,10 +8,10 @@ Subject: [PATCH] add phony target ggml-cpu for all cpu variants
|
|||
1 file changed, 2 insertions(+)
|
||||
|
||||
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
|
||||
index d93664b8b..800f98b65 100644
|
||||
index 4c04c3300..f4747f262 100644
|
||||
--- a/ggml/src/CMakeLists.txt
|
||||
+++ b/ggml/src/CMakeLists.txt
|
||||
@@ -349,6 +349,7 @@ function(ggml_add_cpu_backend_variant tag_name)
|
||||
@@ -345,6 +345,7 @@ function(ggml_add_cpu_backend_variant tag_name)
|
||||
endif()
|
||||
|
||||
ggml_add_cpu_backend_variant_impl(${tag_name})
|
||||
|
|
@ -19,7 +19,7 @@ index d93664b8b..800f98b65 100644
|
|||
endfunction()
|
||||
|
||||
ggml_add_backend(CPU)
|
||||
@@ -359,6 +360,7 @@ if (GGML_CPU_ALL_VARIANTS)
|
||||
@@ -355,6 +356,7 @@ if (GGML_CPU_ALL_VARIANTS)
|
||||
elseif (GGML_CPU_ARM_ARCH)
|
||||
message(FATAL_ERROR "Cannot use both GGML_CPU_ARM_ARCH and GGML_CPU_ALL_VARIANTS")
|
||||
endif()
|
||||
|
|
|
|||
|
|
@ -9,10 +9,10 @@ disable amx as it reduces performance on some systems
|
|||
1 file changed, 4 deletions(-)
|
||||
|
||||
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
|
||||
index 800f98b65..6d493a4ff 100644
|
||||
index f4747f262..d55aed348 100644
|
||||
--- a/ggml/src/CMakeLists.txt
|
||||
+++ b/ggml/src/CMakeLists.txt
|
||||
@@ -369,10 +369,6 @@ if (GGML_CPU_ALL_VARIANTS)
|
||||
@@ -365,10 +365,6 @@ if (GGML_CPU_ALL_VARIANTS)
|
||||
ggml_add_cpu_backend_variant(skylakex SSE42 AVX F16C AVX2 BMI2 FMA AVX512)
|
||||
ggml_add_cpu_backend_variant(icelake SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
|
||||
ggml_add_cpu_backend_variant(alderlake SSE42 AVX F16C AVX2 BMI2 FMA AVX_VNNI)
|
||||
|
|
|
|||
|
|
@ -25,7 +25,7 @@ index 79ee20206..3efb22f01 100644
|
|||
// get ith C string from array with given key_id
|
||||
GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int64_t key_id, size_t i);
|
||||
diff --git a/ggml/src/gguf.cpp b/ggml/src/gguf.cpp
|
||||
index 8cc4ef1cf..d950dbdf5 100644
|
||||
index b165d8bdc..f91d4faba 100644
|
||||
--- a/ggml/src/gguf.cpp
|
||||
+++ b/ggml/src/gguf.cpp
|
||||
@@ -805,10 +805,14 @@ enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int64_t key_id
|
||||
|
|
@ -53,7 +53,7 @@ index 8cc4ef1cf..d950dbdf5 100644
|
|||
}
|
||||
|
||||
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
|
||||
index 1525283d7..ea450c361 100644
|
||||
index dfba7778b..f72f321b9 100644
|
||||
--- a/src/llama-vocab.cpp
|
||||
+++ b/src/llama-vocab.cpp
|
||||
@@ -1781,9 +1781,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
|
|
|
|||
|
|
@ -8,7 +8,7 @@ Subject: [PATCH] ollama debug tensor
|
|||
1 file changed, 6 insertions(+)
|
||||
|
||||
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
|
||||
index 3247af8bb..5be08d6f4 100644
|
||||
index b468b115a..bb65985b4 100644
|
||||
--- a/ggml/src/ggml-cpu/ggml-cpu.c
|
||||
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
|
||||
@@ -15,6 +15,8 @@
|
||||
|
|
@ -20,7 +20,7 @@ index 3247af8bb..5be08d6f4 100644
|
|||
#if defined(_MSC_VER) || defined(__MINGW32__)
|
||||
#include <malloc.h> // using malloc.h with MSC/MINGW
|
||||
#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
|
||||
@@ -2922,6 +2924,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
||||
@@ -2928,6 +2930,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
||||
|
||||
ggml_compute_forward(¶ms, node);
|
||||
|
||||
|
|
|
|||
|
|
@ -4,16 +4,16 @@ Date: Mon, 21 Apr 2025 13:30:31 -0700
|
|||
Subject: [PATCH] add ollama vocab for grammar support
|
||||
|
||||
---
|
||||
src/llama-grammar.cpp | 49 ++++++++++++++++++++++++++++++++++++------
|
||||
src/llama-grammar.cpp | 48 ++++++++++++++++++++++++++++++++++++------
|
||||
src/llama-grammar.h | 14 ++++++++++++
|
||||
src/llama-sampling.cpp | 6 +++---
|
||||
3 files changed, 59 insertions(+), 10 deletions(-)
|
||||
3 files changed, 58 insertions(+), 10 deletions(-)
|
||||
|
||||
diff --git a/src/llama-grammar.cpp b/src/llama-grammar.cpp
|
||||
index b3c5eb571..a7307c47f 100644
|
||||
index 75d5d750c..a0299d181 100644
|
||||
--- a/src/llama-grammar.cpp
|
||||
+++ b/src/llama-grammar.cpp
|
||||
@@ -915,6 +915,7 @@ llama_grammar_candidates llama_grammar_reject_candidates_for_stack(
|
||||
@@ -1041,6 +1041,7 @@ llama_grammar_candidates llama_grammar_reject_candidates_for_stack(
|
||||
|
||||
struct llama_grammar * llama_grammar_init_impl(
|
||||
const struct llama_vocab * vocab,
|
||||
|
|
@ -21,15 +21,15 @@ index b3c5eb571..a7307c47f 100644
|
|||
const llama_grammar_element ** rules,
|
||||
size_t n_rules,
|
||||
size_t start_rule_index) {
|
||||
@@ -970,6 +971,7 @@ struct llama_grammar * llama_grammar_init_impl(
|
||||
@@ -1096,6 +1097,7 @@ struct llama_grammar * llama_grammar_init_impl(
|
||||
// then the pointers would be invalidated when the local vec_rules goes out of scope.
|
||||
return new llama_grammar {
|
||||
vocab,
|
||||
+ ollama_vocab,
|
||||
std::move(vec_rules),
|
||||
std::move(stacks),
|
||||
/* .partial_utf8 = */ {},
|
||||
@@ -983,6 +985,7 @@ struct llama_grammar * llama_grammar_init_impl(
|
||||
/* .partial_utf8 = */ {},
|
||||
@@ -1110,6 +1112,7 @@ struct llama_grammar * llama_grammar_init_impl(
|
||||
|
||||
struct llama_grammar * llama_grammar_init_impl(
|
||||
const struct llama_vocab * vocab,
|
||||
|
|
@ -37,15 +37,15 @@ index b3c5eb571..a7307c47f 100644
|
|||
const char * grammar_str,
|
||||
const char * grammar_root,
|
||||
bool lazy,
|
||||
@@ -1075,6 +1078,7 @@ struct llama_grammar * llama_grammar_init_impl(
|
||||
@@ -1202,6 +1205,7 @@ struct llama_grammar * llama_grammar_init_impl(
|
||||
// then the pointers would be invalidated when the local vec_rules goes out of scope.
|
||||
return new llama_grammar {
|
||||
vocab,
|
||||
+ ollama_vocab,
|
||||
std::move(vec_rules),
|
||||
std::move(stacks),
|
||||
/* .partial_utf8 = */ {},
|
||||
@@ -1097,6 +1101,7 @@ void llama_grammar_free_impl(struct llama_grammar * grammar) {
|
||||
/* .partial_utf8 = */ {},
|
||||
@@ -1225,6 +1229,7 @@ void llama_grammar_free_impl(struct llama_grammar * grammar) {
|
||||
struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & grammar) {
|
||||
auto * result = new llama_grammar {
|
||||
grammar.vocab,
|
||||
|
|
@ -53,7 +53,7 @@ index b3c5eb571..a7307c47f 100644
|
|||
grammar.rules,
|
||||
grammar.stacks,
|
||||
grammar.partial_utf8,
|
||||
@@ -1124,7 +1129,6 @@ struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & gra
|
||||
@@ -1253,7 +1258,6 @@ struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & gra
|
||||
}
|
||||
|
||||
void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_data_array * cur_p) {
|
||||
|
|
@ -61,7 +61,7 @@ index b3c5eb571..a7307c47f 100644
|
|||
|
||||
if (grammar.awaiting_trigger) {
|
||||
return;
|
||||
@@ -1146,9 +1150,13 @@ void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_
|
||||
@@ -1275,9 +1279,13 @@ void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_
|
||||
|
||||
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||
const llama_token id = cur_p->data[i].id;
|
||||
|
|
@ -77,7 +77,7 @@ index b3c5eb571..a7307c47f 100644
|
|||
if (!allow_eog) {
|
||||
cur_p->data[i].logit = -INFINITY;
|
||||
}
|
||||
@@ -1167,9 +1175,10 @@ void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_
|
||||
@@ -1296,9 +1304,10 @@ void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_
|
||||
}
|
||||
|
||||
void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token) {
|
||||
|
|
@ -90,7 +90,7 @@ index b3c5eb571..a7307c47f 100644
|
|||
|
||||
if (grammar.awaiting_trigger) {
|
||||
if (std::find(grammar.trigger_tokens.begin(), grammar.trigger_tokens.end(), token) != grammar.trigger_tokens.end()) {
|
||||
@@ -1209,13 +1218,14 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
|
||||
@@ -1353,13 +1362,14 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -106,12 +106,11 @@ index b3c5eb571..a7307c47f 100644
|
|||
+ GGML_ABORT("grammar error: end of grammar token received but grammar stack is not empty");
|
||||
}
|
||||
|
||||
llama_grammar_accept_str(grammar, piece);
|
||||
@@ -1235,3 +1245,28 @@ void llama_grammar_accept_str(struct llama_grammar & grammar, const std::string
|
||||
throw std::runtime_error("Unexpected empty grammar stack after accepting piece: " + piece);
|
||||
llama_grammar_accept_token(grammar, token, piece);
|
||||
@@ -1435,3 +1445,27 @@ void llama_grammar_accept_token(struct llama_grammar & grammar, llama_token toke
|
||||
}
|
||||
}
|
||||
+
|
||||
|
||||
+
|
||||
+const std::string & ollama_vocab::token_to_piece(const uint32_t token) const {
|
||||
+ try {
|
||||
|
|
@ -137,7 +136,7 @@ index b3c5eb571..a7307c47f 100644
|
|||
+ }
|
||||
+}
|
||||
diff --git a/src/llama-grammar.h b/src/llama-grammar.h
|
||||
index f8c291de9..2a3a62db3 100644
|
||||
index a4c978ac1..5c0da4049 100644
|
||||
--- a/src/llama-grammar.h
|
||||
+++ b/src/llama-grammar.h
|
||||
@@ -6,8 +6,19 @@
|
||||
|
|
@ -160,15 +159,15 @@ index f8c291de9..2a3a62db3 100644
|
|||
|
||||
// grammar element type
|
||||
enum llama_gretype {
|
||||
@@ -114,6 +125,7 @@ struct llama_grammar_trigger_pattern {
|
||||
struct llama_grammar {
|
||||
@@ -127,6 +138,7 @@ struct llama_grammar {
|
||||
|
||||
// note: allow null vocab for testing (not great)
|
||||
const llama_vocab * vocab;
|
||||
+ const ollama_vocab * o_vocab;
|
||||
|
||||
const llama_grammar_rules rules; // TODO: shared ptr
|
||||
llama_grammar_stacks stacks;
|
||||
@@ -141,12 +153,14 @@ struct llama_grammar {
|
||||
@@ -155,12 +167,14 @@ struct llama_grammar {
|
||||
// note: needed for tests (not great)
|
||||
struct llama_grammar * llama_grammar_init_impl(
|
||||
const struct llama_vocab * vocab,
|
||||
|
|
|
|||
|
|
@ -4,18 +4,18 @@ Date: Thu, 1 May 2025 13:45:12 -0700
|
|||
Subject: [PATCH] add argsort and cuda copy for i32
|
||||
|
||||
---
|
||||
ggml/src/ggml-cpu/ops.cpp | 43 ++++++++++
|
||||
ggml/src/ggml-cuda/argsort.cu | 122 ++++++++++++++++++++++++---
|
||||
ggml/src/ggml-cuda/cpy-utils.cuh | 6 ++
|
||||
ggml/src/ggml-cuda/cpy.cu | 40 +++++++++
|
||||
ggml/src/ggml-metal/ggml-metal.metal | 69 +++++++++++++++
|
||||
5 files changed, 268 insertions(+), 12 deletions(-)
|
||||
ggml/src/ggml-cpu/ops.cpp | 43 ++++++
|
||||
ggml/src/ggml-cuda/argsort.cu | 122 +++++++++++++--
|
||||
ggml/src/ggml-cuda/cpy-utils.cuh | 6 +
|
||||
ggml/src/ggml-cuda/cpy.cu | 40 +++++
|
||||
ggml/src/ggml-metal/ggml-metal.metal | 215 +++++++++++++++++++++++++++
|
||||
5 files changed, 414 insertions(+), 12 deletions(-)
|
||||
|
||||
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
|
||||
index 2745fc54e..40666bab6 100644
|
||||
index 303278397..7d1733adb 100644
|
||||
--- a/ggml/src/ggml-cpu/ops.cpp
|
||||
+++ b/ggml/src/ggml-cpu/ops.cpp
|
||||
@@ -7846,6 +7846,45 @@ static void ggml_compute_forward_argsort_f32(
|
||||
@@ -7932,6 +7932,45 @@ static void ggml_compute_forward_argsort_f32(
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -61,7 +61,7 @@ index 2745fc54e..40666bab6 100644
|
|||
void ggml_compute_forward_argsort(
|
||||
const ggml_compute_params * params,
|
||||
ggml_tensor * dst) {
|
||||
@@ -7857,6 +7896,10 @@ void ggml_compute_forward_argsort(
|
||||
@@ -7943,6 +7982,10 @@ void ggml_compute_forward_argsort(
|
||||
{
|
||||
ggml_compute_forward_argsort_f32(params, dst);
|
||||
} break;
|
||||
|
|
@ -292,10 +292,10 @@ index c4ceb4fc5..0e53ecc39 100644
|
|||
if (can_be_transposed) {
|
||||
ggml_cpy_scalar_cuda<nv_bfloat16, nv_bfloat16, true>
|
||||
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
|
||||
index 73b45c762..aed013a9d 100644
|
||||
index 51bcbae30..236838e9e 100644
|
||||
--- a/ggml/src/ggml-metal/ggml-metal.metal
|
||||
+++ b/ggml/src/ggml-metal/ggml-metal.metal
|
||||
@@ -4721,8 +4721,77 @@ kernel void kernel_argsort_f32_i32(
|
||||
@@ -4954,8 +4954,77 @@ kernel void kernel_argsort_f32_i32(
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -373,3 +373,158 @@ index 73b45c762..aed013a9d 100644
|
|||
|
||||
typedef void (argsort_merge_t)(
|
||||
constant ggml_metal_kargs_argsort_merge & args,
|
||||
@@ -5110,8 +5179,154 @@ kernel void kernel_argsort_merge_f32_i32(
|
||||
}
|
||||
}
|
||||
|
||||
+template<ggml_sort_order order>
|
||||
+kernel void kernel_argsort_merge_i32_i32(
|
||||
+ constant ggml_metal_kargs_argsort_merge & args,
|
||||
+ device const char * src0,
|
||||
+ device const int32_t * tmp,
|
||||
+ device int32_t * dst,
|
||||
+ uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
+ ushort3 tpitg[[thread_position_in_threadgroup]],
|
||||
+ ushort3 ntg[[threads_per_threadgroup]]) {
|
||||
+
|
||||
+ const int im = tgpig[0] / args.ne01;
|
||||
+ const int i01 = tgpig[0] % args.ne01;
|
||||
+ const int i02 = tgpig[1];
|
||||
+ const int i03 = tgpig[2];
|
||||
+
|
||||
+ const int start = im * (2 * args.len);
|
||||
+
|
||||
+ const int len0 = MIN(args.len, MAX(0, args.ne0 - (int)(start)));
|
||||
+ const int len1 = MIN(args.len, MAX(0, args.ne0 - (int)(start + args.len)));
|
||||
+
|
||||
+ const int total = len0 + len1;
|
||||
+
|
||||
+ device const int32_t * tmp0 = tmp + start
|
||||
+ + i01*args.ne0
|
||||
+ + i02*args.ne0*args.ne01
|
||||
+ + i03*args.ne0*args.ne01*args.ne02;
|
||||
+
|
||||
+ device const int32_t * tmp1 = tmp0 + args.len;
|
||||
+
|
||||
+ dst += start
|
||||
+ + i01*args.top_k
|
||||
+ + i02*args.top_k*args.ne01
|
||||
+ + i03*args.top_k*args.ne01*args.ne02;
|
||||
+
|
||||
+ device const int32_t * src0_row = (device const int32_t *)(src0
|
||||
+ + args.nb01*i01
|
||||
+ + args.nb02*i02
|
||||
+ + args.nb03*i03);
|
||||
+
|
||||
+ if (total == 0) {
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
+ const int chunk = (total + ntg.x - 1) / ntg.x;
|
||||
+
|
||||
+ const int k0 = tpitg.x * chunk;
|
||||
+ const int k1 = MIN(MIN(k0 + chunk, total), args.top_k);
|
||||
+
|
||||
+ if (k0 >= args.top_k) {
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
+ if (k0 >= total) {
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
+ int low = k0 > len1 ? k0 - len1 : 0;
|
||||
+ int high = MIN(k0, len0);
|
||||
+
|
||||
+ // binary-search partition (i, j) such that i + j = k
|
||||
+ while (low < high) {
|
||||
+ const int mid = (low + high) >> 1;
|
||||
+
|
||||
+ const int32_t idx0 = tmp0[mid];
|
||||
+ const int32_t idx1 = tmp1[k0 - mid - 1];
|
||||
+
|
||||
+ const int32_t val0 = src0_row[idx0];
|
||||
+ const int32_t val1 = src0_row[idx1];
|
||||
+
|
||||
+ bool take_left;
|
||||
+ if (order == GGML_SORT_ORDER_ASC) {
|
||||
+ take_left = (val0 <= val1);
|
||||
+ } else {
|
||||
+ take_left = (val0 >= val1);
|
||||
+ }
|
||||
+
|
||||
+ if (take_left) {
|
||||
+ low = mid + 1;
|
||||
+ } else {
|
||||
+ high = mid;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ int i = low;
|
||||
+ int j = k0 - i;
|
||||
+
|
||||
+ // keep the merge fronts into registers
|
||||
+ int32_t idx0 = 0;
|
||||
+ int32_t val0 = 0.0f;
|
||||
+ if (i < len0) {
|
||||
+ idx0 = tmp0[i];
|
||||
+ val0 = src0_row[idx0];
|
||||
+ }
|
||||
+
|
||||
+ int32_t idx1 = 0;
|
||||
+ int32_t val1 = 0.0f;
|
||||
+ if (j < len1) {
|
||||
+ idx1 = tmp1[j];
|
||||
+ val1 = src0_row[idx1];
|
||||
+ }
|
||||
+
|
||||
+ for (int k = k0; k < k1; ++k) {
|
||||
+ int32_t out_idx;
|
||||
+
|
||||
+ if (i >= len0) {
|
||||
+ while (k < k1) {
|
||||
+ dst[k++] = tmp1[j++];
|
||||
+ }
|
||||
+ break;
|
||||
+ } else if (j >= len1) {
|
||||
+ while (k < k1) {
|
||||
+ dst[k++] = tmp0[i++];
|
||||
+ }
|
||||
+ break;
|
||||
+ } else {
|
||||
+ bool take_left;
|
||||
+
|
||||
+ if (order == GGML_SORT_ORDER_ASC) {
|
||||
+ take_left = (val0 <= val1);
|
||||
+ } else {
|
||||
+ take_left = (val0 >= val1);
|
||||
+ }
|
||||
+
|
||||
+ if (take_left) {
|
||||
+ out_idx = idx0;
|
||||
+ ++i;
|
||||
+ if (i < len0) {
|
||||
+ idx0 = tmp0[i];
|
||||
+ val0 = src0_row[idx0];
|
||||
+ }
|
||||
+ } else {
|
||||
+ out_idx = idx1;
|
||||
+ ++j;
|
||||
+ if (j < len1) {
|
||||
+ idx1 = tmp1[j];
|
||||
+ val1 = src0_row[idx1];
|
||||
+ }
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ dst[k] = out_idx;
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
template [[host_name("kernel_argsort_merge_f32_i32_asc")]] kernel argsort_merge_t kernel_argsort_merge_f32_i32<GGML_SORT_ORDER_ASC>;
|
||||
template [[host_name("kernel_argsort_merge_f32_i32_desc")]] kernel argsort_merge_t kernel_argsort_merge_f32_i32<GGML_SORT_ORDER_DESC>;
|
||||
+template [[host_name("kernel_argsort_merge_i32_i32_asc")]] kernel argsort_merge_t kernel_argsort_merge_i32_i32<GGML_SORT_ORDER_ASC>;
|
||||
+template [[host_name("kernel_argsort_merge_i32_i32_desc")]] kernel argsort_merge_t kernel_argsort_merge_i32_i32<GGML_SORT_ORDER_DESC>;
|
||||
|
||||
kernel void kernel_leaky_relu_f32(
|
||||
constant ggml_metal_kargs_leaky_relu & args,
|
||||
|
|
|
|||
|
|
@ -35,10 +35,10 @@ index f1b740785..c54ff98bf 100644
|
|||
GGML_API void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
|
||||
GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
|
||||
diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
|
||||
index 218222ece..06ee502ab 100644
|
||||
index a5995fdc2..dbfd8b5b2 100644
|
||||
--- a/ggml/src/ggml-alloc.c
|
||||
+++ b/ggml/src/ggml-alloc.c
|
||||
@@ -493,6 +493,7 @@ struct node_alloc {
|
||||
@@ -494,6 +494,7 @@ struct node_alloc {
|
||||
struct ggml_gallocr {
|
||||
ggml_backend_buffer_type_t * bufts; // [n_buffers]
|
||||
struct vbuffer ** buffers; // [n_buffers]
|
||||
|
|
@ -46,7 +46,7 @@ index 218222ece..06ee502ab 100644
|
|||
struct ggml_dyn_tallocr ** buf_tallocs; // [n_buffers]
|
||||
int n_buffers;
|
||||
|
||||
@@ -516,6 +517,9 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
|
||||
@@ -517,6 +518,9 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
|
||||
galloc->buffers = calloc(n_bufs, sizeof(struct vbuffer *));
|
||||
GGML_ASSERT(galloc->buffers != NULL);
|
||||
|
||||
|
|
@ -56,7 +56,7 @@ index 218222ece..06ee502ab 100644
|
|||
galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));
|
||||
GGML_ASSERT(galloc->buf_tallocs != NULL);
|
||||
|
||||
@@ -583,6 +587,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
|
||||
@@ -584,6 +588,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
|
||||
ggml_hash_set_free(&galloc->hash_set);
|
||||
free(galloc->hash_values);
|
||||
free(galloc->bufts);
|
||||
|
|
@ -64,7 +64,7 @@ index 218222ece..06ee502ab 100644
|
|||
free(galloc->buffers);
|
||||
free(galloc->buf_tallocs);
|
||||
free(galloc->node_allocs);
|
||||
@@ -898,6 +903,8 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
||||
@@ -899,6 +904,8 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -73,7 +73,7 @@ index 218222ece..06ee502ab 100644
|
|||
// reallocate buffers if needed
|
||||
for (int i = 0; i < galloc->n_buffers; i++) {
|
||||
// if the buffer type is used multiple times, we reuse the same buffer
|
||||
@@ -932,14 +939,19 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
||||
@@ -933,14 +940,19 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
||||
#endif
|
||||
ggml_vbuffer_free(galloc->buffers[i]);
|
||||
galloc->buffers[i] = ggml_vbuffer_alloc(galloc->bufts[i], galloc->buf_tallocs[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
|
||||
|
|
@ -96,7 +96,7 @@ index 218222ece..06ee502ab 100644
|
|||
}
|
||||
|
||||
bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
|
||||
@@ -1094,6 +1106,22 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
|
||||
@@ -1095,6 +1107,22 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
|
||||
return ggml_vbuffer_size(galloc->buffers[buffer_id]);
|
||||
}
|
||||
|
||||
|
|
@ -120,10 +120,10 @@ index 218222ece..06ee502ab 100644
|
|||
|
||||
static void free_buffers(ggml_backend_buffer_t ** buffers, const size_t * n_buffers) {
|
||||
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
|
||||
index 4882541c8..ff41c7712 100644
|
||||
index afde2f0b7..dbf8486a0 100644
|
||||
--- a/ggml/src/ggml-backend.cpp
|
||||
+++ b/ggml/src/ggml-backend.cpp
|
||||
@@ -1813,6 +1813,13 @@ size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backe
|
||||
@@ -1840,6 +1840,13 @@ size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backe
|
||||
return ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -22,10 +22,10 @@ index c54ff98bf..229bf387b 100644
|
|||
size_t memory_total;
|
||||
// device type
|
||||
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
index 8f3b1c173..e803f4af6 100644
|
||||
index 5145c1e88..f641c1016 100644
|
||||
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
@@ -185,6 +185,51 @@ static int ggml_cuda_parse_id(char devName[]) {
|
||||
@@ -189,6 +189,51 @@ static int ggml_cuda_parse_id(char devName[]) {
|
||||
}
|
||||
#endif // defined(GGML_USE_HIP)
|
||||
|
||||
|
|
@ -77,7 +77,7 @@ index 8f3b1c173..e803f4af6 100644
|
|||
static ggml_cuda_device_info ggml_cuda_init() {
|
||||
ggml_cuda_device_info info = {};
|
||||
|
||||
@@ -251,22 +296,24 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
||||
@@ -255,22 +300,24 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
||||
info.devices[id].cc += prop.minor * 0x10;
|
||||
}
|
||||
}
|
||||
|
|
@ -108,7 +108,7 @@ index 8f3b1c173..e803f4af6 100644
|
|||
std::string device_name(prop.name);
|
||||
if (device_name == "NVIDIA GeForce MX450") {
|
||||
turing_devices_without_mma.push_back({ id, device_name });
|
||||
@@ -4048,6 +4095,7 @@ struct ggml_backend_cuda_device_context {
|
||||
@@ -4110,6 +4157,7 @@ struct ggml_backend_cuda_device_context {
|
||||
std::string name;
|
||||
std::string description;
|
||||
std::string pci_bus_id;
|
||||
|
|
@ -116,7 +116,7 @@ index 8f3b1c173..e803f4af6 100644
|
|||
};
|
||||
|
||||
static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
|
||||
@@ -4136,6 +4184,11 @@ static bool ggml_backend_cuda_get_available_uma_memory(long * available_memory_k
|
||||
@@ -4198,6 +4246,11 @@ static bool ggml_backend_cuda_get_available_uma_memory(long * available_memory_k
|
||||
}
|
||||
#endif // defined(__linux__)
|
||||
|
||||
|
|
@ -128,7 +128,7 @@ index 8f3b1c173..e803f4af6 100644
|
|||
static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
|
||||
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
|
||||
ggml_cuda_set_device(ctx->device);
|
||||
@@ -4176,6 +4229,7 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
|
||||
@@ -4238,6 +4291,7 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
|
||||
|
||||
props->name = ggml_backend_cuda_device_get_name(dev);
|
||||
props->description = ggml_backend_cuda_device_get_description(dev);
|
||||
|
|
@ -136,7 +136,7 @@ index 8f3b1c173..e803f4af6 100644
|
|||
props->type = ggml_backend_cuda_device_get_type(dev);
|
||||
props->device_id = ctx->pci_bus_id.empty() ? nullptr : ctx->pci_bus_id.c_str();
|
||||
ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total);
|
||||
@@ -4767,6 +4821,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
|
||||
@@ -4833,6 +4887,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
|
||||
cudaDeviceProp prop;
|
||||
CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
|
||||
dev_ctx->description = prop.name;
|
||||
|
|
|
|||
|
|
@ -10,7 +10,7 @@ Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
|
|||
2 files changed, 13 insertions(+)
|
||||
|
||||
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
|
||||
index dfad9cd79..9858de630 100644
|
||||
index d06fa42e6..0f5712e21 100644
|
||||
--- a/tools/mtmd/mtmd.cpp
|
||||
+++ b/tools/mtmd/mtmd.cpp
|
||||
@@ -87,6 +87,16 @@ enum mtmd_slice_tmpl {
|
||||
|
|
@ -31,7 +31,7 @@ index dfad9cd79..9858de630 100644
|
|||
return "<__media__>";
|
||||
}
|
||||
diff --git a/tools/mtmd/mtmd.h b/tools/mtmd/mtmd.h
|
||||
index 015119be8..8d3fa5d34 100644
|
||||
index b3df24c29..a6a1af3b8 100644
|
||||
--- a/tools/mtmd/mtmd.h
|
||||
+++ b/tools/mtmd/mtmd.h
|
||||
@@ -75,6 +75,9 @@ typedef struct mtmd_input_chunk mtmd_input_chunk;
|
||||
|
|
|
|||
|
|
@ -8,10 +8,10 @@ Subject: [PATCH] no power throttling win32 with gnuc
|
|||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
|
||||
index 5be08d6f4..7a0df30c3 100644
|
||||
index bb65985b4..47089a62e 100644
|
||||
--- a/ggml/src/ggml-cpu/ggml-cpu.c
|
||||
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
|
||||
@@ -2463,7 +2463,7 @@ static bool ggml_thread_apply_priority(int32_t prio) {
|
||||
@@ -2464,7 +2464,7 @@ static bool ggml_thread_apply_priority(int32_t prio) {
|
||||
// Newer Windows 11 versions aggresively park (offline) CPU cores and often place
|
||||
// all our threads onto the first 4 cores which results in terrible performance with
|
||||
// n_threads > 4
|
||||
|
|
|
|||
|
|
@ -58,7 +58,7 @@ index 6792ba986..0f5b03cef 100644
|
|||
// (optional) event synchronization
|
||||
// record an event on this stream
|
||||
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
|
||||
index ff41c7712..f511e8d76 100644
|
||||
index dbf8486a0..312ca873c 100644
|
||||
--- a/ggml/src/ggml-backend.cpp
|
||||
+++ b/ggml/src/ggml-backend.cpp
|
||||
@@ -348,14 +348,14 @@ enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_ba
|
||||
|
|
@ -86,9 +86,9 @@ index ff41c7712..f511e8d76 100644
|
|||
+ int batch_size; // a hint on the batch size to optimize processing, -1 to use heuristics
|
||||
+
|
||||
int debug;
|
||||
};
|
||||
|
||||
@@ -814,7 +816,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
|
||||
// used for debugging graph reallocations [GGML_SCHED_DEBUG_REALLOC]
|
||||
@@ -820,7 +822,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
|
||||
if (tensor->op != GGML_OP_ROPE && src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
||||
int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor);
|
||||
// check if a backend with higher prio wants to offload the op
|
||||
|
|
@ -97,7 +97,7 @@ index ff41c7712..f511e8d76 100644
|
|||
for (int b = 0; b < src_backend_id; b++) {
|
||||
if (ggml_backend_supports_op(sched->backends[b], tensor) && ggml_backend_offload_op(sched->backends[b], tensor)) {
|
||||
SET_CAUSE(tensor, "1.off");
|
||||
@@ -1556,7 +1558,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
||||
@@ -1572,7 +1574,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
||||
}
|
||||
|
||||
if (!sched->callback_eval) {
|
||||
|
|
@ -106,7 +106,7 @@ index ff41c7712..f511e8d76 100644
|
|||
if (ec != GGML_STATUS_SUCCESS) {
|
||||
return ec;
|
||||
}
|
||||
@@ -1578,7 +1580,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
||||
@@ -1594,7 +1596,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
||||
|
||||
struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1);
|
||||
|
||||
|
|
@ -115,7 +115,7 @@ index ff41c7712..f511e8d76 100644
|
|||
if (ec != GGML_STATUS_SUCCESS) {
|
||||
return ec;
|
||||
}
|
||||
@@ -1657,6 +1659,7 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
||||
@@ -1684,6 +1686,7 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
||||
|
||||
sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends);
|
||||
sched->op_offload = op_offload;
|
||||
|
|
@ -123,7 +123,7 @@ index ff41c7712..f511e8d76 100644
|
|||
|
||||
ggml_backend_sched_reset(sched);
|
||||
|
||||
@@ -1688,6 +1691,10 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
|
||||
@@ -1715,6 +1718,10 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
|
||||
free(sched);
|
||||
}
|
||||
|
||||
|
|
@ -178,10 +178,10 @@ index 3191faaa4..32f14c811 100644
|
|||
|
||||
static const struct ggml_backend_i ggml_backend_cpu_i = {
|
||||
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
index e803f4af6..78fb2d8b3 100644
|
||||
index f641c1016..17062697b 100644
|
||||
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
@@ -2885,7 +2885,7 @@ static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
|
||||
@@ -2901,7 +2901,7 @@ static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
|
||||
|
||||
#ifdef USE_CUDA_GRAPH
|
||||
static bool check_node_graph_compatibility(ggml_cgraph * cgraph,
|
||||
|
|
@ -190,7 +190,7 @@ index e803f4af6..78fb2d8b3 100644
|
|||
|
||||
// Loop over nodes in GGML graph to obtain info needed for CUDA graph
|
||||
|
||||
@@ -2918,24 +2918,34 @@ static bool check_node_graph_compatibility(ggml_cgraph * cgraph,
|
||||
@@ -2934,24 +2934,34 @@ static bool check_node_graph_compatibility(ggml_cgraph * cgraph,
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
@ -241,7 +241,7 @@ index e803f4af6..78fb2d8b3 100644
|
|||
}
|
||||
|
||||
if (!use_cuda_graph) {
|
||||
@@ -3679,7 +3689,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
|
||||
@@ -3742,7 +3752,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -250,7 +250,7 @@ index e803f4af6..78fb2d8b3 100644
|
|||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
||||
|
||||
ggml_cuda_set_device(cuda_ctx->device);
|
||||
@@ -3717,7 +3727,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
|
||||
@@ -3780,7 +3790,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
|
||||
if (use_cuda_graph) {
|
||||
cuda_graph_update_required = is_cuda_graph_update_required(cuda_ctx, cgraph);
|
||||
|
||||
|
|
@ -278,10 +278,10 @@ index 8fc1c2fb5..ba95b4acc 100644
|
|||
|
||||
static void ggml_backend_metal_graph_optimize(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
||||
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
index 83cdec29e..a36c6560c 100644
|
||||
index c801d2fd2..b2c0d0cee 100644
|
||||
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
@@ -13103,7 +13103,7 @@ static uint32_t ggml_vk_fuse_multi_add(ggml_backend_vk_context * ctx, const stru
|
||||
@@ -13006,7 +13006,7 @@ static uint32_t ggml_vk_fuse_multi_add(ggml_backend_vk_context * ctx, const stru
|
||||
return num_adds;
|
||||
}
|
||||
|
||||
|
|
@ -290,7 +290,7 @@ index 83cdec29e..a36c6560c 100644
|
|||
VK_LOG_DEBUG("ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)");
|
||||
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
||||
|
||||
@@ -13320,6 +13320,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
||||
@@ -13241,6 +13241,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
||||
return GGML_STATUS_SUCCESS;
|
||||
|
||||
UNUSED(backend);
|
||||
|
|
|
|||
|
|
@ -10,10 +10,10 @@ must be recreated with no-alloc set to false before loading data.
|
|||
---
|
||||
ggml/include/ggml-backend.h | 1 +
|
||||
ggml/src/ggml-backend-impl.h | 16 +++
|
||||
ggml/src/ggml-backend.cpp | 72 ++++++++++-
|
||||
ggml/src/ggml-cuda/common.cuh | 58 ++++++++-
|
||||
ggml/src/ggml-cuda/ggml-cuda.cu | 218 ++++++++++++++++++++++++++------
|
||||
5 files changed, 321 insertions(+), 44 deletions(-)
|
||||
ggml/src/ggml-backend.cpp | 72 +++++++++-
|
||||
ggml/src/ggml-cuda/common.cuh | 62 ++++++++-
|
||||
ggml/src/ggml-cuda/ggml-cuda.cu | 224 ++++++++++++++++++++++++++------
|
||||
5 files changed, 331 insertions(+), 44 deletions(-)
|
||||
|
||||
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
|
||||
index 2763f2bd6..b3b5b356a 100644
|
||||
|
|
@ -75,7 +75,7 @@ index 0f5b03cef..7bdf9d81f 100644
|
|||
|
||||
struct ggml_backend {
|
||||
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
|
||||
index f511e8d76..74b7f070c 100644
|
||||
index 312ca873c..4092dfe8a 100644
|
||||
--- a/ggml/src/ggml-backend.cpp
|
||||
+++ b/ggml/src/ggml-backend.cpp
|
||||
@@ -41,6 +41,19 @@ ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t
|
||||
|
|
@ -121,10 +121,10 @@ index f511e8d76..74b7f070c 100644
|
|||
void * base = buffer->iface.get_base(buffer);
|
||||
|
||||
GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL");
|
||||
@@ -725,6 +745,12 @@ struct ggml_backend_sched {
|
||||
int batch_size; // a hint on the batch size to optimize processing, -1 to use heuristics
|
||||
|
||||
int debug;
|
||||
@@ -731,6 +751,12 @@ struct ggml_backend_sched {
|
||||
int debug_realloc;
|
||||
int debug_graph_size;
|
||||
int debug_prev_graph_size;
|
||||
+
|
||||
+ // allocate buffers on attached ggml_backend_buffer_type_t's and during reservation
|
||||
+ // if false, dummy buffers are used for faster memory sizing calculations
|
||||
|
|
@ -134,7 +134,7 @@ index f511e8d76..74b7f070c 100644
|
|||
};
|
||||
|
||||
#define hash_id(tensor) ggml_hash_find_or_insert(&sched->hash_set, tensor)
|
||||
@@ -1614,6 +1640,17 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
||||
@@ -1630,6 +1656,17 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
||||
size_t graph_size,
|
||||
bool parallel,
|
||||
bool op_offload) {
|
||||
|
|
@ -152,7 +152,7 @@ index f511e8d76..74b7f070c 100644
|
|||
GGML_ASSERT(n_backends > 0);
|
||||
GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
|
||||
GGML_ASSERT(ggml_backend_dev_type(ggml_backend_get_device(backends[n_backends - 1])) == GGML_BACKEND_DEVICE_TYPE_CPU);
|
||||
@@ -1655,11 +1692,14 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
||||
@@ -1682,11 +1719,14 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
||||
sched->events[b][c] = ggml_backend_event_new(backends[b]->device);
|
||||
}
|
||||
}
|
||||
|
|
@ -167,7 +167,7 @@ index f511e8d76..74b7f070c 100644
|
|||
|
||||
ggml_backend_sched_reset(sched);
|
||||
|
||||
@@ -1674,6 +1714,10 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
|
||||
@@ -1701,6 +1741,10 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
|
||||
for (int c = 0; c < sched->n_copies; c++) {
|
||||
ggml_backend_event_free(sched->events[b][c]);
|
||||
}
|
||||
|
|
@ -178,7 +178,7 @@ index f511e8d76..74b7f070c 100644
|
|||
}
|
||||
ggml_gallocr_free(sched->galloc);
|
||||
ggml_free(sched->ctx);
|
||||
@@ -1719,6 +1763,24 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
|
||||
@@ -1746,6 +1790,24 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
@ -203,7 +203,7 @@ index f511e8d76..74b7f070c 100644
|
|||
ggml_backend_sched_reset(sched);
|
||||
|
||||
return true;
|
||||
@@ -1824,7 +1886,13 @@ size_t ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched,
|
||||
@@ -1851,7 +1913,13 @@ size_t ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched,
|
||||
int backend_index = ggml_backend_sched_backend_id(sched, backend);
|
||||
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
||||
|
||||
|
|
@ -219,7 +219,7 @@ index f511e8d76..74b7f070c 100644
|
|||
|
||||
void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
|
||||
diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
|
||||
index 611341deb..c3f8ca914 100644
|
||||
index c4529f5d9..8b0fb5d42 100644
|
||||
--- a/ggml/src/ggml-cuda/common.cuh
|
||||
+++ b/ggml/src/ggml-cuda/common.cuh
|
||||
@@ -37,6 +37,41 @@
|
||||
|
|
@ -264,7 +264,7 @@ index 611341deb..c3f8ca914 100644
|
|||
#define STRINGIZE_IMPL(...) #__VA_ARGS__
|
||||
#define STRINGIZE(...) STRINGIZE_IMPL(__VA_ARGS__)
|
||||
|
||||
@@ -891,6 +926,9 @@ struct ggml_cuda_pool {
|
||||
@@ -938,6 +973,9 @@ struct ggml_cuda_pool {
|
||||
|
||||
virtual void * alloc(size_t size, size_t * actual_size) = 0;
|
||||
virtual void free(void * ptr, size_t size) = 0;
|
||||
|
|
@ -274,7 +274,7 @@ index 611341deb..c3f8ca914 100644
|
|||
};
|
||||
|
||||
template<typename T>
|
||||
@@ -1179,11 +1217,11 @@ struct ggml_backend_cuda_context {
|
||||
@@ -1229,11 +1267,15 @@ struct ggml_backend_cuda_context {
|
||||
// pool
|
||||
std::unique_ptr<ggml_cuda_pool> pools[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS];
|
||||
|
||||
|
|
@ -284,11 +284,15 @@ index 611341deb..c3f8ca914 100644
|
|||
ggml_cuda_pool & pool(int device) {
|
||||
if (pools[device][curr_stream_no] == nullptr) {
|
||||
- pools[device][curr_stream_no] = new_pool_for_device(device, curr_stream_no);
|
||||
+ pools[device][curr_stream_no] = new_pool_for_device(device, curr_stream_no, true);
|
||||
+ bool alloc = true;
|
||||
+ if (pools[device][0] != nullptr) {
|
||||
+ alloc = pools[device][0]->alloc_memory();
|
||||
+ }
|
||||
+ pools[device][curr_stream_no] = new_pool_for_device(device, curr_stream_no, alloc);
|
||||
}
|
||||
return *pools[device][curr_stream_no];
|
||||
}
|
||||
@@ -1191,6 +1229,22 @@ struct ggml_backend_cuda_context {
|
||||
@@ -1241,6 +1283,22 @@ struct ggml_backend_cuda_context {
|
||||
ggml_cuda_pool & pool() {
|
||||
return pool(device);
|
||||
}
|
||||
|
|
@ -301,21 +305,21 @@ index 611341deb..c3f8ca914 100644
|
|||
+ }
|
||||
+ }
|
||||
+
|
||||
+ size_t pool_get_alloc_size() {
|
||||
+ if (pools[device][curr_stream_no] == nullptr) {
|
||||
+ size_t pool_get_alloc_size(int stream_no) {
|
||||
+ if (pools[device][stream_no] == nullptr) {
|
||||
+ return 0;
|
||||
+ }
|
||||
+
|
||||
+ return pools[device][curr_stream_no]->alloc_size();
|
||||
+ return pools[device][stream_no]->alloc_size();
|
||||
+ }
|
||||
};
|
||||
|
||||
struct ggml_cuda_mm_fusion_args_host {
|
||||
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
index 78fb2d8b3..fe0da71ca 100644
|
||||
index 17062697b..ede1d089a 100644
|
||||
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
@@ -361,6 +361,8 @@ const ggml_cuda_device_info & ggml_cuda_info() {
|
||||
@@ -365,6 +365,8 @@ const ggml_cuda_device_info & ggml_cuda_info() {
|
||||
|
||||
// #define DEBUG_CUDA_MALLOC
|
||||
|
||||
|
|
@ -324,7 +328,7 @@ index 78fb2d8b3..fe0da71ca 100644
|
|||
// buffer pool for cuda (legacy)
|
||||
struct ggml_cuda_pool_leg : public ggml_cuda_pool {
|
||||
static const int MAX_BUFFERS = 256;
|
||||
@@ -373,9 +375,12 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
|
||||
@@ -377,9 +379,12 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
|
||||
|
||||
ggml_cuda_buffer buffer_pool[MAX_BUFFERS] = {};
|
||||
size_t pool_size = 0;
|
||||
|
|
@ -339,7 +343,7 @@ index 78fb2d8b3..fe0da71ca 100644
|
|||
}
|
||||
|
||||
~ggml_cuda_pool_leg() {
|
||||
@@ -383,7 +388,9 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
|
||||
@@ -387,7 +392,9 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
|
||||
for (int i = 0; i < MAX_BUFFERS; ++i) {
|
||||
ggml_cuda_buffer & b = buffer_pool[i];
|
||||
if (b.ptr != nullptr) {
|
||||
|
|
@ -350,7 +354,7 @@ index 78fb2d8b3..fe0da71ca 100644
|
|||
pool_size -= b.size;
|
||||
}
|
||||
}
|
||||
@@ -431,8 +438,15 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
|
||||
@@ -435,8 +442,15 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
|
||||
void * ptr;
|
||||
size_t look_ahead_size = (size_t) (1.05 * size);
|
||||
look_ahead_size = 256 * ((look_ahead_size + 255)/256);
|
||||
|
|
@ -368,7 +372,7 @@ index 78fb2d8b3..fe0da71ca 100644
|
|||
*actual_size = look_ahead_size;
|
||||
pool_size += look_ahead_size;
|
||||
#ifdef DEBUG_CUDA_MALLOC
|
||||
@@ -452,10 +466,20 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
|
||||
@@ -456,10 +470,20 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
|
||||
}
|
||||
}
|
||||
GGML_LOG_DEBUG(GGML_CUDA_NAME " buffer pool full, increase MAX_CUDA_BUFFERS\n");
|
||||
|
|
@ -391,7 +395,7 @@ index 78fb2d8b3..fe0da71ca 100644
|
|||
};
|
||||
|
||||
// pool with virtual memory
|
||||
@@ -467,18 +491,24 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
|
||||
@@ -471,18 +495,24 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
|
||||
CUdeviceptr pool_addr = 0;
|
||||
size_t pool_used = 0;
|
||||
size_t pool_size = 0;
|
||||
|
|
@ -419,7 +423,7 @@ index 78fb2d8b3..fe0da71ca 100644
|
|||
#if defined(GGML_USE_HIP)
|
||||
// Workaround for https://github.com/ROCm/ROCR-Runtime/issues/285
|
||||
for (std::pair<CUdeviceptr, size_t> & mapping : mappings) {
|
||||
@@ -505,35 +535,49 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
|
||||
@@ -509,35 +539,49 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
|
||||
|
||||
GGML_ASSERT(pool_size + reserve_size <= CUDA_POOL_VMM_MAX_SIZE);
|
||||
|
||||
|
|
@ -495,7 +499,7 @@ index 78fb2d8b3..fe0da71ca 100644
|
|||
|
||||
// add to the pool
|
||||
pool_size += reserve_size;
|
||||
@@ -566,17 +610,27 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
|
||||
@@ -570,17 +614,27 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
|
||||
// all deallocations must be in reverse order of the allocations
|
||||
GGML_ASSERT(ptr == (void *) ((char *)(pool_addr) + pool_used));
|
||||
}
|
||||
|
|
@ -526,7 +530,7 @@ index 78fb2d8b3..fe0da71ca 100644
|
|||
}
|
||||
|
||||
// destroying a cuBLAS handle while a graph is being captured in a different thread can result in a CUDA error
|
||||
@@ -760,11 +814,20 @@ static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_bac
|
||||
@@ -764,11 +818,20 @@ static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_bac
|
||||
}
|
||||
|
||||
static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
||||
|
|
@ -548,7 +552,7 @@ index 78fb2d8b3..fe0da71ca 100644
|
|||
static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
|
||||
size_t size = ggml_nbytes(tensor);
|
||||
int64_t ne0 = tensor->ne[0];
|
||||
@@ -788,6 +851,7 @@ static const ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface
|
||||
@@ -792,6 +855,7 @@ static const ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface
|
||||
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
||||
/* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size,
|
||||
/* .is_host = */ NULL,
|
||||
|
|
@ -556,7 +560,7 @@ index 78fb2d8b3..fe0da71ca 100644
|
|||
};
|
||||
|
||||
ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
|
||||
@@ -3258,6 +3322,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
|
||||
@@ -3274,6 +3338,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
|
||||
|
||||
static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
|
||||
bool & graph_evaluated_or_captured, bool & use_cuda_graph, bool & cuda_graph_update_required) {
|
||||
|
|
@ -564,7 +568,7 @@ index 78fb2d8b3..fe0da71ca 100644
|
|||
// flag used to determine whether it is an integrated_gpu
|
||||
const bool integrated = ggml_cuda_info().devices[cuda_ctx->device].integrated;
|
||||
|
||||
@@ -3347,6 +3412,10 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
|
||||
@@ -3410,6 +3475,10 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
|
||||
continue;
|
||||
}
|
||||
|
||||
|
|
@ -575,7 +579,7 @@ index 78fb2d8b3..fe0da71ca 100644
|
|||
|
||||
// start of fusion operations
|
||||
static bool disable_fusion = (getenv("GGML_CUDA_DISABLE_FUSION") != nullptr);
|
||||
@@ -3691,6 +3760,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
|
||||
@@ -3754,6 +3823,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
|
||||
|
||||
static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph, int batch_size) {
|
||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
||||
|
|
@ -583,7 +587,7 @@ index 78fb2d8b3..fe0da71ca 100644
|
|||
|
||||
ggml_cuda_set_device(cuda_ctx->device);
|
||||
|
||||
@@ -3766,6 +3836,71 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
|
||||
@@ -3829,6 +3899,77 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
|
||||
return GGML_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
|
|
@ -644,18 +648,24 @@ index 78fb2d8b3..fe0da71ca 100644
|
|||
+
|
||||
+static size_t ggml_backend_cuda_buffer_size(ggml_backend_t backend) {
|
||||
+ ggml_backend_cuda_context * ctx = (ggml_backend_cuda_context *)backend->context;
|
||||
+ return ctx->pool_get_alloc_size();
|
||||
+ size_t allocs = 0;
|
||||
+ for (int i = 0; i < GGML_CUDA_MAX_STREAMS; i++) {
|
||||
+ allocs += ctx->pool_get_alloc_size(i);
|
||||
+ }
|
||||
+ return allocs;
|
||||
+}
|
||||
+
|
||||
+static void ggml_backend_cuda_reset(ggml_backend_t backend) {
|
||||
+ ggml_backend_cuda_context * ctx = (ggml_backend_cuda_context *)backend->context;
|
||||
+ ctx->pools[ctx->device][ctx->curr_stream_no] = NULL;
|
||||
+ for (int i = 0; i < GGML_CUDA_MAX_STREAMS; i++) {
|
||||
+ ctx->pools[ctx->device][i] = NULL;
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
static void ggml_backend_cuda_event_record(ggml_backend_t backend, ggml_backend_event_t event) {
|
||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
||||
|
||||
@@ -4035,6 +4170,9 @@ static const ggml_backend_i ggml_backend_cuda_interface = {
|
||||
@@ -4097,6 +4238,9 @@ static const ggml_backend_i ggml_backend_cuda_interface = {
|
||||
/* .event_record = */ ggml_backend_cuda_event_record,
|
||||
/* .event_wait = */ ggml_backend_cuda_event_wait,
|
||||
/* .graph_optimize = */ ggml_backend_cuda_graph_optimize,
|
||||
|
|
|
|||
|
|
@ -8,7 +8,7 @@ Subject: [PATCH] decode: disable output_all
|
|||
1 file changed, 1 insertion(+), 2 deletions(-)
|
||||
|
||||
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
|
||||
index e04f0fc4f..1359c614b 100644
|
||||
index 417140071..87f407f99 100644
|
||||
--- a/src/llama-context.cpp
|
||||
+++ b/src/llama-context.cpp
|
||||
@@ -999,8 +999,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
||||
|
|
|
|||
|
|
@ -43,7 +43,7 @@ index 7bdf9d81f..21b35ac5c 100644
|
|||
|
||||
struct ggml_backend_device {
|
||||
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
|
||||
index 74b7f070c..8d2cc167f 100644
|
||||
index 4092dfe8a..a1a19fe51 100644
|
||||
--- a/ggml/src/ggml-backend.cpp
|
||||
+++ b/ggml/src/ggml-backend.cpp
|
||||
@@ -526,6 +526,14 @@ ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * par
|
||||
|
|
@ -62,10 +62,10 @@ index 74b7f070c..8d2cc167f 100644
|
|||
GGML_ASSERT(device);
|
||||
return device->iface.get_buffer_type(device);
|
||||
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
index fe0da71ca..0787e443c 100644
|
||||
index ede1d089a..ec63cadab 100644
|
||||
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
@@ -109,6 +109,11 @@ int ggml_cuda_get_device() {
|
||||
@@ -113,6 +113,11 @@ int ggml_cuda_get_device() {
|
||||
return id;
|
||||
}
|
||||
|
||||
|
|
@ -77,7 +77,7 @@ index fe0da71ca..0787e443c 100644
|
|||
static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) {
|
||||
ggml_cuda_set_device(device);
|
||||
cudaError_t err;
|
||||
@@ -4380,7 +4385,10 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
|
||||
@@ -4448,7 +4453,10 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
|
||||
props->id = ggml_backend_cuda_device_get_id(dev);
|
||||
props->type = ggml_backend_cuda_device_get_type(dev);
|
||||
props->device_id = ctx->pci_bus_id.empty() ? nullptr : ctx->pci_bus_id.c_str();
|
||||
|
|
@ -89,7 +89,7 @@ index fe0da71ca..0787e443c 100644
|
|||
|
||||
bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr;
|
||||
#ifdef GGML_CUDA_NO_PEER_COPY
|
||||
@@ -4835,6 +4843,11 @@ static void ggml_backend_cuda_device_event_synchronize(ggml_backend_dev_t dev, g
|
||||
@@ -4907,6 +4915,11 @@ static void ggml_backend_cuda_device_event_synchronize(ggml_backend_dev_t dev, g
|
||||
CUDA_CHECK(cudaEventSynchronize((cudaEvent_t)event->context));
|
||||
}
|
||||
|
||||
|
|
@ -101,7 +101,7 @@ index fe0da71ca..0787e443c 100644
|
|||
static const ggml_backend_device_i ggml_backend_cuda_device_interface = {
|
||||
/* .get_name = */ ggml_backend_cuda_device_get_name,
|
||||
/* .get_description = */ ggml_backend_cuda_device_get_description,
|
||||
@@ -4851,6 +4864,7 @@ static const ggml_backend_device_i ggml_backend_cuda_device_interface = {
|
||||
@@ -4923,6 +4936,7 @@ static const ggml_backend_device_i ggml_backend_cuda_device_interface = {
|
||||
/* .event_new = */ ggml_backend_cuda_device_event_new,
|
||||
/* .event_free = */ ggml_backend_cuda_device_event_free,
|
||||
/* .event_synchronize = */ ggml_backend_cuda_device_event_synchronize,
|
||||
|
|
|
|||
|
|
@ -45,10 +45,10 @@ index 69223c488..6510e0cba 100644
|
|||
|
||||
GGML_API const char * ggml_backend_dev_name(ggml_backend_dev_t device);
|
||||
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
|
||||
index 6d493a4ff..ac8f38464 100644
|
||||
index d55aed348..99ae293cc 100644
|
||||
--- a/ggml/src/CMakeLists.txt
|
||||
+++ b/ggml/src/CMakeLists.txt
|
||||
@@ -209,6 +209,8 @@ add_library(ggml-base
|
||||
@@ -205,6 +205,8 @@ add_library(ggml-base
|
||||
ggml-threading.h
|
||||
ggml-quants.c
|
||||
ggml-quants.h
|
||||
|
|
@ -58,10 +58,10 @@ index 6d493a4ff..ac8f38464 100644
|
|||
|
||||
set_target_properties(ggml-base PROPERTIES
|
||||
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
index 0787e443c..736d47c07 100644
|
||||
index ec63cadab..cd71902df 100644
|
||||
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
@@ -263,6 +263,16 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
||||
@@ -267,6 +267,16 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
||||
for (int id = 0; id < info.device_count; ++id) {
|
||||
int device_vmm = 0;
|
||||
|
||||
|
|
@ -78,7 +78,7 @@ index 0787e443c..736d47c07 100644
|
|||
#if defined(GGML_USE_VMM)
|
||||
CUdevice device;
|
||||
CU_CHECK(cuDeviceGet(&device, id));
|
||||
@@ -316,6 +326,11 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
||||
@@ -320,6 +330,11 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
||||
#else
|
||||
info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
|
||||
info.devices[id].cc = 100*prop.major + 10*prop.minor;
|
||||
|
|
@ -90,7 +90,7 @@ index 0787e443c..736d47c07 100644
|
|||
GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s, ID: %s\n",
|
||||
id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no",
|
||||
ggml_cuda_parse_uuid(prop, id).c_str());
|
||||
@@ -4249,6 +4264,11 @@ struct ggml_backend_cuda_device_context {
|
||||
@@ -4317,6 +4332,11 @@ struct ggml_backend_cuda_device_context {
|
||||
std::string description;
|
||||
std::string pci_bus_id;
|
||||
std::string id;
|
||||
|
|
@ -102,7 +102,7 @@ index 0787e443c..736d47c07 100644
|
|||
};
|
||||
|
||||
static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
|
||||
@@ -4345,6 +4365,28 @@ static const char * ggml_backend_cuda_device_get_id(ggml_backend_dev_t dev) {
|
||||
@@ -4413,6 +4433,28 @@ static const char * ggml_backend_cuda_device_get_id(ggml_backend_dev_t dev) {
|
||||
static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
|
||||
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
|
||||
ggml_cuda_set_device(ctx->device);
|
||||
|
|
@ -131,7 +131,7 @@ index 0787e443c..736d47c07 100644
|
|||
CUDA_CHECK(cudaMemGetInfo(free, total));
|
||||
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/17368
|
||||
@@ -4377,6 +4419,7 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend
|
||||
@@ -4445,6 +4487,7 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend
|
||||
return GGML_BACKEND_DEVICE_TYPE_GPU;
|
||||
}
|
||||
|
||||
|
|
@ -139,7 +139,7 @@ index 0787e443c..736d47c07 100644
|
|||
static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
|
||||
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
|
||||
|
||||
@@ -4390,6 +4433,19 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
|
||||
@@ -4458,6 +4501,19 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
|
||||
// If you need the memory data, call ggml_backend_dev_memory() explicitly.
|
||||
props->memory_total = props->memory_free = 0;
|
||||
|
||||
|
|
@ -159,7 +159,7 @@ index 0787e443c..736d47c07 100644
|
|||
bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr;
|
||||
#ifdef GGML_CUDA_NO_PEER_COPY
|
||||
bool events = false;
|
||||
@@ -4974,6 +5030,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
|
||||
@@ -5046,6 +5102,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
|
||||
std::lock_guard<std::mutex> lock(mutex);
|
||||
if (!initialized) {
|
||||
ggml_backend_cuda_reg_context * ctx = new ggml_backend_cuda_reg_context;
|
||||
|
|
@ -167,7 +167,7 @@ index 0787e443c..736d47c07 100644
|
|||
|
||||
for (int i = 0; i < ggml_cuda_info().device_count; i++) {
|
||||
ggml_backend_cuda_device_context * dev_ctx = new ggml_backend_cuda_device_context;
|
||||
@@ -4989,6 +5046,14 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
|
||||
@@ -5061,6 +5118,14 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
|
||||
snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.0", prop.pciDomainID, prop.pciBusID, prop.pciDeviceID);
|
||||
dev_ctx->pci_bus_id = pci_bus_id;
|
||||
|
||||
|
|
@ -243,7 +243,7 @@ index ba95b4acc..f6f8f7a10 100644
|
|||
/* .async = */ true,
|
||||
/* .host_buffer = */ false,
|
||||
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
index a36c6560c..a234eda2e 100644
|
||||
index b2c0d0cee..d9f4d34f5 100644
|
||||
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
@@ -236,6 +236,7 @@ class vk_memory_logger;
|
||||
|
|
@ -254,7 +254,7 @@ index a36c6560c..a234eda2e 100644
|
|||
|
||||
static constexpr uint32_t mul_mat_vec_max_cols = 8;
|
||||
static constexpr uint32_t p021_max_gqa_ratio = 8;
|
||||
@@ -12353,6 +12354,29 @@ static void ggml_vk_get_device_description(int device, char * description, size_
|
||||
@@ -12256,6 +12257,29 @@ static void ggml_vk_get_device_description(int device, char * description, size_
|
||||
snprintf(description, description_size, "%s", props.deviceName.data());
|
||||
}
|
||||
|
||||
|
|
@ -284,7 +284,7 @@ index a36c6560c..a234eda2e 100644
|
|||
// backend interface
|
||||
|
||||
#define UNUSED GGML_UNUSED
|
||||
@@ -13614,15 +13638,72 @@ void ggml_backend_vk_get_device_description(int device, char * description, size
|
||||
@@ -13535,15 +13559,72 @@ void ggml_backend_vk_get_device_description(int device, char * description, size
|
||||
ggml_vk_get_device_description(dev_idx, description, description_size);
|
||||
}
|
||||
|
||||
|
|
@ -361,7 +361,7 @@ index a36c6560c..a234eda2e 100644
|
|||
|
||||
if (membudget_supported) {
|
||||
memprops.pNext = &budgetprops;
|
||||
@@ -13674,8 +13755,13 @@ static std::string ggml_backend_vk_get_device_pci_id(int device_idx) {
|
||||
@@ -13595,8 +13676,13 @@ static std::string ggml_backend_vk_get_device_pci_id(int device_idx) {
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -376,7 +376,7 @@ index a36c6560c..a234eda2e 100644
|
|||
}
|
||||
|
||||
vk::PhysicalDeviceProperties2 props = {};
|
||||
@@ -13692,19 +13778,24 @@ static std::string ggml_backend_vk_get_device_pci_id(int device_idx) {
|
||||
@@ -13613,19 +13699,24 @@ static std::string ggml_backend_vk_get_device_pci_id(int device_idx) {
|
||||
|
||||
char pci_bus_id[16] = {};
|
||||
snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.%x", pci_domain, pci_bus, pci_device, pci_function);
|
||||
|
|
@ -410,7 +410,7 @@ index a36c6560c..a234eda2e 100644
|
|||
|
||||
static const char * ggml_backend_vk_device_get_name(ggml_backend_dev_t dev) {
|
||||
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
|
||||
@@ -13716,9 +13807,14 @@ static const char * ggml_backend_vk_device_get_description(ggml_backend_dev_t de
|
||||
@@ -13637,9 +13728,14 @@ static const char * ggml_backend_vk_device_get_description(ggml_backend_dev_t de
|
||||
return ctx->description.c_str();
|
||||
}
|
||||
|
||||
|
|
@ -426,7 +426,7 @@ index a36c6560c..a234eda2e 100644
|
|||
}
|
||||
|
||||
static ggml_backend_buffer_type_t ggml_backend_vk_device_get_buffer_type(ggml_backend_dev_t dev) {
|
||||
@@ -13742,8 +13838,9 @@ static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml
|
||||
@@ -13663,8 +13759,9 @@ static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml
|
||||
|
||||
props->name = ggml_backend_vk_device_get_name(dev);
|
||||
props->description = ggml_backend_vk_device_get_description(dev);
|
||||
|
|
@ -437,7 +437,7 @@ index a36c6560c..a234eda2e 100644
|
|||
ggml_backend_vk_device_get_memory(dev, &props->memory_free, &props->memory_total);
|
||||
props->caps = {
|
||||
/* .async = */ false,
|
||||
@@ -13751,6 +13848,13 @@ static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml
|
||||
@@ -13672,6 +13769,13 @@ static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml
|
||||
/* .buffer_from_host_ptr = */ false,
|
||||
/* .events = */ false,
|
||||
};
|
||||
|
|
@ -451,7 +451,7 @@ index a36c6560c..a234eda2e 100644
|
|||
}
|
||||
|
||||
static ggml_backend_t ggml_backend_vk_device_init(ggml_backend_dev_t dev, const char * params) {
|
||||
@@ -14319,6 +14423,8 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
|
||||
@@ -14236,6 +14340,8 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
|
||||
static std::mutex mutex;
|
||||
std::lock_guard<std::mutex> lock(mutex);
|
||||
if (!initialized) {
|
||||
|
|
@ -460,7 +460,7 @@ index a36c6560c..a234eda2e 100644
|
|||
for (int i = 0; i < ggml_backend_vk_get_device_count(); i++) {
|
||||
ggml_backend_vk_device_context * ctx = new ggml_backend_vk_device_context;
|
||||
char desc[256];
|
||||
@@ -14327,12 +14433,41 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
|
||||
@@ -14244,12 +14350,41 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
|
||||
ctx->name = GGML_VK_NAME + std::to_string(i);
|
||||
ctx->description = desc;
|
||||
ctx->is_integrated_gpu = ggml_backend_vk_get_device_type(i) == vk::PhysicalDeviceType::eIntegratedGpu;
|
||||
|
|
|
|||
|
|
@ -8,10 +8,10 @@ Subject: [PATCH] report LoadLibrary failures
|
|||
1 file changed, 12 insertions(+)
|
||||
|
||||
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
|
||||
index a55d9b280..ec6f7f1e9 100644
|
||||
index 079dba211..2474e0ed6 100644
|
||||
--- a/ggml/src/ggml-backend-reg.cpp
|
||||
+++ b/ggml/src/ggml-backend-reg.cpp
|
||||
@@ -122,6 +122,18 @@ static dl_handle * dl_load_library(const fs::path & path) {
|
||||
@@ -126,6 +126,18 @@ static dl_handle * dl_load_library(const fs::path & path) {
|
||||
SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
|
||||
|
||||
HMODULE handle = LoadLibraryW(path.wstring().c_str());
|
||||
|
|
|
|||
|
|
@ -13,7 +13,7 @@ interleaved version used for qwen3vl
|
|||
4 files changed, 16 insertions(+), 16 deletions(-)
|
||||
|
||||
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
|
||||
index 40666bab6..3155cb4bb 100644
|
||||
index 7d1733adb..f4aae5332 100644
|
||||
--- a/ggml/src/ggml-cpu/ops.cpp
|
||||
+++ b/ggml/src/ggml-cpu/ops.cpp
|
||||
@@ -5599,14 +5599,14 @@ static void ggml_mrope_cache_init(
|
||||
|
|
@ -59,10 +59,10 @@ index 88ed79111..71ca60214 100644
|
|||
} else {
|
||||
if (sector < sections.v[0]) {
|
||||
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
|
||||
index aed013a9d..a489de435 100644
|
||||
index 236838e9e..c98d269d1 100644
|
||||
--- a/ggml/src/ggml-metal/ggml-metal.metal
|
||||
+++ b/ggml/src/ggml-metal/ggml-metal.metal
|
||||
@@ -4009,14 +4009,14 @@ kernel void kernel_rope_multi(
|
||||
@@ -4242,14 +4242,14 @@ kernel void kernel_rope_multi(
|
||||
|
||||
float theta_base;
|
||||
if (FC_rope_is_imrope) {
|
||||
|
|
|
|||
|
|
@ -12,10 +12,10 @@ Subject: [PATCH] Add memory detection using DXGI + PDH
|
|||
create mode 100644 ggml/src/mem_dxgi_pdh.cpp
|
||||
|
||||
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
|
||||
index ac8f38464..faa1beed2 100644
|
||||
index 99ae293cc..9a134b7af 100644
|
||||
--- a/ggml/src/CMakeLists.txt
|
||||
+++ b/ggml/src/CMakeLists.txt
|
||||
@@ -211,6 +211,7 @@ add_library(ggml-base
|
||||
@@ -207,6 +207,7 @@ add_library(ggml-base
|
||||
ggml-quants.h
|
||||
mem_hip.cpp
|
||||
mem_nvml.cpp
|
||||
|
|
@ -38,7 +38,7 @@ index 1c07e767a..0da3e065b 100644
|
|||
#ifdef __cplusplus
|
||||
}
|
||||
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
index a234eda2e..c98f98c73 100644
|
||||
index d9f4d34f5..8a83427fb 100644
|
||||
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
@@ -74,6 +74,7 @@ DispatchLoaderDynamic & ggml_vk_default_dispatcher();
|
||||
|
|
@ -49,7 +49,7 @@ index a234eda2e..c98f98c73 100644
|
|||
|
||||
typedef struct VkPhysicalDeviceShaderBfloat16FeaturesKHR {
|
||||
VkStructureType sType;
|
||||
@@ -13655,6 +13656,7 @@ struct ggml_backend_vk_device_context {
|
||||
@@ -13576,6 +13577,7 @@ struct ggml_backend_vk_device_context {
|
||||
std::string pci_id;
|
||||
std::string id;
|
||||
std::string uuid;
|
||||
|
|
@ -57,7 +57,7 @@ index a234eda2e..c98f98c73 100644
|
|||
int major;
|
||||
int minor;
|
||||
int driver_major;
|
||||
@@ -13673,6 +13675,20 @@ void ggml_backend_vk_get_device_memory(ggml_backend_vk_device_context *ctx, size
|
||||
@@ -13594,6 +13596,20 @@ void ggml_backend_vk_get_device_memory(ggml_backend_vk_device_context *ctx, size
|
||||
|
||||
vk::PhysicalDeviceProperties2 props2;
|
||||
vkdev.getProperties2(&props2);
|
||||
|
|
@ -78,7 +78,7 @@ index a234eda2e..c98f98c73 100644
|
|||
|
||||
if (!is_integrated_gpu)
|
||||
{
|
||||
@@ -13704,7 +13720,6 @@ void ggml_backend_vk_get_device_memory(ggml_backend_vk_device_context *ctx, size
|
||||
@@ -13625,7 +13641,6 @@ void ggml_backend_vk_get_device_memory(ggml_backend_vk_device_context *ctx, size
|
||||
}
|
||||
// else fallback to memory budget if supported
|
||||
|
||||
|
|
@ -86,7 +86,7 @@ index a234eda2e..c98f98c73 100644
|
|||
if (membudget_supported) {
|
||||
memprops.pNext = &budgetprops;
|
||||
}
|
||||
@@ -14440,7 +14455,6 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
|
||||
@@ -14357,7 +14372,6 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
|
||||
/* .reg = */ reg,
|
||||
/* .context = */ ctx,
|
||||
});
|
||||
|
|
@ -94,7 +94,7 @@ index a234eda2e..c98f98c73 100644
|
|||
// Gather additional information about the device
|
||||
int dev_idx = vk_instance.device_indices[i];
|
||||
vk::PhysicalDeviceProperties props1;
|
||||
@@ -14463,6 +14477,14 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
|
||||
@@ -14380,6 +14394,14 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
|
||||
}
|
||||
}
|
||||
ctx->uuid = oss.str();
|
||||
|
|
|
|||
|
|
@ -10,10 +10,10 @@ fallback to cpu
|
|||
1 file changed, 3 insertions(+)
|
||||
|
||||
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
index 736d47c07..7350f6758 100644
|
||||
index cd71902df..d69d62193 100644
|
||||
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
@@ -4564,6 +4564,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
@@ -4632,6 +4632,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
if (b->type == GGML_TYPE_F16 && a->type != GGML_TYPE_F16) {
|
||||
return false;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -8,10 +8,10 @@ Subject: [PATCH] win: exit instead of abort
|
|||
1 file changed, 6 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
|
||||
index b99345a2e..1c9e0bc05 100644
|
||||
index 530ff7b95..fc0196eb7 100644
|
||||
--- a/ggml/src/ggml.c
|
||||
+++ b/ggml/src/ggml.c
|
||||
@@ -229,8 +229,13 @@ void ggml_abort(const char * file, int line, const char * fmt, ...) {
|
||||
@@ -250,8 +250,13 @@ void ggml_abort(const char * file, int line, const char * fmt, ...) {
|
||||
fprintf(stderr, "%s\n", message);
|
||||
ggml_print_backtrace();
|
||||
}
|
||||
|
|
|
|||
|
|
@ -9,10 +9,10 @@ Rever to prior logic of assuming an empty projector type is mlp
|
|||
1 file changed, 4 insertions(+)
|
||||
|
||||
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
|
||||
index f4c4d2c48..3334ff25b 100644
|
||||
index 6be1470ad..2a325c726 100644
|
||||
--- a/tools/mtmd/clip.cpp
|
||||
+++ b/tools/mtmd/clip.cpp
|
||||
@@ -2648,6 +2648,10 @@ struct clip_model_loader {
|
||||
@@ -2649,6 +2649,10 @@ struct clip_model_loader {
|
||||
if (proj_type.empty()) {
|
||||
if (modality == CLIP_MODALITY_VISION) {
|
||||
get_string(KEY_VISION_PROJ_TYPE, proj_type, false);
|
||||
|
|
|
|||
|
|
@ -0,0 +1,586 @@
|
|||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
||||
From: Daniel Bevenius <daniel.bevenius@gmail.com>
|
||||
Date: Mon, 15 Dec 2025 15:13:49 +0100
|
||||
Subject: [PATCH] llama : add support for NVIDIA Nemotron Nano 3
|
||||
|
||||
This commit adds support for the NVIDIA Nemotron Nano 3 model, enabling
|
||||
the conversion and running of this model.
|
||||
|
||||
fix indentation in llama-graph.cpp
|
||||
|
||||
fix indentation and move ffn_inp
|
||||
|
||||
convert : fix modify_tensors in NemotronHModel to call super()
|
||||
|
||||
fix pyright error
|
||||
|
||||
fix flake8 errors
|
||||
---
|
||||
convert_hf_to_gguf.py | 116 +++++++++++++++++++++++++++++++--
|
||||
gguf-py/gguf/constants.py | 29 +++++++++
|
||||
gguf-py/gguf/tensor_mapping.py | 9 ++-
|
||||
src/llama-arch.cpp | 35 ++++++++++
|
||||
src/llama-arch.h | 1 +
|
||||
src/llama-graph.cpp | 10 +++
|
||||
src/llama-model.cpp | 50 +++++++++++---
|
||||
src/llama-model.h | 1 +
|
||||
src/models/nemotron-h.cpp | 41 ++++++++++--
|
||||
9 files changed, 269 insertions(+), 23 deletions(-)
|
||||
|
||||
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
|
||||
index 867bc9053..57ec2faac 100755
|
||||
--- a/convert_hf_to_gguf.py
|
||||
+++ b/convert_hf_to_gguf.py
|
||||
@@ -8601,8 +8601,18 @@ class GraniteHybridModel(Mamba2Model, GraniteMoeModel):
|
||||
class NemotronHModel(GraniteHybridModel):
|
||||
"""Hybrid mamba2/attention model from NVIDIA"""
|
||||
model_arch = gguf.MODEL_ARCH.NEMOTRON_H
|
||||
+ is_moe: bool = False
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
+ # We have to determine the correct model architecture (MoE vs non-MoE) before
|
||||
+ # calling the parent __init__. This is because the parent constructor
|
||||
+ # uses self.model_arch to build the tensor name map, and all MoE-specific
|
||||
+ # mappings would be missed if it were called with the default non-MoE arch.
|
||||
+ hparams = ModelBase.load_hparams(args[0], self.is_mistral_format)
|
||||
+ if "num_experts_per_tok" in hparams:
|
||||
+ self.model_arch = gguf.MODEL_ARCH.NEMOTRON_H_MOE
|
||||
+ self.is_moe = True
|
||||
+
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
# Save the top-level head_dim for later
|
||||
@@ -8614,9 +8624,11 @@ class NemotronHModel(GraniteHybridModel):
|
||||
|
||||
# Update the ssm / attn / mlp layers
|
||||
# M: Mamba2, *: Attention, -: MLP
|
||||
+ # MoE:
|
||||
+ # M: Mamba2, *: Attention, E: Expert
|
||||
hybrid_override_pattern = self.hparams["hybrid_override_pattern"]
|
||||
self._ssm_layers = [i for i, val in enumerate(hybrid_override_pattern) if val == "M"]
|
||||
- self._mlp_layers = [i for i, val in enumerate(hybrid_override_pattern) if val == "-"]
|
||||
+ self._mlp_layers = [i for i, val in enumerate(hybrid_override_pattern) if val == ("E" if self.is_moe else "-")]
|
||||
|
||||
def get_attn_layers(self):
|
||||
hybrid_override_pattern = self.hparams["hybrid_override_pattern"]
|
||||
@@ -8632,10 +8644,28 @@ class NemotronHModel(GraniteHybridModel):
|
||||
# Set feed_forward_length
|
||||
# NOTE: This will trigger an override warning. This is preferrable to
|
||||
# duplicating all the parent logic
|
||||
- n_ff = self.find_hparam(["intermediate_size", "n_inner", "hidden_dim"])
|
||||
- self.gguf_writer.add_feed_forward_length([
|
||||
- n_ff if i in self._mlp_layers else 0 for i in range(self.block_count)
|
||||
- ])
|
||||
+ if not self.is_moe:
|
||||
+ n_ff = self.find_hparam(["intermediate_size", "n_inner", "hidden_dim"])
|
||||
+ self.gguf_writer.add_feed_forward_length([
|
||||
+ n_ff if i in self._mlp_layers else 0 for i in range(self.block_count)
|
||||
+ ])
|
||||
+ else:
|
||||
+ moe_intermediate_size = self.hparams["moe_intermediate_size"]
|
||||
+ self.gguf_writer.add_feed_forward_length([
|
||||
+ moe_intermediate_size if i in self._mlp_layers else 0 for i in range(self.block_count)
|
||||
+ ])
|
||||
+ self.gguf_writer.add_expert_used_count(self.hparams["num_experts_per_tok"])
|
||||
+ self.gguf_writer.add_expert_feed_forward_length(self.hparams["moe_intermediate_size"])
|
||||
+ self.gguf_writer.add_expert_shared_feed_forward_length(self.hparams["moe_shared_expert_intermediate_size"])
|
||||
+ self.gguf_writer.add_expert_count(self.hparams["n_routed_experts"])
|
||||
+ self.gguf_writer.add_expert_shared_count(self.hparams["n_shared_experts"])
|
||||
+ self.gguf_writer.add_expert_weights_norm(self.hparams["norm_topk_prob"])
|
||||
+ self.gguf_writer.add_expert_weights_scale(self.hparams["routed_scaling_factor"])
|
||||
+ self.gguf_writer.add_expert_group_count(self.hparams["n_group"])
|
||||
+
|
||||
+ # number of experts used per token (top-k)
|
||||
+ if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None:
|
||||
+ self.gguf_writer.add_expert_used_count(n_experts_used)
|
||||
|
||||
def set_vocab(self):
|
||||
super().set_vocab()
|
||||
@@ -8643,7 +8673,81 @@ class NemotronHModel(GraniteHybridModel):
|
||||
# The tokenizer _does_ add a BOS token (via post_processor type
|
||||
# TemplateProcessing) but does not set add_bos_token to true in the
|
||||
# config, so we need to explicitly override it here.
|
||||
- self.gguf_writer.add_add_bos_token(True)
|
||||
+ if not self.is_moe:
|
||||
+ self.gguf_writer.add_add_bos_token(True)
|
||||
+
|
||||
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
+ if self.is_moe and bid is not None:
|
||||
+ if name.endswith("mixer.gate.e_score_correction_bias"):
|
||||
+ new_name = name.replace("e_score_correction_bias", "e_score_correction_bias.bias")
|
||||
+ mapped_name = self.map_tensor_name(new_name)
|
||||
+ return [(mapped_name, data_torch)]
|
||||
+
|
||||
+ if name.endswith("mixer.dt_bias"):
|
||||
+ new_name = name.replace("dt_bias", "dt.bias")
|
||||
+ mapped_name = self.map_tensor_name(new_name)
|
||||
+ return [(mapped_name, data_torch)]
|
||||
+
|
||||
+ if name.endswith("mixer.conv1d.weight"):
|
||||
+ squeezed_data = data_torch.squeeze()
|
||||
+ mapped_name = self.map_tensor_name(name)
|
||||
+ return [(mapped_name, squeezed_data)]
|
||||
+
|
||||
+ if name.endswith("mixer.A_log"):
|
||||
+ transformed_data = -torch.exp(data_torch)
|
||||
+ reshaped_data = transformed_data.squeeze().reshape(-1, 1)
|
||||
+ mapped_name = self.map_tensor_name(name)
|
||||
+ return [(mapped_name, reshaped_data)]
|
||||
+
|
||||
+ if name.endswith("mixer.D"):
|
||||
+ reshaped_data = data_torch.squeeze().reshape(-1, 1)
|
||||
+ mapped_name = self.map_tensor_name(name)
|
||||
+ return [(mapped_name, reshaped_data)]
|
||||
+
|
||||
+ if name.endswith("mixer.norm.weight"):
|
||||
+ reshaped_data = data_torch.reshape(8, 512)
|
||||
+ mapped_name = self.map_tensor_name(name)
|
||||
+ return [(mapped_name, reshaped_data)]
|
||||
+
|
||||
+ if name.find("mixer.experts") != -1:
|
||||
+ n_experts = self.hparams["n_routed_experts"]
|
||||
+ assert bid is not None
|
||||
+
|
||||
+ if self._experts is None:
|
||||
+ self._experts = [{} for _ in range(self.block_count)]
|
||||
+
|
||||
+ self._experts[bid][name] = data_torch
|
||||
+
|
||||
+ if len(self._experts[bid]) >= n_experts * 2:
|
||||
+ # merge the experts into a single tensor
|
||||
+ tensors: list[tuple[str, Tensor]] = []
|
||||
+ for w_name in ["down_proj", "up_proj"]:
|
||||
+ datas: list[Tensor] = []
|
||||
+
|
||||
+ for xid in range(n_experts):
|
||||
+ ename = f"backbone.layers.{bid}.mixer.experts.{xid}.{w_name}.weight"
|
||||
+ datas.append(self._experts[bid][ename])
|
||||
+ del self._experts[bid][ename]
|
||||
+
|
||||
+ data_torch = torch.stack(datas, dim=0)
|
||||
+ merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
|
||||
+ new_name = self.map_tensor_name(merged_name)
|
||||
+ tensors.append((new_name, data_torch))
|
||||
+
|
||||
+ return tensors
|
||||
+ else:
|
||||
+ return []
|
||||
+
|
||||
+ return super().modify_tensors(data_torch, name, bid)
|
||||
+
|
||||
+ def prepare_tensors(self):
|
||||
+ super().prepare_tensors()
|
||||
+
|
||||
+ if self._experts is not None:
|
||||
+ # flatten `list[dict[str, Tensor]]` into `list[str]`
|
||||
+ experts = [k for d in self._experts for k in d.keys()]
|
||||
+ if len(experts) > 0:
|
||||
+ raise ValueError(f"Unprocessed experts: {experts}")
|
||||
|
||||
|
||||
@ModelBase.register("BailingMoeForCausalLM")
|
||||
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
|
||||
index 2b8489c59..1852428b4 100644
|
||||
--- a/gguf-py/gguf/constants.py
|
||||
+++ b/gguf-py/gguf/constants.py
|
||||
@@ -413,6 +413,7 @@ class MODEL_ARCH(IntEnum):
|
||||
JAIS = auto()
|
||||
NEMOTRON = auto()
|
||||
NEMOTRON_H = auto()
|
||||
+ NEMOTRON_H_MOE = auto()
|
||||
EXAONE = auto()
|
||||
EXAONE4 = auto()
|
||||
GRANITE = auto()
|
||||
@@ -786,6 +787,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||
MODEL_ARCH.JAIS: "jais",
|
||||
MODEL_ARCH.NEMOTRON: "nemotron",
|
||||
MODEL_ARCH.NEMOTRON_H: "nemotron_h",
|
||||
+ MODEL_ARCH.NEMOTRON_H_MOE: "nemotron_h_moe",
|
||||
MODEL_ARCH.EXAONE: "exaone",
|
||||
MODEL_ARCH.EXAONE4: "exaone4",
|
||||
MODEL_ARCH.GRANITE: "granite",
|
||||
@@ -2529,6 +2531,33 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
],
|
||||
+ MODEL_ARCH.NEMOTRON_H_MOE: [
|
||||
+ MODEL_TENSOR.TOKEN_EMBD,
|
||||
+ MODEL_TENSOR.OUTPUT_NORM,
|
||||
+ MODEL_TENSOR.OUTPUT,
|
||||
+ MODEL_TENSOR.ATTN_NORM,
|
||||
+ MODEL_TENSOR.SSM_IN,
|
||||
+ MODEL_TENSOR.SSM_CONV1D,
|
||||
+ MODEL_TENSOR.SSM_DT,
|
||||
+ MODEL_TENSOR.SSM_A,
|
||||
+ MODEL_TENSOR.SSM_D,
|
||||
+ MODEL_TENSOR.SSM_NORM,
|
||||
+ MODEL_TENSOR.SSM_OUT,
|
||||
+ MODEL_TENSOR.ATTN_Q,
|
||||
+ MODEL_TENSOR.ATTN_K,
|
||||
+ MODEL_TENSOR.ATTN_V,
|
||||
+ MODEL_TENSOR.ATTN_OUT,
|
||||
+ MODEL_TENSOR.FFN_DOWN,
|
||||
+ MODEL_TENSOR.FFN_UP,
|
||||
+ # experts
|
||||
+ MODEL_TENSOR.FFN_GATE_INP,
|
||||
+ MODEL_TENSOR.FFN_UP_EXP,
|
||||
+ MODEL_TENSOR.FFN_DOWN_EXP,
|
||||
+ # shared expert
|
||||
+ MODEL_TENSOR.FFN_DOWN_SHEXP,
|
||||
+ MODEL_TENSOR.FFN_UP_SHEXP,
|
||||
+ MODEL_TENSOR.FFN_EXP_PROBS_B,
|
||||
+ ],
|
||||
MODEL_ARCH.EXAONE: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
|
||||
index d9c87da19..7a3c7c5e0 100644
|
||||
--- a/gguf-py/gguf/tensor_mapping.py
|
||||
+++ b/gguf-py/gguf/tensor_mapping.py
|
||||
@@ -377,6 +377,7 @@ class TensorNameMap:
|
||||
"model.layers.{bid}.feed_forward.gate", # lfm2moe
|
||||
"model.layers.{bid}.mlp.router.gate", # afmoe
|
||||
"layers.{bid}.gate", # mistral-large
|
||||
+ "backbone.layers.{bid}.mixer.gate", # nemotron-h-moe
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_GATE_INP_SHEXP: (
|
||||
@@ -390,6 +391,7 @@ class TensorNameMap:
|
||||
"model.layers.{bid}.mlp.expert_bias", # afmoe
|
||||
"model.layers.{bid}.feed_forward.expert_bias", # lfm2moe
|
||||
"model.layers.{bid}.block_sparse_moe.e_score_correction", # minimax-m2
|
||||
+ "backbone.layers.{bid}.mixer.gate.e_score_correction_bias" # nemotron-h-moe
|
||||
),
|
||||
|
||||
# Feed-forward up
|
||||
@@ -438,7 +440,7 @@ class TensorNameMap:
|
||||
"layers.{bid}.feed_forward.experts.w3", # mixtral (merged)
|
||||
"transformer.decoder_layer.{bid}.moe.linear_v", # Grok (merged)
|
||||
"transformer.blocks.{bid}.ffn.experts.mlp.v1", # dbrx
|
||||
- "model.layers.{bid}.mlp.experts.up_proj", # qwen2moe olmoe (merged) ernie4.5-moe
|
||||
+ "model.layers.{bid}.mlp.experts.up_proj", # qwen2moe olmoe (merged) ernie4.5-moe, nemotron-h-moe (merged)
|
||||
"model.layers.{bid}.block_sparse_moe.experts.w3", # phimoe (merged)
|
||||
"model.layers.{bid}.feed_forward.experts.up_proj", # llama4
|
||||
"encoder.layers.{bid}.mlp.experts.mlp.w1", # nomic-bert-moe
|
||||
@@ -452,6 +454,7 @@ class TensorNameMap:
|
||||
"model.layers.{bid}.feed_forward.down_proj",
|
||||
"model.layers.{bid}.mlp.shared_mlp.up_proj", # hunyuan
|
||||
"layers.{bid}.shared_experts.w3", # mistral-large
|
||||
+ "backbone.layers.{bid}.mixer.shared_experts.up_proj", # nemotron-h-moe
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_UP_CHEXP: (
|
||||
@@ -546,7 +549,7 @@ class TensorNameMap:
|
||||
"layers.{bid}.feed_forward.experts.w2", # mixtral (merged)
|
||||
"transformer.decoder_layer.{bid}.moe.linear_1", # Grok (merged)
|
||||
"transformer.blocks.{bid}.ffn.experts.mlp.w2", # dbrx
|
||||
- "model.layers.{bid}.mlp.experts.down_proj", # qwen2moe olmoe (merged) ernie4.5-moe
|
||||
+ "model.layers.{bid}.mlp.experts.down_proj", # qwen2moe olmoe (merged) ernie4.5-moe nemotron-h-moe (merged)
|
||||
"model.layers.{bid}.block_sparse_moe.output_linear", # granitemoe
|
||||
"model.layers.{bid}.block_sparse_moe.experts.w2", # phimoe (merged)
|
||||
"model.layers.{bid}.feed_forward.experts.down_proj", # llama4
|
||||
@@ -561,6 +564,7 @@ class TensorNameMap:
|
||||
"model.layers.{bid}.shared_mlp.output_linear", # granitemoe
|
||||
"model.layers.{bid}.mlp.shared_mlp.down_proj", # hunyuan
|
||||
"layers.{bid}.shared_experts.w2", # mistral-large
|
||||
+ "backbone.layers.{bid}.mixer.shared_experts.down_proj", # nemotron-h-moe
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_DOWN_CHEXP: (
|
||||
@@ -704,6 +708,7 @@ class TensorNameMap:
|
||||
"model.layers.{bid}.mamba.dt_proj", # jamba falcon-h1 granite-hybrid
|
||||
"model.layers.layers.{bid}.mixer.dt_proj", # plamo2
|
||||
"model.layers.{bid}.linear_attn.dt_proj", # qwen3next
|
||||
+ "backbone.layers.{bid}.mixer.dt", # nemotron-h-moe
|
||||
),
|
||||
|
||||
MODEL_TENSOR.SSM_DT_NORM: (
|
||||
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
|
||||
index a5fe4f66c..ac8b5e033 100644
|
||||
--- a/src/llama-arch.cpp
|
||||
+++ b/src/llama-arch.cpp
|
||||
@@ -75,6 +75,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
{ LLM_ARCH_JAIS, "jais" },
|
||||
{ LLM_ARCH_NEMOTRON, "nemotron" },
|
||||
{ LLM_ARCH_NEMOTRON_H, "nemotron_h" },
|
||||
+ { LLM_ARCH_NEMOTRON_H_MOE, "nemotron_h_moe" },
|
||||
{ LLM_ARCH_EXAONE, "exaone" },
|
||||
{ LLM_ARCH_EXAONE4, "exaone4" },
|
||||
{ LLM_ARCH_RWKV6, "rwkv6" },
|
||||
@@ -1765,6 +1766,39 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
},
|
||||
},
|
||||
+ {
|
||||
+ LLM_ARCH_NEMOTRON_H_MOE,
|
||||
+ {
|
||||
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||
+ { LLM_TENSOR_OUTPUT, "output" },
|
||||
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||
+ // mamba(2) ssm layers
|
||||
+ { LLM_TENSOR_SSM_IN, "blk.%d.ssm_in" },
|
||||
+ { LLM_TENSOR_SSM_CONV1D, "blk.%d.ssm_conv1d" },
|
||||
+ { LLM_TENSOR_SSM_DT, "blk.%d.ssm_dt" },
|
||||
+ { LLM_TENSOR_SSM_A, "blk.%d.ssm_a" },
|
||||
+ { LLM_TENSOR_SSM_D, "blk.%d.ssm_d" },
|
||||
+ { LLM_TENSOR_SSM_NORM, "blk.%d.ssm_norm" },
|
||||
+ { LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" },
|
||||
+ // attention layers
|
||||
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||
+ // dense FFN
|
||||
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
+ // MoE FFN (for MoE layers)
|
||||
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
||||
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
||||
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
||||
+ { LLM_TENSOR_FFN_EXP_PROBS_B,"blk.%d.exp_probs_b" },
|
||||
+ // MoE shared expert layer
|
||||
+ { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
|
||||
+ { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
||||
+ },
|
||||
+ },
|
||||
{
|
||||
LLM_ARCH_EXAONE,
|
||||
{
|
||||
@@ -2838,6 +2872,7 @@ bool llm_arch_is_hybrid(const llm_arch & arch) {
|
||||
case LLM_ARCH_LFM2:
|
||||
case LLM_ARCH_LFM2MOE:
|
||||
case LLM_ARCH_NEMOTRON_H:
|
||||
+ case LLM_ARCH_NEMOTRON_H_MOE:
|
||||
case LLM_ARCH_QWEN3NEXT:
|
||||
return true;
|
||||
default:
|
||||
diff --git a/src/llama-arch.h b/src/llama-arch.h
|
||||
index ec9e3a6df..61d73786c 100644
|
||||
--- a/src/llama-arch.h
|
||||
+++ b/src/llama-arch.h
|
||||
@@ -79,6 +79,7 @@ enum llm_arch {
|
||||
LLM_ARCH_JAIS,
|
||||
LLM_ARCH_NEMOTRON,
|
||||
LLM_ARCH_NEMOTRON_H,
|
||||
+ LLM_ARCH_NEMOTRON_H_MOE,
|
||||
LLM_ARCH_EXAONE,
|
||||
LLM_ARCH_EXAONE4,
|
||||
LLM_ARCH_RWKV6,
|
||||
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
|
||||
index 43620df78..763202d87 100644
|
||||
--- a/src/llama-graph.cpp
|
||||
+++ b/src/llama-graph.cpp
|
||||
@@ -1089,6 +1089,16 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
||||
cur = ggml_relu(ctx0, cur);
|
||||
cb(cur, "ffn_moe_relu", il);
|
||||
} break;
|
||||
+ case LLM_FFN_RELU_SQR:
|
||||
+ if (gate_exps) {
|
||||
+ // TODO: add support for gated squared relu
|
||||
+ GGML_ABORT("fatal error: gated squared relu not implemented");
|
||||
+ } else {
|
||||
+ cur = ggml_relu(ctx0, cur);
|
||||
+ cur = ggml_sqr(ctx0, cur);
|
||||
+ cb(cur, "ffn_moe_relu_sqr", il);
|
||||
+ }
|
||||
+ break;
|
||||
default:
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
|
||||
index 3c503b424..94dee78c3 100644
|
||||
--- a/src/llama-model.cpp
|
||||
+++ b/src/llama-model.cpp
|
||||
@@ -120,6 +120,8 @@ const char * llm_type_name(llm_type type) {
|
||||
case LLM_TYPE_16B_A1B: return "16B.A1B";
|
||||
case LLM_TYPE_21B_A3B: return "21B.A3B";
|
||||
case LLM_TYPE_30B_A3B: return "30B.A3B";
|
||||
+ case LLM_TYPE_31B_A3_5B: return "31B.A3.5B";
|
||||
+ case LLM_TYPE_80B_A3B: return "80B.A3B";
|
||||
case LLM_TYPE_100B_A6B: return "100B.A6B";
|
||||
case LLM_TYPE_106B_A12B: return "106B.A12B";
|
||||
case LLM_TYPE_230B_A10B: return "230B.A10B";
|
||||
@@ -1788,6 +1790,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_NEMOTRON_H:
|
||||
+ case LLM_ARCH_NEMOTRON_H_MOE:
|
||||
{
|
||||
ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
|
||||
ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
|
||||
@@ -1803,7 +1806,14 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
|
||||
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
|
||||
+ ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
|
||||
+ ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared, false);
|
||||
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
|
||||
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
|
||||
+
|
||||
switch (hparams.n_layer) {
|
||||
+ case 52: type = LLM_TYPE_31B_A3_5B; break; // Nemotron-H_MOE 31B
|
||||
case 56: type = LLM_TYPE_9B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
@@ -5175,6 +5185,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_NEMOTRON_H:
|
||||
+ case LLM_ARCH_NEMOTRON_H_MOE:
|
||||
{
|
||||
// mamba2 Mixer SSM params
|
||||
// NOTE: int64_t for tensor dimensions
|
||||
@@ -5185,6 +5196,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
const int64_t n_group = hparams.ssm_n_group;
|
||||
const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_ssm_head;
|
||||
|
||||
+ const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
|
||||
+ const int64_t n_ff_shexp = hparams.n_ff_shexp;
|
||||
+
|
||||
// embeddings
|
||||
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||
|
||||
@@ -5234,12 +5248,26 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_k_gqa_i}, TENSOR_NOT_REQUIRED);
|
||||
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_v_gqa_i}, TENSOR_NOT_REQUIRED);
|
||||
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
||||
- } else {
|
||||
- // mlp layers
|
||||
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { hparams.n_ff(i), n_embd}, 0);
|
||||
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, hparams.n_ff(i)}, 0);
|
||||
- layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
||||
- layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {hparams.n_ff(i)}, TENSOR_NOT_REQUIRED);
|
||||
+ } else {
|
||||
+ if (n_expert != 0) {
|
||||
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert}, 0);
|
||||
+ layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert }, 0);
|
||||
+
|
||||
+ // MoE branch
|
||||
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
|
||||
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
|
||||
+
|
||||
+ // Shared expert branch
|
||||
+ layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, 0);
|
||||
+ layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_shexp}, 0);
|
||||
+
|
||||
+ } else {
|
||||
+ // mlp layers
|
||||
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { hparams.n_ff(i), n_embd}, 0);
|
||||
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, hparams.n_ff(i)}, 0);
|
||||
+ layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
||||
+ layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {hparams.n_ff(i)}, TENSOR_NOT_REQUIRED);
|
||||
+ }
|
||||
}
|
||||
}
|
||||
} break;
|
||||
@@ -6870,7 +6898,8 @@ void llama_model::print_info() const {
|
||||
arch == LLM_ARCH_PLAMO2 ||
|
||||
arch == LLM_ARCH_GRANITE_HYBRID ||
|
||||
arch == LLM_ARCH_QWEN3NEXT ||
|
||||
- arch == LLM_ARCH_NEMOTRON_H) {
|
||||
+ arch == LLM_ARCH_NEMOTRON_H ||
|
||||
+ arch == LLM_ARCH_NEMOTRON_H_MOE) {
|
||||
LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
|
||||
LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
|
||||
LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
|
||||
@@ -6926,7 +6955,8 @@ void llama_model::print_info() const {
|
||||
if (arch == LLM_ARCH_MINICPM ||
|
||||
arch == LLM_ARCH_GRANITE ||
|
||||
arch == LLM_ARCH_GRANITE_MOE ||
|
||||
- arch == LLM_ARCH_GRANITE_HYBRID) {
|
||||
+ arch == LLM_ARCH_GRANITE_HYBRID ||
|
||||
+ arch == LLM_ARCH_NEMOTRON_H_MOE) {
|
||||
LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
|
||||
LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
|
||||
LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
|
||||
@@ -7107,7 +7137,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
||||
if (arch == LLM_ARCH_FALCON_H1) {
|
||||
filter_attn = [&](int32_t) { return true; };
|
||||
filter_recr = [&](int32_t) { return true; };
|
||||
- } else if (arch == LLM_ARCH_NEMOTRON_H) {
|
||||
+ } else if (arch == LLM_ARCH_NEMOTRON_H || arch == LLM_ARCH_NEMOTRON_H_MOE) {
|
||||
filter_attn = [&](int32_t il) {
|
||||
return !hparams.is_recurrent(il) && hparams.n_ff(il) == 0;
|
||||
};
|
||||
@@ -7478,6 +7508,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
||||
llm = std::make_unique<llm_build_nemotron>(*this, params);
|
||||
} break;
|
||||
case LLM_ARCH_NEMOTRON_H:
|
||||
+ case LLM_ARCH_NEMOTRON_H_MOE:
|
||||
{
|
||||
llm = std::make_unique<llm_build_nemotron_h>(*this, params);
|
||||
} break;
|
||||
@@ -7765,6 +7796,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||
case LLM_ARCH_ARWKV7:
|
||||
case LLM_ARCH_WAVTOKENIZER_DEC:
|
||||
case LLM_ARCH_NEMOTRON_H:
|
||||
+ case LLM_ARCH_NEMOTRON_H_MOE:
|
||||
return LLAMA_ROPE_TYPE_NONE;
|
||||
|
||||
// use what we call a normal RoPE, operating on pairs of consecutive head values
|
||||
diff --git a/src/llama-model.h b/src/llama-model.h
|
||||
index cbf4e1bfa..b378b23ec 100644
|
||||
--- a/src/llama-model.h
|
||||
+++ b/src/llama-model.h
|
||||
@@ -114,6 +114,7 @@ enum llm_type {
|
||||
LLM_TYPE_16B_A1B,
|
||||
LLM_TYPE_21B_A3B, // Ernie MoE small
|
||||
LLM_TYPE_30B_A3B,
|
||||
+ LLM_TYPE_31B_A3_5B,
|
||||
LLM_TYPE_80B_A3B, // Qwen3 Next
|
||||
LLM_TYPE_100B_A6B,
|
||||
LLM_TYPE_106B_A12B, // GLM-4.5-Air
|
||||
diff --git a/src/models/nemotron-h.cpp b/src/models/nemotron-h.cpp
|
||||
index 541434888..eb135e63f 100644
|
||||
--- a/src/models/nemotron-h.cpp
|
||||
+++ b/src/models/nemotron-h.cpp
|
||||
@@ -107,12 +107,41 @@ ggml_tensor * llm_build_nemotron_h::build_attention_layer(ggml_tensor *
|
||||
}
|
||||
|
||||
ggml_tensor * llm_build_nemotron_h::build_ffn_layer(ggml_tensor * cur, const llama_model & model, const int il) {
|
||||
- cur = build_ffn(cur,
|
||||
- model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
||||
- NULL, NULL, NULL,
|
||||
- model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
||||
- NULL, LLM_FFN_RELU_SQR, LLM_FFN_PAR, il);
|
||||
- cb(cur, "ffn_out", il);
|
||||
+ if (model.layers[il].ffn_gate_inp == nullptr) {
|
||||
+ cur = build_ffn(cur,
|
||||
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
||||
+ NULL, NULL, NULL,
|
||||
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
||||
+ NULL,
|
||||
+ LLM_FFN_RELU_SQR, LLM_FFN_PAR, il);
|
||||
+ cb(cur, "ffn_out", il);
|
||||
+ } else {
|
||||
+ ggml_tensor * ffn_inp = cur;
|
||||
+ ggml_tensor * moe_out =
|
||||
+ build_moe_ffn(ffn_inp,
|
||||
+ model.layers[il].ffn_gate_inp,
|
||||
+ model.layers[il].ffn_up_exps,
|
||||
+ nullptr, // no gate
|
||||
+ model.layers[il].ffn_down_exps,
|
||||
+ model.layers[il].ffn_exp_probs_b,
|
||||
+ n_expert, n_expert_used,
|
||||
+ LLM_FFN_RELU_SQR, hparams.expert_weights_norm,
|
||||
+ true, hparams.expert_weights_scale,
|
||||
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID,
|
||||
+ il);
|
||||
+ cb(moe_out, "ffn_moe_out", il);
|
||||
+
|
||||
+ ggml_tensor * ffn_shexp = build_ffn(ffn_inp,
|
||||
+ model.layers[il].ffn_up_shexp, NULL, NULL,
|
||||
+ NULL /* no gate */ , NULL, NULL,
|
||||
+ model.layers[il].ffn_down_shexp, NULL, NULL,
|
||||
+ NULL,
|
||||
+ LLM_FFN_RELU_SQR, LLM_FFN_PAR, il);
|
||||
+ cb(ffn_shexp, "ffn_shexp", il);
|
||||
+
|
||||
+ cur = ggml_add(ctx0, moe_out, ffn_shexp);
|
||||
+ cb(cur, "ffn_out", il);
|
||||
+ }
|
||||
|
||||
cur = build_cvec(cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
100
llm/server.go
100
llm/server.go
|
|
@ -69,7 +69,7 @@ type LlamaServer interface {
|
|||
Ping(ctx context.Context) error
|
||||
WaitUntilRunning(ctx context.Context) error
|
||||
Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error
|
||||
Embedding(ctx context.Context, input string) ([]float32, error)
|
||||
Embedding(ctx context.Context, input string) ([]float32, int, error)
|
||||
Tokenize(ctx context.Context, content string) ([]int, error)
|
||||
Detokenize(ctx context.Context, tokens []int) (string, error)
|
||||
Close() error
|
||||
|
|
@ -188,6 +188,11 @@ func NewLlamaServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, modelPath st
|
|||
if len(projectors) > 0 && llamaModel != nil {
|
||||
loadRequest.ProjectorPath = projectors[0]
|
||||
}
|
||||
// Determine if the user has forced FA on or off
|
||||
faUserSet := false
|
||||
if envconfig.FlashAttention(true) == envconfig.FlashAttention(false) {
|
||||
faUserSet = true
|
||||
}
|
||||
|
||||
fa := envconfig.FlashAttention(f.FlashAttention())
|
||||
|
||||
|
|
@ -205,19 +210,51 @@ func NewLlamaServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, modelPath st
|
|||
|
||||
kvct := strings.ToLower(envconfig.KvCacheType())
|
||||
|
||||
if fa {
|
||||
slog.Info("enabling flash attention")
|
||||
loadRequest.FlashAttention = true
|
||||
|
||||
// Flash Attention also supports kv cache quantization
|
||||
// Enable if the requested and kv cache type is supported by the model
|
||||
if f.SupportsKVCacheType(kvct) {
|
||||
loadRequest.KvCacheType = kvct
|
||||
} else {
|
||||
slog.Warn("kv cache type not supported by model", "type", kvct)
|
||||
if textProcessor == nil {
|
||||
flashAttention := ml.FlashAttentionAuto
|
||||
if faUserSet {
|
||||
if fa {
|
||||
flashAttention = ml.FlashAttentionEnabled
|
||||
} else {
|
||||
flashAttention = ml.FlashAttentionDisabled
|
||||
}
|
||||
}
|
||||
|
||||
if kvct != "" {
|
||||
if f.KVCacheTypeIsQuantized(kvct) {
|
||||
if flashAttention != ml.FlashAttentionEnabled {
|
||||
slog.Warn("OLLAMA_FLASH_ATTENTION must be enabled to use a quantized OLLAMA_KV_CACHE_TYPE", "type", kvct)
|
||||
loadRequest.KvCacheType = ""
|
||||
} else if f.SupportsKVCacheType(kvct) {
|
||||
loadRequest.KvCacheType = kvct
|
||||
} else {
|
||||
slog.Warn("unsupported OLLAMA_KV_CACHE_TYPE", "type", kvct)
|
||||
}
|
||||
} else {
|
||||
if f.SupportsKVCacheType(kvct) {
|
||||
loadRequest.KvCacheType = kvct
|
||||
} else {
|
||||
slog.Warn("unsupported OLLAMA_KV_CACHE_TYPE", "type", kvct)
|
||||
}
|
||||
}
|
||||
}
|
||||
loadRequest.FlashAttention = flashAttention
|
||||
} else {
|
||||
// For Ollama engine, use our SupportsFlashAttention logic
|
||||
if fa {
|
||||
slog.Info("enabling flash attention")
|
||||
loadRequest.FlashAttention = ml.FlashAttentionEnabled
|
||||
|
||||
// Flash Attention also supports kv cache quantization
|
||||
// Enable if the requested and kv cache type is supported by the model
|
||||
if f.SupportsKVCacheType(kvct) {
|
||||
loadRequest.KvCacheType = kvct
|
||||
} else {
|
||||
slog.Warn("kv cache type not supported by model", "type", kvct)
|
||||
}
|
||||
} else if kvct != "" && kvct != "f16" {
|
||||
slog.Warn("quantized kv cache requested but flash attention disabled", "type", kvct)
|
||||
}
|
||||
} else if kvct != "" && kvct != "f16" {
|
||||
slog.Warn("quantized kv cache requested but flash attention disabled", "type", kvct)
|
||||
}
|
||||
|
||||
gpuLibs := ml.LibraryPaths(gpus)
|
||||
|
|
@ -435,7 +472,7 @@ type LoadRequest struct {
|
|||
LoraPath []string
|
||||
Parallel int
|
||||
BatchSize int
|
||||
FlashAttention bool
|
||||
FlashAttention ml.FlashAttentionType
|
||||
KvSize int
|
||||
KvCacheType string
|
||||
NumThreads int
|
||||
|
|
@ -474,6 +511,13 @@ func (s *llamaServer) Load(ctx context.Context, systemInfo ml.SystemInfo, system
|
|||
s.mem.GPUs[i].Cache = make([]uint64, s.totalLayers)
|
||||
}
|
||||
|
||||
// Check if embedding model and adjust batch size accordingly
|
||||
_, isEmbedding := s.ggml.KV()[fmt.Sprintf("%s.pooling_type", s.ggml.KV().Architecture())]
|
||||
if isEmbedding && s.loadRequest.BatchSize < s.options.NumCtx {
|
||||
s.loadRequest.BatchSize = s.options.NumCtx
|
||||
slog.Info("embedding model detected, setting batch size to context length", "batch_size", s.loadRequest.BatchSize)
|
||||
}
|
||||
|
||||
kv, graphPartialOffload, graphFullOffload := s.ggml.GraphSize(uint64(s.options.NumCtx), uint64(s.loadRequest.BatchSize),
|
||||
s.loadRequest.Parallel, s.loadRequest.KvCacheType, s.loadRequest.FlashAttention)
|
||||
|
||||
|
|
@ -1629,10 +1673,11 @@ type EmbeddingRequest struct {
|
|||
}
|
||||
|
||||
type EmbeddingResponse struct {
|
||||
Embedding []float32 `json:"embedding"`
|
||||
Embedding []float32 `json:"embedding"`
|
||||
PromptEvalCount int `json:"prompt_eval_count"`
|
||||
}
|
||||
|
||||
func (s *llmServer) Embedding(ctx context.Context, input string) ([]float32, error) {
|
||||
func (s *llmServer) Embedding(ctx context.Context, input string) ([]float32, int, error) {
|
||||
logutil.Trace("embedding request", "input", input)
|
||||
|
||||
if err := s.sem.Acquire(ctx, 1); err != nil {
|
||||
|
|
@ -1641,51 +1686,54 @@ func (s *llmServer) Embedding(ctx context.Context, input string) ([]float32, err
|
|||
} else {
|
||||
slog.Error("Failed to acquire semaphore", "error", err)
|
||||
}
|
||||
return nil, err
|
||||
return nil, 0, err
|
||||
}
|
||||
defer s.sem.Release(1)
|
||||
|
||||
// Make sure the server is ready
|
||||
status, err := s.getServerStatusRetry(ctx)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
return nil, 0, err
|
||||
} else if status != ServerStatusReady {
|
||||
return nil, fmt.Errorf("unexpected server status: %s", status)
|
||||
return nil, 0, fmt.Errorf("unexpected server status: %s", status)
|
||||
}
|
||||
|
||||
data, err := json.Marshal(EmbeddingRequest{Content: input})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error marshaling embed data: %w", err)
|
||||
return nil, 0, fmt.Errorf("error marshaling embed data: %w", err)
|
||||
}
|
||||
|
||||
r, err := http.NewRequestWithContext(ctx, http.MethodPost, fmt.Sprintf("http://127.0.0.1:%d/embedding", s.port), bytes.NewBuffer(data))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error creating embed request: %w", err)
|
||||
return nil, 0, fmt.Errorf("error creating embed request: %w", err)
|
||||
}
|
||||
r.Header.Set("Content-Type", "application/json")
|
||||
|
||||
resp, err := http.DefaultClient.Do(r)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("do embedding request: %w", err)
|
||||
return nil, 0, fmt.Errorf("do embedding request: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error reading embed response: %w", err)
|
||||
return nil, 0, fmt.Errorf("error reading embed response: %w", err)
|
||||
}
|
||||
|
||||
if resp.StatusCode >= 400 {
|
||||
log.Printf("llm embedding error: %s", body)
|
||||
return nil, fmt.Errorf("%s", body)
|
||||
return nil, 0, api.StatusError{
|
||||
StatusCode: resp.StatusCode,
|
||||
ErrorMessage: string(body),
|
||||
}
|
||||
}
|
||||
|
||||
var e EmbeddingResponse
|
||||
if err := json.Unmarshal(body, &e); err != nil {
|
||||
return nil, fmt.Errorf("unmarshal tokenize response: %w", err)
|
||||
return nil, 0, fmt.Errorf("unmarshal tokenize response: %w", err)
|
||||
}
|
||||
|
||||
return e.Embedding, nil
|
||||
return e.Embedding, e.PromptEvalCount, nil
|
||||
}
|
||||
|
||||
func (s *llamaServer) Tokenize(ctx context.Context, content string) ([]int, error) {
|
||||
|
|
|
|||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue