mirror of https://github.com/ollama/ollama
Compare commits
52 Commits
v0.13.2-rc
...
main
| Author | SHA1 | Date |
|---|---|---|
|
|
49a9c9ba6a | |
|
|
1c094038bc | |
|
|
a013693f80 | |
|
|
f6a016f49d | |
|
|
45c4739374 | |
|
|
2dd029de12 | |
|
|
903b1fc97f | |
|
|
89eb795293 | |
|
|
7e3ea813c1 | |
|
|
7b95087b9d | |
|
|
971d62595a | |
|
|
ffbe8e076d | |
|
|
2c639431b1 | |
|
|
aacd1cb394 | |
|
|
e3731fb160 | |
|
|
8dbc9e7b68 | |
|
|
abe67acf8a | |
|
|
4ff8a691bc | |
|
|
1b308e1d2a | |
|
|
bd6c1d6b49 | |
|
|
3af5d3b738 | |
|
|
7730895158 | |
|
|
de9ecfd01c | |
|
|
95fdd8d619 | |
|
|
9f7822851c | |
|
|
9b2035d194 | |
|
|
93d45d7a04 | |
|
|
709f842457 | |
|
|
2dfb74410d | |
|
|
1eb5e75972 | |
|
|
3475d915cb | |
|
|
48e78e9be1 | |
|
|
a838421ea3 | |
|
|
1c4e85b4df | |
|
|
dac4f17fea | |
|
|
56b8fb024c | |
|
|
b95693056c | |
|
|
c34fc64688 | |
|
|
7cf6f18c1f | |
|
|
bbbb6b2a01 | |
|
|
76f88caf43 | |
|
|
2bccf8c624 | |
|
|
0c5e5f6630 | |
|
|
d475d1f081 | |
|
|
d2f334c1f7 | |
|
|
603ceefaa6 | |
|
|
e082d60a24 | |
|
|
5dae738067 | |
|
|
0c78723174 | |
|
|
5a41d69b2a | |
|
|
c146a138e3 | |
|
|
31b8c6a214 |
|
|
@ -16,13 +16,15 @@ jobs:
|
||||||
outputs:
|
outputs:
|
||||||
GOFLAGS: ${{ steps.goflags.outputs.GOFLAGS }}
|
GOFLAGS: ${{ steps.goflags.outputs.GOFLAGS }}
|
||||||
VERSION: ${{ steps.goflags.outputs.VERSION }}
|
VERSION: ${{ steps.goflags.outputs.VERSION }}
|
||||||
|
vendorsha: ${{ steps.changes.outputs.vendorsha }}
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v4
|
||||||
- name: Set environment
|
- name: Set environment
|
||||||
id: goflags
|
id: goflags
|
||||||
run: |
|
run: |
|
||||||
echo GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=${GITHUB_REF_NAME#v}\" \"-X=github.com/ollama/ollama/server.mode=release\"'" >>$GITHUB_OUTPUT
|
echo GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=${GITHUB_REF_NAME#v}\" \"-X=github.com/ollama/ollama/server.mode=release\"'" | tee -a $GITHUB_OUTPUT
|
||||||
echo VERSION="${GITHUB_REF_NAME#v}" >>$GITHUB_OUTPUT
|
echo VERSION="${GITHUB_REF_NAME#v}" | tee -a $GITHUB_OUTPUT
|
||||||
|
echo vendorsha=$(make -f Makefile.sync print-base) | tee -a $GITHUB_OUTPUT
|
||||||
|
|
||||||
darwin-build:
|
darwin-build:
|
||||||
runs-on: macos-14-xlarge
|
runs-on: macos-14-xlarge
|
||||||
|
|
@ -53,6 +55,9 @@ jobs:
|
||||||
- uses: actions/setup-go@v5
|
- uses: actions/setup-go@v5
|
||||||
with:
|
with:
|
||||||
go-version-file: go.mod
|
go-version-file: go.mod
|
||||||
|
cache-dependency-path: |
|
||||||
|
go.sum
|
||||||
|
Makefile.sync
|
||||||
- run: |
|
- run: |
|
||||||
./scripts/build_darwin.sh
|
./scripts/build_darwin.sh
|
||||||
- name: Log build results
|
- name: Log build results
|
||||||
|
|
@ -185,7 +190,7 @@ jobs:
|
||||||
- uses: actions/cache@v4
|
- uses: actions/cache@v4
|
||||||
with:
|
with:
|
||||||
path: ${{ github.workspace }}\.ccache
|
path: ${{ github.workspace }}\.ccache
|
||||||
key: ccache-${{ matrix.os }}-${{ matrix.arch }}-${{ matrix.preset }}
|
key: ccache-${{ matrix.os }}-${{ matrix.arch }}-${{ matrix.preset }}-${{ needs.setup-environment.outputs.vendorsha }}
|
||||||
- name: Build target "${{ matrix.preset }}"
|
- name: Build target "${{ matrix.preset }}"
|
||||||
run: |
|
run: |
|
||||||
Import-Module 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
|
Import-Module 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
|
||||||
|
|
@ -249,6 +254,9 @@ jobs:
|
||||||
- uses: actions/setup-go@v5
|
- uses: actions/setup-go@v5
|
||||||
with:
|
with:
|
||||||
go-version-file: go.mod
|
go-version-file: go.mod
|
||||||
|
cache-dependency-path: |
|
||||||
|
go.sum
|
||||||
|
Makefile.sync
|
||||||
- name: Verify gcc is actually clang
|
- name: Verify gcc is actually clang
|
||||||
run: |
|
run: |
|
||||||
$ErrorActionPreference='Continue'
|
$ErrorActionPreference='Continue'
|
||||||
|
|
@ -302,6 +310,9 @@ jobs:
|
||||||
- uses: actions/setup-go@v5
|
- uses: actions/setup-go@v5
|
||||||
with:
|
with:
|
||||||
go-version-file: go.mod
|
go-version-file: go.mod
|
||||||
|
cache-dependency-path: |
|
||||||
|
go.sum
|
||||||
|
Makefile.sync
|
||||||
- uses: actions/download-artifact@v4
|
- uses: actions/download-artifact@v4
|
||||||
with:
|
with:
|
||||||
pattern: depends-windows*
|
pattern: depends-windows*
|
||||||
|
|
|
||||||
|
|
@ -22,6 +22,7 @@ jobs:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
outputs:
|
outputs:
|
||||||
changed: ${{ steps.changes.outputs.changed }}
|
changed: ${{ steps.changes.outputs.changed }}
|
||||||
|
vendorsha: ${{ steps.changes.outputs.vendorsha }}
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v4
|
||||||
with:
|
with:
|
||||||
|
|
@ -37,6 +38,7 @@ jobs:
|
||||||
}
|
}
|
||||||
|
|
||||||
echo changed=$(changed 'llama/llama.cpp/**/*' 'ml/backend/ggml/ggml/**/*') | tee -a $GITHUB_OUTPUT
|
echo changed=$(changed 'llama/llama.cpp/**/*' 'ml/backend/ggml/ggml/**/*') | tee -a $GITHUB_OUTPUT
|
||||||
|
echo vendorsha=$(make -f Makefile.sync print-base) | tee -a $GITHUB_OUTPUT
|
||||||
|
|
||||||
linux:
|
linux:
|
||||||
needs: [changes]
|
needs: [changes]
|
||||||
|
|
@ -83,7 +85,7 @@ jobs:
|
||||||
- uses: actions/cache@v4
|
- uses: actions/cache@v4
|
||||||
with:
|
with:
|
||||||
path: /github/home/.cache/ccache
|
path: /github/home/.cache/ccache
|
||||||
key: ccache-${{ runner.os }}-${{ runner.arch }}-${{ matrix.preset }}
|
key: ccache-${{ runner.os }}-${{ runner.arch }}-${{ matrix.preset }}-${{ needs.changes.outputs.vendorsha }}
|
||||||
- run: |
|
- run: |
|
||||||
cmake --preset ${{ matrix.preset }} ${{ matrix.flags }}
|
cmake --preset ${{ matrix.preset }} ${{ matrix.flags }}
|
||||||
cmake --build --preset ${{ matrix.preset }} --parallel
|
cmake --build --preset ${{ matrix.preset }} --parallel
|
||||||
|
|
@ -178,7 +180,7 @@ jobs:
|
||||||
- uses: actions/cache@v4
|
- uses: actions/cache@v4
|
||||||
with:
|
with:
|
||||||
path: ${{ github.workspace }}\.ccache
|
path: ${{ github.workspace }}\.ccache
|
||||||
key: ccache-${{ runner.os }}-${{ runner.arch }}-${{ matrix.preset }}
|
key: ccache-${{ runner.os }}-${{ runner.arch }}-${{ matrix.preset }}-${{ needs.changes.outputs.vendorsha }}
|
||||||
- run: |
|
- run: |
|
||||||
Import-Module 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
|
Import-Module 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
|
||||||
Enter-VsDevShell -VsInstallPath 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise' -SkipAutomaticLocation -DevCmdArguments '-arch=x64 -no_logo'
|
Enter-VsDevShell -VsInstallPath 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise' -SkipAutomaticLocation -DevCmdArguments '-arch=x64 -no_logo'
|
||||||
|
|
@ -206,6 +208,9 @@ jobs:
|
||||||
- uses: actions/setup-go@v5
|
- uses: actions/setup-go@v5
|
||||||
with:
|
with:
|
||||||
go-version-file: 'go.mod'
|
go-version-file: 'go.mod'
|
||||||
|
cache-dependency-path: |
|
||||||
|
go.sum
|
||||||
|
Makefile.sync
|
||||||
- uses: actions/setup-node@v4
|
- uses: actions/setup-node@v4
|
||||||
with:
|
with:
|
||||||
node-version: '20'
|
node-version: '20'
|
||||||
|
|
|
||||||
|
|
@ -54,6 +54,13 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-cp
|
||||||
|
|
||||||
add_compile_definitions(NDEBUG GGML_VERSION=0x0 GGML_COMMIT=0x0)
|
add_compile_definitions(NDEBUG GGML_VERSION=0x0 GGML_COMMIT=0x0)
|
||||||
|
|
||||||
|
# Define GGML version variables for shared library SOVERSION
|
||||||
|
# These are required by ggml/src/CMakeLists.txt for proper library versioning
|
||||||
|
set(GGML_VERSION_MAJOR 0)
|
||||||
|
set(GGML_VERSION_MINOR 0)
|
||||||
|
set(GGML_VERSION_PATCH 0)
|
||||||
|
set(GGML_VERSION "${GGML_VERSION_MAJOR}.${GGML_VERSION_MINOR}.${GGML_VERSION_PATCH}")
|
||||||
|
|
||||||
set(GGML_CPU ON)
|
set(GGML_CPU ON)
|
||||||
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src)
|
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src)
|
||||||
set_property(TARGET ggml PROPERTY EXCLUDE_FROM_ALL TRUE)
|
set_property(TARGET ggml PROPERTY EXCLUDE_FROM_ALL TRUE)
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,6 @@
|
||||||
UPSTREAM=https://github.com/ggml-org/llama.cpp.git
|
UPSTREAM=https://github.com/ggml-org/llama.cpp.git
|
||||||
WORKDIR=llama/vendor
|
WORKDIR=llama/vendor
|
||||||
FETCH_HEAD=7f8ef50cce40e3e7e4526a3696cb45658190e69a
|
FETCH_HEAD=ec98e2002
|
||||||
|
|
||||||
.PHONY: help
|
.PHONY: help
|
||||||
help:
|
help:
|
||||||
|
|
@ -57,7 +57,7 @@ checkout: $(WORKDIR)
|
||||||
$(WORKDIR):
|
$(WORKDIR):
|
||||||
git clone $(UPSTREAM) $(WORKDIR)
|
git clone $(UPSTREAM) $(WORKDIR)
|
||||||
|
|
||||||
.PHONE: format-patches
|
.PHONY: format-patches
|
||||||
format-patches: llama/patches
|
format-patches: llama/patches
|
||||||
git -C $(WORKDIR) format-patch \
|
git -C $(WORKDIR) format-patch \
|
||||||
--no-signature \
|
--no-signature \
|
||||||
|
|
@ -66,7 +66,11 @@ format-patches: llama/patches
|
||||||
-o $(realpath $<) \
|
-o $(realpath $<) \
|
||||||
$(FETCH_HEAD)
|
$(FETCH_HEAD)
|
||||||
|
|
||||||
.PHONE: clean
|
.PHONY: clean
|
||||||
clean: checkout
|
clean: checkout
|
||||||
@git -C $(WORKDIR) am --abort || true
|
@git -C $(WORKDIR) am --abort || true
|
||||||
$(RM) llama/patches/.*.patched
|
$(RM) llama/patches/.*.patched
|
||||||
|
|
||||||
|
.PHONY: print-base
|
||||||
|
print-base:
|
||||||
|
@echo $(FETCH_HEAD)
|
||||||
|
|
@ -555,7 +555,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
||||||
- [Parakeet](https://github.com/parakeet-nest/parakeet) is a GoLang library, made to simplify the development of small generative AI applications with Ollama.
|
- [Parakeet](https://github.com/parakeet-nest/parakeet) is a GoLang library, made to simplify the development of small generative AI applications with Ollama.
|
||||||
- [Haverscript](https://github.com/andygill/haverscript) with [examples](https://github.com/andygill/haverscript/tree/main/examples)
|
- [Haverscript](https://github.com/andygill/haverscript) with [examples](https://github.com/andygill/haverscript/tree/main/examples)
|
||||||
- [Ollama for Swift](https://github.com/mattt/ollama-swift)
|
- [Ollama for Swift](https://github.com/mattt/ollama-swift)
|
||||||
- [Swollama for Swift](https://github.com/marcusziade/Swollama) with [DocC](https://marcusziade.github.io/Swollama/documentation/swollama/)
|
- [Swollama for Swift](https://github.com/guitaripod/Swollama) with [DocC](https://guitaripod.github.io/Swollama/documentation/swollama)
|
||||||
- [GoLamify](https://github.com/prasad89/golamify)
|
- [GoLamify](https://github.com/prasad89/golamify)
|
||||||
- [Ollama for Haskell](https://github.com/tusharad/ollama-haskell)
|
- [Ollama for Haskell](https://github.com/tusharad/ollama-haskell)
|
||||||
- [multi-llm-ts](https://github.com/nbonamy/multi-llm-ts) (A Typescript/JavaScript library allowing access to different LLM in a unified API)
|
- [multi-llm-ts](https://github.com/nbonamy/multi-llm-ts) (A Typescript/JavaScript library allowing access to different LLM in a unified API)
|
||||||
|
|
|
||||||
|
|
@ -347,7 +347,7 @@ type CreateProgressFunc func(ProgressResponse) error
|
||||||
// Create creates a model from a [Modelfile]. fn is a progress function that
|
// Create creates a model from a [Modelfile]. fn is a progress function that
|
||||||
// behaves similarly to other methods (see [Client.Pull]).
|
// behaves similarly to other methods (see [Client.Pull]).
|
||||||
//
|
//
|
||||||
// [Modelfile]: https://github.com/ollama/ollama/blob/main/docs/modelfile.md
|
// [Modelfile]: https://github.com/ollama/ollama/blob/main/docs/modelfile.mdx
|
||||||
func (c *Client) Create(ctx context.Context, req *CreateRequest, fn CreateProgressFunc) error {
|
func (c *Client) Create(ctx context.Context, req *CreateRequest, fn CreateProgressFunc) error {
|
||||||
return c.stream(ctx, http.MethodPost, "/api/create", req, func(bts []byte) error {
|
return c.stream(ctx, http.MethodPost, "/api/create", req, func(bts []byte) error {
|
||||||
var resp ProgressResponse
|
var resp ProgressResponse
|
||||||
|
|
|
||||||
|
|
@ -288,6 +288,7 @@ type ToolProperty struct {
|
||||||
Items any `json:"items,omitempty"`
|
Items any `json:"items,omitempty"`
|
||||||
Description string `json:"description,omitempty"`
|
Description string `json:"description,omitempty"`
|
||||||
Enum []any `json:"enum,omitempty"`
|
Enum []any `json:"enum,omitempty"`
|
||||||
|
Properties map[string]ToolProperty `json:"properties,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
// ToTypeScriptType converts a ToolProperty to a TypeScript type string
|
// ToTypeScriptType converts a ToolProperty to a TypeScript type string
|
||||||
|
|
|
||||||
|
|
@ -504,6 +504,107 @@ func TestThinking_UnmarshalJSON(t *testing.T) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestToolPropertyNestedProperties(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
input string
|
||||||
|
expected ToolProperty
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "nested object properties",
|
||||||
|
input: `{
|
||||||
|
"type": "object",
|
||||||
|
"description": "Location details",
|
||||||
|
"properties": {
|
||||||
|
"address": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Street address"
|
||||||
|
},
|
||||||
|
"city": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "City name"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}`,
|
||||||
|
expected: ToolProperty{
|
||||||
|
Type: PropertyType{"object"},
|
||||||
|
Description: "Location details",
|
||||||
|
Properties: map[string]ToolProperty{
|
||||||
|
"address": {
|
||||||
|
Type: PropertyType{"string"},
|
||||||
|
Description: "Street address",
|
||||||
|
},
|
||||||
|
"city": {
|
||||||
|
Type: PropertyType{"string"},
|
||||||
|
Description: "City name",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "deeply nested properties",
|
||||||
|
input: `{
|
||||||
|
"type": "object",
|
||||||
|
"description": "Event",
|
||||||
|
"properties": {
|
||||||
|
"location": {
|
||||||
|
"type": "object",
|
||||||
|
"description": "Location",
|
||||||
|
"properties": {
|
||||||
|
"coordinates": {
|
||||||
|
"type": "object",
|
||||||
|
"description": "GPS coordinates",
|
||||||
|
"properties": {
|
||||||
|
"lat": {"type": "number", "description": "Latitude"},
|
||||||
|
"lng": {"type": "number", "description": "Longitude"}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}`,
|
||||||
|
expected: ToolProperty{
|
||||||
|
Type: PropertyType{"object"},
|
||||||
|
Description: "Event",
|
||||||
|
Properties: map[string]ToolProperty{
|
||||||
|
"location": {
|
||||||
|
Type: PropertyType{"object"},
|
||||||
|
Description: "Location",
|
||||||
|
Properties: map[string]ToolProperty{
|
||||||
|
"coordinates": {
|
||||||
|
Type: PropertyType{"object"},
|
||||||
|
Description: "GPS coordinates",
|
||||||
|
Properties: map[string]ToolProperty{
|
||||||
|
"lat": {Type: PropertyType{"number"}, Description: "Latitude"},
|
||||||
|
"lng": {Type: PropertyType{"number"}, Description: "Longitude"},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range tests {
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
var prop ToolProperty
|
||||||
|
err := json.Unmarshal([]byte(tt.input), &prop)
|
||||||
|
require.NoError(t, err)
|
||||||
|
assert.Equal(t, tt.expected, prop)
|
||||||
|
|
||||||
|
// Round-trip test: marshal and unmarshal again
|
||||||
|
data, err := json.Marshal(prop)
|
||||||
|
require.NoError(t, err)
|
||||||
|
|
||||||
|
var prop2 ToolProperty
|
||||||
|
err = json.Unmarshal(data, &prop2)
|
||||||
|
require.NoError(t, err)
|
||||||
|
assert.Equal(t, tt.expected, prop2)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestToolFunctionParameters_String(t *testing.T) {
|
func TestToolFunctionParameters_String(t *testing.T) {
|
||||||
tests := []struct {
|
tests := []struct {
|
||||||
name string
|
name string
|
||||||
|
|
|
||||||
|
|
@ -273,10 +273,6 @@ func main() {
|
||||||
Handler: uiServer.Handler(),
|
Handler: uiServer.Handler(),
|
||||||
}
|
}
|
||||||
|
|
||||||
if _, err := uiServer.UserData(ctx); err != nil {
|
|
||||||
slog.Warn("failed to load user data", "error", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Start the UI server
|
// Start the UI server
|
||||||
slog.Info("starting ui server", "port", port)
|
slog.Info("starting ui server", "port", port)
|
||||||
go func() {
|
go func() {
|
||||||
|
|
@ -320,6 +316,17 @@ func main() {
|
||||||
slog.Debug("no URL scheme request to handle")
|
slog.Debug("no URL scheme request to handle")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
go func() {
|
||||||
|
slog.Debug("waiting for ollama server to be ready")
|
||||||
|
if err := ui.WaitForServer(ctx, 10*time.Second); err != nil {
|
||||||
|
slog.Warn("ollama server not ready, continuing anyway", "error", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if _, err := uiServer.UserData(ctx); err != nil {
|
||||||
|
slog.Warn("failed to load user data", "error", err)
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
osRun(cancel, hasCompletedFirstRun, startHidden)
|
osRun(cancel, hasCompletedFirstRun, startHidden)
|
||||||
|
|
||||||
slog.Info("shutting down desktop server")
|
slog.Info("shutting down desktop server")
|
||||||
|
|
@ -361,7 +368,7 @@ func checkUserLoggedIn(uiServerPort int) bool {
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|
||||||
resp, err := http.Get(fmt.Sprintf("http://127.0.0.1:%d/api/v1/me", uiServerPort))
|
resp, err := http.Post(fmt.Sprintf("http://127.0.0.1:%d/api/me", uiServerPort), "application/json", nil)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
slog.Debug("failed to call local auth endpoint", "error", err)
|
slog.Debug("failed to call local auth endpoint", "error", err)
|
||||||
return false
|
return false
|
||||||
|
|
|
||||||
|
|
@ -191,13 +191,6 @@ func LaunchNewApp() {
|
||||||
C.launchApp(appName)
|
C.launchApp(appName)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Send a request to the main app thread to load a UI page
|
|
||||||
func sendUIRequestMessage(path string) {
|
|
||||||
p := C.CString(path)
|
|
||||||
defer C.free(unsafe.Pointer(p))
|
|
||||||
C.uiRequest(p)
|
|
||||||
}
|
|
||||||
|
|
||||||
func registerLaunchAgent(hasCompletedFirstRun bool) {
|
func registerLaunchAgent(hasCompletedFirstRun bool) {
|
||||||
// Remove any stale Login Item registrations
|
// Remove any stale Login Item registrations
|
||||||
C.unregisterSelfFromLoginItem()
|
C.unregisterSelfFromLoginItem()
|
||||||
|
|
|
||||||
|
|
@ -263,11 +263,6 @@ func createLoginShortcut() error {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// Send a request to the main app thread to load a UI page
|
|
||||||
func sendUIRequestMessage(path string) {
|
|
||||||
wintray.SendUIRequestMessage(path)
|
|
||||||
}
|
|
||||||
|
|
||||||
func LaunchNewApp() {
|
func LaunchNewApp() {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -169,7 +169,11 @@ DlgResult fileDlg(FileDlgParams* params) {
|
||||||
}
|
}
|
||||||
|
|
||||||
NSArray* urls = [panel URLs];
|
NSArray* urls = [panel URLs];
|
||||||
if(self->params->allowMultiple && [urls count] >= 1) {
|
if([urls count] == 0) {
|
||||||
|
return DLG_CANCEL;
|
||||||
|
}
|
||||||
|
|
||||||
|
if(self->params->allowMultiple) {
|
||||||
// For multiple files, we need to return all paths separated by null bytes
|
// For multiple files, we need to return all paths separated by null bytes
|
||||||
char* bufPtr = self->params->buf;
|
char* bufPtr = self->params->buf;
|
||||||
int remainingBuf = self->params->nbuf;
|
int remainingBuf = self->params->nbuf;
|
||||||
|
|
@ -200,6 +204,12 @@ DlgResult fileDlg(FileDlgParams* params) {
|
||||||
bufPtr += pathLen + 1;
|
bufPtr += pathLen + 1;
|
||||||
}
|
}
|
||||||
*bufPtr = '\0'; // Final null terminator
|
*bufPtr = '\0'; // Final null terminator
|
||||||
|
} else {
|
||||||
|
// Single file/directory selection - write path to buffer
|
||||||
|
NSURL* url = [urls firstObject];
|
||||||
|
if(![url getFileSystemRepresentation:self->params->buf maxLength:self->params->nbuf]) {
|
||||||
|
return DLG_URLFAIL;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return DLG_OK;
|
return DLG_OK;
|
||||||
|
|
|
||||||
|
|
@ -15,7 +15,7 @@ const multiFileBufferSize = w32.MAX_PATH * 10
|
||||||
type WinDlgError int
|
type WinDlgError int
|
||||||
|
|
||||||
func (e WinDlgError) Error() string {
|
func (e WinDlgError) Error() string {
|
||||||
return fmt.Sprintf("CommDlgExtendedError: %#x", e)
|
return fmt.Sprintf("CommDlgExtendedError: %#x", int(e))
|
||||||
}
|
}
|
||||||
|
|
||||||
func err() error {
|
func err() error {
|
||||||
|
|
|
||||||
|
|
@ -224,9 +224,7 @@ func (s *Server) cmd(ctx context.Context) (*exec.Cmd, error) {
|
||||||
if _, err := os.Stat(settings.Models); err == nil {
|
if _, err := os.Stat(settings.Models); err == nil {
|
||||||
env["OLLAMA_MODELS"] = settings.Models
|
env["OLLAMA_MODELS"] = settings.Models
|
||||||
} else {
|
} else {
|
||||||
slog.Warn("models path not accessible, clearing models setting", "path", settings.Models, "err", err)
|
slog.Warn("models path not accessible, using default", "path", settings.Models, "err", err)
|
||||||
settings.Models = ""
|
|
||||||
s.store.SetSettings(settings)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if settings.ContextLength > 0 {
|
if settings.ContextLength > 0 {
|
||||||
|
|
|
||||||
|
|
@ -469,26 +469,24 @@ export class HealthResponse {
|
||||||
}
|
}
|
||||||
export class User {
|
export class User {
|
||||||
id: string;
|
id: string;
|
||||||
name: string;
|
|
||||||
email: string;
|
email: string;
|
||||||
avatarURL: string;
|
name: string;
|
||||||
plan: string;
|
bio?: string;
|
||||||
bio: string;
|
avatarurl?: string;
|
||||||
firstName: string;
|
firstname?: string;
|
||||||
lastName: string;
|
lastname?: string;
|
||||||
overThreshold: boolean;
|
plan?: string;
|
||||||
|
|
||||||
constructor(source: any = {}) {
|
constructor(source: any = {}) {
|
||||||
if ('string' === typeof source) source = JSON.parse(source);
|
if ('string' === typeof source) source = JSON.parse(source);
|
||||||
this.id = source["id"];
|
this.id = source["id"];
|
||||||
this.name = source["name"];
|
|
||||||
this.email = source["email"];
|
this.email = source["email"];
|
||||||
this.avatarURL = source["avatarURL"];
|
this.name = source["name"];
|
||||||
this.plan = source["plan"];
|
|
||||||
this.bio = source["bio"];
|
this.bio = source["bio"];
|
||||||
this.firstName = source["firstName"];
|
this.avatarurl = source["avatarurl"];
|
||||||
this.lastName = source["lastName"];
|
this.firstname = source["firstname"];
|
||||||
this.overThreshold = source["overThreshold"];
|
this.lastname = source["lastname"];
|
||||||
|
this.plan = source["plan"];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
export class Attachment {
|
export class Attachment {
|
||||||
|
|
|
||||||
|
|
@ -15,7 +15,7 @@ import {
|
||||||
import { parseJsonlFromResponse } from "./util/jsonl-parsing";
|
import { parseJsonlFromResponse } from "./util/jsonl-parsing";
|
||||||
import { ollamaClient as ollama } from "./lib/ollama-client";
|
import { ollamaClient as ollama } from "./lib/ollama-client";
|
||||||
import type { ModelResponse } from "ollama/browser";
|
import type { ModelResponse } from "ollama/browser";
|
||||||
import { API_BASE } from "./lib/config";
|
import { API_BASE, OLLAMA_DOT_COM } from "./lib/config";
|
||||||
|
|
||||||
// Extend Model class with utility methods
|
// Extend Model class with utility methods
|
||||||
declare module "@/gotypes" {
|
declare module "@/gotypes" {
|
||||||
|
|
@ -27,7 +27,6 @@ declare module "@/gotypes" {
|
||||||
Model.prototype.isCloud = function (): boolean {
|
Model.prototype.isCloud = function (): boolean {
|
||||||
return this.model.endsWith("cloud");
|
return this.model.endsWith("cloud");
|
||||||
};
|
};
|
||||||
|
|
||||||
// Helper function to convert Uint8Array to base64
|
// Helper function to convert Uint8Array to base64
|
||||||
function uint8ArrayToBase64(uint8Array: Uint8Array): string {
|
function uint8ArrayToBase64(uint8Array: Uint8Array): string {
|
||||||
const chunkSize = 0x8000; // 32KB chunks to avoid stack overflow
|
const chunkSize = 0x8000; // 32KB chunks to avoid stack overflow
|
||||||
|
|
@ -42,9 +41,8 @@ function uint8ArrayToBase64(uint8Array: Uint8Array): string {
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function fetchUser(): Promise<User | null> {
|
export async function fetchUser(): Promise<User | null> {
|
||||||
try {
|
const response = await fetch(`${API_BASE}/api/me`, {
|
||||||
const response = await fetch(`${API_BASE}/api/v1/me`, {
|
method: "POST",
|
||||||
method: "GET",
|
|
||||||
headers: {
|
headers: {
|
||||||
"Content-Type": "application/json",
|
"Content-Type": "application/json",
|
||||||
},
|
},
|
||||||
|
|
@ -52,34 +50,41 @@ export async function fetchUser(): Promise<User | null> {
|
||||||
|
|
||||||
if (response.ok) {
|
if (response.ok) {
|
||||||
const userData: User = await response.json();
|
const userData: User = await response.json();
|
||||||
|
|
||||||
|
if (userData.avatarurl && !userData.avatarurl.startsWith("http")) {
|
||||||
|
userData.avatarurl = `${OLLAMA_DOT_COM}${userData.avatarurl}`;
|
||||||
|
}
|
||||||
|
|
||||||
return userData;
|
return userData;
|
||||||
}
|
}
|
||||||
|
|
||||||
return null;
|
if (response.status === 401 || response.status === 403) {
|
||||||
} catch (error) {
|
|
||||||
console.error("Error fetching user:", error);
|
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
throw new Error(`Failed to fetch user: ${response.status}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function fetchConnectUrl(): Promise<string> {
|
export async function fetchConnectUrl(): Promise<string> {
|
||||||
const response = await fetch(`${API_BASE}/api/v1/connect`, {
|
const response = await fetch(`${API_BASE}/api/me`, {
|
||||||
method: "GET",
|
method: "POST",
|
||||||
headers: {
|
headers: {
|
||||||
"Content-Type": "application/json",
|
"Content-Type": "application/json",
|
||||||
},
|
},
|
||||||
});
|
});
|
||||||
|
|
||||||
if (!response.ok) {
|
if (response.status === 401) {
|
||||||
throw new Error("Failed to fetch connect URL");
|
const data = await response.json();
|
||||||
|
if (data.signin_url) {
|
||||||
|
return data.signin_url;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const data = await response.json();
|
throw new Error("Failed to fetch connect URL");
|
||||||
return data.connect_url;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function disconnectUser(): Promise<void> {
|
export async function disconnectUser(): Promise<void> {
|
||||||
const response = await fetch(`${API_BASE}/api/v1/disconnect`, {
|
const response = await fetch(`${API_BASE}/api/signout`, {
|
||||||
method: "POST",
|
method: "POST",
|
||||||
headers: {
|
headers: {
|
||||||
"Content-Type": "application/json",
|
"Content-Type": "application/json",
|
||||||
|
|
@ -389,7 +394,8 @@ export async function getInferenceCompute(): Promise<InferenceCompute[]> {
|
||||||
|
|
||||||
export async function fetchHealth(): Promise<boolean> {
|
export async function fetchHealth(): Promise<boolean> {
|
||||||
try {
|
try {
|
||||||
const response = await fetch(`${API_BASE}/api/v1/health`, {
|
// Use the /api/version endpoint as a health check
|
||||||
|
const response = await fetch(`${API_BASE}/api/version`, {
|
||||||
method: "GET",
|
method: "GET",
|
||||||
headers: {
|
headers: {
|
||||||
"Content-Type": "application/json",
|
"Content-Type": "application/json",
|
||||||
|
|
@ -398,7 +404,8 @@ export async function fetchHealth(): Promise<boolean> {
|
||||||
|
|
||||||
if (response.ok) {
|
if (response.ok) {
|
||||||
const data = await response.json();
|
const data = await response.json();
|
||||||
return data.healthy || false;
|
// If we get a version back, the server is healthy
|
||||||
|
return !!data.version;
|
||||||
}
|
}
|
||||||
|
|
||||||
return false;
|
return false;
|
||||||
|
|
|
||||||
|
|
@ -299,9 +299,9 @@ export default function Settings() {
|
||||||
</Button>
|
</Button>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
{user?.avatarURL && (
|
{user?.avatarurl && (
|
||||||
<img
|
<img
|
||||||
src={user.avatarURL}
|
src={user.avatarurl}
|
||||||
alt={user?.name}
|
alt={user?.name}
|
||||||
className="h-10 w-10 rounded-full bg-neutral-200 dark:bg-neutral-700 flex-shrink-0"
|
className="h-10 w-10 rounded-full bg-neutral-200 dark:bg-neutral-700 flex-shrink-0"
|
||||||
onError={(e) => {
|
onError={(e) => {
|
||||||
|
|
|
||||||
|
|
@ -50,6 +50,9 @@ export default function Thinking({
|
||||||
// Position content to show bottom when collapsed
|
// Position content to show bottom when collapsed
|
||||||
useEffect(() => {
|
useEffect(() => {
|
||||||
if (isCollapsed && contentRef.current && wrapperRef.current) {
|
if (isCollapsed && contentRef.current && wrapperRef.current) {
|
||||||
|
requestAnimationFrame(() => {
|
||||||
|
if (!contentRef.current || !wrapperRef.current) return;
|
||||||
|
|
||||||
const contentHeight = contentRef.current.scrollHeight;
|
const contentHeight = contentRef.current.scrollHeight;
|
||||||
const wrapperHeight = wrapperRef.current.clientHeight;
|
const wrapperHeight = wrapperRef.current.clientHeight;
|
||||||
if (contentHeight > wrapperHeight) {
|
if (contentHeight > wrapperHeight) {
|
||||||
|
|
@ -57,14 +60,23 @@ export default function Thinking({
|
||||||
contentRef.current.style.transform = `translateY(${translateY}px)`;
|
contentRef.current.style.transform = `translateY(${translateY}px)`;
|
||||||
setHasOverflow(true);
|
setHasOverflow(true);
|
||||||
} else {
|
} else {
|
||||||
|
contentRef.current.style.transform = "translateY(0)";
|
||||||
setHasOverflow(false);
|
setHasOverflow(false);
|
||||||
}
|
}
|
||||||
|
});
|
||||||
} else if (contentRef.current) {
|
} else if (contentRef.current) {
|
||||||
contentRef.current.style.transform = "translateY(0)";
|
contentRef.current.style.transform = "translateY(0)";
|
||||||
setHasOverflow(false);
|
setHasOverflow(false);
|
||||||
}
|
}
|
||||||
}, [thinking, isCollapsed]);
|
}, [thinking, isCollapsed]);
|
||||||
|
|
||||||
|
useEffect(() => {
|
||||||
|
if (activelyThinking && wrapperRef.current && !isCollapsed) {
|
||||||
|
// When expanded and actively thinking, scroll to bottom
|
||||||
|
wrapperRef.current.scrollTop = wrapperRef.current.scrollHeight;
|
||||||
|
}
|
||||||
|
}, [thinking, activelyThinking, isCollapsed]);
|
||||||
|
|
||||||
const handleToggle = () => {
|
const handleToggle = () => {
|
||||||
setIsCollapsed(!isCollapsed);
|
setIsCollapsed(!isCollapsed);
|
||||||
setHasUserInteracted(true);
|
setHasUserInteracted(true);
|
||||||
|
|
|
||||||
|
|
@ -7,6 +7,7 @@ import { createQueryBatcher } from "./useQueryBatcher";
|
||||||
import { useRefetchModels } from "./useModels";
|
import { useRefetchModels } from "./useModels";
|
||||||
import { useStreamingContext } from "@/contexts/StreamingContext";
|
import { useStreamingContext } from "@/contexts/StreamingContext";
|
||||||
import { useSettings } from "./useSettings";
|
import { useSettings } from "./useSettings";
|
||||||
|
import { getModelCapabilities } from "@/api";
|
||||||
|
|
||||||
export const useChats = () => {
|
export const useChats = () => {
|
||||||
return useQuery({
|
return useQuery({
|
||||||
|
|
@ -606,6 +607,24 @@ export const useSendMessage = (chatId: string) => {
|
||||||
queryClient.setQueryData(["staleModels"], newStaleMap);
|
queryClient.setQueryData(["staleModels"], newStaleMap);
|
||||||
|
|
||||||
queryClient.invalidateQueries({ queryKey: ["models"] });
|
queryClient.invalidateQueries({ queryKey: ["models"] });
|
||||||
|
|
||||||
|
// Fetch fresh capabilities for the downloaded model
|
||||||
|
getModelCapabilities(selectedModel.model)
|
||||||
|
.then((capabilities) => {
|
||||||
|
queryClient.setQueryData(
|
||||||
|
["modelCapabilities", selectedModel.model],
|
||||||
|
capabilities,
|
||||||
|
);
|
||||||
|
})
|
||||||
|
.catch((error) => {
|
||||||
|
console.error(
|
||||||
|
"Failed to fetch capabilities after download:",
|
||||||
|
error,
|
||||||
|
);
|
||||||
|
queryClient.invalidateQueries({
|
||||||
|
queryKey: ["modelCapabilities", selectedModel.model],
|
||||||
|
});
|
||||||
|
});
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,114 +0,0 @@
|
||||||
import { useMutation, useQueryClient } from "@tanstack/react-query";
|
|
||||||
import { useState } from "react";
|
|
||||||
import { pullModel } from "@/api";
|
|
||||||
import { useSelectedModel } from "./useSelectedModel";
|
|
||||||
import { useSettings } from "./useSettings";
|
|
||||||
|
|
||||||
interface DownloadProgress {
|
|
||||||
status: string;
|
|
||||||
digest?: string;
|
|
||||||
total?: number;
|
|
||||||
completed?: number;
|
|
||||||
done?: boolean;
|
|
||||||
}
|
|
||||||
|
|
||||||
export function useDownloadModel(chatId?: string) {
|
|
||||||
const queryClient = useQueryClient();
|
|
||||||
const { selectedModel } = useSelectedModel(chatId);
|
|
||||||
const { setSettings } = useSettings();
|
|
||||||
const [downloadProgress, setDownloadProgress] =
|
|
||||||
useState<DownloadProgress | null>(null);
|
|
||||||
const [abortController, setAbortController] =
|
|
||||||
useState<AbortController | null>(null);
|
|
||||||
const [downloadingChatIds, setDownloadingChatIds] = useState<Set<string>>(
|
|
||||||
new Set(),
|
|
||||||
);
|
|
||||||
|
|
||||||
const mutation = useMutation({
|
|
||||||
mutationFn: async (modelName: string) => {
|
|
||||||
const controller = new AbortController();
|
|
||||||
setAbortController(controller);
|
|
||||||
setDownloadProgress({ status: "Starting download..." });
|
|
||||||
if (chatId) {
|
|
||||||
setDownloadingChatIds((prev) => new Set(prev).add(chatId));
|
|
||||||
}
|
|
||||||
|
|
||||||
try {
|
|
||||||
for await (const progress of pullModel(modelName, controller.signal)) {
|
|
||||||
setDownloadProgress(progress);
|
|
||||||
|
|
||||||
if (progress.status === "success") {
|
|
||||||
// Update selected model to indicate it's now available locally
|
|
||||||
if (selectedModel && selectedModel.model === modelName) {
|
|
||||||
setSettings({ SelectedModel: modelName });
|
|
||||||
}
|
|
||||||
// Invalidate models query to refresh the list
|
|
||||||
await queryClient.invalidateQueries({ queryKey: ["models"] });
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} finally {
|
|
||||||
setAbortController(null);
|
|
||||||
if (chatId) {
|
|
||||||
setDownloadingChatIds((prev) => {
|
|
||||||
const newSet = new Set(prev);
|
|
||||||
newSet.delete(chatId);
|
|
||||||
return newSet;
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
onSuccess: () => {
|
|
||||||
setDownloadProgress(null);
|
|
||||||
if (chatId) {
|
|
||||||
setDownloadingChatIds((prev) => {
|
|
||||||
const newSet = new Set(prev);
|
|
||||||
newSet.delete(chatId);
|
|
||||||
return newSet;
|
|
||||||
});
|
|
||||||
}
|
|
||||||
},
|
|
||||||
onError: (error: Error) => {
|
|
||||||
const status =
|
|
||||||
error.name === "AbortError" ? "Download cancelled" : "Download failed";
|
|
||||||
setDownloadProgress({ status, done: true });
|
|
||||||
|
|
||||||
// Clear error message after delay
|
|
||||||
const delay = error.name === "AbortError" ? 1500 : 3000;
|
|
||||||
setTimeout(() => {
|
|
||||||
setDownloadProgress(null);
|
|
||||||
if (chatId) {
|
|
||||||
setDownloadingChatIds((prev) => {
|
|
||||||
const newSet = new Set(prev);
|
|
||||||
newSet.delete(chatId);
|
|
||||||
return newSet;
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}, delay);
|
|
||||||
},
|
|
||||||
});
|
|
||||||
|
|
||||||
const cancelDownload = () => {
|
|
||||||
if (abortController) {
|
|
||||||
abortController.abort();
|
|
||||||
setAbortController(null);
|
|
||||||
if (chatId) {
|
|
||||||
setDownloadingChatIds((prev) => {
|
|
||||||
const newSet = new Set(prev);
|
|
||||||
newSet.delete(chatId);
|
|
||||||
return newSet;
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
return {
|
|
||||||
downloadModel: mutation.mutate,
|
|
||||||
isDownloading:
|
|
||||||
mutation.isPending && chatId ? downloadingChatIds.has(chatId) : false,
|
|
||||||
downloadProgress:
|
|
||||||
chatId && downloadingChatIds.has(chatId) ? downloadProgress : null,
|
|
||||||
error: mutation.error,
|
|
||||||
cancelDownload,
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
@ -1,29 +1,20 @@
|
||||||
import { useQuery, useMutation, useQueryClient } from "@tanstack/react-query";
|
import { useQuery, useMutation, useQueryClient } from "@tanstack/react-query";
|
||||||
import { useEffect, useState } from "react";
|
|
||||||
import { fetchUser, fetchConnectUrl, disconnectUser } from "@/api";
|
import { fetchUser, fetchConnectUrl, disconnectUser } from "@/api";
|
||||||
|
|
||||||
export function useUser() {
|
export function useUser() {
|
||||||
const queryClient = useQueryClient();
|
const queryClient = useQueryClient();
|
||||||
const [initialDataLoaded, setInitialDataLoaded] = useState(false);
|
|
||||||
|
|
||||||
// Wait for initial data to be loaded
|
|
||||||
useEffect(() => {
|
|
||||||
const initialPromise = window.__initialUserDataPromise;
|
|
||||||
if (initialPromise) {
|
|
||||||
initialPromise.finally(() => {
|
|
||||||
setInitialDataLoaded(true);
|
|
||||||
});
|
|
||||||
} else {
|
|
||||||
setInitialDataLoaded(true);
|
|
||||||
}
|
|
||||||
}, []);
|
|
||||||
|
|
||||||
const userQuery = useQuery({
|
const userQuery = useQuery({
|
||||||
queryKey: ["user"],
|
queryKey: ["user"],
|
||||||
queryFn: () => fetchUser(),
|
queryFn: async () => {
|
||||||
|
const result = await fetchUser();
|
||||||
|
return result;
|
||||||
|
},
|
||||||
staleTime: 5 * 60 * 1000, // Consider data stale after 5 minutes
|
staleTime: 5 * 60 * 1000, // Consider data stale after 5 minutes
|
||||||
gcTime: 10 * 60 * 1000, // Keep in cache for 10 minutes
|
gcTime: 10 * 60 * 1000, // Keep in cache for 10 minutes
|
||||||
initialData: null, // Start with null to prevent flashing
|
retry: 10,
|
||||||
|
retryDelay: (attemptIndex) => Math.min(500 * attemptIndex, 2000),
|
||||||
|
refetchOnMount: true, // Always fetch when component mounts
|
||||||
});
|
});
|
||||||
|
|
||||||
// Mutation to refresh user data
|
// Mutation to refresh user data
|
||||||
|
|
@ -49,14 +40,15 @@ export function useUser() {
|
||||||
},
|
},
|
||||||
});
|
});
|
||||||
|
|
||||||
|
const isLoading = userQuery.isLoading || userQuery.isFetching;
|
||||||
|
const isAuthenticated = Boolean(userQuery.data?.name);
|
||||||
|
|
||||||
return {
|
return {
|
||||||
user: userQuery.data,
|
user: userQuery.data,
|
||||||
isLoading:
|
isLoading,
|
||||||
!initialDataLoaded ||
|
|
||||||
(userQuery.isLoading && userQuery.data === undefined), // Show loading until initial data is loaded
|
|
||||||
isError: userQuery.isError,
|
isError: userQuery.isError,
|
||||||
error: userQuery.error,
|
error: userQuery.error,
|
||||||
isAuthenticated: Boolean(userQuery.data?.name),
|
isAuthenticated,
|
||||||
refreshUser: refreshUser.mutate,
|
refreshUser: refreshUser.mutate,
|
||||||
isRefreshing: refreshUser.isPending,
|
isRefreshing: refreshUser.isPending,
|
||||||
refetchUser: userQuery.refetch,
|
refetchUser: userQuery.refetch,
|
||||||
|
|
|
||||||
|
|
@ -8,3 +8,6 @@ export const API_BASE = import.meta.env.DEV ? DEV_API_URL : "";
|
||||||
export const OLLAMA_HOST = import.meta.env.DEV
|
export const OLLAMA_HOST = import.meta.env.DEV
|
||||||
? DEV_API_URL
|
? DEV_API_URL
|
||||||
: window.location.origin;
|
: window.location.origin;
|
||||||
|
|
||||||
|
export const OLLAMA_DOT_COM =
|
||||||
|
import.meta.env.VITE_OLLAMA_DOT_COM_URL || "https://ollama.com";
|
||||||
|
|
|
||||||
|
|
@ -5,13 +5,6 @@ import { QueryClient, QueryClientProvider } from "@tanstack/react-query";
|
||||||
import { routeTree } from "./routeTree.gen";
|
import { routeTree } from "./routeTree.gen";
|
||||||
import { fetchUser } from "./api";
|
import { fetchUser } from "./api";
|
||||||
import { StreamingProvider } from "./contexts/StreamingContext";
|
import { StreamingProvider } from "./contexts/StreamingContext";
|
||||||
import { User } from "@/gotypes";
|
|
||||||
|
|
||||||
declare global {
|
|
||||||
interface Window {
|
|
||||||
__initialUserDataPromise?: Promise<User | null>;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
const queryClient = new QueryClient({
|
const queryClient = new QueryClient({
|
||||||
defaultOptions: {
|
defaultOptions: {
|
||||||
|
|
@ -24,27 +17,11 @@ const queryClient = new QueryClient({
|
||||||
},
|
},
|
||||||
});
|
});
|
||||||
|
|
||||||
// Track initial user data fetch
|
fetchUser().then((userData) => {
|
||||||
let initialUserDataPromise: Promise<User | null> | null = null;
|
if (userData) {
|
||||||
|
|
||||||
// Initialize user data on app startup
|
|
||||||
const initializeUserData = async () => {
|
|
||||||
try {
|
|
||||||
const userData = await fetchUser();
|
|
||||||
queryClient.setQueryData(["user"], userData);
|
queryClient.setQueryData(["user"], userData);
|
||||||
return userData;
|
|
||||||
} catch (error) {
|
|
||||||
console.error("Error initializing user data:", error);
|
|
||||||
queryClient.setQueryData(["user"], null);
|
|
||||||
return null;
|
|
||||||
}
|
}
|
||||||
};
|
});
|
||||||
|
|
||||||
// Start initialization immediately and track the promise
|
|
||||||
initialUserDataPromise = initializeUserData();
|
|
||||||
|
|
||||||
// Export the promise so hooks can await it
|
|
||||||
window.__initialUserDataPromise = initialUserDataPromise;
|
|
||||||
|
|
||||||
const router = createRouter({
|
const router = createRouter({
|
||||||
routeTree,
|
routeTree,
|
||||||
|
|
|
||||||
|
|
@ -102,14 +102,13 @@ type HealthResponse struct {
|
||||||
|
|
||||||
type User struct {
|
type User struct {
|
||||||
ID string `json:"id"`
|
ID string `json:"id"`
|
||||||
Name string `json:"name"`
|
|
||||||
Email string `json:"email"`
|
Email string `json:"email"`
|
||||||
AvatarURL string `json:"avatarURL"`
|
Name string `json:"name"`
|
||||||
Plan string `json:"plan"`
|
Bio string `json:"bio,omitempty"`
|
||||||
Bio string `json:"bio"`
|
AvatarURL string `json:"avatarurl,omitempty"`
|
||||||
FirstName string `json:"firstName"`
|
FirstName string `json:"firstname,omitempty"`
|
||||||
LastName string `json:"lastName"`
|
LastName string `json:"lastname,omitempty"`
|
||||||
OverThreshold bool `json:"overThreshold"`
|
Plan string `json:"plan,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type Attachment struct {
|
type Attachment struct {
|
||||||
|
|
|
||||||
223
app/ui/ui.go
223
app/ui/ui.go
|
|
@ -12,18 +12,17 @@ import (
|
||||||
"log/slog"
|
"log/slog"
|
||||||
"net/http"
|
"net/http"
|
||||||
"net/http/httputil"
|
"net/http/httputil"
|
||||||
"net/url"
|
|
||||||
"os"
|
"os"
|
||||||
"runtime"
|
"runtime"
|
||||||
"runtime/debug"
|
"runtime/debug"
|
||||||
"slices"
|
"slices"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
|
"sync"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/google/uuid"
|
"github.com/google/uuid"
|
||||||
"github.com/ollama/ollama/api"
|
"github.com/ollama/ollama/api"
|
||||||
"github.com/ollama/ollama/app/auth"
|
|
||||||
"github.com/ollama/ollama/app/server"
|
"github.com/ollama/ollama/app/server"
|
||||||
"github.com/ollama/ollama/app/store"
|
"github.com/ollama/ollama/app/store"
|
||||||
"github.com/ollama/ollama/app/tools"
|
"github.com/ollama/ollama/app/tools"
|
||||||
|
|
@ -118,40 +117,66 @@ func (s *Server) log() *slog.Logger {
|
||||||
|
|
||||||
// ollamaProxy creates a reverse proxy handler to the Ollama server
|
// ollamaProxy creates a reverse proxy handler to the Ollama server
|
||||||
func (s *Server) ollamaProxy() http.Handler {
|
func (s *Server) ollamaProxy() http.Handler {
|
||||||
ollamaHost := os.Getenv("OLLAMA_HOST")
|
var (
|
||||||
if ollamaHost == "" {
|
proxy http.Handler
|
||||||
ollamaHost = "http://127.0.0.1:11434"
|
proxyMu sync.Mutex
|
||||||
}
|
)
|
||||||
|
|
||||||
if !strings.HasPrefix(ollamaHost, "http://") && !strings.HasPrefix(ollamaHost, "https://") {
|
|
||||||
ollamaHost = "http://" + ollamaHost
|
|
||||||
}
|
|
||||||
|
|
||||||
target, err := url.Parse(ollamaHost)
|
|
||||||
if err != nil {
|
|
||||||
s.log().Error("failed to parse OLLAMA_HOST", "error", err, "host", ollamaHost)
|
|
||||||
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||||
http.Error(w, "failed to configure proxy", http.StatusInternalServerError)
|
proxyMu.Lock()
|
||||||
})
|
p := proxy
|
||||||
|
proxyMu.Unlock()
|
||||||
|
|
||||||
|
if p == nil {
|
||||||
|
proxyMu.Lock()
|
||||||
|
if proxy == nil {
|
||||||
|
var err error
|
||||||
|
for i := range 2 {
|
||||||
|
if i > 0 {
|
||||||
|
s.log().Warn("ollama server not ready, retrying", "attempt", i+1)
|
||||||
|
time.Sleep(1 * time.Second)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
err = WaitForServer(context.Background(), 10*time.Second)
|
||||||
|
if err == nil {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
proxyMu.Unlock()
|
||||||
|
s.log().Error("ollama server not ready after retries", "error", err)
|
||||||
|
http.Error(w, "Ollama server is not ready", http.StatusServiceUnavailable)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
target := envconfig.Host()
|
||||||
s.log().Info("configuring ollama proxy", "target", target.String())
|
s.log().Info("configuring ollama proxy", "target", target.String())
|
||||||
|
|
||||||
proxy := httputil.NewSingleHostReverseProxy(target)
|
newProxy := httputil.NewSingleHostReverseProxy(target)
|
||||||
|
|
||||||
originalDirector := proxy.Director
|
originalDirector := newProxy.Director
|
||||||
proxy.Director = func(req *http.Request) {
|
newProxy.Director = func(req *http.Request) {
|
||||||
originalDirector(req)
|
originalDirector(req)
|
||||||
req.Host = target.Host
|
req.Host = target.Host
|
||||||
s.log().Debug("proxying request", "method", req.Method, "path", req.URL.Path, "target", target.Host)
|
s.log().Debug("proxying request", "method", req.Method, "path", req.URL.Path, "target", target.Host)
|
||||||
}
|
}
|
||||||
|
|
||||||
proxy.ErrorHandler = func(w http.ResponseWriter, r *http.Request, err error) {
|
newProxy.ErrorHandler = func(w http.ResponseWriter, r *http.Request, err error) {
|
||||||
s.log().Error("proxy error", "error", err, "path", r.URL.Path, "target", target.String())
|
s.log().Error("proxy error", "error", err, "path", r.URL.Path, "target", target.String())
|
||||||
http.Error(w, "proxy error: "+err.Error(), http.StatusBadGateway)
|
http.Error(w, "proxy error: "+err.Error(), http.StatusBadGateway)
|
||||||
}
|
}
|
||||||
|
|
||||||
return proxy
|
proxy = newProxy
|
||||||
|
p = newProxy
|
||||||
|
} else {
|
||||||
|
p = proxy
|
||||||
|
}
|
||||||
|
proxyMu.Unlock()
|
||||||
|
}
|
||||||
|
|
||||||
|
p.ServeHTTP(w, r)
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
type errHandlerFunc func(http.ResponseWriter, *http.Request) error
|
type errHandlerFunc func(http.ResponseWriter, *http.Request) error
|
||||||
|
|
@ -264,11 +289,10 @@ func (s *Server) Handler() http.Handler {
|
||||||
ollamaProxy := s.ollamaProxy()
|
ollamaProxy := s.ollamaProxy()
|
||||||
mux.Handle("GET /api/tags", ollamaProxy)
|
mux.Handle("GET /api/tags", ollamaProxy)
|
||||||
mux.Handle("POST /api/show", ollamaProxy)
|
mux.Handle("POST /api/show", ollamaProxy)
|
||||||
|
mux.Handle("GET /api/version", ollamaProxy)
|
||||||
mux.Handle("GET /api/v1/me", handle(s.me))
|
mux.Handle("HEAD /api/version", ollamaProxy)
|
||||||
mux.Handle("POST /api/v1/disconnect", handle(s.disconnect))
|
mux.Handle("POST /api/me", ollamaProxy)
|
||||||
mux.Handle("GET /api/v1/connect", handle(s.connectURL))
|
mux.Handle("POST /api/signout", ollamaProxy)
|
||||||
mux.Handle("GET /api/v1/health", handle(s.health))
|
|
||||||
|
|
||||||
// React app - catch all non-API routes and serve the React app
|
// React app - catch all non-API routes and serve the React app
|
||||||
mux.Handle("GET /", s.appHandler())
|
mux.Handle("GET /", s.appHandler())
|
||||||
|
|
@ -338,7 +362,7 @@ func (s *Server) doSelfSigned(ctx context.Context, method, path string) (*http.R
|
||||||
}
|
}
|
||||||
|
|
||||||
// UserData fetches user data from ollama.com API for the current ollama key
|
// UserData fetches user data from ollama.com API for the current ollama key
|
||||||
func (s *Server) UserData(ctx context.Context) (*responses.User, error) {
|
func (s *Server) UserData(ctx context.Context) (*api.UserResponse, error) {
|
||||||
resp, err := s.doSelfSigned(ctx, http.MethodPost, "/api/me")
|
resp, err := s.doSelfSigned(ctx, http.MethodPost, "/api/me")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("failed to call ollama.com/api/me: %w", err)
|
return nil, fmt.Errorf("failed to call ollama.com/api/me: %w", err)
|
||||||
|
|
@ -349,7 +373,7 @@ func (s *Server) UserData(ctx context.Context) (*responses.User, error) {
|
||||||
return nil, fmt.Errorf("unexpected status code: %d", resp.StatusCode)
|
return nil, fmt.Errorf("unexpected status code: %d", resp.StatusCode)
|
||||||
}
|
}
|
||||||
|
|
||||||
var user responses.User
|
var user api.UserResponse
|
||||||
if err := json.NewDecoder(resp.Body).Decode(&user); err != nil {
|
if err := json.NewDecoder(resp.Body).Decode(&user); err != nil {
|
||||||
return nil, fmt.Errorf("failed to parse user response: %w", err)
|
return nil, fmt.Errorf("failed to parse user response: %w", err)
|
||||||
}
|
}
|
||||||
|
|
@ -368,29 +392,27 @@ func (s *Server) UserData(ctx context.Context) (*responses.User, error) {
|
||||||
return &user, nil
|
return &user, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func waitForServer(ctx context.Context) error {
|
// WaitForServer waits for the Ollama server to be ready
|
||||||
timeout := time.Now().Add(10 * time.Second)
|
func WaitForServer(ctx context.Context, timeout time.Duration) error {
|
||||||
// TODO: this avoids an error on first load of the app
|
deadline := time.Now().Add(timeout)
|
||||||
// however we should either show a loading state or
|
for time.Now().Before(deadline) {
|
||||||
// wait for the Ollama server to be ready before redirecting
|
|
||||||
for {
|
|
||||||
c, err := api.ClientFromEnvironment()
|
c, err := api.ClientFromEnvironment()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
if _, err := c.Version(ctx); err == nil {
|
if _, err := c.Version(ctx); err == nil {
|
||||||
break
|
slog.Debug("ollama server is ready")
|
||||||
}
|
return nil
|
||||||
if time.Now().After(timeout) {
|
|
||||||
return fmt.Errorf("timeout waiting for Ollama server to be ready")
|
|
||||||
}
|
}
|
||||||
time.Sleep(10 * time.Millisecond)
|
time.Sleep(10 * time.Millisecond)
|
||||||
}
|
}
|
||||||
return nil
|
return errors.New("timeout waiting for Ollama server to be ready")
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *Server) createChat(w http.ResponseWriter, r *http.Request) error {
|
func (s *Server) createChat(w http.ResponseWriter, r *http.Request) error {
|
||||||
waitForServer(r.Context())
|
if err := WaitForServer(r.Context(), 10*time.Second); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
id, err := uuid.NewV7()
|
id, err := uuid.NewV7()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
|
@ -1438,129 +1460,6 @@ func (s *Server) settings(w http.ResponseWriter, r *http.Request) error {
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *Server) me(w http.ResponseWriter, r *http.Request) error {
|
|
||||||
if r.Method != http.MethodGet {
|
|
||||||
http.Error(w, "Method Not Allowed", http.StatusMethodNotAllowed)
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
user, err := s.UserData(r.Context())
|
|
||||||
if err != nil {
|
|
||||||
// If fetching from API fails, try to return cached user data if available
|
|
||||||
if cachedUser, cacheErr := s.Store.User(); cacheErr == nil && cachedUser != nil {
|
|
||||||
s.log().Info("API request failed, returning cached user data", "error", err)
|
|
||||||
responseUser := &responses.User{
|
|
||||||
Name: cachedUser.Name,
|
|
||||||
Email: cachedUser.Email,
|
|
||||||
Plan: cachedUser.Plan,
|
|
||||||
}
|
|
||||||
w.Header().Set("Content-Type", "application/json")
|
|
||||||
w.WriteHeader(http.StatusOK)
|
|
||||||
return json.NewEncoder(w).Encode(responseUser)
|
|
||||||
}
|
|
||||||
|
|
||||||
s.log().Error("failed to get user data", "error", err)
|
|
||||||
w.WriteHeader(http.StatusInternalServerError)
|
|
||||||
return json.NewEncoder(w).Encode(responses.Error{
|
|
||||||
Error: "failed to get user data",
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
w.Header().Set("Content-Type", "application/json")
|
|
||||||
w.WriteHeader(http.StatusOK)
|
|
||||||
return json.NewEncoder(w).Encode(user)
|
|
||||||
}
|
|
||||||
|
|
||||||
func (s *Server) disconnect(w http.ResponseWriter, r *http.Request) error {
|
|
||||||
if r.Method != http.MethodPost {
|
|
||||||
http.Error(w, "Method Not Allowed", http.StatusMethodNotAllowed)
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
if err := s.Store.ClearUser(); err != nil {
|
|
||||||
s.log().Warn("failed to clear cached user data", "error", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Get the SSH public key to encode for the delete request
|
|
||||||
pubKey, err := ollamaAuth.GetPublicKey()
|
|
||||||
if err != nil {
|
|
||||||
s.log().Error("failed to get public key", "error", err)
|
|
||||||
w.WriteHeader(http.StatusInternalServerError)
|
|
||||||
return json.NewEncoder(w).Encode(responses.Error{
|
|
||||||
Error: "failed to get public key",
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
// Encode the key using base64 URL encoding
|
|
||||||
encodedKey := base64.RawURLEncoding.EncodeToString([]byte(pubKey))
|
|
||||||
|
|
||||||
// Call the /api/user/keys/{encodedKey} endpoint with DELETE
|
|
||||||
resp, err := s.doSelfSigned(r.Context(), http.MethodDelete, fmt.Sprintf("/api/user/keys/%s", encodedKey))
|
|
||||||
if err != nil {
|
|
||||||
s.log().Error("failed to call ollama.com/api/user/keys", "error", err)
|
|
||||||
w.WriteHeader(http.StatusInternalServerError)
|
|
||||||
return json.NewEncoder(w).Encode(responses.Error{
|
|
||||||
Error: "failed to disconnect from ollama.com",
|
|
||||||
})
|
|
||||||
}
|
|
||||||
defer resp.Body.Close()
|
|
||||||
|
|
||||||
if resp.StatusCode != http.StatusOK {
|
|
||||||
s.log().Error("disconnect request failed", "status", resp.StatusCode)
|
|
||||||
w.WriteHeader(http.StatusInternalServerError)
|
|
||||||
return json.NewEncoder(w).Encode(responses.Error{
|
|
||||||
Error: "failed to disconnect from ollama.com",
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
w.Header().Set("Content-Type", "application/json")
|
|
||||||
w.WriteHeader(http.StatusOK)
|
|
||||||
return json.NewEncoder(w).Encode(map[string]string{"status": "disconnected"})
|
|
||||||
}
|
|
||||||
|
|
||||||
func (s *Server) connectURL(w http.ResponseWriter, r *http.Request) error {
|
|
||||||
if r.Method != http.MethodGet {
|
|
||||||
http.Error(w, "Method Not Allowed", http.StatusMethodNotAllowed)
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
connectURL, err := auth.BuildConnectURL(OllamaDotCom)
|
|
||||||
if err != nil {
|
|
||||||
s.log().Error("failed to build connect URL", "error", err)
|
|
||||||
w.WriteHeader(http.StatusInternalServerError)
|
|
||||||
return json.NewEncoder(w).Encode(responses.Error{
|
|
||||||
Error: "failed to build connect URL",
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
w.Header().Set("Content-Type", "application/json")
|
|
||||||
w.WriteHeader(http.StatusOK)
|
|
||||||
return json.NewEncoder(w).Encode(map[string]string{
|
|
||||||
"connect_url": connectURL,
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
func (s *Server) health(w http.ResponseWriter, r *http.Request) error {
|
|
||||||
if r.Method != http.MethodGet {
|
|
||||||
http.Error(w, "Method Not Allowed", http.StatusMethodNotAllowed)
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
healthy := false
|
|
||||||
c, err := api.ClientFromEnvironment()
|
|
||||||
if err == nil {
|
|
||||||
if _, err := c.Version(r.Context()); err == nil {
|
|
||||||
healthy = true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
w.Header().Set("Content-Type", "application/json")
|
|
||||||
w.WriteHeader(http.StatusOK)
|
|
||||||
return json.NewEncoder(w).Encode(responses.HealthResponse{
|
|
||||||
Healthy: healthy,
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
func (s *Server) getInferenceCompute(w http.ResponseWriter, r *http.Request) error {
|
func (s *Server) getInferenceCompute(w http.ResponseWriter, r *http.Request) error {
|
||||||
ctx, cancel := context.WithTimeout(r.Context(), 500*time.Millisecond)
|
ctx, cancel := context.WithTimeout(r.Context(), 500*time.Millisecond)
|
||||||
defer cancel()
|
defer cancel()
|
||||||
|
|
|
||||||
|
|
@ -158,16 +158,16 @@ func (t *winTray) wndProc(hWnd windows.Handle, message uint32, wParam, lParam ui
|
||||||
case uint32(UI_REQUEST_MSG_ID):
|
case uint32(UI_REQUEST_MSG_ID):
|
||||||
// Requests for the UI must always come from the main event thread
|
// Requests for the UI must always come from the main event thread
|
||||||
l := int(wParam)
|
l := int(wParam)
|
||||||
path := unsafe.String((*byte)(unsafe.Pointer(lParam)), l)
|
path := unsafe.String((*byte)(unsafe.Pointer(lParam)), l) //nolint:govet,gosec
|
||||||
t.app.UIRun(path)
|
t.app.UIRun(path)
|
||||||
case WM_COPYDATA:
|
case WM_COPYDATA:
|
||||||
// Handle URL scheme requests from other instances
|
// Handle URL scheme requests from other instances
|
||||||
if lParam != 0 {
|
if lParam != 0 {
|
||||||
cds := (*COPYDATASTRUCT)(unsafe.Pointer(lParam))
|
cds := (*COPYDATASTRUCT)(unsafe.Pointer(lParam)) //nolint:govet,gosec
|
||||||
if cds.DwData == 1 { // Our identifier for URL scheme messages
|
if cds.DwData == 1 { // Our identifier for URL scheme messages
|
||||||
// Convert the data back to string
|
// Convert the data back to string
|
||||||
data := make([]byte, cds.CbData)
|
data := make([]byte, cds.CbData)
|
||||||
copy(data, (*[1 << 30]byte)(unsafe.Pointer(cds.LpData))[:cds.CbData:cds.CbData])
|
copy(data, (*[1 << 30]byte)(unsafe.Pointer(cds.LpData))[:cds.CbData:cds.CbData]) //nolint:govet,gosec
|
||||||
urlScheme := string(data)
|
urlScheme := string(data)
|
||||||
handleURLSchemeRequest(urlScheme)
|
handleURLSchemeRequest(urlScheme)
|
||||||
lResult = 1 // Return non-zero to indicate success
|
lResult = 1 // Return non-zero to indicate success
|
||||||
|
|
|
||||||
|
|
@ -15,7 +15,7 @@ A Go-based command-line tool for benchmarking Ollama models with configurable pa
|
||||||
|
|
||||||
```
|
```
|
||||||
go build -o ollama-bench bench.go
|
go build -o ollama-bench bench.go
|
||||||
./bench -model gpt-oss:20b -epochs 6 -format csv
|
./ollama-bench -model gpt-oss:20b -epochs 6 -format csv
|
||||||
```
|
```
|
||||||
|
|
||||||
Using Go Run (without building)
|
Using Go Run (without building)
|
||||||
|
|
@ -29,31 +29,32 @@ go run bench.go -model gpt-oss:20b -epochs 3
|
||||||
### Basic Example
|
### Basic Example
|
||||||
|
|
||||||
```
|
```
|
||||||
./bench -model gemma3 -epochs 6
|
./ollama-bench -model gemma3 -epochs 6
|
||||||
```
|
```
|
||||||
|
|
||||||
### Benchmark Multiple Models
|
### Benchmark Multiple Models
|
||||||
|
|
||||||
```
|
```
|
||||||
./bench -model gemma3,gemma3n -epochs 6 -max-tokens 100 -p "Write me a short story" | tee gemma.bench
|
./ollama-bench -model gemma3,gemma3n -epochs 6 -max-tokens 100 -p "Write me a short story" | tee gemma.bench
|
||||||
benchstat -col /name gemma.bench
|
benchstat -col /name gemma.bench
|
||||||
```
|
```
|
||||||
|
|
||||||
### With Image Prompt
|
### With Image Prompt
|
||||||
|
|
||||||
```
|
```
|
||||||
./bench -model qwen3-vl -image photo.jpg -epochs 6 -max-tokens 100 -p "Describe this image"
|
./ollama-bench -model qwen3-vl -image photo.jpg -epochs 6 -max-tokens 100 -p "Describe this image"
|
||||||
```
|
```
|
||||||
|
|
||||||
### Advanced Example
|
### Advanced Example
|
||||||
|
|
||||||
```
|
```
|
||||||
./bench -model llama3 -epochs 10 -temperature 0.7 -max-tokens 500 -seed 42 -format csv -output results.csv
|
./ollama-bench -model llama3 -epochs 10 -temperature 0.7 -max-tokens 500 -seed 42 -format csv -output results.csv
|
||||||
```
|
```
|
||||||
|
|
||||||
## Command Line Options
|
## Command Line Options
|
||||||
|
|
||||||
| Option | Description | Default |
|
| Option | Description | Default |
|
||||||
|
|----------|-------------|---------|
|
||||||
| -model | Comma-separated list of models to benchmark | (required) |
|
| -model | Comma-separated list of models to benchmark | (required) |
|
||||||
| -epochs | Number of iterations per model | 1 |
|
| -epochs | Number of iterations per model | 1 |
|
||||||
| -max-tokens | Maximum tokens for model response | 0 (unlimited) |
|
| -max-tokens | Maximum tokens for model response | 0 (unlimited) |
|
||||||
|
|
|
||||||
|
|
@ -182,6 +182,8 @@ func ConvertModel(fsys fs.FS, f *os.File) error {
|
||||||
conv = &llama4Model{}
|
conv = &llama4Model{}
|
||||||
case "Mistral3ForConditionalGeneration":
|
case "Mistral3ForConditionalGeneration":
|
||||||
conv = &mistral3Model{}
|
conv = &mistral3Model{}
|
||||||
|
case "Ministral3ForCausalLM":
|
||||||
|
conv = &mistral3CausalModel{}
|
||||||
case "MixtralForCausalLM":
|
case "MixtralForCausalLM":
|
||||||
conv = &mixtralModel{}
|
conv = &mixtralModel{}
|
||||||
case "GemmaForCausalLM":
|
case "GemmaForCausalLM":
|
||||||
|
|
@ -200,8 +202,12 @@ func ConvertModel(fsys fs.FS, f *os.File) error {
|
||||||
conv = &qwen25VLModel{}
|
conv = &qwen25VLModel{}
|
||||||
case "Qwen3VLForConditionalGeneration", "Qwen3VLMoeForConditionalGeneration":
|
case "Qwen3VLForConditionalGeneration", "Qwen3VLMoeForConditionalGeneration":
|
||||||
conv = &qwen3VLModel{}
|
conv = &qwen3VLModel{}
|
||||||
|
case "Olmo3ForCausalLM":
|
||||||
|
conv = &olmoModel{}
|
||||||
case "BertModel":
|
case "BertModel":
|
||||||
conv = &bertModel{}
|
conv = &bertModel{}
|
||||||
|
case "NomicBertModel", "NomicBertMoEModel":
|
||||||
|
conv = &nomicbertModel{}
|
||||||
case "CohereForCausalLM":
|
case "CohereForCausalLM":
|
||||||
conv = &commandrModel{}
|
conv = &commandrModel{}
|
||||||
case "GptOssForCausalLM":
|
case "GptOssForCausalLM":
|
||||||
|
|
|
||||||
|
|
@ -2,6 +2,7 @@ package convert
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"cmp"
|
"cmp"
|
||||||
|
"slices"
|
||||||
|
|
||||||
"github.com/ollama/ollama/fs/ggml"
|
"github.com/ollama/ollama/fs/ggml"
|
||||||
)
|
)
|
||||||
|
|
@ -33,9 +34,19 @@ type gemma3Model struct {
|
||||||
HeadDim uint32 `json:"head_dim"`
|
HeadDim uint32 `json:"head_dim"`
|
||||||
FinalLogitSoftcap float32 `json:"final_logit_softcapping"`
|
FinalLogitSoftcap float32 `json:"final_logit_softcapping"`
|
||||||
RopeLocalTheta float32 `json:"rope_local_base_freq"`
|
RopeLocalTheta float32 `json:"rope_local_base_freq"`
|
||||||
RopeGlobalTheta float32 `json:"rope_global_base_freq"`
|
RopeTheta float32 `json:"rope_theta"`
|
||||||
SlidingWindow uint32 `json:"sliding_window"`
|
SlidingWindow uint32 `json:"sliding_window"`
|
||||||
|
SlidingWindowPattern *uint32 `json:"sliding_window_pattern"`
|
||||||
|
LayerTypes []string `json:"layer_types"`
|
||||||
MultiModalTokensPerImage uint32 `json:"mm_tokens_per_image"`
|
MultiModalTokensPerImage uint32 `json:"mm_tokens_per_image"`
|
||||||
|
RopeScaling *struct {
|
||||||
|
Type string `json:"rope_type"`
|
||||||
|
Factor float32 `json:"factor"`
|
||||||
|
OriginalMaxPositionEmbeddings uint32 `json:"original_max_position_embeddings"`
|
||||||
|
ExtrapolationFactor float32 `json:"extrapolation_factor"`
|
||||||
|
BetaFast float32 `json:"beta_fast"`
|
||||||
|
BetaSlow float32 `json:"beta_slow"`
|
||||||
|
} `json:"rope_scaling"`
|
||||||
}
|
}
|
||||||
|
|
||||||
const (
|
const (
|
||||||
|
|
@ -81,9 +92,38 @@ func (p *gemma3Model) KV(t *Tokenizer) ggml.KV {
|
||||||
kv["gemma3.attention.key_length"] = p.HeadDim
|
kv["gemma3.attention.key_length"] = p.HeadDim
|
||||||
kv["gemma3.attention.value_length"] = p.HeadDim
|
kv["gemma3.attention.value_length"] = p.HeadDim
|
||||||
kv["gemma3.attention.sliding_window"] = p.SlidingWindow
|
kv["gemma3.attention.sliding_window"] = p.SlidingWindow
|
||||||
kv["gemma3.final_logit_softcapping"] = cmp.Or(p.FinalLogitSoftcap, 30)
|
|
||||||
|
// The sliding window pattern is either provided as the sliding_window_pattern
|
||||||
|
// key (an int) or as the layer_types key (a list of strings).
|
||||||
|
if p.SlidingWindowPattern != nil || len(p.LayerTypes) > 0 {
|
||||||
|
kv["gemma3.attention.sliding_window_pattern"] = slices.Collect(func(yield func(bool) bool) {
|
||||||
|
for i := range numBlocks {
|
||||||
|
var isLocal bool
|
||||||
|
if len(p.LayerTypes) > 0 && int(i) < len(p.LayerTypes) {
|
||||||
|
isLocal = p.LayerTypes[i] == "sliding_attention"
|
||||||
|
} else if p.SlidingWindowPattern != nil && *p.SlidingWindowPattern > 0 {
|
||||||
|
isLocal = (i+1)%*p.SlidingWindowPattern != 0
|
||||||
|
}
|
||||||
|
if !yield(isLocal) {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
if p.FinalLogitSoftcap > 0 {
|
||||||
|
kv["gemma3.final_logit_softcapping"] = p.FinalLogitSoftcap
|
||||||
|
}
|
||||||
kv["gemma3.rope.local.freq_base"] = cmp.Or(p.RopeLocalTheta, 10000.0)
|
kv["gemma3.rope.local.freq_base"] = cmp.Or(p.RopeLocalTheta, 10000.0)
|
||||||
kv["gemma3.rope.global.freq_base"] = cmp.Or(p.RopeGlobalTheta, 1000000.0)
|
kv["gemma3.rope.freq_base"] = cmp.Or(p.RopeTheta, 1000000.0)
|
||||||
|
if p.RopeScaling != nil && p.RopeScaling.Type == "yarn" && p.RopeScaling.Factor > 0 {
|
||||||
|
kv["gemma3.rope.scaling.type"] = "yarn"
|
||||||
|
kv["gemma3.rope.scaling.factor"] = p.RopeScaling.Factor
|
||||||
|
kv["gemma3.rope.scaling.original_context_length"] = p.RopeScaling.OriginalMaxPositionEmbeddings
|
||||||
|
kv["gemma3.rope.scaling.extrapolation_factor"] = cmp.Or(p.RopeScaling.ExtrapolationFactor, float32(1.0))
|
||||||
|
kv["gemma3.rope.scaling.beta_fast"] = cmp.Or(p.RopeScaling.BetaFast, float32(64.0))
|
||||||
|
kv["gemma3.rope.scaling.beta_slow"] = cmp.Or(p.RopeScaling.BetaSlow, float32(1.0))
|
||||||
|
}
|
||||||
|
|
||||||
kv["gemma3.embedding_length"] = p.HiddenSize
|
kv["gemma3.embedding_length"] = p.HiddenSize
|
||||||
kv["gemma3.feed_forward_length"] = p.IntermediateSize
|
kv["gemma3.feed_forward_length"] = p.IntermediateSize
|
||||||
default:
|
default:
|
||||||
|
|
|
||||||
|
|
@ -33,10 +33,12 @@ type mistral3Model struct {
|
||||||
BetaFast float32 `json:"beta_fast"`
|
BetaFast float32 `json:"beta_fast"`
|
||||||
BetaSlow float32 `json:"beta_slow"`
|
BetaSlow float32 `json:"beta_slow"`
|
||||||
Factor float32 `json:"factor"`
|
Factor float32 `json:"factor"`
|
||||||
ScalingBeta float32 `json:"llama_4_scaling_beta"`
|
Llama4ScalingBeta *float32 `json:"llama_4_scaling_beta"`
|
||||||
OrigMaxPositionEmbeddings uint32 `json:"original_max_position_embeddings"`
|
OrigMaxPositionEmbeddings uint32 `json:"original_max_position_embeddings"`
|
||||||
RopeType string `json:"rope_type"`
|
RopeType string `json:"rope_type"`
|
||||||
RopeTheta float32 `json:"rope_theta"`
|
RopeTheta float32 `json:"rope_theta"`
|
||||||
|
Mscale *float32 `json:"mscale"`
|
||||||
|
MscaleAllDim *float32 `json:"mscale_all_dim"`
|
||||||
} `json:"rope_parameters"`
|
} `json:"rope_parameters"`
|
||||||
} `json:"text_config"`
|
} `json:"text_config"`
|
||||||
VisionModel struct {
|
VisionModel struct {
|
||||||
|
|
@ -50,6 +52,9 @@ type mistral3Model struct {
|
||||||
HeadDim uint32 `json:"head_dim"`
|
HeadDim uint32 `json:"head_dim"`
|
||||||
HiddenAct string `json:"hidden_act"`
|
HiddenAct string `json:"hidden_act"`
|
||||||
RopeTheta float32 `json:"rope_theta"`
|
RopeTheta float32 `json:"rope_theta"`
|
||||||
|
RopeParameters struct {
|
||||||
|
RopeTheta float32 `json:"rope_theta"`
|
||||||
|
} `json:"rope_parameters"`
|
||||||
} `json:"vision_config"`
|
} `json:"vision_config"`
|
||||||
MultiModalProjectorBias bool `json:"multimodal_projector_bias"`
|
MultiModalProjectorBias bool `json:"multimodal_projector_bias"`
|
||||||
ProjectorHiddenAct string `json:"projector_hidden_act"`
|
ProjectorHiddenAct string `json:"projector_hidden_act"`
|
||||||
|
|
@ -72,10 +77,22 @@ func (p *mistral3Model) KV(t *Tokenizer) ggml.KV {
|
||||||
kv["mistral3.attention.value_length"] = p.TextModel.HeadDim
|
kv["mistral3.attention.value_length"] = p.TextModel.HeadDim
|
||||||
kv["mistral3.rope.dimension_count"] = cmp.Or(p.TextModel.HeadDim, p.TextModel.HiddenSize/p.TextModel.NumAttentionHeads)
|
kv["mistral3.rope.dimension_count"] = cmp.Or(p.TextModel.HeadDim, p.TextModel.HiddenSize/p.TextModel.NumAttentionHeads)
|
||||||
kv["mistral3.rope.freq_base"] = cmp.Or(p.TextModel.RopeTheta, p.TextModel.RopeParameters.RopeTheta)
|
kv["mistral3.rope.freq_base"] = cmp.Or(p.TextModel.RopeTheta, p.TextModel.RopeParameters.RopeTheta)
|
||||||
|
kv["mistral3.rope.scaling.factor"] = p.TextModel.RopeParameters.Factor
|
||||||
|
kv["mistral3.rope.scaling.type"] = p.TextModel.RopeParameters.RopeType
|
||||||
|
kv["mistral3.rope.scaling.beta_fast"] = p.TextModel.RopeParameters.BetaFast
|
||||||
|
kv["mistral3.rope.scaling.beta_slow"] = p.TextModel.RopeParameters.BetaSlow
|
||||||
|
|
||||||
|
if p.TextModel.RopeParameters.Mscale != nil {
|
||||||
|
kv["mistral3.rope.scaling.mscale"] = *p.TextModel.RopeParameters.Mscale
|
||||||
|
}
|
||||||
|
if p.TextModel.RopeParameters.MscaleAllDim != nil {
|
||||||
|
kv["mistral3.rope.scaling.mscale_all_dim"] = *p.TextModel.RopeParameters.MscaleAllDim
|
||||||
|
}
|
||||||
if p.TextModel.RopeParameters.OrigMaxPositionEmbeddings > 0 {
|
if p.TextModel.RopeParameters.OrigMaxPositionEmbeddings > 0 {
|
||||||
kv["mistral3.rope.scaling.original_context_length"] = p.TextModel.RopeParameters.OrigMaxPositionEmbeddings
|
kv["mistral3.rope.scaling.original_context_length"] = p.TextModel.RopeParameters.OrigMaxPositionEmbeddings
|
||||||
kv["mistral3.rope.scaling_beta"] = p.TextModel.RopeParameters.ScalingBeta
|
}
|
||||||
|
if p.TextModel.RopeParameters.Llama4ScalingBeta != nil {
|
||||||
|
kv["mistral3.rope.scaling_beta"] = *p.TextModel.RopeParameters.Llama4ScalingBeta
|
||||||
}
|
}
|
||||||
|
|
||||||
// Vision configuration
|
// Vision configuration
|
||||||
|
|
@ -88,7 +105,7 @@ func (p *mistral3Model) KV(t *Tokenizer) ggml.KV {
|
||||||
kv["mistral3.vision.patch_size"] = p.VisionModel.PatchSize
|
kv["mistral3.vision.patch_size"] = p.VisionModel.PatchSize
|
||||||
kv["mistral3.vision.num_channels"] = p.VisionModel.NumChannels
|
kv["mistral3.vision.num_channels"] = p.VisionModel.NumChannels
|
||||||
// kv["mistral3.vision.attention.layer_norm_epsilon"] = 1e-05 // Default value
|
// kv["mistral3.vision.attention.layer_norm_epsilon"] = 1e-05 // Default value
|
||||||
kv["mistral3.vision.rope.freq_base"] = p.VisionModel.RopeTheta
|
kv["mistral3.vision.rope.freq_base"] = cmp.Or(p.VisionModel.RopeTheta, p.VisionModel.RopeParameters.RopeTheta)
|
||||||
|
|
||||||
// Multimodal configuration
|
// Multimodal configuration
|
||||||
kv["mistral3.image_token_index"] = p.ImageTokenIndex
|
kv["mistral3.image_token_index"] = p.ImageTokenIndex
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,181 @@
|
||||||
|
package convert
|
||||||
|
|
||||||
|
import (
|
||||||
|
"cmp"
|
||||||
|
"fmt"
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"github.com/pdevine/tensor"
|
||||||
|
"github.com/pdevine/tensor/native"
|
||||||
|
|
||||||
|
"github.com/ollama/ollama/fs/ggml"
|
||||||
|
)
|
||||||
|
|
||||||
|
type mistral3CausalModel struct {
|
||||||
|
ModelParameters
|
||||||
|
|
||||||
|
NumHiddenLayers uint32 `json:"num_hidden_layers"`
|
||||||
|
MaxPositionEmbeddings uint32 `json:"max_position_embeddings"`
|
||||||
|
HiddenSize uint32 `json:"hidden_size"`
|
||||||
|
IntermediateSize uint32 `json:"intermediate_size"`
|
||||||
|
NumAttentionHeads uint32 `json:"num_attention_heads"`
|
||||||
|
NumKeyValueHeads uint32 `json:"num_key_value_heads"`
|
||||||
|
RopeTheta float32 `json:"rope_theta"`
|
||||||
|
RMSNormEPS float32 `json:"rms_norm_eps"`
|
||||||
|
HeadDim uint32 `json:"head_dim"`
|
||||||
|
SlidingWindow *uint32 `json:"sliding_window"`
|
||||||
|
HiddenAct string `json:"hidden_act"`
|
||||||
|
VocabSize uint32 `json:"vocab_size"`
|
||||||
|
RopeParameters struct {
|
||||||
|
BetaFast float32 `json:"beta_fast"`
|
||||||
|
BetaSlow float32 `json:"beta_slow"`
|
||||||
|
Factor float32 `json:"factor"`
|
||||||
|
Llama4ScalingBeta *float32 `json:"llama_4_scaling_beta"`
|
||||||
|
OrigMaxPositionEmbeddings uint32 `json:"original_max_position_embeddings"`
|
||||||
|
RopeType string `json:"rope_type"`
|
||||||
|
RopeTheta float32 `json:"rope_theta"`
|
||||||
|
Mscale *float32 `json:"mscale"`
|
||||||
|
MscaleAllDim *float32 `json:"mscale_all_dim"`
|
||||||
|
} `json:"rope_parameters"`
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *mistral3CausalModel) KV(t *Tokenizer) ggml.KV {
|
||||||
|
kv := p.ModelParameters.KV(t)
|
||||||
|
kv["general.architecture"] = "mistral3"
|
||||||
|
kv["mistral3.vocab_size"] = p.VocabSize
|
||||||
|
|
||||||
|
// Text configuration
|
||||||
|
kv["mistral3.block_count"] = p.NumHiddenLayers
|
||||||
|
kv["mistral3.context_length"] = p.MaxPositionEmbeddings
|
||||||
|
kv["mistral3.embedding_length"] = p.HiddenSize
|
||||||
|
kv["mistral3.feed_forward_length"] = p.IntermediateSize
|
||||||
|
kv["mistral3.attention.head_count"] = p.NumAttentionHeads
|
||||||
|
kv["mistral3.attention.head_count_kv"] = p.NumKeyValueHeads
|
||||||
|
kv["mistral3.attention.layer_norm_rms_epsilon"] = p.RMSNormEPS
|
||||||
|
kv["mistral3.attention.key_length"] = p.HeadDim
|
||||||
|
kv["mistral3.attention.value_length"] = p.HeadDim
|
||||||
|
kv["mistral3.rope.dimension_count"] = cmp.Or(p.HeadDim, p.HiddenSize/p.NumAttentionHeads)
|
||||||
|
kv["mistral3.rope.freq_base"] = cmp.Or(p.RopeTheta, p.RopeParameters.RopeTheta)
|
||||||
|
kv["mistral3.rope.scaling.factor"] = p.RopeParameters.Factor
|
||||||
|
kv["mistral3.rope.scaling.type"] = p.RopeParameters.RopeType
|
||||||
|
kv["mistral3.rope.scaling.beta_fast"] = p.RopeParameters.BetaFast
|
||||||
|
kv["mistral3.rope.scaling.beta_slow"] = p.RopeParameters.BetaSlow
|
||||||
|
|
||||||
|
if p.RopeParameters.Mscale != nil {
|
||||||
|
kv["mistral3.rope.scaling.mscale"] = *p.RopeParameters.Mscale
|
||||||
|
}
|
||||||
|
|
||||||
|
if p.RopeParameters.MscaleAllDim != nil {
|
||||||
|
kv["mistral3.rope.scaling.mscale_all_dim"] = *p.RopeParameters.MscaleAllDim
|
||||||
|
}
|
||||||
|
|
||||||
|
if p.RopeParameters.OrigMaxPositionEmbeddings > 0 {
|
||||||
|
kv["mistral3.rope.scaling.original_context_length"] = p.RopeParameters.OrigMaxPositionEmbeddings
|
||||||
|
kv["mistral3.rope.scaling_beta"] = *p.RopeParameters.Llama4ScalingBeta
|
||||||
|
}
|
||||||
|
|
||||||
|
if p.RopeParameters.Llama4ScalingBeta != nil {
|
||||||
|
kv["mistral3.rope.scaling_beta"] = *p.RopeParameters.Llama4ScalingBeta
|
||||||
|
}
|
||||||
|
|
||||||
|
return kv
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *mistral3CausalModel) Tensors(ts []Tensor) []*ggml.Tensor {
|
||||||
|
var out []*ggml.Tensor
|
||||||
|
|
||||||
|
for _, t := range ts {
|
||||||
|
if !strings.HasPrefix(t.Name(), "v.") {
|
||||||
|
if strings.HasSuffix(t.Name(), ".attn_q.weight") ||
|
||||||
|
strings.HasSuffix(t.Name(), ".attn_k.weight") {
|
||||||
|
t.SetRepacker(p.repack)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
out = append(out, &ggml.Tensor{
|
||||||
|
Name: t.Name(),
|
||||||
|
Kind: t.Kind(),
|
||||||
|
Shape: t.Shape(),
|
||||||
|
WriterTo: t,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *mistral3CausalModel) Replacements() []string {
|
||||||
|
return []string{
|
||||||
|
"model.norm", "output_norm",
|
||||||
|
"model.", "",
|
||||||
|
"layers", "blk",
|
||||||
|
"transformer.layers", "blk",
|
||||||
|
"vision_tower", "v",
|
||||||
|
"ln_pre", "encoder_norm",
|
||||||
|
"input_layernorm", "attn_norm",
|
||||||
|
"post_attention_layernorm", "ffn_norm",
|
||||||
|
"embed_tokens", "token_embd",
|
||||||
|
"self_attn.q_proj", "attn_q",
|
||||||
|
"self_attn.k_proj", "attn_k",
|
||||||
|
"self_attn.v_proj", "attn_v",
|
||||||
|
"self_attn.o_proj", "attn_output",
|
||||||
|
"mlp.down_proj", "ffn_down",
|
||||||
|
"mlp.gate_proj", "ffn_gate",
|
||||||
|
"mlp.up_proj", "ffn_up",
|
||||||
|
"attention.q_proj", "attn_q",
|
||||||
|
"attention.k_proj", "attn_k",
|
||||||
|
"attention.v_proj", "attn_v",
|
||||||
|
"attention.o_proj", "attn_output",
|
||||||
|
"attention_norm", "attn_norm",
|
||||||
|
"feed_forward.gate_proj", "ffn_gate",
|
||||||
|
"feed_forward.down_proj", "ffn_down",
|
||||||
|
"feed_forward.up_proj", "ffn_up",
|
||||||
|
"multi_modal_projector", "mm",
|
||||||
|
"ffn_norm", "ffn_norm",
|
||||||
|
"lm_head", "output",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *mistral3CausalModel) repack(name string, data []float32, shape []uint64) ([]float32, error) {
|
||||||
|
var dims []int
|
||||||
|
for _, dim := range shape {
|
||||||
|
dims = append(dims, int(dim))
|
||||||
|
}
|
||||||
|
|
||||||
|
var heads uint32
|
||||||
|
if strings.HasSuffix(name, ".attn_q.weight") {
|
||||||
|
heads = p.NumAttentionHeads
|
||||||
|
} else if strings.HasSuffix(name, ".attn_k.weight") {
|
||||||
|
heads = cmp.Or(p.NumKeyValueHeads, p.NumAttentionHeads)
|
||||||
|
} else {
|
||||||
|
return nil, fmt.Errorf("unknown tensor for repack: %s", name)
|
||||||
|
}
|
||||||
|
|
||||||
|
n := tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
|
||||||
|
if err := n.Reshape(append([]int{int(heads), 2, dims[0] / int(heads) / 2}, dims[1:]...)...); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := n.T(0, 2, 1, 3); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := n.Reshape(dims...); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := n.Transpose(); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
ts, err := native.SelectF32(n, 1)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
var f32s []float32
|
||||||
|
for _, t := range ts {
|
||||||
|
f32s = append(f32s, t...)
|
||||||
|
}
|
||||||
|
|
||||||
|
return f32s, nil
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,213 @@
|
||||||
|
package convert
|
||||||
|
|
||||||
|
import (
|
||||||
|
"cmp"
|
||||||
|
"encoding/json"
|
||||||
|
"io/fs"
|
||||||
|
"path/filepath"
|
||||||
|
"slices"
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"github.com/ollama/ollama/fs/ggml"
|
||||||
|
)
|
||||||
|
|
||||||
|
type nomicbertModel struct {
|
||||||
|
ModelParameters
|
||||||
|
NLayers uint32 `json:"n_layers"`
|
||||||
|
NumHiddenLayers uint32 `json:"num_hidden_layers"`
|
||||||
|
MaxPositionEmbeddings uint32 `json:"max_position_embeddings"`
|
||||||
|
HiddenSize uint32 `json:"hidden_size"`
|
||||||
|
IntermediateSize uint32 `json:"intermediate_size"`
|
||||||
|
NumAttentionHeads uint32 `json:"num_attention_heads"`
|
||||||
|
NumKeyValueHeads uint32 `json:"num_key_value_heads"`
|
||||||
|
LayerNormEPS float32 `json:"layer_norm_eps"`
|
||||||
|
LayerNormEpsilon float32 `json:"layer_norm_epsilon"`
|
||||||
|
RopeFreqBase float32 `json:"rope_theta"`
|
||||||
|
normalizeEmbeddings bool
|
||||||
|
PoolingType uint32
|
||||||
|
|
||||||
|
// MoE parameters (only present in v2 models)
|
||||||
|
NumExperts uint32 `json:"num_local_experts"`
|
||||||
|
NumExpertsUsed uint32 `json:"num_experts_per_tok"`
|
||||||
|
MoEEveryNLayers uint32 `json:"moe_every_n_layers"`
|
||||||
|
}
|
||||||
|
|
||||||
|
var (
|
||||||
|
_ ModelConverter = (*nomicbertModel)(nil)
|
||||||
|
_ moreParser = (*nomicbertModel)(nil)
|
||||||
|
)
|
||||||
|
|
||||||
|
func (p *nomicbertModel) parseMore(fsys fs.FS) error {
|
||||||
|
bts, err := fs.ReadFile(fsys, "modules.json")
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
var modules []struct {
|
||||||
|
Type string `json:"type"`
|
||||||
|
Path string `json:"path"`
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := json.Unmarshal(bts, &modules); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
var pooling string
|
||||||
|
for _, m := range modules {
|
||||||
|
switch m.Type {
|
||||||
|
case "sentence_transformers.models.Pooling":
|
||||||
|
pooling = m.Path
|
||||||
|
case "sentence_transformers.models.Normalize":
|
||||||
|
p.normalizeEmbeddings = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if pooling != "" {
|
||||||
|
bts, err := fs.ReadFile(fsys, filepath.Join(pooling, "config.json"))
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
var pc struct {
|
||||||
|
PoolingModeCLSToken bool `json:"pooling_mode_cls_token"`
|
||||||
|
PoolingModeMeanTokens bool `json:"pooling_mode_mean_tokens"`
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := json.Unmarshal(bts, &pc); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
if pc.PoolingModeMeanTokens {
|
||||||
|
p.PoolingType = 1
|
||||||
|
} else if pc.PoolingModeCLSToken {
|
||||||
|
p.PoolingType = 2
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *nomicbertModel) KV(t *Tokenizer) ggml.KV {
|
||||||
|
kv := p.ModelParameters.KV(t)
|
||||||
|
|
||||||
|
// Determine architecture based on MoE parameters (following qwen3 pattern)
|
||||||
|
arch := "nomic-bert"
|
||||||
|
if p.MoEEveryNLayers > 0 {
|
||||||
|
arch += "-moe"
|
||||||
|
}
|
||||||
|
|
||||||
|
kv["general.architecture"] = arch
|
||||||
|
kv["attention.causal"] = false
|
||||||
|
kv["pooling_type"] = p.PoolingType
|
||||||
|
kv["normalize_embeddings"] = p.normalizeEmbeddings
|
||||||
|
|
||||||
|
kv["block_count"] = cmp.Or(p.NLayers, p.NumHiddenLayers)
|
||||||
|
|
||||||
|
if contextLength := p.MaxPositionEmbeddings; contextLength > 0 {
|
||||||
|
kv["context_length"] = contextLength
|
||||||
|
}
|
||||||
|
|
||||||
|
if embeddingLength := p.HiddenSize; embeddingLength > 0 {
|
||||||
|
kv["embedding_length"] = p.HiddenSize
|
||||||
|
}
|
||||||
|
|
||||||
|
if feedForwardLength := p.IntermediateSize; feedForwardLength > 0 {
|
||||||
|
kv["feed_forward_length"] = p.IntermediateSize
|
||||||
|
}
|
||||||
|
|
||||||
|
if headCount := p.NumAttentionHeads; headCount > 0 {
|
||||||
|
kv["attention.head_count"] = p.NumAttentionHeads
|
||||||
|
}
|
||||||
|
|
||||||
|
if kvHeadCount := p.NumKeyValueHeads; kvHeadCount > 0 {
|
||||||
|
kv["attention.head_count_kv"] = p.NumKeyValueHeads
|
||||||
|
}
|
||||||
|
|
||||||
|
if layerNormEpsilon := cmp.Or(p.LayerNormEPS, p.LayerNormEpsilon); layerNormEpsilon > 0 {
|
||||||
|
kv["attention.layer_norm_epsilon"] = layerNormEpsilon
|
||||||
|
}
|
||||||
|
|
||||||
|
if p.RopeFreqBase > 0 {
|
||||||
|
kv["rope.freq_base"] = p.RopeFreqBase
|
||||||
|
}
|
||||||
|
|
||||||
|
// MoE specific parameters (only if MoE is enabled)
|
||||||
|
if p.NumExperts > 0 {
|
||||||
|
kv["expert_count"] = p.NumExperts
|
||||||
|
}
|
||||||
|
|
||||||
|
if p.NumExpertsUsed > 0 {
|
||||||
|
kv["expert_used_count"] = p.NumExpertsUsed
|
||||||
|
}
|
||||||
|
|
||||||
|
if p.MoEEveryNLayers > 0 {
|
||||||
|
kv["moe_every_n_layers"] = p.MoEEveryNLayers
|
||||||
|
}
|
||||||
|
|
||||||
|
kv["tokenizer.ggml.model"] = "bert"
|
||||||
|
kv["tokenizer.ggml.token_type_count"] = uint32(2)
|
||||||
|
|
||||||
|
// convert to phantom space tokens
|
||||||
|
for i, e := range t.Tokens {
|
||||||
|
switch {
|
||||||
|
case strings.HasPrefix(e, "[") && strings.HasSuffix(e, "]"):
|
||||||
|
// noop - keep special tokens as-is
|
||||||
|
case strings.HasPrefix(e, "##"):
|
||||||
|
t.Tokens[i] = e[2:]
|
||||||
|
default:
|
||||||
|
t.Tokens[i] = "\u2581" + e
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
kv["tokenizer.ggml.tokens"] = t.Tokens
|
||||||
|
|
||||||
|
return kv
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *nomicbertModel) Tensors(ts []Tensor) []*ggml.Tensor {
|
||||||
|
out := make([]*ggml.Tensor, 0, len(ts))
|
||||||
|
for _, t := range ts {
|
||||||
|
if slices.Contains([]string{
|
||||||
|
"embeddings.position_ids",
|
||||||
|
"pooler.dense.weight",
|
||||||
|
"pooler.dense.bias",
|
||||||
|
}, t.Name()) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
out = append(out, &ggml.Tensor{
|
||||||
|
Name: t.Name(),
|
||||||
|
Kind: t.Kind(),
|
||||||
|
Shape: t.Shape(),
|
||||||
|
WriterTo: t,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func (nomicbertModel) Replacements() []string {
|
||||||
|
return []string{
|
||||||
|
"encoder.layer", "blk",
|
||||||
|
"encoder.layers", "blk",
|
||||||
|
"embeddings.word_embeddings", "token_embd",
|
||||||
|
"embeddings.token_type_embeddings", "token_types",
|
||||||
|
"embeddings.LayerNorm", "token_embd_norm",
|
||||||
|
|
||||||
|
"attention.self.qkv", "attn_qkv",
|
||||||
|
|
||||||
|
"attention.output.dense", "attn_output",
|
||||||
|
"attention.output.LayerNorm", "attn_output_norm",
|
||||||
|
|
||||||
|
"mlp.up", "ffn_up",
|
||||||
|
"mlp.down", "ffn_down",
|
||||||
|
|
||||||
|
"mlp.router", "ffn_gate_inp",
|
||||||
|
"mlp.experts.up", "ffn_up_exps",
|
||||||
|
"mlp.experts.down", "ffn_down_exps",
|
||||||
|
|
||||||
|
"intermediate.dense", "ffn_up",
|
||||||
|
"output.dense", "ffn_down",
|
||||||
|
"output.LayerNorm", "layer_output_norm",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,117 @@
|
||||||
|
package convert
|
||||||
|
|
||||||
|
import (
|
||||||
|
"cmp"
|
||||||
|
|
||||||
|
"github.com/ollama/ollama/fs/ggml"
|
||||||
|
)
|
||||||
|
|
||||||
|
type ropeScaling struct {
|
||||||
|
Factor float32 `json:"factor"`
|
||||||
|
OriginalMaxPositionEmbeds uint32 `json:"original_max_position_embeddings"`
|
||||||
|
AttentionFactor float32 `json:"attention_factor"`
|
||||||
|
BetaFast float32 `json:"beta_fast"`
|
||||||
|
BetaSlow float32 `json:"beta_slow"`
|
||||||
|
RopeType string `json:"rope_type"`
|
||||||
|
ExtrapolationFactor float32 `json:"extrapolation_factor"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type olmoModel struct {
|
||||||
|
ModelParameters
|
||||||
|
|
||||||
|
HiddenSize uint32 `json:"hidden_size"`
|
||||||
|
NumHiddenLayers uint32 `json:"num_hidden_layers"`
|
||||||
|
IntermediateSize uint32 `json:"intermediate_size"`
|
||||||
|
NumAttentionHeads uint32 `json:"num_attention_heads"`
|
||||||
|
NumKeyValueHeads uint32 `json:"num_key_value_heads"`
|
||||||
|
MaxPositionEmbeddings uint32 `json:"max_position_embeddings"`
|
||||||
|
RMSNormEPS float32 `json:"rms_norm_eps"`
|
||||||
|
RopeTheta float32 `json:"rope_theta"`
|
||||||
|
RopeScaling *ropeScaling `json:"rope_scaling"`
|
||||||
|
SlidingWindow uint32 `json:"sliding_window"`
|
||||||
|
LayerTypes []string `json:"layer_types"`
|
||||||
|
}
|
||||||
|
|
||||||
|
var _ ModelConverter = (*olmoModel)(nil)
|
||||||
|
|
||||||
|
func (p *olmoModel) KV(t *Tokenizer) ggml.KV {
|
||||||
|
kv := p.ModelParameters.KV(t)
|
||||||
|
kv["general.architecture"] = "olmo3"
|
||||||
|
kv["olmo3.block_count"] = p.NumHiddenLayers
|
||||||
|
kv["olmo3.context_length"] = p.MaxPositionEmbeddings
|
||||||
|
kv["olmo3.embedding_length"] = p.HiddenSize
|
||||||
|
kv["olmo3.feed_forward_length"] = p.IntermediateSize
|
||||||
|
kv["olmo3.attention.head_count"] = p.NumAttentionHeads
|
||||||
|
kv["olmo3.attention.head_count_kv"] = cmp.Or(p.NumKeyValueHeads, p.NumAttentionHeads)
|
||||||
|
|
||||||
|
if p.RopeTheta > 0 {
|
||||||
|
kv["olmo3.rope.freq_base"] = p.RopeTheta
|
||||||
|
}
|
||||||
|
|
||||||
|
if p.RopeScaling != nil {
|
||||||
|
if p.RopeScaling.Factor > 0 {
|
||||||
|
kv["olmo3.rope.scaling.factor"] = p.RopeScaling.Factor
|
||||||
|
}
|
||||||
|
if p.RopeScaling.OriginalMaxPositionEmbeds > 0 {
|
||||||
|
kv["olmo3.rope.scaling.original_context_length"] = p.RopeScaling.OriginalMaxPositionEmbeds
|
||||||
|
}
|
||||||
|
if p.RopeScaling.AttentionFactor > 0 {
|
||||||
|
kv["olmo3.rope.scaling.attn_factor"] = p.RopeScaling.AttentionFactor
|
||||||
|
}
|
||||||
|
if p.RopeScaling.RopeType != "" {
|
||||||
|
kv["olmo3.rope.scaling.type"] = p.RopeScaling.RopeType
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if p.RMSNormEPS > 0 {
|
||||||
|
kv["olmo3.attention.layer_norm_rms_epsilon"] = p.RMSNormEPS
|
||||||
|
}
|
||||||
|
|
||||||
|
if p.SlidingWindow > 0 {
|
||||||
|
kv["olmo3.attention.sliding_window"] = p.SlidingWindow
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(p.LayerTypes) > 0 {
|
||||||
|
slidingPattern := make([]bool, len(p.LayerTypes))
|
||||||
|
for i, layerType := range p.LayerTypes {
|
||||||
|
slidingPattern[i] = (layerType == "sliding_attention")
|
||||||
|
}
|
||||||
|
kv["olmo3.attention.sliding_window_pattern"] = slidingPattern
|
||||||
|
}
|
||||||
|
|
||||||
|
return kv
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *olmoModel) Tensors(ts []Tensor) []*ggml.Tensor {
|
||||||
|
out := make([]*ggml.Tensor, 0, len(ts))
|
||||||
|
for _, t := range ts {
|
||||||
|
out = append(out, &ggml.Tensor{
|
||||||
|
Name: t.Name(),
|
||||||
|
Kind: t.Kind(),
|
||||||
|
Shape: t.Shape(),
|
||||||
|
WriterTo: t,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *olmoModel) Replacements() []string {
|
||||||
|
return []string{
|
||||||
|
"lm_head", "output",
|
||||||
|
"model.embed_tokens", "token_embd",
|
||||||
|
"model.layers", "blk",
|
||||||
|
"model.norm", "output_norm",
|
||||||
|
"self_attn.q_proj", "attn_q",
|
||||||
|
"self_attn.k_proj", "attn_k",
|
||||||
|
"self_attn.v_proj", "attn_v",
|
||||||
|
"self_attn.o_proj", "attn_output",
|
||||||
|
"self_attn.q_norm", "attn_q_norm",
|
||||||
|
"self_attn.k_norm", "attn_k_norm",
|
||||||
|
"post_attention_layernorm", "post_attention_norm",
|
||||||
|
"post_feedforward_layernorm", "post_ffw_norm",
|
||||||
|
"mlp.gate_proj", "ffn_gate",
|
||||||
|
"mlp.down_proj", "ffn_down",
|
||||||
|
"mlp.up_proj", "ffn_up",
|
||||||
|
}
|
||||||
|
}
|
||||||
10
docs/api.md
10
docs/api.md
|
|
@ -50,7 +50,7 @@ Generate a response for a given prompt with a provided model. This is a streamin
|
||||||
Advanced parameters (optional):
|
Advanced parameters (optional):
|
||||||
|
|
||||||
- `format`: the format to return a response in. Format can be `json` or a JSON schema
|
- `format`: the format to return a response in. Format can be `json` or a JSON schema
|
||||||
- `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
|
- `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.mdx#valid-parameters-and-values) such as `temperature`
|
||||||
- `system`: system message to (overrides what is defined in the `Modelfile`)
|
- `system`: system message to (overrides what is defined in the `Modelfile`)
|
||||||
- `template`: the prompt template to use (overrides what is defined in the `Modelfile`)
|
- `template`: the prompt template to use (overrides what is defined in the `Modelfile`)
|
||||||
- `stream`: if `false` the response will be returned as a single response object, rather than a stream of objects
|
- `stream`: if `false` the response will be returned as a single response object, rather than a stream of objects
|
||||||
|
|
@ -507,7 +507,7 @@ The `message` object has the following fields:
|
||||||
Advanced parameters (optional):
|
Advanced parameters (optional):
|
||||||
|
|
||||||
- `format`: the format to return a response in. Format can be `json` or a JSON schema.
|
- `format`: the format to return a response in. Format can be `json` or a JSON schema.
|
||||||
- `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
|
- `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.mdx#valid-parameters-and-values) such as `temperature`
|
||||||
- `stream`: if `false` the response will be returned as a single response object, rather than a stream of objects
|
- `stream`: if `false` the response will be returned as a single response object, rather than a stream of objects
|
||||||
- `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`)
|
- `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`)
|
||||||
|
|
||||||
|
|
@ -1189,7 +1189,7 @@ If you are creating a model from a safetensors directory or from a GGUF file, yo
|
||||||
- `template`: (optional) the prompt template for the model
|
- `template`: (optional) the prompt template for the model
|
||||||
- `license`: (optional) a string or list of strings containing the license or licenses for the model
|
- `license`: (optional) a string or list of strings containing the license or licenses for the model
|
||||||
- `system`: (optional) a string containing the system prompt for the model
|
- `system`: (optional) a string containing the system prompt for the model
|
||||||
- `parameters`: (optional) a dictionary of parameters for the model (see [Modelfile](./modelfile.md#valid-parameters-and-values) for a list of parameters)
|
- `parameters`: (optional) a dictionary of parameters for the model (see [Modelfile](./modelfile.mdx#valid-parameters-and-values) for a list of parameters)
|
||||||
- `messages`: (optional) a list of message objects used to create a conversation
|
- `messages`: (optional) a list of message objects used to create a conversation
|
||||||
- `stream`: (optional) if `false` the response will be returned as a single response object, rather than a stream of objects
|
- `stream`: (optional) if `false` the response will be returned as a single response object, rather than a stream of objects
|
||||||
- `quantize` (optional): quantize a non-quantized (e.g. float16) model
|
- `quantize` (optional): quantize a non-quantized (e.g. float16) model
|
||||||
|
|
@ -1698,7 +1698,7 @@ Generate embeddings from a model
|
||||||
Advanced parameters:
|
Advanced parameters:
|
||||||
|
|
||||||
- `truncate`: truncates the end of each input to fit within context length. Returns error if `false` and context length is exceeded. Defaults to `true`
|
- `truncate`: truncates the end of each input to fit within context length. Returns error if `false` and context length is exceeded. Defaults to `true`
|
||||||
- `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
|
- `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.mdx#valid-parameters-and-values) such as `temperature`
|
||||||
- `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`)
|
- `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`)
|
||||||
- `dimensions`: number of dimensions for the embedding
|
- `dimensions`: number of dimensions for the embedding
|
||||||
|
|
||||||
|
|
@ -1817,7 +1817,7 @@ Generate embeddings from a model
|
||||||
|
|
||||||
Advanced parameters:
|
Advanced parameters:
|
||||||
|
|
||||||
- `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
|
- `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.mdx#valid-parameters-and-values) such as `temperature`
|
||||||
- `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`)
|
- `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`)
|
||||||
|
|
||||||
### Examples
|
### Examples
|
||||||
|
|
|
||||||
File diff suppressed because one or more lines are too long
|
|
@ -0,0 +1,46 @@
|
||||||
|
# extract-examples
|
||||||
|
|
||||||
|
Extracts code examples from MDX files to a temp directory so you can run them.
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
```shell
|
||||||
|
go run docs/tools/extract-examples/main.go <mdx-file>
|
||||||
|
```
|
||||||
|
|
||||||
|
## Example
|
||||||
|
|
||||||
|
```shell
|
||||||
|
go run docs/tools/extract-examples/main.go docs/api/openai-compatibility.mdx
|
||||||
|
```
|
||||||
|
|
||||||
|
Output:
|
||||||
|
|
||||||
|
```
|
||||||
|
Extracting code examples to: /var/folders/vq/wfm2g6k917d3ldzpjdxc8ph00000gn/T/mdx-examples-3271754368
|
||||||
|
|
||||||
|
- 01_basic.py
|
||||||
|
- 01_basic.js
|
||||||
|
- 01_basic.sh
|
||||||
|
- 02_responses.py
|
||||||
|
- 02_responses.js
|
||||||
|
- 02_responses.sh
|
||||||
|
- 03_vision.py
|
||||||
|
- 03_vision.js
|
||||||
|
- 03_vision.sh
|
||||||
|
|
||||||
|
Extracted 9 file(s) to /var/folders/vq/wfm2g6k917d3ldzpjdxc8ph00000gn/T/mdx-examples-3271754368
|
||||||
|
|
||||||
|
To run examples:
|
||||||
|
|
||||||
|
cd /var/folders/vq/wfm2g6k917d3ldzpjdxc8ph00000gn/T/mdx-examples-3271754368
|
||||||
|
npm install # for JS examples
|
||||||
|
|
||||||
|
then run individual files with `node file.js`, `python file.py`, `bash file.sh`
|
||||||
|
```
|
||||||
|
|
||||||
|
## How it works
|
||||||
|
|
||||||
|
- Parses MDX files looking for fenced code blocks with filenames (e.g., ` ```python basic.py `)
|
||||||
|
- Groups examples by their `<CodeGroup>` and prefixes filenames with `01_`, `02_`, etc.
|
||||||
|
- Writes all extracted files to a temp directory
|
||||||
|
|
@ -0,0 +1,137 @@
|
||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bufio"
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"regexp"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
func main() {
|
||||||
|
if len(os.Args) < 2 {
|
||||||
|
fmt.Fprintln(os.Stderr, "Usage: go run extract-examples.go <mdx-file>")
|
||||||
|
os.Exit(1)
|
||||||
|
}
|
||||||
|
|
||||||
|
mdxFile := os.Args[1]
|
||||||
|
|
||||||
|
f, err := os.Open(mdxFile)
|
||||||
|
if err != nil {
|
||||||
|
fmt.Fprintf(os.Stderr, "Error: %v\n", err)
|
||||||
|
os.Exit(1)
|
||||||
|
}
|
||||||
|
defer f.Close()
|
||||||
|
|
||||||
|
// Create temp directory
|
||||||
|
tempDir, err := os.MkdirTemp("", "mdx-examples-*")
|
||||||
|
if err != nil {
|
||||||
|
fmt.Fprintf(os.Stderr, "Error creating temp dir: %v\n", err)
|
||||||
|
os.Exit(1)
|
||||||
|
}
|
||||||
|
|
||||||
|
fmt.Printf("Extracting code examples to: %s\n\n", tempDir)
|
||||||
|
|
||||||
|
// Patterns
|
||||||
|
codeBlockStart := regexp.MustCompile("^```([a-zA-Z0-9_-]+)\\s+([^\\s]+)$")
|
||||||
|
codeGroupStart := regexp.MustCompile("^<CodeGroup")
|
||||||
|
codeGroupEnd := regexp.MustCompile("^</CodeGroup>")
|
||||||
|
|
||||||
|
scanner := bufio.NewScanner(f)
|
||||||
|
inCodeBlock := false
|
||||||
|
inCodeGroup := false
|
||||||
|
var currentFile string
|
||||||
|
var content strings.Builder
|
||||||
|
count := 0
|
||||||
|
codeGroupNum := 0
|
||||||
|
|
||||||
|
for scanner.Scan() {
|
||||||
|
line := scanner.Text()
|
||||||
|
|
||||||
|
// Track CodeGroup boundaries
|
||||||
|
if codeGroupStart.MatchString(line) {
|
||||||
|
inCodeGroup = true
|
||||||
|
codeGroupNum++
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if codeGroupEnd.MatchString(line) {
|
||||||
|
inCodeGroup = false
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
if inCodeBlock {
|
||||||
|
if line == "```" {
|
||||||
|
// End of code block - write file
|
||||||
|
if currentFile != "" {
|
||||||
|
outPath := filepath.Join(tempDir, currentFile)
|
||||||
|
if err := os.WriteFile(outPath, []byte(content.String()), 0o644); err != nil {
|
||||||
|
fmt.Fprintf(os.Stderr, "Error writing %s: %v\n", currentFile, err)
|
||||||
|
} else {
|
||||||
|
fmt.Printf(" - %s\n", currentFile)
|
||||||
|
count++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
inCodeBlock = false
|
||||||
|
currentFile = ""
|
||||||
|
content.Reset()
|
||||||
|
} else {
|
||||||
|
content.WriteString(line)
|
||||||
|
content.WriteString("\n")
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if matches := codeBlockStart.FindStringSubmatch(line); matches != nil {
|
||||||
|
inCodeBlock = true
|
||||||
|
filename := matches[2]
|
||||||
|
// Prefix with CodeGroup number if inside a CodeGroup
|
||||||
|
if inCodeGroup {
|
||||||
|
currentFile = fmt.Sprintf("%02d_%s", codeGroupNum, filename)
|
||||||
|
} else {
|
||||||
|
currentFile = filename
|
||||||
|
}
|
||||||
|
content.Reset()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := scanner.Err(); err != nil {
|
||||||
|
fmt.Fprintf(os.Stderr, "Error reading file: %v\n", err)
|
||||||
|
os.Exit(1)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Write package.json for JavaScript dependencies
|
||||||
|
packageJSON := `{
|
||||||
|
"name": "mdx-examples",
|
||||||
|
"type": "module",
|
||||||
|
"dependencies": {
|
||||||
|
"openai": "^4",
|
||||||
|
"ollama": "^0.5"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
`
|
||||||
|
if err := os.WriteFile(filepath.Join(tempDir, "package.json"), []byte(packageJSON), 0o644); err != nil {
|
||||||
|
fmt.Fprintf(os.Stderr, "Error writing package.json: %v\n", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Write pyproject.toml for Python dependencies
|
||||||
|
pyprojectTOML := `[project]
|
||||||
|
name = "mdx-examples"
|
||||||
|
version = "0.0.0"
|
||||||
|
dependencies = [
|
||||||
|
"openai",
|
||||||
|
"ollama",
|
||||||
|
]
|
||||||
|
`
|
||||||
|
if err := os.WriteFile(filepath.Join(tempDir, "pyproject.toml"), []byte(pyprojectTOML), 0o644); err != nil {
|
||||||
|
fmt.Fprintf(os.Stderr, "Error writing pyproject.toml: %v\n", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
fmt.Printf("\n")
|
||||||
|
fmt.Printf("Extracted %d file(s) to %s\n", count, tempDir)
|
||||||
|
fmt.Printf("\n")
|
||||||
|
fmt.Printf("To run examples:\n")
|
||||||
|
fmt.Printf("\n")
|
||||||
|
fmt.Printf(" cd %s\n npm install # for JS examples\n", tempDir)
|
||||||
|
fmt.Printf("\n")
|
||||||
|
fmt.Printf("then run individual files with `node file.js`, `python file.py`, `bash file.sh`\n")
|
||||||
|
}
|
||||||
|
|
@ -13,6 +13,7 @@ import (
|
||||||
|
|
||||||
"github.com/ollama/ollama/format"
|
"github.com/ollama/ollama/format"
|
||||||
"github.com/ollama/ollama/fs/util/bufioutil"
|
"github.com/ollama/ollama/fs/util/bufioutil"
|
||||||
|
"github.com/ollama/ollama/ml"
|
||||||
)
|
)
|
||||||
|
|
||||||
type GGML struct {
|
type GGML struct {
|
||||||
|
|
@ -240,18 +241,20 @@ func (kv KV) Bools(key string, defaultValue ...[]bool) []bool {
|
||||||
|
|
||||||
func (kv KV) OllamaEngineRequired() bool {
|
func (kv KV) OllamaEngineRequired() bool {
|
||||||
return slices.Contains([]string{
|
return slices.Contains([]string{
|
||||||
|
"bert",
|
||||||
|
"deepseek2",
|
||||||
|
"deepseekocr",
|
||||||
"gemma3",
|
"gemma3",
|
||||||
"gemma3n",
|
"gemma3n",
|
||||||
"gptoss", "gpt-oss",
|
"gptoss", "gpt-oss",
|
||||||
"llama4",
|
"llama4",
|
||||||
"mistral3",
|
"mistral3",
|
||||||
"mllama",
|
"mllama",
|
||||||
|
"nomic-bert",
|
||||||
|
"olmo3",
|
||||||
"qwen25vl",
|
"qwen25vl",
|
||||||
"qwen3", "qwen3moe",
|
"qwen3", "qwen3moe",
|
||||||
"qwen3vl", "qwen3vlmoe",
|
"qwen3vl", "qwen3vlmoe",
|
||||||
"deepseekocr",
|
|
||||||
"deepseek2",
|
|
||||||
"nomic-bert",
|
|
||||||
}, kv.Architecture())
|
}, kv.Architecture())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -550,7 +553,7 @@ func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, error) {
|
||||||
}, nil
|
}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType string, useFlashAttention bool) (kv []uint64, partialOffload, fullOffload uint64) {
|
func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType string, useFlashAttention ml.FlashAttentionType) (kv []uint64, partialOffload, fullOffload uint64) {
|
||||||
context *= uint64(numParallel)
|
context *= uint64(numParallel)
|
||||||
|
|
||||||
embedding := f.KV().EmbeddingLength()
|
embedding := f.KV().EmbeddingLength()
|
||||||
|
|
@ -791,7 +794,7 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri
|
||||||
}
|
}
|
||||||
|
|
||||||
partialOffload = 2 * f.KV().HeadCountMax() / cmp.Or(f.KV().HeadCountKVMin(), 1) * kvTotal / 6
|
partialOffload = 2 * f.KV().HeadCountMax() / cmp.Or(f.KV().HeadCountKVMin(), 1) * kvTotal / 6
|
||||||
if useFlashAttention {
|
if useFlashAttention == ml.FlashAttentionEnabled {
|
||||||
// rough estimate of graph size with flash attention on
|
// rough estimate of graph size with flash attention on
|
||||||
partialOffload = (4*uint64(numParallel) + context>>10 + 110) * format.MebiByte
|
partialOffload = (4*uint64(numParallel) + context>>10 + 110) * format.MebiByte
|
||||||
}
|
}
|
||||||
|
|
@ -809,6 +812,14 @@ func (f GGML) SupportsKVCacheType(cacheType string) bool {
|
||||||
return slices.Contains([]string{"q8_0", "q4_0"}, cacheType)
|
return slices.Contains([]string{"q8_0", "q4_0"}, cacheType)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// KVCacheTypeIsQuantized checks if the requested cache type is a quantized type
|
||||||
|
func (f GGML) KVCacheTypeIsQuantized(cacheType string) bool {
|
||||||
|
if cacheType == "" || cacheType == "f16" || cacheType == "f32" || cacheType == "bf16" {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
// SupportsFlashAttention checks if the model supports flash attention
|
// SupportsFlashAttention checks if the model supports flash attention
|
||||||
func (f GGML) SupportsFlashAttention() bool {
|
func (f GGML) SupportsFlashAttention() bool {
|
||||||
_, isEmbedding := f.KV()[fmt.Sprintf("%s.pooling_type", f.KV().Architecture())]
|
_, isEmbedding := f.KV()[fmt.Sprintf("%s.pooling_type", f.KV().Architecture())]
|
||||||
|
|
@ -829,9 +840,11 @@ func (f GGML) SupportsFlashAttention() bool {
|
||||||
// FlashAttention checks if the model should enable flash attention
|
// FlashAttention checks if the model should enable flash attention
|
||||||
func (f GGML) FlashAttention() bool {
|
func (f GGML) FlashAttention() bool {
|
||||||
return slices.Contains([]string{
|
return slices.Contains([]string{
|
||||||
|
"bert",
|
||||||
"gemma3",
|
"gemma3",
|
||||||
"gptoss", "gpt-oss",
|
"gptoss", "gpt-oss",
|
||||||
"mistral3",
|
"mistral3",
|
||||||
|
"olmo3",
|
||||||
"qwen3", "qwen3moe",
|
"qwen3", "qwen3moe",
|
||||||
"qwen3vl", "qwen3vlmoe",
|
"qwen3vl", "qwen3vlmoe",
|
||||||
}, f.KV().String("general.architecture"))
|
}, f.KV().String("general.architecture"))
|
||||||
|
|
|
||||||
|
|
@ -597,6 +597,10 @@ func ggufWriteKV(ws io.WriteSeeker, arch, k string, v any) error {
|
||||||
|
|
||||||
var err error
|
var err error
|
||||||
switch v := v.(type) {
|
switch v := v.(type) {
|
||||||
|
case int32:
|
||||||
|
err = writeGGUF(ws, ggufTypeInt32, v)
|
||||||
|
case int64:
|
||||||
|
err = writeGGUF(ws, ggufTypeInt64, v)
|
||||||
case uint32, FileType:
|
case uint32, FileType:
|
||||||
err = writeGGUF(ws, ggufTypeUint32, v)
|
err = writeGGUF(ws, ggufTypeUint32, v)
|
||||||
case uint64:
|
case uint64:
|
||||||
|
|
@ -611,6 +615,10 @@ func ggufWriteKV(ws io.WriteSeeker, arch, k string, v any) error {
|
||||||
err = writeGGUFArray(ws, ggufTypeInt32, v)
|
err = writeGGUFArray(ws, ggufTypeInt32, v)
|
||||||
case *array[int32]:
|
case *array[int32]:
|
||||||
err = writeGGUFArray(ws, ggufTypeInt32, v.values)
|
err = writeGGUFArray(ws, ggufTypeInt32, v.values)
|
||||||
|
case []int64:
|
||||||
|
err = writeGGUFArray(ws, ggufTypeInt64, v)
|
||||||
|
case *array[int64]:
|
||||||
|
err = writeGGUFArray(ws, ggufTypeInt64, v.values)
|
||||||
case []uint32:
|
case []uint32:
|
||||||
err = writeGGUFArray(ws, ggufTypeUint32, v)
|
err = writeGGUFArray(ws, ggufTypeUint32, v)
|
||||||
case *array[uint32]:
|
case *array[uint32]:
|
||||||
|
|
|
||||||
|
|
@ -42,6 +42,10 @@ func TestWriteGGUF(t *testing.T) {
|
||||||
"general.architecture": "test",
|
"general.architecture": "test",
|
||||||
"general.alignment": uint32(16),
|
"general.alignment": uint32(16),
|
||||||
"test.key": "value",
|
"test.key": "value",
|
||||||
|
"test.int32_key": int32(-42),
|
||||||
|
"test.int64_key": int64(-9223372036854775808),
|
||||||
|
"test.int32_array": []int32{-1, 0, 1, 2147483647, -2147483648},
|
||||||
|
"test.int64_array": []int64{-1, 0, 1, 9223372036854775807, -9223372036854775808},
|
||||||
"attention.key": "value2",
|
"attention.key": "value2",
|
||||||
"tokenizer.key": "value3",
|
"tokenizer.key": "value3",
|
||||||
"adapter.key": "value4",
|
"adapter.key": "value4",
|
||||||
|
|
@ -55,7 +59,7 @@ func TestWriteGGUF(t *testing.T) {
|
||||||
}
|
}
|
||||||
defer r.Close()
|
defer r.Close()
|
||||||
|
|
||||||
ff, err := Decode(r, 0)
|
ff, err := Decode(r, -1)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
|
|
@ -65,15 +69,19 @@ func TestWriteGGUF(t *testing.T) {
|
||||||
"general.alignment": uint32(16),
|
"general.alignment": uint32(16),
|
||||||
"general.parameter_count": uint64(54),
|
"general.parameter_count": uint64(54),
|
||||||
"test.key": "value",
|
"test.key": "value",
|
||||||
|
"test.int32_key": int32(-42),
|
||||||
|
"test.int64_key": int64(-9223372036854775808),
|
||||||
|
"test.int32_array": &array[int32]{size: 5, values: []int32{-1, 0, 1, 2147483647, -2147483648}},
|
||||||
|
"test.int64_array": &array[int64]{size: 5, values: []int64{-1, 0, 1, 9223372036854775807, -9223372036854775808}},
|
||||||
"test.attention.key": "value2",
|
"test.attention.key": "value2",
|
||||||
"tokenizer.key": "value3",
|
"tokenizer.key": "value3",
|
||||||
"adapter.key": "value4",
|
"adapter.key": "value4",
|
||||||
}, ff.KV()); diff != "" {
|
}, ff.KV(), cmp.AllowUnexported(array[int32]{}, array[int64]{})); diff != "" {
|
||||||
t.Errorf("Mismatch (-want +got):\n%s", diff)
|
t.Errorf("Mismatch (-want +got):\n%s", diff)
|
||||||
}
|
}
|
||||||
|
|
||||||
if diff := cmp.Diff(Tensors{
|
if diff := cmp.Diff(Tensors{
|
||||||
Offset: 800,
|
Offset: 992,
|
||||||
items: []*Tensor{
|
items: []*Tensor{
|
||||||
{Name: "blk.0.attn_k.weight", Offset: 0, Shape: []uint64{2, 3}},
|
{Name: "blk.0.attn_k.weight", Offset: 0, Shape: []uint64{2, 3}},
|
||||||
{Name: "blk.0.attn_norm.weight", Offset: 32, Shape: []uint64{2, 3}},
|
{Name: "blk.0.attn_norm.weight", Offset: 32, Shape: []uint64{2, 3}},
|
||||||
|
|
|
||||||
|
|
@ -4,7 +4,9 @@ package integration
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
|
"errors"
|
||||||
"math"
|
"math"
|
||||||
|
"strings"
|
||||||
"testing"
|
"testing"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
|
|
@ -204,8 +206,8 @@ func TestAllMiniLMEmbed(t *testing.T) {
|
||||||
t.Fatalf("expected %v, got %v (similarity: %f)", expected[0:5], res.Embeddings[0][0:5], sim)
|
t.Fatalf("expected %v, got %v (similarity: %f)", expected[0:5], res.Embeddings[0][0:5], sim)
|
||||||
}
|
}
|
||||||
|
|
||||||
if res.PromptEvalCount != 6 {
|
if res.PromptEvalCount != 8 {
|
||||||
t.Fatalf("expected 6 prompt tokens, got %d", res.PromptEvalCount)
|
t.Fatalf("expected 8 prompt tokens, got %d", res.PromptEvalCount)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -251,8 +253,8 @@ func TestAllMiniLMBatchEmbed(t *testing.T) {
|
||||||
t.Fatalf("expected %v, got %v (similarity: %f)", expected[1][0:5], res.Embeddings[1][0:5], sim)
|
t.Fatalf("expected %v, got %v (similarity: %f)", expected[1][0:5], res.Embeddings[1][0:5], sim)
|
||||||
}
|
}
|
||||||
|
|
||||||
if res.PromptEvalCount != 12 {
|
if res.PromptEvalCount != 16 {
|
||||||
t.Fatalf("expected 12 prompt tokens, got %d", res.PromptEvalCount)
|
t.Fatalf("expected 16 prompt tokens, got %d", res.PromptEvalCount)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -275,7 +277,7 @@ func TestAllMiniLMEmbedTruncate(t *testing.T) {
|
||||||
cases := []struct {
|
cases := []struct {
|
||||||
name string
|
name string
|
||||||
request api.EmbedRequest
|
request api.EmbedRequest
|
||||||
check func(*api.EmbedResponse, error)
|
check func(*testing.T, *api.EmbedResponse, error)
|
||||||
}{
|
}{
|
||||||
{
|
{
|
||||||
name: "target truncation",
|
name: "target truncation",
|
||||||
|
|
@ -283,7 +285,7 @@ func TestAllMiniLMEmbedTruncate(t *testing.T) {
|
||||||
Model: "all-minilm",
|
Model: "all-minilm",
|
||||||
Input: "why",
|
Input: "why",
|
||||||
},
|
},
|
||||||
check: func(got *api.EmbedResponse, err error) {
|
check: func(t *testing.T, got *api.EmbedResponse, err error) {
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
|
|
@ -300,10 +302,11 @@ func TestAllMiniLMEmbedTruncate(t *testing.T) {
|
||||||
Input: "why is the sky blue?",
|
Input: "why is the sky blue?",
|
||||||
Options: map[string]any{"num_ctx": 3},
|
Options: map[string]any{"num_ctx": 3},
|
||||||
},
|
},
|
||||||
check: func(got *api.EmbedResponse, err error) {
|
check: func(t *testing.T, got *api.EmbedResponse, err error) {
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
|
t.Logf("PromptEvalCount: want=%d got=%d", want.PromptEvalCount, got.PromptEvalCount)
|
||||||
if diff := cmp.Diff(want.Embeddings[0], got.Embeddings[0]); diff != "" {
|
if diff := cmp.Diff(want.Embeddings[0], got.Embeddings[0]); diff != "" {
|
||||||
t.Errorf("embedding mismatch (-want +got):\n%s", diff)
|
t.Errorf("embedding mismatch (-want +got):\n%s", diff)
|
||||||
}
|
}
|
||||||
|
|
@ -317,10 +320,11 @@ func TestAllMiniLMEmbedTruncate(t *testing.T) {
|
||||||
Truncate: &truncTrue,
|
Truncate: &truncTrue,
|
||||||
Options: map[string]any{"num_ctx": 3},
|
Options: map[string]any{"num_ctx": 3},
|
||||||
},
|
},
|
||||||
check: func(got *api.EmbedResponse, err error) {
|
check: func(t *testing.T, got *api.EmbedResponse, err error) {
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
|
t.Logf("PromptEvalCount: want=%d got=%d", want.PromptEvalCount, got.PromptEvalCount)
|
||||||
if diff := cmp.Diff(want.Embeddings[0], got.Embeddings[0]); diff != "" {
|
if diff := cmp.Diff(want.Embeddings[0], got.Embeddings[0]); diff != "" {
|
||||||
t.Errorf("embedding mismatch (-want +got):\n%s", diff)
|
t.Errorf("embedding mismatch (-want +got):\n%s", diff)
|
||||||
}
|
}
|
||||||
|
|
@ -334,21 +338,21 @@ func TestAllMiniLMEmbedTruncate(t *testing.T) {
|
||||||
Truncate: &truncFalse,
|
Truncate: &truncFalse,
|
||||||
Options: map[string]any{"num_ctx": 3},
|
Options: map[string]any{"num_ctx": 3},
|
||||||
},
|
},
|
||||||
check: func(res *api.EmbedResponse, err error) {
|
check: func(t *testing.T, res *api.EmbedResponse, err error) {
|
||||||
if err.Error() != "input exceeds maximum context length" {
|
if err.Error() != "the input length exceeds the context length" {
|
||||||
t.Fatalf("expected truncation error, got: %v", err)
|
t.Fatalf("expected truncation error, got: %v", err)
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "input after truncate error",
|
name: "input after truncate error with context length of 1",
|
||||||
request: api.EmbedRequest{
|
request: api.EmbedRequest{
|
||||||
Model: "all-minilm",
|
Model: "all-minilm",
|
||||||
Input: "why is the sky blue?",
|
Input: "why is the sky blue?",
|
||||||
Truncate: &truncTrue,
|
Truncate: &truncTrue,
|
||||||
Options: map[string]any{"num_ctx": 1},
|
Options: map[string]any{"num_ctx": 1},
|
||||||
},
|
},
|
||||||
check: func(res *api.EmbedResponse, err error) {
|
check: func(t *testing.T, res *api.EmbedResponse, err error) {
|
||||||
if err.Error() != "input after truncation exceeds maximum context length" {
|
if err.Error() != "input after truncation exceeds maximum context length" {
|
||||||
t.Fatalf("expected truncation error, got: %v", err)
|
t.Fatalf("expected truncation error, got: %v", err)
|
||||||
}
|
}
|
||||||
|
|
@ -362,7 +366,7 @@ func TestAllMiniLMEmbedTruncate(t *testing.T) {
|
||||||
Truncate: &truncTrue,
|
Truncate: &truncTrue,
|
||||||
Options: map[string]any{"num_ctx": 0},
|
Options: map[string]any{"num_ctx": 0},
|
||||||
},
|
},
|
||||||
check: func(res *api.EmbedResponse, err error) {
|
check: func(t *testing.T, res *api.EmbedResponse, err error) {
|
||||||
if err.Error() != "input after truncation exceeds maximum context length" {
|
if err.Error() != "input after truncation exceeds maximum context length" {
|
||||||
t.Fatalf("expected truncation error, got: %v", err)
|
t.Fatalf("expected truncation error, got: %v", err)
|
||||||
}
|
}
|
||||||
|
|
@ -375,7 +379,7 @@ func TestAllMiniLMEmbedTruncate(t *testing.T) {
|
||||||
Input: "why is the sky blue? Why is the sky blue? hi there my",
|
Input: "why is the sky blue? Why is the sky blue? hi there my",
|
||||||
Options: map[string]any{"num_ctx": 16},
|
Options: map[string]any{"num_ctx": 16},
|
||||||
},
|
},
|
||||||
check: func(res *api.EmbedResponse, err error) {
|
check: func(t *testing.T, res *api.EmbedResponse, err error) {
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
|
|
@ -385,7 +389,8 @@ func TestAllMiniLMEmbedTruncate(t *testing.T) {
|
||||||
|
|
||||||
for _, req := range cases {
|
for _, req := range cases {
|
||||||
t.Run(req.name, func(t *testing.T) {
|
t.Run(req.name, func(t *testing.T) {
|
||||||
req.check(embedTestHelper(ctx, client, t, req.request))
|
resp, err := embedTestHelper(ctx, client, t, req.request)
|
||||||
|
req.check(t, resp, err)
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -409,3 +414,230 @@ func embedTestHelper(ctx context.Context, client *api.Client, t *testing.T, req
|
||||||
|
|
||||||
return client.Embed(ctx, &req)
|
return client.Embed(ctx, &req)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestEmbedTruncation(t *testing.T) {
|
||||||
|
// Use test deadline if set, otherwise default to 2 minutes
|
||||||
|
timeout := 2 * time.Minute
|
||||||
|
if deadline, ok := t.Deadline(); ok {
|
||||||
|
timeout = time.Until(deadline) - 10*time.Second // Reserve 10s buffer
|
||||||
|
}
|
||||||
|
ctx, cancel := context.WithTimeout(context.Background(), timeout)
|
||||||
|
defer cancel()
|
||||||
|
client, _, cleanup := InitServerConnection(ctx, t)
|
||||||
|
defer cleanup()
|
||||||
|
|
||||||
|
for _, model := range libraryEmbedModels {
|
||||||
|
model := model
|
||||||
|
t.Run(model, func(t *testing.T) {
|
||||||
|
// Check if we're running out of time (reserve 20s for current model)
|
||||||
|
if deadline, ok := t.Deadline(); ok && time.Until(deadline) < 20*time.Second {
|
||||||
|
t.Skip("skipping remaining tests to avoid timeout")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Give each model its own budget to account for first-time pulls/loads
|
||||||
|
mctx, mcancel := context.WithTimeout(ctx, 3*time.Minute)
|
||||||
|
defer mcancel()
|
||||||
|
|
||||||
|
t.Run("truncation batch", func(t *testing.T) {
|
||||||
|
truncTrue := true
|
||||||
|
req := api.EmbedRequest{
|
||||||
|
Model: model,
|
||||||
|
Input: []string{"short", strings.Repeat("long ", 100), "medium text"},
|
||||||
|
Truncate: &truncTrue,
|
||||||
|
Options: map[string]any{"num_ctx": 30},
|
||||||
|
}
|
||||||
|
|
||||||
|
res, err := embedTestHelper(mctx, client, t, req)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(res.Embeddings) != 3 {
|
||||||
|
t.Fatalf("expected 3 embeddings, got %d", len(res.Embeddings))
|
||||||
|
}
|
||||||
|
|
||||||
|
if res.PromptEvalCount > 90 {
|
||||||
|
t.Fatalf("expected tokens <= 90 (3 × 30 max), got %d", res.PromptEvalCount)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("runner token count accuracy", func(t *testing.T) {
|
||||||
|
baseline := api.EmbedRequest{Model: model, Input: "test"}
|
||||||
|
baseRes, err := embedTestHelper(mctx, client, t, baseline)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
batch := api.EmbedRequest{
|
||||||
|
Model: model,
|
||||||
|
Input: []string{"test", "test", "test"},
|
||||||
|
}
|
||||||
|
batchRes, err := embedTestHelper(mctx, client, t, batch)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
expectedCount := baseRes.PromptEvalCount * 3
|
||||||
|
if batchRes.PromptEvalCount < expectedCount-2 || batchRes.PromptEvalCount > expectedCount+2 {
|
||||||
|
t.Fatalf("expected ~%d tokens (3 × %d), got %d",
|
||||||
|
expectedCount, baseRes.PromptEvalCount, batchRes.PromptEvalCount)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestEmbedLargeInput tests that embedding models can handle large inputs that would exceed typical batch sizes.
|
||||||
|
func TestEmbedLargeInput(t *testing.T) {
|
||||||
|
ctx, cancel := context.WithTimeout(context.Background(), 3*time.Minute)
|
||||||
|
defer cancel()
|
||||||
|
client, _, cleanup := InitServerConnection(ctx, t)
|
||||||
|
defer cleanup()
|
||||||
|
|
||||||
|
for _, model := range libraryEmbedModels {
|
||||||
|
model := model
|
||||||
|
t.Run(model, func(t *testing.T) {
|
||||||
|
mctx, mcancel := context.WithTimeout(ctx, 2*time.Minute)
|
||||||
|
defer mcancel()
|
||||||
|
|
||||||
|
// Test with progressively larger inputs
|
||||||
|
testCases := []struct {
|
||||||
|
name string
|
||||||
|
inputWords int
|
||||||
|
}{
|
||||||
|
{"medium_input_256_words", 256},
|
||||||
|
{"large_input_512_words", 512},
|
||||||
|
{"very_large_input_800_words", 800},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tc := range testCases {
|
||||||
|
t.Run(tc.name, func(t *testing.T) {
|
||||||
|
words := make([]string, tc.inputWords)
|
||||||
|
for i := range words {
|
||||||
|
words[i] = "word"
|
||||||
|
}
|
||||||
|
input := strings.Join(words, " ")
|
||||||
|
|
||||||
|
req := api.EmbedRequest{
|
||||||
|
Model: model,
|
||||||
|
Input: input,
|
||||||
|
KeepAlive: &api.Duration{Duration: 30 * time.Second},
|
||||||
|
}
|
||||||
|
|
||||||
|
res, err := embedTestHelper(mctx, client, t, req)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("embedding failed for %d words: %v", tc.inputWords, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(res.Embeddings) != 1 {
|
||||||
|
t.Fatalf("expected 1 embedding, got %d", len(res.Embeddings))
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(res.Embeddings[0]) == 0 {
|
||||||
|
t.Fatal("expected non-empty embedding")
|
||||||
|
}
|
||||||
|
|
||||||
|
t.Logf("Successfully embedded %d words (%d tokens)", tc.inputWords, res.PromptEvalCount)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestEmbedStatusCode tests that errors from the embedding endpoint
|
||||||
|
// properly preserve their HTTP status codes when returned to the client.
|
||||||
|
// This test specifically checks the error handling path in EmbedHandler
|
||||||
|
// where api.StatusError errors should maintain their original status code.
|
||||||
|
func TestEmbedStatusCode(t *testing.T) {
|
||||||
|
// Use test deadline if set, otherwise default to 2 minutes
|
||||||
|
timeout := 2 * time.Minute
|
||||||
|
if deadline, ok := t.Deadline(); ok {
|
||||||
|
timeout = time.Until(deadline) - 10*time.Second // Reserve 10s buffer
|
||||||
|
}
|
||||||
|
ctx, cancel := context.WithTimeout(context.Background(), timeout)
|
||||||
|
defer cancel()
|
||||||
|
client, _, cleanup := InitServerConnection(ctx, t)
|
||||||
|
defer cleanup()
|
||||||
|
|
||||||
|
for _, model := range libraryEmbedModels {
|
||||||
|
model := model
|
||||||
|
t.Run(model, func(t *testing.T) {
|
||||||
|
// Check if we're running out of time (reserve 20s for current model)
|
||||||
|
if deadline, ok := t.Deadline(); ok && time.Until(deadline) < 20*time.Second {
|
||||||
|
t.Skip("skipping remaining tests to avoid timeout")
|
||||||
|
}
|
||||||
|
|
||||||
|
mctx, mcancel := context.WithTimeout(ctx, 3*time.Minute)
|
||||||
|
defer mcancel()
|
||||||
|
|
||||||
|
// Pull the model if needed
|
||||||
|
if err := PullIfMissing(mctx, client, model); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
t.Run("truncation error status code", func(t *testing.T) {
|
||||||
|
truncFalse := false
|
||||||
|
longInput := strings.Repeat("word ", 100)
|
||||||
|
|
||||||
|
req := api.EmbedRequest{
|
||||||
|
Model: model,
|
||||||
|
Input: longInput,
|
||||||
|
Truncate: &truncFalse,
|
||||||
|
Options: map[string]any{"num_ctx": 10},
|
||||||
|
}
|
||||||
|
|
||||||
|
_, err := embedTestHelper(mctx, client, t, req)
|
||||||
|
if err == nil {
|
||||||
|
t.Fatal("expected error when truncate=false with long input")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check that it's a StatusError with the correct status code
|
||||||
|
var statusErr api.StatusError
|
||||||
|
if !errors.As(err, &statusErr) {
|
||||||
|
t.Fatalf("expected api.StatusError, got %T: %v", err, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// The error should be a 4xx client error (likely 400 Bad Request)
|
||||||
|
// not a 500 Internal Server Error
|
||||||
|
if statusErr.StatusCode < 400 || statusErr.StatusCode >= 500 {
|
||||||
|
t.Errorf("expected 4xx status code, got %d", statusErr.StatusCode)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Verify the error message is meaningful
|
||||||
|
if !strings.Contains(err.Error(), "context length") {
|
||||||
|
t.Errorf("expected error message to mention context length, got: %v", err)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("batch truncation error status code", func(t *testing.T) {
|
||||||
|
truncFalse := false
|
||||||
|
req := api.EmbedRequest{
|
||||||
|
Model: model,
|
||||||
|
Input: []string{
|
||||||
|
"short input",
|
||||||
|
strings.Repeat("very long input ", 100),
|
||||||
|
"another short input",
|
||||||
|
},
|
||||||
|
Truncate: &truncFalse,
|
||||||
|
Options: map[string]any{"num_ctx": 10},
|
||||||
|
}
|
||||||
|
|
||||||
|
_, err := embedTestHelper(mctx, client, t, req)
|
||||||
|
if err == nil {
|
||||||
|
t.Fatal("expected error when one input exceeds context with truncate=false")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check that it's a StatusError with the correct status code
|
||||||
|
var statusErr api.StatusError
|
||||||
|
if !errors.As(err, &statusErr) {
|
||||||
|
t.Fatalf("expected api.StatusError, got %T: %v", err, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// The error should be a 4xx client error, not a 500 Internal Server Error
|
||||||
|
if statusErr.StatusCode < 400 || statusErr.StatusCode >= 500 {
|
||||||
|
t.Errorf("expected 4xx status code, got %d", statusErr.StatusCode)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
||||||
|
|
@ -140,10 +140,6 @@ func (c *Causal) Init(backend ml.Backend, dtype ml.DType, maxSequences, capacity
|
||||||
c.config.CachePadding = 1
|
c.config.CachePadding = 1
|
||||||
}
|
}
|
||||||
|
|
||||||
if c.config.MaskBatchPadding == 0 {
|
|
||||||
c.config.MaskBatchPadding = 1
|
|
||||||
}
|
|
||||||
|
|
||||||
if c.config.MaskDType == ml.DTypeOther {
|
if c.config.MaskDType == ml.DTypeOther {
|
||||||
c.config.MaskDType = ml.DTypeF32
|
c.config.MaskDType = ml.DTypeF32
|
||||||
}
|
}
|
||||||
|
|
@ -364,15 +360,12 @@ func roundUp(length, pad int) int {
|
||||||
// token in the history should apply. This is based on both the sequence and causality (the
|
// token in the history should apply. This is based on both the sequence and causality (the
|
||||||
// position of the history is not ahead of the token in the batch).
|
// position of the history is not ahead of the token in the batch).
|
||||||
func (c *Causal) buildMask(ctx ml.Context) ml.Tensor {
|
func (c *Causal) buildMask(ctx ml.Context) ml.Tensor {
|
||||||
// Align and pad the two dimensions as required by the backend
|
|
||||||
batchSize := roundUp(c.curBatchSize, c.config.MaskBatchPadding)
|
|
||||||
|
|
||||||
c.curCellRange.min = roundDown(c.curCellRange.min, c.config.CachePadding)
|
c.curCellRange.min = roundDown(c.curCellRange.min, c.config.CachePadding)
|
||||||
c.curCellRange.max = roundUp(c.curCellRange.max+1, c.config.CachePadding) - 1
|
c.curCellRange.max = roundUp(c.curCellRange.max+1, c.config.CachePadding) - 1
|
||||||
|
|
||||||
length := c.curCellRange.max - c.curCellRange.min + 1
|
length := c.curCellRange.max - c.curCellRange.min + 1
|
||||||
|
|
||||||
mask := make([]float32, batchSize*length)
|
mask := make([]float32, c.curBatchSize*length)
|
||||||
|
|
||||||
for i := range c.curBatchSize {
|
for i := range c.curBatchSize {
|
||||||
enabled := !slices.Contains(c.opts.Except, i)
|
enabled := !slices.Contains(c.opts.Except, i)
|
||||||
|
|
@ -386,13 +379,7 @@ func (c *Causal) buildMask(ctx ml.Context) ml.Tensor {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Mask out any padding tokens we added. For padding that we added to the cache history, this
|
maskTensor := ctx.Input().FromFloats(mask, length, c.curBatchSize)
|
||||||
// has already been masked out because the sequence doesn't match.
|
|
||||||
for i := c.curBatchSize * length; i < len(mask); i++ {
|
|
||||||
mask[i] = float32(math.Inf(-1))
|
|
||||||
}
|
|
||||||
|
|
||||||
maskTensor := ctx.Input().FromFloats(mask, length, batchSize)
|
|
||||||
|
|
||||||
if c.config.MaskDType != ml.DTypeF32 {
|
if c.config.MaskDType != ml.DTypeF32 {
|
||||||
maskTensor = maskTensor.Cast(ctx, c.config.MaskDType)
|
maskTensor = maskTensor.Cast(ctx, c.config.MaskDType)
|
||||||
|
|
|
||||||
|
|
@ -1,4 +1,4 @@
|
||||||
int LLAMA_BUILD_NUMBER = 0;
|
int LLAMA_BUILD_NUMBER = 0;
|
||||||
char const *LLAMA_COMMIT = "7f8ef50cce40e3e7e4526a3696cb45658190e69a";
|
char const *LLAMA_COMMIT = "ec98e2002";
|
||||||
char const *LLAMA_COMPILER = "";
|
char const *LLAMA_COMPILER = "";
|
||||||
char const *LLAMA_BUILD_TARGET = "";
|
char const *LLAMA_BUILD_TARGET = "";
|
||||||
|
|
|
||||||
|
|
@ -17,6 +17,9 @@ include /tools/mtmd/clip.cpp
|
||||||
include /tools/mtmd/mtmd.cpp
|
include /tools/mtmd/mtmd.cpp
|
||||||
include /tools/mtmd/mtmd-audio.cpp
|
include /tools/mtmd/mtmd-audio.cpp
|
||||||
include /tools/mtmd/mtmd-helper.cpp
|
include /tools/mtmd/mtmd-helper.cpp
|
||||||
|
include /tools/mtmd/models/
|
||||||
|
include /tools/mtmd/models/*.h
|
||||||
|
include /tools/mtmd/models/*.cpp
|
||||||
include /src/
|
include /src/
|
||||||
include /src/llama.*
|
include /src/llama.*
|
||||||
include /src/llama-*.*
|
include /src/llama-*.*
|
||||||
|
|
|
||||||
|
|
@ -694,7 +694,7 @@ bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_over
|
||||||
|
|
||||||
// Validate if a filename is safe to use
|
// Validate if a filename is safe to use
|
||||||
// To validate a full path, split the path by the OS-specific path separator, and validate each part with this function
|
// To validate a full path, split the path by the OS-specific path separator, and validate each part with this function
|
||||||
bool fs_validate_filename(const std::string & filename) {
|
bool fs_validate_filename(const std::string & filename, bool allow_subdirs) {
|
||||||
if (!filename.length()) {
|
if (!filename.length()) {
|
||||||
// Empty filename invalid
|
// Empty filename invalid
|
||||||
return false;
|
return false;
|
||||||
|
|
@ -754,10 +754,14 @@ bool fs_validate_filename(const std::string & filename) {
|
||||||
|| (c >= 0xD800 && c <= 0xDFFF) // UTF-16 surrogate pairs
|
|| (c >= 0xD800 && c <= 0xDFFF) // UTF-16 surrogate pairs
|
||||||
|| c == 0xFFFD // Replacement Character (UTF-8)
|
|| c == 0xFFFD // Replacement Character (UTF-8)
|
||||||
|| c == 0xFEFF // Byte Order Mark (BOM)
|
|| c == 0xFEFF // Byte Order Mark (BOM)
|
||||||
|| c == '/' || c == '\\' || c == ':' || c == '*' // Illegal characters
|
|| c == ':' || c == '*' // Illegal characters
|
||||||
|| c == '?' || c == '"' || c == '<' || c == '>' || c == '|') {
|
|| c == '?' || c == '"' || c == '<' || c == '>' || c == '|') {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
if (!allow_subdirs && (c == '/' || c == '\\')) {
|
||||||
|
// Subdirectories not allowed, reject path separators
|
||||||
|
return false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Reject any leading or trailing ' ', or any trailing '.', these are stripped on Windows and will cause a different filename
|
// Reject any leading or trailing ' ', or any trailing '.', these are stripped on Windows and will cause a different filename
|
||||||
|
|
@ -782,11 +786,29 @@ bool fs_validate_filename(const std::string & filename) {
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
|
|
||||||
|
|
||||||
|
#ifdef _WIN32
|
||||||
|
static std::wstring utf8_to_wstring(const std::string & str) {
|
||||||
|
if (str.empty()) {
|
||||||
|
return std::wstring();
|
||||||
|
}
|
||||||
|
|
||||||
|
int size = MultiByteToWideChar(CP_UTF8, 0, str.c_str(), (int)str.size(), NULL, 0);
|
||||||
|
|
||||||
|
if (size <= 0) {
|
||||||
|
return std::wstring();
|
||||||
|
}
|
||||||
|
|
||||||
|
std::wstring wstr(size, 0);
|
||||||
|
MultiByteToWideChar(CP_UTF8, 0, str.c_str(), (int)str.size(), &wstr[0], size);
|
||||||
|
|
||||||
|
return wstr;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
// returns true if successful, false otherwise
|
// returns true if successful, false otherwise
|
||||||
bool fs_create_directory_with_parents(const std::string & path) {
|
bool fs_create_directory_with_parents(const std::string & path) {
|
||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
|
std::wstring wpath = utf8_to_wstring(path);
|
||||||
std::wstring wpath = converter.from_bytes(path);
|
|
||||||
|
|
||||||
// if the path already exists, check whether it's a directory
|
// if the path already exists, check whether it's a directory
|
||||||
const DWORD attributes = GetFileAttributesW(wpath.c_str());
|
const DWORD attributes = GetFileAttributesW(wpath.c_str());
|
||||||
|
|
@ -859,6 +881,11 @@ bool fs_create_directory_with_parents(const std::string & path) {
|
||||||
#endif // _WIN32
|
#endif // _WIN32
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool fs_is_directory(const std::string & path) {
|
||||||
|
std::filesystem::path dir(path);
|
||||||
|
return std::filesystem::exists(dir) && std::filesystem::is_directory(dir);
|
||||||
|
}
|
||||||
|
|
||||||
std::string fs_get_cache_directory() {
|
std::string fs_get_cache_directory() {
|
||||||
std::string cache_directory = "";
|
std::string cache_directory = "";
|
||||||
auto ensure_trailing_slash = [](std::string p) {
|
auto ensure_trailing_slash = [](std::string p) {
|
||||||
|
|
@ -893,6 +920,8 @@ std::string fs_get_cache_directory() {
|
||||||
cache_directory = std::getenv("HOME") + std::string("/Library/Caches/");
|
cache_directory = std::getenv("HOME") + std::string("/Library/Caches/");
|
||||||
#elif defined(_WIN32)
|
#elif defined(_WIN32)
|
||||||
cache_directory = std::getenv("LOCALAPPDATA");
|
cache_directory = std::getenv("LOCALAPPDATA");
|
||||||
|
#elif defined(__EMSCRIPTEN__)
|
||||||
|
GGML_ABORT("not implemented on this platform");
|
||||||
#else
|
#else
|
||||||
# error Unknown architecture
|
# error Unknown architecture
|
||||||
#endif
|
#endif
|
||||||
|
|
@ -912,7 +941,7 @@ std::string fs_get_cache_file(const std::string & filename) {
|
||||||
return cache_directory + filename;
|
return cache_directory + filename;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<common_file_info> fs_list_files(const std::string & path) {
|
std::vector<common_file_info> fs_list(const std::string & path, bool include_directories) {
|
||||||
std::vector<common_file_info> files;
|
std::vector<common_file_info> files;
|
||||||
if (path.empty()) return files;
|
if (path.empty()) return files;
|
||||||
|
|
||||||
|
|
@ -929,12 +958,20 @@ std::vector<common_file_info> fs_list_files(const std::string & path) {
|
||||||
common_file_info info;
|
common_file_info info;
|
||||||
info.path = p.string();
|
info.path = p.string();
|
||||||
info.name = p.filename().string();
|
info.name = p.filename().string();
|
||||||
|
info.is_dir = false;
|
||||||
try {
|
try {
|
||||||
info.size = static_cast<size_t>(std::filesystem::file_size(p));
|
info.size = static_cast<size_t>(std::filesystem::file_size(p));
|
||||||
} catch (const std::filesystem::filesystem_error &) {
|
} catch (const std::filesystem::filesystem_error &) {
|
||||||
info.size = 0;
|
info.size = 0;
|
||||||
}
|
}
|
||||||
files.push_back(std::move(info));
|
files.push_back(std::move(info));
|
||||||
|
} else if (include_directories && std::filesystem::is_directory(p)) {
|
||||||
|
common_file_info info;
|
||||||
|
info.path = p.string();
|
||||||
|
info.name = p.filename().string();
|
||||||
|
info.size = 0; // Directories have no size
|
||||||
|
info.is_dir = true;
|
||||||
|
files.push_back(std::move(info));
|
||||||
}
|
}
|
||||||
} catch (const std::filesystem::filesystem_error &) {
|
} catch (const std::filesystem::filesystem_error &) {
|
||||||
// skip entries we cannot inspect
|
// skip entries we cannot inspect
|
||||||
|
|
@ -945,36 +982,71 @@ std::vector<common_file_info> fs_list_files(const std::string & path) {
|
||||||
return files;
|
return files;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// TTY utils
|
||||||
|
//
|
||||||
|
|
||||||
|
bool tty_can_use_colors() {
|
||||||
|
// Check NO_COLOR environment variable (https://no-color.org/)
|
||||||
|
if (const char * no_color = std::getenv("NO_COLOR")) {
|
||||||
|
if (no_color[0] != '\0') {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check TERM environment variable
|
||||||
|
if (const char * term = std::getenv("TERM")) {
|
||||||
|
if (std::strcmp(term, "dumb") == 0) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if stdout and stderr are connected to a terminal
|
||||||
|
// We check both because log messages can go to either
|
||||||
|
bool stdout_is_tty = isatty(fileno(stdout));
|
||||||
|
bool stderr_is_tty = isatty(fileno(stderr));
|
||||||
|
|
||||||
|
return stdout_is_tty || stderr_is_tty;
|
||||||
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
// Model utils
|
// Model utils
|
||||||
//
|
//
|
||||||
|
|
||||||
static inline void common_init_sampler_from_model(
|
// TODO: move to common/sampling
|
||||||
|
static void common_init_sampler_from_model(
|
||||||
const llama_model * model,
|
const llama_model * model,
|
||||||
common_params_sampling & sparams) {
|
common_params_sampling & sparams) {
|
||||||
|
|
||||||
const uint64_t config = sparams.user_sampling_config;
|
const uint64_t config = sparams.user_sampling_config;
|
||||||
|
|
||||||
auto get_int32 = [&](const char * key, int32_t & dst, uint64_t user_config) {
|
auto get_int32 = [&](const char * key, int32_t & dst, uint64_t user_config) {
|
||||||
if (config & user_config) return;
|
if (config & user_config) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
char buf[64] = {0};
|
char buf[64] = {0};
|
||||||
if (llama_model_meta_val_str(model, key, buf, sizeof(buf)) > 0) {
|
if (llama_model_meta_val_str(model, key, buf, sizeof(buf)) > 0) {
|
||||||
char * end = nullptr;
|
char * end = nullptr;
|
||||||
int32_t v = strtol(buf, &end, 10);
|
int32_t v = strtol(buf, &end, 10);
|
||||||
if (end && end != buf) dst = v;
|
if (end && end != buf) {
|
||||||
|
dst = v;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
auto get_float = [&](const char * key, float & dst, uint64_t user_config) {
|
auto get_float = [&](const char * key, float & dst, uint64_t user_config) {
|
||||||
if (config & user_config) return;
|
if (config & user_config) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
char buf[128] = {0};
|
char buf[128] = {0};
|
||||||
if (llama_model_meta_val_str(model, key, buf, sizeof(buf)) > 0) {
|
if (llama_model_meta_val_str(model, key, buf, sizeof(buf)) > 0) {
|
||||||
char * end = nullptr;
|
char * end = nullptr;
|
||||||
float v = strtof(buf, &end);
|
float v = strtof(buf, &end);
|
||||||
if (end && end != buf) dst = v;
|
if (end && end != buf) {
|
||||||
|
dst = v;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
@ -1002,31 +1074,125 @@ static inline void common_init_sampler_from_model(
|
||||||
get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_ETA), sparams.mirostat_eta, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_ETA);
|
get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_ETA), sparams.mirostat_eta, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_ETA);
|
||||||
}
|
}
|
||||||
|
|
||||||
struct common_init_result common_init_from_params(common_params & params) {
|
struct common_init_result::impl {
|
||||||
common_init_result iparams;
|
impl() = default;
|
||||||
|
~impl() = default;
|
||||||
|
|
||||||
|
llama_model_ptr model;
|
||||||
|
llama_context_ptr context;
|
||||||
|
|
||||||
|
std::vector<llama_adapter_lora_ptr> lora;
|
||||||
|
|
||||||
|
std::vector<common_sampler_ptr> samplers;
|
||||||
|
};
|
||||||
|
|
||||||
|
common_init_result::common_init_result(common_params & params) :
|
||||||
|
pimpl(new impl{}) {
|
||||||
auto mparams = common_model_params_to_llama(params);
|
auto mparams = common_model_params_to_llama(params);
|
||||||
|
auto cparams = common_context_params_to_llama(params);
|
||||||
|
|
||||||
|
if (params.fit_params) {
|
||||||
|
LOG_INF("%s: fitting params to device memory, to report bugs during this step use -fit off (or --verbose if you can't)\n", __func__);
|
||||||
|
llama_params_fit(params.model.path.c_str(), &mparams, &cparams,
|
||||||
|
params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target, params.fit_params_min_ctx,
|
||||||
|
params.verbosity >= 4 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR);
|
||||||
|
}
|
||||||
|
|
||||||
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
|
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
|
||||||
if (model == NULL) {
|
if (model == NULL) {
|
||||||
LOG_ERR("%s: failed to load model '%s', try reducing --n-gpu-layers if you're running out of VRAM\n",
|
return;
|
||||||
__func__, params.model.path.c_str());
|
|
||||||
return iparams;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
common_init_sampler_from_model(model, params.sampling);
|
pimpl->model.reset(model);
|
||||||
|
|
||||||
const llama_vocab * vocab = llama_model_get_vocab(model);
|
const llama_vocab * vocab = llama_model_get_vocab(model);
|
||||||
|
|
||||||
auto cparams = common_context_params_to_llama(params);
|
// updates params.sampling
|
||||||
|
// TODO: fix naming
|
||||||
|
common_init_sampler_from_model(model, params.sampling);
|
||||||
|
|
||||||
|
if (params.sampling.ignore_eos && llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) {
|
||||||
|
LOG_WRN("%s: warning: vocab does not have an EOS token, ignoring --ignore-eos\n", __func__);
|
||||||
|
params.sampling.ignore_eos = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// initialize once
|
||||||
|
for (llama_token i = 0; i < llama_vocab_n_tokens(vocab); i++) {
|
||||||
|
if (llama_vocab_is_eog(vocab, i)) {
|
||||||
|
LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(vocab, i).c_str(), -INFINITY);
|
||||||
|
params.sampling.logit_bias_eog.push_back({i, -INFINITY});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (params.sampling.ignore_eos) {
|
||||||
|
// add EOG biases to the active set of logit biases
|
||||||
|
params.sampling.logit_bias.insert(
|
||||||
|
params.sampling.logit_bias.end(),
|
||||||
|
params.sampling.logit_bias_eog.begin(), params.sampling.logit_bias_eog.end());
|
||||||
|
}
|
||||||
|
|
||||||
|
//if (params.sampling.penalty_last_n == -1) {
|
||||||
|
// LOG_INF("%s: setting penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
|
||||||
|
// params.sampling.penalty_last_n = llama_n_ctx(lctx);
|
||||||
|
//}
|
||||||
|
|
||||||
|
//if (params.sampling.dry_penalty_last_n == -1) {
|
||||||
|
// LOG_INF("%s: setting dry_penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
|
||||||
|
// params.sampling.dry_penalty_last_n = llama_n_ctx(lctx);
|
||||||
|
//}
|
||||||
|
|
||||||
|
pimpl->samplers.resize(cparams.n_seq_max);
|
||||||
|
|
||||||
|
for (int i = 0; i < (int) cparams.n_seq_max; ++i) {
|
||||||
|
pimpl->samplers[i].reset(common_sampler_init(model, params.sampling));
|
||||||
|
}
|
||||||
|
|
||||||
llama_context * lctx = llama_init_from_model(model, cparams);
|
llama_context * lctx = llama_init_from_model(model, cparams);
|
||||||
if (lctx == NULL) {
|
if (lctx == NULL) {
|
||||||
LOG_ERR("%s: failed to create context with model '%s', try reducing --n-gpu-layers if you're running out of VRAM\n",
|
LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str());
|
||||||
__func__, params.model.path.c_str());
|
return;
|
||||||
llama_model_free(model);
|
|
||||||
return iparams;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pimpl->context.reset(lctx);
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_model * common_init_result::model() {
|
||||||
|
return pimpl->model.get();
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_context * common_init_result::context() {
|
||||||
|
return pimpl->context.get();
|
||||||
|
}
|
||||||
|
|
||||||
|
common_sampler * common_init_result::sampler(llama_seq_id seq_id) {
|
||||||
|
return pimpl->samplers[seq_id].get();
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<llama_adapter_lora_ptr> & common_init_result::lora() {
|
||||||
|
return pimpl->lora;
|
||||||
|
}
|
||||||
|
|
||||||
|
void common_init_result::free_context() {
|
||||||
|
pimpl->context.reset();
|
||||||
|
}
|
||||||
|
|
||||||
|
common_init_result_ptr common_init_from_params(common_params & params) {
|
||||||
|
common_init_result_ptr res(new common_init_result(params));
|
||||||
|
|
||||||
|
llama_model * model = res->model();
|
||||||
|
if (model == NULL) {
|
||||||
|
LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.path.c_str());
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_context * lctx = res->context();
|
||||||
|
if (lctx == NULL) {
|
||||||
|
LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str());
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
const llama_vocab * vocab = llama_model_get_vocab(model);
|
||||||
|
|
||||||
if (params.ctx_shift && !llama_memory_can_shift(llama_get_memory(lctx))) {
|
if (params.ctx_shift && !llama_memory_can_shift(llama_get_memory(lctx))) {
|
||||||
LOG_WRN("%s: KV cache shifting is not supported for this context, disabling KV cache shifting\n", __func__);
|
LOG_WRN("%s: KV cache shifting is not supported for this context, disabling KV cache shifting\n", __func__);
|
||||||
params.ctx_shift = false;
|
params.ctx_shift = false;
|
||||||
|
|
@ -1038,10 +1204,7 @@ struct common_init_result common_init_from_params(common_params & params) {
|
||||||
|
|
||||||
const auto cvec = common_control_vector_load(params.control_vectors);
|
const auto cvec = common_control_vector_load(params.control_vectors);
|
||||||
if (cvec.n_embd == -1) {
|
if (cvec.n_embd == -1) {
|
||||||
llama_free(lctx);
|
return res;
|
||||||
llama_model_free(model);
|
|
||||||
|
|
||||||
return iparams;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
int err = llama_apply_adapter_cvec(
|
int err = llama_apply_adapter_cvec(
|
||||||
|
|
@ -1052,10 +1215,7 @@ struct common_init_result common_init_from_params(common_params & params) {
|
||||||
params.control_vector_layer_start,
|
params.control_vector_layer_start,
|
||||||
params.control_vector_layer_end);
|
params.control_vector_layer_end);
|
||||||
if (err) {
|
if (err) {
|
||||||
llama_free(lctx);
|
return res;
|
||||||
llama_model_free(model);
|
|
||||||
|
|
||||||
return iparams;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -1079,10 +1239,7 @@ struct common_init_result common_init_from_params(common_params & params) {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!ok) {
|
if (!ok) {
|
||||||
llama_free(lctx);
|
return res;
|
||||||
llama_model_free(model);
|
|
||||||
|
|
||||||
return iparams;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -1092,9 +1249,7 @@ struct common_init_result common_init_from_params(common_params & params) {
|
||||||
lora.reset(llama_adapter_lora_init(model, la.path.c_str()));
|
lora.reset(llama_adapter_lora_init(model, la.path.c_str()));
|
||||||
if (lora == nullptr) {
|
if (lora == nullptr) {
|
||||||
LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
|
LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
|
||||||
llama_free(lctx);
|
return res;
|
||||||
llama_model_free(model);
|
|
||||||
return iparams;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
char buf[1024];
|
char buf[1024];
|
||||||
|
|
@ -1103,43 +1258,13 @@ struct common_init_result common_init_from_params(common_params & params) {
|
||||||
la.task_name = buf;
|
la.task_name = buf;
|
||||||
llama_adapter_meta_val_str(la.ptr, "adapter.lora.prompt_prefix", buf, sizeof(buf));
|
llama_adapter_meta_val_str(la.ptr, "adapter.lora.prompt_prefix", buf, sizeof(buf));
|
||||||
la.prompt_prefix = buf;
|
la.prompt_prefix = buf;
|
||||||
iparams.lora.emplace_back(std::move(lora)); // copy to list of loaded adapters
|
res->lora().emplace_back(std::move(lora)); // copy to list of loaded adapters
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!params.lora_init_without_apply) {
|
if (!params.lora_init_without_apply) {
|
||||||
common_set_adapter_lora(lctx, params.lora_adapters);
|
common_set_adapter_lora(lctx, params.lora_adapters);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.sampling.ignore_eos && llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) {
|
|
||||||
LOG_WRN("%s: warning: vocab does not have an EOS token, ignoring --ignore-eos\n", __func__);
|
|
||||||
params.sampling.ignore_eos = false;
|
|
||||||
}
|
|
||||||
|
|
||||||
// initialize once
|
|
||||||
for (llama_token i = 0; i < llama_vocab_n_tokens(vocab); i++) {
|
|
||||||
if (llama_vocab_is_eog(vocab, i)) {
|
|
||||||
LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(lctx, i).c_str(), -INFINITY);
|
|
||||||
params.sampling.logit_bias_eog.push_back({i, -INFINITY});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (params.sampling.ignore_eos) {
|
|
||||||
// add EOG biases to the active set of logit biases
|
|
||||||
params.sampling.logit_bias.insert(
|
|
||||||
params.sampling.logit_bias.end(),
|
|
||||||
params.sampling.logit_bias_eog.begin(), params.sampling.logit_bias_eog.end());
|
|
||||||
}
|
|
||||||
|
|
||||||
if (params.sampling.penalty_last_n == -1) {
|
|
||||||
LOG_INF("%s: setting penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
|
|
||||||
params.sampling.penalty_last_n = llama_n_ctx(lctx);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (params.sampling.dry_penalty_last_n == -1) {
|
|
||||||
LOG_INF("%s: setting dry_penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
|
|
||||||
params.sampling.dry_penalty_last_n = llama_n_ctx(lctx);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (params.warmup) {
|
if (params.warmup) {
|
||||||
LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);
|
LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);
|
||||||
|
|
||||||
|
|
@ -1178,12 +1303,11 @@ struct common_init_result common_init_from_params(common_params & params) {
|
||||||
llama_set_warmup(lctx, false);
|
llama_set_warmup(lctx, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
iparams.model.reset(model);
|
return res;
|
||||||
iparams.context.reset(lctx);
|
|
||||||
|
|
||||||
return iparams;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
common_init_result::~common_init_result() = default;
|
||||||
|
|
||||||
std::string get_model_endpoint() {
|
std::string get_model_endpoint() {
|
||||||
const char * model_endpoint_env = getenv("MODEL_ENDPOINT");
|
const char * model_endpoint_env = getenv("MODEL_ENDPOINT");
|
||||||
// We still respect the use of environment-variable "HF_ENDPOINT" for backward-compatibility.
|
// We still respect the use of environment-variable "HF_ENDPOINT" for backward-compatibility.
|
||||||
|
|
@ -1192,7 +1316,9 @@ std::string get_model_endpoint() {
|
||||||
std::string model_endpoint = "https://huggingface.co/";
|
std::string model_endpoint = "https://huggingface.co/";
|
||||||
if (endpoint_env) {
|
if (endpoint_env) {
|
||||||
model_endpoint = endpoint_env;
|
model_endpoint = endpoint_env;
|
||||||
if (model_endpoint.back() != '/') model_endpoint += '/';
|
if (model_endpoint.back() != '/') {
|
||||||
|
model_endpoint += '/';
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return model_endpoint;
|
return model_endpoint;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -12,6 +12,10 @@
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <map>
|
#include <map>
|
||||||
|
|
||||||
|
#if defined(_WIN32) && !defined(_WIN32_WINNT)
|
||||||
|
#define _WIN32_WINNT 0x0A00
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
#define DIRECTORY_SEPARATOR '\\'
|
#define DIRECTORY_SEPARATOR '\\'
|
||||||
#else
|
#else
|
||||||
|
|
@ -26,8 +30,6 @@
|
||||||
fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \
|
fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \
|
||||||
} while(0)
|
} while(0)
|
||||||
|
|
||||||
#define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
|
|
||||||
|
|
||||||
struct common_time_meas {
|
struct common_time_meas {
|
||||||
common_time_meas(int64_t & t_acc, bool disable = false);
|
common_time_meas(int64_t & t_acc, bool disable = false);
|
||||||
~common_time_meas();
|
~common_time_meas();
|
||||||
|
|
@ -80,7 +82,8 @@ int32_t cpu_get_num_math();
|
||||||
enum llama_example {
|
enum llama_example {
|
||||||
LLAMA_EXAMPLE_COMMON,
|
LLAMA_EXAMPLE_COMMON,
|
||||||
LLAMA_EXAMPLE_SPECULATIVE,
|
LLAMA_EXAMPLE_SPECULATIVE,
|
||||||
LLAMA_EXAMPLE_MAIN,
|
LLAMA_EXAMPLE_COMPLETION,
|
||||||
|
LLAMA_EXAMPLE_CLI,
|
||||||
LLAMA_EXAMPLE_EMBEDDING,
|
LLAMA_EXAMPLE_EMBEDDING,
|
||||||
LLAMA_EXAMPLE_PERPLEXITY,
|
LLAMA_EXAMPLE_PERPLEXITY,
|
||||||
LLAMA_EXAMPLE_RETRIEVAL,
|
LLAMA_EXAMPLE_RETRIEVAL,
|
||||||
|
|
@ -96,6 +99,7 @@ enum llama_example {
|
||||||
LLAMA_EXAMPLE_TTS,
|
LLAMA_EXAMPLE_TTS,
|
||||||
LLAMA_EXAMPLE_DIFFUSION,
|
LLAMA_EXAMPLE_DIFFUSION,
|
||||||
LLAMA_EXAMPLE_FINETUNE,
|
LLAMA_EXAMPLE_FINETUNE,
|
||||||
|
LLAMA_EXAMPLE_FIT_PARAMS,
|
||||||
|
|
||||||
LLAMA_EXAMPLE_COUNT,
|
LLAMA_EXAMPLE_COUNT,
|
||||||
};
|
};
|
||||||
|
|
@ -192,7 +196,6 @@ struct common_params_sampling {
|
||||||
|
|
||||||
std::vector<std::string> dry_sequence_breakers = {"\n", ":", "\"", "*"}; // default sequence breakers for DRY
|
std::vector<std::string> dry_sequence_breakers = {"\n", ":", "\"", "*"}; // default sequence breakers for DRY
|
||||||
|
|
||||||
|
|
||||||
std::vector<enum common_sampler_type> samplers = {
|
std::vector<enum common_sampler_type> samplers = {
|
||||||
COMMON_SAMPLER_TYPE_PENALTIES,
|
COMMON_SAMPLER_TYPE_PENALTIES,
|
||||||
COMMON_SAMPLER_TYPE_DRY,
|
COMMON_SAMPLER_TYPE_DRY,
|
||||||
|
|
@ -213,6 +216,10 @@ struct common_params_sampling {
|
||||||
std::vector<llama_logit_bias> logit_bias; // logit biases to apply
|
std::vector<llama_logit_bias> logit_bias; // logit biases to apply
|
||||||
std::vector<llama_logit_bias> logit_bias_eog; // pre-calculated logit biases for EOG tokens
|
std::vector<llama_logit_bias> logit_bias_eog; // pre-calculated logit biases for EOG tokens
|
||||||
|
|
||||||
|
bool has_logit_bias() const {
|
||||||
|
return !logit_bias.empty();
|
||||||
|
}
|
||||||
|
|
||||||
// print the parameters into a string
|
// print the parameters into a string
|
||||||
std::string print() const;
|
std::string print() const;
|
||||||
};
|
};
|
||||||
|
|
@ -223,6 +230,7 @@ struct common_params_model {
|
||||||
std::string hf_repo = ""; // HF repo // NOLINT
|
std::string hf_repo = ""; // HF repo // NOLINT
|
||||||
std::string hf_file = ""; // HF file // NOLINT
|
std::string hf_file = ""; // HF file // NOLINT
|
||||||
std::string docker_repo = ""; // Docker repo // NOLINT
|
std::string docker_repo = ""; // Docker repo // NOLINT
|
||||||
|
std::string name = ""; // in format <user>/<model>[:<tag>] (tag is optional) // NOLINT
|
||||||
};
|
};
|
||||||
|
|
||||||
struct common_params_speculative {
|
struct common_params_speculative {
|
||||||
|
|
@ -299,8 +307,8 @@ struct lr_opt {
|
||||||
struct ggml_opt_optimizer_params common_opt_lr_pars(void * userdata);
|
struct ggml_opt_optimizer_params common_opt_lr_pars(void * userdata);
|
||||||
|
|
||||||
struct common_params {
|
struct common_params {
|
||||||
int32_t n_predict = -1; // new tokens to predict
|
int32_t n_predict = -1; // max. number of new tokens to predict, -1 == no limit
|
||||||
int32_t n_ctx = 4096; // context size
|
int32_t n_ctx = 0; // context size, 0 == context the model was trained with
|
||||||
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
|
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
|
||||||
int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
|
int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
|
||||||
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
||||||
|
|
@ -324,6 +332,9 @@ struct common_params {
|
||||||
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
|
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
|
||||||
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
||||||
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
|
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
|
||||||
|
bool fit_params = true; // whether to fit unset model/context parameters to free device memory
|
||||||
|
size_t fit_params_target = 1024 * 1024*1024; // margin per device in bytes for fitting parameters to free memory
|
||||||
|
int32_t fit_params_min_ctx = 4096; // minimum context size to set when trying to reduce memory use
|
||||||
|
|
||||||
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
|
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
|
||||||
|
|
||||||
|
|
@ -369,7 +380,7 @@ struct common_params {
|
||||||
|
|
||||||
std::vector<common_control_vector_load_info> control_vectors; // control vector with user defined scale
|
std::vector<common_control_vector_load_info> control_vectors; // control vector with user defined scale
|
||||||
|
|
||||||
int32_t verbosity = 0;
|
int32_t verbosity = 3; // LOG_LEVEL_INFO
|
||||||
int32_t control_vector_layer_start = -1; // layer range for control vector
|
int32_t control_vector_layer_start = -1; // layer range for control vector
|
||||||
int32_t control_vector_layer_end = -1; // layer range for control vector
|
int32_t control_vector_layer_end = -1; // layer range for control vector
|
||||||
bool offline = false;
|
bool offline = false;
|
||||||
|
|
@ -403,6 +414,7 @@ struct common_params {
|
||||||
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
|
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
|
||||||
bool cont_batching = true; // insert new sequences for decoding on-the-fly
|
bool cont_batching = true; // insert new sequences for decoding on-the-fly
|
||||||
bool no_perf = false; // disable performance metrics
|
bool no_perf = false; // disable performance metrics
|
||||||
|
bool show_timings = true; // show timing information on CLI
|
||||||
bool ctx_shift = false; // context shift on infinite text generation
|
bool ctx_shift = false; // context shift on infinite text generation
|
||||||
bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
|
bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
|
||||||
bool kv_unified = false; // enable unified KV cache
|
bool kv_unified = false; // enable unified KV cache
|
||||||
|
|
@ -459,7 +471,7 @@ struct common_params {
|
||||||
std::string public_path = ""; // NOLINT
|
std::string public_path = ""; // NOLINT
|
||||||
std::string api_prefix = ""; // NOLINT
|
std::string api_prefix = ""; // NOLINT
|
||||||
std::string chat_template = ""; // NOLINT
|
std::string chat_template = ""; // NOLINT
|
||||||
bool use_jinja = false; // NOLINT
|
bool use_jinja = true; // NOLINT
|
||||||
bool enable_chat_template = true;
|
bool enable_chat_template = true;
|
||||||
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
|
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
|
||||||
int reasoning_budget = -1;
|
int reasoning_budget = -1;
|
||||||
|
|
@ -478,9 +490,16 @@ struct common_params {
|
||||||
bool endpoint_props = false; // only control POST requests, not GET
|
bool endpoint_props = false; // only control POST requests, not GET
|
||||||
bool endpoint_metrics = false;
|
bool endpoint_metrics = false;
|
||||||
|
|
||||||
|
// router server configs
|
||||||
|
std::string models_dir = ""; // directory containing models for the router server
|
||||||
|
std::string models_preset = ""; // directory containing model presets for the router server
|
||||||
|
int models_max = 4; // maximum number of models to load simultaneously
|
||||||
|
bool models_autoload = true; // automatically load models when requested via the router server
|
||||||
|
|
||||||
bool log_json = false;
|
bool log_json = false;
|
||||||
|
|
||||||
std::string slot_save_path;
|
std::string slot_save_path;
|
||||||
|
std::string media_path; // path to directory for loading media files
|
||||||
|
|
||||||
float slot_prompt_similarity = 0.1f;
|
float slot_prompt_similarity = 0.1f;
|
||||||
|
|
||||||
|
|
@ -631,8 +650,9 @@ std::string string_from(const struct llama_context * ctx, const struct llama_bat
|
||||||
// Filesystem utils
|
// Filesystem utils
|
||||||
//
|
//
|
||||||
|
|
||||||
bool fs_validate_filename(const std::string & filename);
|
bool fs_validate_filename(const std::string & filename, bool allow_subdirs = false);
|
||||||
bool fs_create_directory_with_parents(const std::string & path);
|
bool fs_create_directory_with_parents(const std::string & path);
|
||||||
|
bool fs_is_directory(const std::string & path);
|
||||||
|
|
||||||
std::string fs_get_cache_directory();
|
std::string fs_get_cache_directory();
|
||||||
std::string fs_get_cache_file(const std::string & filename);
|
std::string fs_get_cache_file(const std::string & filename);
|
||||||
|
|
@ -641,22 +661,44 @@ struct common_file_info {
|
||||||
std::string path;
|
std::string path;
|
||||||
std::string name;
|
std::string name;
|
||||||
size_t size = 0; // in bytes
|
size_t size = 0; // in bytes
|
||||||
|
bool is_dir = false;
|
||||||
};
|
};
|
||||||
std::vector<common_file_info> fs_list_files(const std::string & path);
|
std::vector<common_file_info> fs_list(const std::string & path, bool include_directories);
|
||||||
|
|
||||||
|
//
|
||||||
|
// TTY utils
|
||||||
|
//
|
||||||
|
|
||||||
|
// Auto-detect if colors can be enabled based on terminal and environment
|
||||||
|
bool tty_can_use_colors();
|
||||||
|
|
||||||
//
|
//
|
||||||
// Model utils
|
// Model utils
|
||||||
//
|
//
|
||||||
|
|
||||||
// note: defines object's lifetime
|
struct common_sampler;
|
||||||
struct common_init_result {
|
|
||||||
llama_model_ptr model;
|
|
||||||
llama_context_ptr context;
|
|
||||||
|
|
||||||
std::vector<llama_adapter_lora_ptr> lora;
|
// note: defines the model, context, samplers, ets. lifetimes
|
||||||
|
struct common_init_result {
|
||||||
|
common_init_result(common_params & params);
|
||||||
|
~common_init_result();
|
||||||
|
|
||||||
|
llama_model * model();
|
||||||
|
llama_context * context();
|
||||||
|
common_sampler * sampler(llama_seq_id seq_id);
|
||||||
|
|
||||||
|
std::vector<llama_adapter_lora_ptr> & lora();
|
||||||
|
|
||||||
|
void free_context();
|
||||||
|
|
||||||
|
private:
|
||||||
|
struct impl;
|
||||||
|
std::unique_ptr<impl> pimpl;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct common_init_result common_init_from_params(common_params & params);
|
using common_init_result_ptr = std::unique_ptr<common_init_result>;
|
||||||
|
|
||||||
|
common_init_result_ptr common_init_from_params(common_params & params);
|
||||||
|
|
||||||
struct llama_model_params common_model_params_to_llama ( common_params & params);
|
struct llama_model_params common_model_params_to_llama ( common_params & params);
|
||||||
struct llama_context_params common_context_params_to_llama(const common_params & params);
|
struct llama_context_params common_context_params_to_llama(const common_params & params);
|
||||||
|
|
|
||||||
|
|
@ -305,8 +305,9 @@ static std::string format_literal(const std::string & literal) {
|
||||||
|
|
||||||
std::string gbnf_format_literal(const std::string & literal) { return format_literal(literal); }
|
std::string gbnf_format_literal(const std::string & literal) { return format_literal(literal); }
|
||||||
|
|
||||||
class SchemaConverter {
|
class common_schema_converter {
|
||||||
private:
|
private:
|
||||||
|
friend class common_schema_info;
|
||||||
friend std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options);
|
friend std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options);
|
||||||
std::function<json(const std::string &)> _fetch_json;
|
std::function<json(const std::string &)> _fetch_json;
|
||||||
bool _dotall;
|
bool _dotall;
|
||||||
|
|
@ -729,7 +730,7 @@ private:
|
||||||
}
|
}
|
||||||
|
|
||||||
public:
|
public:
|
||||||
SchemaConverter(
|
common_schema_converter(
|
||||||
const std::function<json(const std::string &)> & fetch_json,
|
const std::function<json(const std::string &)> & fetch_json,
|
||||||
bool dotall)
|
bool dotall)
|
||||||
: _fetch_json(fetch_json), _dotall(dotall)
|
: _fetch_json(fetch_json), _dotall(dotall)
|
||||||
|
|
@ -974,7 +975,7 @@ public:
|
||||||
|
|
||||||
void check_errors() {
|
void check_errors() {
|
||||||
if (!_errors.empty()) {
|
if (!_errors.empty()) {
|
||||||
throw std::runtime_error("JSON schema conversion failed:\n" + string_join(_errors, "\n"));
|
throw std::invalid_argument("JSON schema conversion failed:\n" + string_join(_errors, "\n"));
|
||||||
}
|
}
|
||||||
if (!_warnings.empty()) {
|
if (!_warnings.empty()) {
|
||||||
fprintf(stderr, "WARNING: JSON schema conversion was incomplete: %s\n", string_join(_warnings, "; ").c_str());
|
fprintf(stderr, "WARNING: JSON schema conversion was incomplete: %s\n", string_join(_warnings, "; ").c_str());
|
||||||
|
|
@ -990,6 +991,134 @@ public:
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// common_schema_info implementation (pimpl)
|
||||||
|
|
||||||
|
common_schema_info::common_schema_info()
|
||||||
|
: impl_(std::make_unique<common_schema_converter>(
|
||||||
|
[](const std::string &) { return json(); },
|
||||||
|
false)) {}
|
||||||
|
|
||||||
|
common_schema_info::~common_schema_info() = default;
|
||||||
|
|
||||||
|
common_schema_info::common_schema_info(common_schema_info &&) noexcept = default;
|
||||||
|
common_schema_info & common_schema_info::operator=(common_schema_info &&) noexcept = default;
|
||||||
|
|
||||||
|
void common_schema_info::resolve_refs(nlohmann::ordered_json & schema) {
|
||||||
|
impl_->resolve_refs(schema, "");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Determines if a JSON schema can resolve to a string type through any path.
|
||||||
|
// Some models emit raw string values rather than JSON-encoded strings for string parameters.
|
||||||
|
// If any branch of the schema (via oneOf, anyOf, $ref, etc.) permits a string, this returns
|
||||||
|
// true, allowing callers to handle the value as a raw string for simplicity.
|
||||||
|
bool common_schema_info::resolves_to_string(const nlohmann::ordered_json & schema) {
|
||||||
|
std::unordered_set<std::string> visited_refs;
|
||||||
|
|
||||||
|
std::function<bool(const json &)> check = [&](const json & s) -> bool {
|
||||||
|
if (!s.is_object()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Handle $ref
|
||||||
|
if (s.contains("$ref")) {
|
||||||
|
const std::string & ref = s["$ref"];
|
||||||
|
if (visited_refs.find(ref) != visited_refs.end()) {
|
||||||
|
// Circular reference, assume not a string to be safe
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
visited_refs.insert(ref);
|
||||||
|
auto it = impl_->_refs.find(ref);
|
||||||
|
if (it != impl_->_refs.end()) {
|
||||||
|
return check(it->second);
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check type field
|
||||||
|
if (s.contains("type")) {
|
||||||
|
const json & schema_type = s["type"];
|
||||||
|
if (schema_type.is_string()) {
|
||||||
|
if (schema_type == "string") {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
} else if (schema_type.is_array()) {
|
||||||
|
// Type can be an array like ["string", "null"]
|
||||||
|
for (const auto & t : schema_type) {
|
||||||
|
if (t == "string") {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check oneOf/anyOf - if any alternative can be a string
|
||||||
|
if (s.contains("oneOf")) {
|
||||||
|
for (const auto & alt : s["oneOf"]) {
|
||||||
|
if (check(alt)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (s.contains("anyOf")) {
|
||||||
|
for (const auto & alt : s["anyOf"]) {
|
||||||
|
if (check(alt)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check allOf - all components must be compatible with string type
|
||||||
|
if (s.contains("allOf")) {
|
||||||
|
bool all_string = true;
|
||||||
|
for (const auto & component : s["allOf"]) {
|
||||||
|
if (!check(component)) {
|
||||||
|
all_string = false;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (all_string) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check const - if the constant value is a string
|
||||||
|
if (s.contains("const")) {
|
||||||
|
if (s["const"].is_string()) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check enum - if any enum value is a string
|
||||||
|
if (s.contains("enum")) {
|
||||||
|
for (const auto & val : s["enum"]) {
|
||||||
|
if (val.is_string()) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// String-specific keywords imply string type
|
||||||
|
if (s.contains("pattern") || s.contains("minLength") || s.contains("maxLength")) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check format - many formats imply string
|
||||||
|
if (s.contains("format")) {
|
||||||
|
const std::string & fmt = s["format"];
|
||||||
|
if (fmt == "date" || fmt == "time" || fmt == "date-time" ||
|
||||||
|
fmt == "uri" || fmt == "email" || fmt == "hostname" ||
|
||||||
|
fmt == "ipv4" || fmt == "ipv6" || fmt == "uuid" ||
|
||||||
|
fmt.find("uuid") == 0) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
};
|
||||||
|
|
||||||
|
return check(schema);
|
||||||
|
}
|
||||||
|
|
||||||
std::string json_schema_to_grammar(const json & schema, bool force_gbnf) {
|
std::string json_schema_to_grammar(const json & schema, bool force_gbnf) {
|
||||||
#ifdef LLAMA_USE_LLGUIDANCE
|
#ifdef LLAMA_USE_LLGUIDANCE
|
||||||
if (!force_gbnf) {
|
if (!force_gbnf) {
|
||||||
|
|
@ -1006,7 +1135,7 @@ std::string json_schema_to_grammar(const json & schema, bool force_gbnf) {
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options) {
|
std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options) {
|
||||||
SchemaConverter converter([&](const std::string &) { return json(); }, options.dotall);
|
common_schema_converter converter([&](const std::string &) { return json(); }, options.dotall);
|
||||||
common_grammar_builder builder {
|
common_grammar_builder builder {
|
||||||
/* .add_rule = */ [&](const std::string & name, const std::string & rule) {
|
/* .add_rule = */ [&](const std::string & name, const std::string & rule) {
|
||||||
return converter._add_rule(name, rule);
|
return converter._add_rule(name, rule);
|
||||||
|
|
|
||||||
|
|
@ -3,11 +3,31 @@
|
||||||
#include <nlohmann/json_fwd.hpp>
|
#include <nlohmann/json_fwd.hpp>
|
||||||
|
|
||||||
#include <functional>
|
#include <functional>
|
||||||
|
#include <memory>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
std::string json_schema_to_grammar(const nlohmann::ordered_json & schema,
|
std::string json_schema_to_grammar(const nlohmann::ordered_json & schema,
|
||||||
bool force_gbnf = false);
|
bool force_gbnf = false);
|
||||||
|
|
||||||
|
class common_schema_converter;
|
||||||
|
|
||||||
|
// Probes a JSON schema to extract information about its structure and type constraints.
|
||||||
|
class common_schema_info {
|
||||||
|
std::unique_ptr<common_schema_converter> impl_;
|
||||||
|
|
||||||
|
public:
|
||||||
|
common_schema_info();
|
||||||
|
~common_schema_info();
|
||||||
|
|
||||||
|
common_schema_info(const common_schema_info &) = delete;
|
||||||
|
common_schema_info & operator=(const common_schema_info &) = delete;
|
||||||
|
common_schema_info(common_schema_info &&) noexcept;
|
||||||
|
common_schema_info & operator=(common_schema_info &&) noexcept;
|
||||||
|
|
||||||
|
void resolve_refs(nlohmann::ordered_json & schema);
|
||||||
|
bool resolves_to_string(const nlohmann::ordered_json & schema);
|
||||||
|
};
|
||||||
|
|
||||||
struct common_grammar_builder {
|
struct common_grammar_builder {
|
||||||
std::function<std::string(const std::string &, const std::string &)> add_rule;
|
std::function<std::string(const std::string &, const std::string &)> add_rule;
|
||||||
std::function<std::string(const std::string &, const nlohmann::ordered_json &)> add_schema;
|
std::function<std::string(const std::string &, const nlohmann::ordered_json &)> add_schema;
|
||||||
|
|
|
||||||
|
|
@ -1,3 +1,4 @@
|
||||||
|
#include "common.h"
|
||||||
#include "log.h"
|
#include "log.h"
|
||||||
|
|
||||||
#include <chrono>
|
#include <chrono>
|
||||||
|
|
@ -26,30 +27,6 @@ void common_log_set_verbosity_thold(int verbosity) {
|
||||||
common_log_verbosity_thold = verbosity;
|
common_log_verbosity_thold = verbosity;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Auto-detect if colors should be enabled based on terminal and environment
|
|
||||||
static bool common_log_should_use_colors_auto() {
|
|
||||||
// Check NO_COLOR environment variable (https://no-color.org/)
|
|
||||||
if (const char * no_color = std::getenv("NO_COLOR")) {
|
|
||||||
if (no_color[0] != '\0') {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Check TERM environment variable
|
|
||||||
if (const char * term = std::getenv("TERM")) {
|
|
||||||
if (std::strcmp(term, "dumb") == 0) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Check if stdout and stderr are connected to a terminal
|
|
||||||
// We check both because log messages can go to either
|
|
||||||
bool stdout_is_tty = isatty(fileno(stdout));
|
|
||||||
bool stderr_is_tty = isatty(fileno(stderr));
|
|
||||||
|
|
||||||
return stdout_is_tty || stderr_is_tty;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int64_t t_us() {
|
static int64_t t_us() {
|
||||||
return std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
|
return std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
|
||||||
}
|
}
|
||||||
|
|
@ -391,7 +368,7 @@ struct common_log * common_log_main() {
|
||||||
static std::once_flag init_flag;
|
static std::once_flag init_flag;
|
||||||
std::call_once(init_flag, [&]() {
|
std::call_once(init_flag, [&]() {
|
||||||
// Set default to auto-detect colors
|
// Set default to auto-detect colors
|
||||||
log.set_colors(common_log_should_use_colors_auto());
|
log.set_colors(tty_can_use_colors());
|
||||||
});
|
});
|
||||||
|
|
||||||
return &log;
|
return &log;
|
||||||
|
|
@ -422,7 +399,7 @@ void common_log_set_file(struct common_log * log, const char * file) {
|
||||||
|
|
||||||
void common_log_set_colors(struct common_log * log, log_colors colors) {
|
void common_log_set_colors(struct common_log * log, log_colors colors) {
|
||||||
if (colors == LOG_COLORS_AUTO) {
|
if (colors == LOG_COLORS_AUTO) {
|
||||||
log->set_colors(common_log_should_use_colors_auto());
|
log->set_colors(tty_can_use_colors());
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -443,8 +420,27 @@ void common_log_set_timestamps(struct common_log * log, bool timestamps) {
|
||||||
log->set_timestamps(timestamps);
|
log->set_timestamps(timestamps);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void common_log_flush(struct common_log * log) {
|
||||||
|
log->pause();
|
||||||
|
log->resume();
|
||||||
|
}
|
||||||
|
|
||||||
|
static int common_get_verbosity(enum ggml_log_level level) {
|
||||||
|
switch (level) {
|
||||||
|
case GGML_LOG_LEVEL_DEBUG: return LOG_LEVEL_DEBUG;
|
||||||
|
case GGML_LOG_LEVEL_INFO: return LOG_LEVEL_INFO;
|
||||||
|
case GGML_LOG_LEVEL_WARN: return LOG_LEVEL_WARN;
|
||||||
|
case GGML_LOG_LEVEL_ERROR: return LOG_LEVEL_ERROR;
|
||||||
|
case GGML_LOG_LEVEL_CONT: return LOG_LEVEL_INFO; // same as INFO
|
||||||
|
case GGML_LOG_LEVEL_NONE:
|
||||||
|
default:
|
||||||
|
return LOG_LEVEL_OUTPUT;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void common_log_default_callback(enum ggml_log_level level, const char * text, void * /*user_data*/) {
|
void common_log_default_callback(enum ggml_log_level level, const char * text, void * /*user_data*/) {
|
||||||
if (LOG_DEFAULT_LLAMA <= common_log_verbosity_thold) {
|
auto verbosity = common_get_verbosity(level);
|
||||||
|
if (verbosity <= common_log_verbosity_thold) {
|
||||||
common_log_add(common_log_main(), level, "%s", text);
|
common_log_add(common_log_main(), level, "%s", text);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -21,8 +21,14 @@
|
||||||
# define LOG_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
|
# define LOG_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define LOG_DEFAULT_DEBUG 1
|
#define LOG_LEVEL_DEBUG 4
|
||||||
#define LOG_DEFAULT_LLAMA 0
|
#define LOG_LEVEL_INFO 3
|
||||||
|
#define LOG_LEVEL_WARN 2
|
||||||
|
#define LOG_LEVEL_ERROR 1
|
||||||
|
#define LOG_LEVEL_OUTPUT 0 // output data from tools
|
||||||
|
|
||||||
|
#define LOG_DEFAULT_DEBUG LOG_LEVEL_DEBUG
|
||||||
|
#define LOG_DEFAULT_LLAMA LOG_LEVEL_INFO
|
||||||
|
|
||||||
enum log_colors {
|
enum log_colors {
|
||||||
LOG_COLORS_AUTO = -1,
|
LOG_COLORS_AUTO = -1,
|
||||||
|
|
@ -67,16 +73,18 @@ void common_log_add(struct common_log * log, enum ggml_log_level level, const ch
|
||||||
// 0.00.090.578 I llm_load_tensors: offloading 32 repeating layers to GPU
|
// 0.00.090.578 I llm_load_tensors: offloading 32 repeating layers to GPU
|
||||||
// 0.00.090.579 I llm_load_tensors: offloading non-repeating layers to GPU
|
// 0.00.090.579 I llm_load_tensors: offloading non-repeating layers to GPU
|
||||||
//
|
//
|
||||||
// I - info (stdout, V = 0)
|
|
||||||
// W - warning (stderr, V = 0)
|
|
||||||
// E - error (stderr, V = 0)
|
|
||||||
// D - debug (stderr, V = LOG_DEFAULT_DEBUG)
|
// D - debug (stderr, V = LOG_DEFAULT_DEBUG)
|
||||||
|
// I - info (stdout, V = LOG_DEFAULT_INFO)
|
||||||
|
// W - warning (stderr, V = LOG_DEFAULT_WARN)
|
||||||
|
// E - error (stderr, V = LOG_DEFAULT_ERROR)
|
||||||
|
// O - output (stdout, V = LOG_DEFAULT_OUTPUT)
|
||||||
//
|
//
|
||||||
|
|
||||||
void common_log_set_file (struct common_log * log, const char * file); // not thread-safe
|
void common_log_set_file (struct common_log * log, const char * file); // not thread-safe
|
||||||
void common_log_set_colors (struct common_log * log, log_colors colors); // not thread-safe
|
void common_log_set_colors (struct common_log * log, log_colors colors); // not thread-safe
|
||||||
void common_log_set_prefix (struct common_log * log, bool prefix); // whether to output prefix to each log
|
void common_log_set_prefix (struct common_log * log, bool prefix); // whether to output prefix to each log
|
||||||
void common_log_set_timestamps(struct common_log * log, bool timestamps); // whether to output timestamps in the prefix
|
void common_log_set_timestamps(struct common_log * log, bool timestamps); // whether to output timestamps in the prefix
|
||||||
|
void common_log_flush (struct common_log * log); // flush all pending log messages
|
||||||
|
|
||||||
// helper macros for logging
|
// helper macros for logging
|
||||||
// use these to avoid computing log arguments if the verbosity of the log is higher than the threshold
|
// use these to avoid computing log arguments if the verbosity of the log is higher than the threshold
|
||||||
|
|
@ -95,14 +103,14 @@ void common_log_set_timestamps(struct common_log * log, bool timestamps); // w
|
||||||
} \
|
} \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
#define LOG(...) LOG_TMPL(GGML_LOG_LEVEL_NONE, 0, __VA_ARGS__)
|
#define LOG(...) LOG_TMPL(GGML_LOG_LEVEL_NONE, LOG_LEVEL_OUTPUT, __VA_ARGS__)
|
||||||
#define LOGV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_NONE, verbosity, __VA_ARGS__)
|
#define LOGV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_NONE, verbosity, __VA_ARGS__)
|
||||||
|
|
||||||
#define LOG_INF(...) LOG_TMPL(GGML_LOG_LEVEL_INFO, 0, __VA_ARGS__)
|
#define LOG_DBG(...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, LOG_LEVEL_DEBUG, __VA_ARGS__)
|
||||||
#define LOG_WRN(...) LOG_TMPL(GGML_LOG_LEVEL_WARN, 0, __VA_ARGS__)
|
#define LOG_INF(...) LOG_TMPL(GGML_LOG_LEVEL_INFO, LOG_LEVEL_INFO, __VA_ARGS__)
|
||||||
#define LOG_ERR(...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, 0, __VA_ARGS__)
|
#define LOG_WRN(...) LOG_TMPL(GGML_LOG_LEVEL_WARN, LOG_LEVEL_WARN, __VA_ARGS__)
|
||||||
#define LOG_DBG(...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, LOG_DEFAULT_DEBUG, __VA_ARGS__)
|
#define LOG_ERR(...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, LOG_LEVEL_ERROR, __VA_ARGS__)
|
||||||
#define LOG_CNT(...) LOG_TMPL(GGML_LOG_LEVEL_CONT, 0, __VA_ARGS__)
|
#define LOG_CNT(...) LOG_TMPL(GGML_LOG_LEVEL_CONT, LOG_LEVEL_INFO, __VA_ARGS__) // same as INFO
|
||||||
|
|
||||||
#define LOG_INFV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_INFO, verbosity, __VA_ARGS__)
|
#define LOG_INFV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_INFO, verbosity, __VA_ARGS__)
|
||||||
#define LOG_WRNV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_WARN, verbosity, __VA_ARGS__)
|
#define LOG_WRNV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_WARN, verbosity, __VA_ARGS__)
|
||||||
|
|
|
||||||
|
|
@ -104,9 +104,10 @@ struct ring_buffer {
|
||||||
struct common_sampler {
|
struct common_sampler {
|
||||||
common_params_sampling params;
|
common_params_sampling params;
|
||||||
|
|
||||||
struct llama_sampler * grmr;
|
|
||||||
struct llama_sampler * chain;
|
struct llama_sampler * chain;
|
||||||
|
|
||||||
|
bool grammar;
|
||||||
|
|
||||||
ring_buffer<llama_token> prev;
|
ring_buffer<llama_token> prev;
|
||||||
|
|
||||||
std::vector<llama_token_data> cur;
|
std::vector<llama_token_data> cur;
|
||||||
|
|
@ -116,7 +117,6 @@ struct common_sampler {
|
||||||
void reset() {
|
void reset() {
|
||||||
prev.clear();
|
prev.clear();
|
||||||
|
|
||||||
llama_sampler_reset(grmr);
|
|
||||||
llama_sampler_reset(chain);
|
llama_sampler_reset(chain);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -167,10 +167,15 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
|
||||||
|
|
||||||
lparams.no_perf = params.no_perf;
|
lparams.no_perf = params.no_perf;
|
||||||
|
|
||||||
struct llama_sampler * grmr;
|
llama_sampler * chain = llama_sampler_chain_init(lparams);
|
||||||
|
|
||||||
|
bool grammar = false;
|
||||||
|
std::vector<llama_sampler *> samplers;
|
||||||
|
|
||||||
if (params.grammar.compare(0, 11, "%llguidance") == 0) {
|
if (params.grammar.compare(0, 11, "%llguidance") == 0) {
|
||||||
#ifdef LLAMA_USE_LLGUIDANCE
|
#ifdef LLAMA_USE_LLGUIDANCE
|
||||||
grmr = llama_sampler_init_llg(vocab, "lark", params.grammar.c_str());
|
samplers.push_back(llama_sampler_init_llg(vocab, "lark", params.grammar.c_str()));
|
||||||
|
grammar = true;
|
||||||
#else
|
#else
|
||||||
GGML_ABORT("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled");
|
GGML_ABORT("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled");
|
||||||
#endif // LLAMA_USE_LLGUIDANCE
|
#endif // LLAMA_USE_LLGUIDANCE
|
||||||
|
|
@ -217,30 +222,23 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
|
||||||
trigger_patterns_c.push_back(regex.c_str());
|
trigger_patterns_c.push_back(regex.c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
grmr = params.grammar_lazy
|
if (!params.grammar.empty()) {
|
||||||
? llama_sampler_init_grammar_lazy_patterns(vocab, params.grammar.c_str(), "root",
|
if (params.grammar_lazy) {
|
||||||
|
samplers.push_back(
|
||||||
|
llama_sampler_init_grammar_lazy_patterns(vocab, params.grammar.c_str(), "root",
|
||||||
trigger_patterns_c.data(), trigger_patterns_c.size(),
|
trigger_patterns_c.data(), trigger_patterns_c.size(),
|
||||||
trigger_tokens.data(), trigger_tokens.size())
|
trigger_tokens.data(), trigger_tokens.size()));
|
||||||
: llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root");
|
} else {
|
||||||
if (!grmr) {
|
samplers.push_back(llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root"));
|
||||||
return nullptr;
|
}
|
||||||
|
|
||||||
|
grammar = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
auto * result = new common_sampler {
|
if (params.has_logit_bias()) {
|
||||||
/* .params = */ params,
|
samplers.push_back(llama_sampler_init_logit_bias(llama_vocab_n_tokens(vocab), params.logit_bias.size(), params.logit_bias.data()));
|
||||||
/* .grmr = */ grmr,
|
}
|
||||||
/* .chain = */ llama_sampler_chain_init(lparams),
|
|
||||||
/* .prev = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
|
|
||||||
/* .cur = */ {},
|
|
||||||
/* .cur_p = */ {},
|
|
||||||
};
|
|
||||||
|
|
||||||
llama_sampler_chain_add(result->chain,
|
|
||||||
llama_sampler_init_logit_bias(
|
|
||||||
llama_vocab_n_tokens(vocab),
|
|
||||||
params.logit_bias.size(),
|
|
||||||
params.logit_bias.data()));
|
|
||||||
|
|
||||||
if (params.mirostat == 0) {
|
if (params.mirostat == 0) {
|
||||||
for (const auto & cnstr : params.samplers) {
|
for (const auto & cnstr : params.samplers) {
|
||||||
|
|
@ -253,58 +251,70 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
|
||||||
c_breakers.push_back(str.c_str());
|
c_breakers.push_back(str.c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_dry (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
|
samplers.push_back(llama_sampler_init_dry (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case COMMON_SAMPLER_TYPE_TOP_K:
|
case COMMON_SAMPLER_TYPE_TOP_K:
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
|
samplers.push_back(llama_sampler_init_top_k (params.top_k));
|
||||||
break;
|
break;
|
||||||
case COMMON_SAMPLER_TYPE_TOP_P:
|
case COMMON_SAMPLER_TYPE_TOP_P:
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_top_p (params.top_p, params.min_keep));
|
samplers.push_back(llama_sampler_init_top_p (params.top_p, params.min_keep));
|
||||||
break;
|
break;
|
||||||
case COMMON_SAMPLER_TYPE_TOP_N_SIGMA:
|
case COMMON_SAMPLER_TYPE_TOP_N_SIGMA:
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_top_n_sigma (params.top_n_sigma));
|
samplers.push_back(llama_sampler_init_top_n_sigma(params.top_n_sigma));
|
||||||
break;
|
break;
|
||||||
case COMMON_SAMPLER_TYPE_MIN_P:
|
case COMMON_SAMPLER_TYPE_MIN_P:
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep));
|
samplers.push_back(llama_sampler_init_min_p (params.min_p, params.min_keep));
|
||||||
break;
|
break;
|
||||||
case COMMON_SAMPLER_TYPE_XTC:
|
case COMMON_SAMPLER_TYPE_XTC:
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
|
samplers.push_back(llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
|
||||||
break;
|
break;
|
||||||
case COMMON_SAMPLER_TYPE_TYPICAL_P:
|
case COMMON_SAMPLER_TYPE_TYPICAL_P:
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_typical (params.typ_p, params.min_keep));
|
samplers.push_back(llama_sampler_init_typical (params.typ_p, params.min_keep));
|
||||||
break;
|
break;
|
||||||
case COMMON_SAMPLER_TYPE_TEMPERATURE:
|
case COMMON_SAMPLER_TYPE_TEMPERATURE:
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
|
samplers.push_back(llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
|
||||||
break;
|
break;
|
||||||
case COMMON_SAMPLER_TYPE_INFILL:
|
case COMMON_SAMPLER_TYPE_INFILL:
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_infill (vocab));
|
samplers.push_back(llama_sampler_init_infill (vocab));
|
||||||
break;
|
break;
|
||||||
case COMMON_SAMPLER_TYPE_PENALTIES:
|
case COMMON_SAMPLER_TYPE_PENALTIES:
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_penalties (params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
|
samplers.push_back(llama_sampler_init_penalties (params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
GGML_ASSERT(false && "unknown sampler type");
|
GGML_ASSERT(false && "unknown sampler type");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
|
|
||||||
|
samplers.push_back(llama_sampler_init_dist(params.seed));
|
||||||
} else if (params.mirostat == 1) {
|
} else if (params.mirostat == 1) {
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
|
samplers.push_back(llama_sampler_init_temp(params.temp));
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat(llama_vocab_n_tokens(vocab), params.seed, params.mirostat_tau, params.mirostat_eta, 100));
|
samplers.push_back(llama_sampler_init_mirostat(llama_vocab_n_tokens(vocab), params.seed, params.mirostat_tau, params.mirostat_eta, 100));
|
||||||
} else if (params.mirostat == 2) {
|
} else if (params.mirostat == 2) {
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
|
samplers.push_back(llama_sampler_init_temp(params.temp));
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta));
|
samplers.push_back(llama_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta));
|
||||||
} else {
|
} else {
|
||||||
GGML_ASSERT(false && "unknown mirostat version");
|
GGML_ASSERT(false && "unknown mirostat version");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for (auto * smpl : samplers) {
|
||||||
|
llama_sampler_chain_add(chain, smpl);
|
||||||
|
}
|
||||||
|
|
||||||
|
auto * result = new common_sampler {
|
||||||
|
/* .params = */ params,
|
||||||
|
/* .chain = */ chain,
|
||||||
|
/* .grammar = */ grammar,
|
||||||
|
/* .prev = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
|
||||||
|
/* .cur = */ {},
|
||||||
|
/* .cur_p = */ {},
|
||||||
|
};
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
void common_sampler_free(struct common_sampler * gsmpl) {
|
void common_sampler_free(struct common_sampler * gsmpl) {
|
||||||
if (gsmpl) {
|
if (gsmpl) {
|
||||||
llama_sampler_free(gsmpl->grmr);
|
|
||||||
|
|
||||||
llama_sampler_free(gsmpl->chain);
|
llama_sampler_free(gsmpl->chain);
|
||||||
|
|
||||||
delete gsmpl;
|
delete gsmpl;
|
||||||
|
|
@ -314,11 +324,24 @@ void common_sampler_free(struct common_sampler * gsmpl) {
|
||||||
void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) {
|
void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) {
|
||||||
const auto tm = gsmpl->tm();
|
const auto tm = gsmpl->tm();
|
||||||
|
|
||||||
if (accept_grammar) {
|
if (gsmpl->grammar) {
|
||||||
llama_sampler_accept(gsmpl->grmr, token);
|
const int n_smpl = llama_sampler_chain_n(gsmpl->chain);
|
||||||
}
|
|
||||||
|
|
||||||
|
for (int i = 0; i < n_smpl; i++) {
|
||||||
|
auto * smpl = llama_sampler_chain_get(gsmpl->chain, i);
|
||||||
|
|
||||||
|
// the grammar sampler is always the first one
|
||||||
|
if (i == 0) {
|
||||||
|
if (accept_grammar) {
|
||||||
|
llama_sampler_accept(smpl, token);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
llama_sampler_accept(smpl, token);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
llama_sampler_accept(gsmpl->chain, token);
|
llama_sampler_accept(gsmpl->chain, token);
|
||||||
|
}
|
||||||
|
|
||||||
gsmpl->prev.push_back(token);
|
gsmpl->prev.push_back(token);
|
||||||
}
|
}
|
||||||
|
|
@ -330,8 +353,8 @@ void common_sampler_reset(struct common_sampler * gsmpl) {
|
||||||
struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
|
struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
|
||||||
return new common_sampler {
|
return new common_sampler {
|
||||||
/* .params = */ gsmpl->params,
|
/* .params = */ gsmpl->params,
|
||||||
/* .grmr = */ llama_sampler_clone(gsmpl->grmr),
|
|
||||||
/* .chain = */ llama_sampler_clone(gsmpl->chain),
|
/* .chain = */ llama_sampler_clone(gsmpl->chain),
|
||||||
|
/* .grammar = */ gsmpl->grammar,
|
||||||
/* .prev = */ gsmpl->prev,
|
/* .prev = */ gsmpl->prev,
|
||||||
/* .cur = */ gsmpl->cur,
|
/* .cur = */ gsmpl->cur,
|
||||||
/* .cur_p = */ gsmpl->cur_p,
|
/* .cur_p = */ gsmpl->cur_p,
|
||||||
|
|
@ -383,58 +406,33 @@ void common_perf_print(const struct llama_context * ctx, const struct common_sam
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
|
struct llama_sampler * common_sampler_get(const struct common_sampler * gsmpl) {
|
||||||
|
return gsmpl->chain;
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx) {
|
||||||
llama_synchronize(ctx);
|
llama_synchronize(ctx);
|
||||||
|
|
||||||
// start measuring sampling time after the llama_context synchronization in order to not measure any ongoing async operations
|
// start measuring sampling time after the llama_context synchronization in order to not measure any ongoing async operations
|
||||||
const auto tm = gsmpl->tm();
|
const auto tm = gsmpl->tm();
|
||||||
|
|
||||||
gsmpl->set_logits(ctx, idx);
|
llama_token id = LLAMA_TOKEN_NULL;
|
||||||
|
|
||||||
auto & grmr = gsmpl->grmr;
|
|
||||||
auto & chain = gsmpl->chain;
|
auto & chain = gsmpl->chain;
|
||||||
auto & cur_p = gsmpl->cur_p; // initialized by set_logits
|
auto & cur_p = gsmpl->cur_p; // initialized by set_logits
|
||||||
|
|
||||||
if (grammar_first) {
|
gsmpl->set_logits(ctx, idx);
|
||||||
llama_sampler_apply(grmr, &cur_p);
|
|
||||||
}
|
|
||||||
|
|
||||||
llama_sampler_apply(chain, &cur_p);
|
llama_sampler_apply(chain, &cur_p);
|
||||||
|
|
||||||
GGML_ASSERT(cur_p.selected != -1 && "no selected token during sampling - check your sampling configuration");
|
GGML_ASSERT(cur_p.selected != -1 && "no selected token during sampling - check your sampling configuration");
|
||||||
|
|
||||||
const llama_token id = cur_p.data[cur_p.selected].id;
|
id = cur_p.data[cur_p.selected].id;
|
||||||
|
|
||||||
if (grammar_first) {
|
|
||||||
return id;
|
return id;
|
||||||
}
|
|
||||||
|
|
||||||
// check if it the sampled token fits the grammar
|
|
||||||
{
|
|
||||||
llama_token_data single_token_data = { id, 1.0f, 0.0f };
|
|
||||||
llama_token_data_array single_token_data_array = { &single_token_data, 1, -1, false };
|
|
||||||
|
|
||||||
llama_sampler_apply(grmr, &single_token_data_array);
|
|
||||||
|
|
||||||
const bool is_valid = single_token_data_array.data[0].logit != -INFINITY;
|
|
||||||
if (is_valid) {
|
|
||||||
return id;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// resampling:
|
|
||||||
// if the token is not valid, sample again, but first apply the grammar sampler and then the sampling chain
|
|
||||||
gsmpl->set_logits(ctx, idx);
|
|
||||||
|
|
||||||
llama_sampler_apply(grmr, &cur_p);
|
|
||||||
llama_sampler_apply(chain, &cur_p);
|
|
||||||
|
|
||||||
GGML_ASSERT(cur_p.selected != -1 && "no selected token during re-sampling - check your sampling configuration");
|
|
||||||
|
|
||||||
return cur_p.data[cur_p.selected].id;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft, bool grammar_first) {
|
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft) {
|
||||||
GGML_ASSERT(idxs.size() == draft.size() + 1 && "idxs.size() must be draft.size() + 1");
|
GGML_ASSERT(idxs.size() == draft.size() + 1 && "idxs.size() must be draft.size() + 1");
|
||||||
|
|
||||||
std::vector<llama_token> result;
|
std::vector<llama_token> result;
|
||||||
|
|
@ -442,7 +440,7 @@ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sample
|
||||||
|
|
||||||
size_t i = 0;
|
size_t i = 0;
|
||||||
for (; i < draft.size(); i++) {
|
for (; i < draft.size(); i++) {
|
||||||
const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first);
|
const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i]);
|
||||||
|
|
||||||
common_sampler_accept(gsmpl, id, true);
|
common_sampler_accept(gsmpl, id, true);
|
||||||
|
|
||||||
|
|
@ -454,7 +452,7 @@ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sample
|
||||||
}
|
}
|
||||||
|
|
||||||
if (i == draft.size()) {
|
if (i == draft.size()) {
|
||||||
const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first);
|
const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i]);
|
||||||
|
|
||||||
common_sampler_accept(gsmpl, id, true);
|
common_sampler_accept(gsmpl, id, true);
|
||||||
|
|
||||||
|
|
@ -464,13 +462,13 @@ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sample
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first) {
|
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft) {
|
||||||
std::vector<int> idxs(draft.size() + 1);
|
std::vector<int> idxs(draft.size() + 1);
|
||||||
for (size_t i = 0; i < idxs.size(); ++i) {
|
for (size_t i = 0; i < idxs.size(); ++i) {
|
||||||
idxs[i] = i;
|
idxs[i] = i;
|
||||||
}
|
}
|
||||||
|
|
||||||
return common_sampler_sample_and_accept_n(gsmpl, ctx, idxs, draft, grammar_first);
|
return common_sampler_sample_and_accept_n(gsmpl, ctx, idxs, draft);
|
||||||
}
|
}
|
||||||
|
|
||||||
uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {
|
uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {
|
||||||
|
|
@ -515,7 +513,8 @@ std::string common_sampler_print(const struct common_sampler * gsmpl) {
|
||||||
|
|
||||||
for (int i = 0; i < llama_sampler_chain_n(gsmpl->chain); i++) {
|
for (int i = 0; i < llama_sampler_chain_n(gsmpl->chain); i++) {
|
||||||
const auto * smpl = llama_sampler_chain_get(gsmpl->chain, i);
|
const auto * smpl = llama_sampler_chain_get(gsmpl->chain, i);
|
||||||
result += std::string("-> ") + llama_sampler_name(smpl) + " ";
|
result += std::string("-> ");
|
||||||
|
result += std::string(llama_sampler_name(smpl)) + " ";
|
||||||
}
|
}
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
|
|
|
||||||
|
|
@ -48,6 +48,8 @@ struct common_sampler * common_sampler_clone (struct common_sampler * gsmpl);
|
||||||
// arguments can be nullptr to skip printing
|
// arguments can be nullptr to skip printing
|
||||||
void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl);
|
void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl);
|
||||||
|
|
||||||
|
struct llama_sampler * common_sampler_get(const struct common_sampler * gsmpl);
|
||||||
|
|
||||||
// extended sampling implementation:
|
// extended sampling implementation:
|
||||||
//
|
//
|
||||||
// - set logits
|
// - set logits
|
||||||
|
|
@ -55,10 +57,7 @@ void common_perf_print(const struct llama_context * ctx, const struct common_sam
|
||||||
// - check if the token fits the grammar (if any)
|
// - check if the token fits the grammar (if any)
|
||||||
// - if not: resample by first applying the grammar constraints and then sampling again (slower path)
|
// - if not: resample by first applying the grammar constraints and then sampling again (slower path)
|
||||||
//
|
//
|
||||||
// if grammar_first is true, the grammar is applied before the samplers (slower)
|
llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx);
|
||||||
// useful in cases where all the resulting candidates (not just the sampled one) must fit the grammar
|
|
||||||
//
|
|
||||||
llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);
|
|
||||||
|
|
||||||
// generalized version of common_sampler_sample
|
// generalized version of common_sampler_sample
|
||||||
//
|
//
|
||||||
|
|
@ -76,10 +75,10 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
|
||||||
//
|
//
|
||||||
// returns at least 1 token, up to idxs.size()
|
// returns at least 1 token, up to idxs.size()
|
||||||
//
|
//
|
||||||
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft, bool grammar_first = false);
|
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft);
|
||||||
|
|
||||||
// assume idxs == [ 0, 1, 2, ..., draft.size() ]
|
// assume idxs == [ 0, 1, 2, ..., draft.size() ]
|
||||||
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first = false);
|
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft);
|
||||||
|
|
||||||
uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl);
|
uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl);
|
||||||
|
|
||||||
|
|
@ -107,3 +106,9 @@ std::vector<enum common_sampler_type> common_sampler_types_from_chars(const std:
|
||||||
|
|
||||||
llama_sampler * llama_sampler_init_llg(const llama_vocab * vocab,
|
llama_sampler * llama_sampler_init_llg(const llama_vocab * vocab,
|
||||||
const char * grammar_kind, const char * grammar_data);
|
const char * grammar_kind, const char * grammar_data);
|
||||||
|
|
||||||
|
struct common_sampler_deleter {
|
||||||
|
void operator()(common_sampler * s) { common_sampler_free(s); }
|
||||||
|
};
|
||||||
|
|
||||||
|
typedef std::unique_ptr<common_sampler, common_sampler_deleter> common_sampler_ptr;
|
||||||
|
|
|
||||||
|
|
@ -313,6 +313,7 @@ extern "C" {
|
||||||
bool check_tensors; // validate model tensor data
|
bool check_tensors; // validate model tensor data
|
||||||
bool use_extra_bufts; // use extra buffer types (used for weight repacking)
|
bool use_extra_bufts; // use extra buffer types (used for weight repacking)
|
||||||
bool no_host; // bypass host buffer allowing extra buffers to be used
|
bool no_host; // bypass host buffer allowing extra buffers to be used
|
||||||
|
bool no_alloc; // only load metadata and simulate memory allocations
|
||||||
};
|
};
|
||||||
|
|
||||||
// NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
|
// NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
|
||||||
|
|
@ -466,10 +467,24 @@ extern "C" {
|
||||||
// Frees all allocated memory
|
// Frees all allocated memory
|
||||||
LLAMA_API void llama_free(struct llama_context * ctx);
|
LLAMA_API void llama_free(struct llama_context * ctx);
|
||||||
|
|
||||||
|
// fits mparams and cparams to free device memory (assumes system memory is unlimited)
|
||||||
|
// returns true if the parameters could be successfully modified to fit device memory
|
||||||
|
// this function is NOT thread safe because it modifies the global llama logger state
|
||||||
|
LLAMA_API bool llama_params_fit(
|
||||||
|
const char * path_model,
|
||||||
|
struct llama_model_params * mparams,
|
||||||
|
struct llama_context_params * cparams,
|
||||||
|
float * tensor_split, // writable buffer for tensor split, needs at least llama_max_devices elements
|
||||||
|
struct llama_model_tensor_buft_override * tensor_buft_overrides, // writable buffer for overrides, needs at least llama_max_tensor_buft_overrides elements
|
||||||
|
size_t margin, // margin of memory to leave per device in bytes
|
||||||
|
uint32_t n_ctx_min, // minimum context size to set when trying to reduce memory use
|
||||||
|
enum ggml_log_level log_level); // minimum log level to print during fitting, lower levels go to debug log
|
||||||
|
|
||||||
LLAMA_API int64_t llama_time_us(void);
|
LLAMA_API int64_t llama_time_us(void);
|
||||||
|
|
||||||
LLAMA_API size_t llama_max_devices(void);
|
LLAMA_API size_t llama_max_devices(void);
|
||||||
LLAMA_API size_t llama_max_parallel_sequences(void);
|
LLAMA_API size_t llama_max_parallel_sequences(void);
|
||||||
|
LLAMA_API size_t llama_max_tensor_buft_overrides(void);
|
||||||
|
|
||||||
LLAMA_API bool llama_supports_mmap (void);
|
LLAMA_API bool llama_supports_mmap (void);
|
||||||
LLAMA_API bool llama_supports_mlock (void);
|
LLAMA_API bool llama_supports_mlock (void);
|
||||||
|
|
@ -1354,6 +1369,8 @@ extern "C" {
|
||||||
|
|
||||||
// Set callback for all future logging events.
|
// Set callback for all future logging events.
|
||||||
// If this is not called, or NULL is supplied, everything is output on stderr.
|
// If this is not called, or NULL is supplied, everything is output on stderr.
|
||||||
|
// The logger state is global so these functions are NOT thread safe.
|
||||||
|
LLAMA_API void llama_log_get(ggml_log_callback * log_callback, void ** user_data);
|
||||||
LLAMA_API void llama_log_set(ggml_log_callback log_callback, void * user_data);
|
LLAMA_API void llama_log_set(ggml_log_callback log_callback, void * user_data);
|
||||||
|
|
||||||
//
|
//
|
||||||
|
|
|
||||||
File diff suppressed because it is too large
Load Diff
|
|
@ -3,6 +3,7 @@
|
||||||
#include "ggml.h" // ggml_op
|
#include "ggml.h" // ggml_op
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
|
#include <set>
|
||||||
|
|
||||||
//
|
//
|
||||||
// gguf constants (sync with gguf.py)
|
// gguf constants (sync with gguf.py)
|
||||||
|
|
@ -79,6 +80,7 @@ enum llm_arch {
|
||||||
LLM_ARCH_JAIS,
|
LLM_ARCH_JAIS,
|
||||||
LLM_ARCH_NEMOTRON,
|
LLM_ARCH_NEMOTRON,
|
||||||
LLM_ARCH_NEMOTRON_H,
|
LLM_ARCH_NEMOTRON_H,
|
||||||
|
LLM_ARCH_NEMOTRON_H_MOE,
|
||||||
LLM_ARCH_EXAONE,
|
LLM_ARCH_EXAONE,
|
||||||
LLM_ARCH_EXAONE4,
|
LLM_ARCH_EXAONE4,
|
||||||
LLM_ARCH_RWKV6,
|
LLM_ARCH_RWKV6,
|
||||||
|
|
@ -116,6 +118,7 @@ enum llm_arch {
|
||||||
LLM_ARCH_COGVLM,
|
LLM_ARCH_COGVLM,
|
||||||
LLM_ARCH_RND1,
|
LLM_ARCH_RND1,
|
||||||
LLM_ARCH_PANGU_EMBED,
|
LLM_ARCH_PANGU_EMBED,
|
||||||
|
LLM_ARCH_MISTRAL3,
|
||||||
LLM_ARCH_UNKNOWN,
|
LLM_ARCH_UNKNOWN,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
@ -209,6 +212,7 @@ enum llm_kv {
|
||||||
LLM_KV_ATTENTION_SCALE,
|
LLM_KV_ATTENTION_SCALE,
|
||||||
LLM_KV_ATTENTION_OUTPUT_SCALE,
|
LLM_KV_ATTENTION_OUTPUT_SCALE,
|
||||||
LLM_KV_ATTENTION_TEMPERATURE_LENGTH,
|
LLM_KV_ATTENTION_TEMPERATURE_LENGTH,
|
||||||
|
LLM_KV_ATTENTION_TEMPERATURE_SCALE,
|
||||||
LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
|
LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
|
||||||
LLM_KV_ATTENTION_KEY_LENGTH_MLA,
|
LLM_KV_ATTENTION_KEY_LENGTH_MLA,
|
||||||
LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
|
LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
|
||||||
|
|
@ -315,6 +319,7 @@ enum llm_tensor {
|
||||||
LLM_TENSOR_DENSE_3_OUT,
|
LLM_TENSOR_DENSE_3_OUT,
|
||||||
LLM_TENSOR_OUTPUT,
|
LLM_TENSOR_OUTPUT,
|
||||||
LLM_TENSOR_OUTPUT_NORM,
|
LLM_TENSOR_OUTPUT_NORM,
|
||||||
|
LLM_TENSOR_OUTPUT_NORM_LFM2, // fix for wrong tensor name
|
||||||
LLM_TENSOR_ROPE_FREQS,
|
LLM_TENSOR_ROPE_FREQS,
|
||||||
LLM_TENSOR_ROPE_FACTORS_LONG,
|
LLM_TENSOR_ROPE_FACTORS_LONG,
|
||||||
LLM_TENSOR_ROPE_FACTORS_SHORT,
|
LLM_TENSOR_ROPE_FACTORS_SHORT,
|
||||||
|
|
@ -379,6 +384,7 @@ enum llm_tensor {
|
||||||
LLM_TENSOR_SSM_DT,
|
LLM_TENSOR_SSM_DT,
|
||||||
LLM_TENSOR_SSM_DT_NORM,
|
LLM_TENSOR_SSM_DT_NORM,
|
||||||
LLM_TENSOR_SSM_A,
|
LLM_TENSOR_SSM_A,
|
||||||
|
LLM_TENSOR_SSM_A_NOSCAN, // qwen3next special case with MUL instead of SSM_SCAN
|
||||||
LLM_TENSOR_SSM_B_NORM,
|
LLM_TENSOR_SSM_B_NORM,
|
||||||
LLM_TENSOR_SSM_C_NORM,
|
LLM_TENSOR_SSM_C_NORM,
|
||||||
LLM_TENSOR_SSM_D,
|
LLM_TENSOR_SSM_D,
|
||||||
|
|
@ -525,6 +531,10 @@ struct LLM_TN_IMPL {
|
||||||
const int bid;
|
const int bid;
|
||||||
const int xid;
|
const int xid;
|
||||||
|
|
||||||
|
const std::set<llm_tensor> model_tensors;
|
||||||
|
|
||||||
|
LLM_TN_IMPL(llm_arch arch, llm_tensor tensor, const char * suffix, int bid, int xid);
|
||||||
|
|
||||||
std::string str() const;
|
std::string str() const;
|
||||||
|
|
||||||
operator std::string() const {
|
operator std::string() const {
|
||||||
|
|
@ -546,11 +556,11 @@ struct LLM_TN {
|
||||||
llm_arch arch;
|
llm_arch arch;
|
||||||
|
|
||||||
LLM_TN_IMPL operator()(llm_tensor tensor, const char * suffix, int bid = -1, int xid = -1) const {
|
LLM_TN_IMPL operator()(llm_tensor tensor, const char * suffix, int bid = -1, int xid = -1) const {
|
||||||
return { arch, tensor, suffix, bid, xid };
|
return LLM_TN_IMPL(arch, tensor, suffix, bid, xid);
|
||||||
}
|
}
|
||||||
|
|
||||||
LLM_TN_IMPL operator()(llm_tensor tensor, int bid = -1, int xid = -1) const {
|
LLM_TN_IMPL operator()(llm_tensor tensor, int bid = -1, int xid = -1) const {
|
||||||
return { arch, tensor, nullptr, bid, xid };
|
return LLM_TN_IMPL(arch, tensor, nullptr, bid, xid);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -695,6 +695,8 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
|
||||||
udata->seq_idx .resize(LLAMA_MAX_SEQ, -1);
|
udata->seq_idx .resize(LLAMA_MAX_SEQ, -1);
|
||||||
udata->output .resize(n_tokens);
|
udata->output .resize(n_tokens);
|
||||||
|
|
||||||
|
udata->seq_id_data.reserve(n_tokens);
|
||||||
|
|
||||||
seq_set_t seq_set_unq;
|
seq_set_t seq_set_unq;
|
||||||
|
|
||||||
for (size_t i = 0; i < idxs.size(); ++i) {
|
for (size_t i = 0; i < idxs.size(); ++i) {
|
||||||
|
|
@ -716,11 +718,13 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
|
||||||
}
|
}
|
||||||
|
|
||||||
udata->n_seq_id[i] = batch.n_seq_id[idxs[i]];
|
udata->n_seq_id[i] = batch.n_seq_id[idxs[i]];
|
||||||
udata->seq_id[i] = batch.seq_id[idxs[i]];
|
|
||||||
udata->output[i] = batch.logits[idxs[i]];
|
udata->output[i] = batch.logits[idxs[i]];
|
||||||
|
|
||||||
for (int s = 0; s < udata->n_seq_id[i]; ++s) {
|
for (int s = 0; s < udata->n_seq_id[i]; ++s) {
|
||||||
seq_set_unq.set(udata->seq_id[i][s]);
|
const llama_seq_id seq_id = batch.seq_id[idxs[i]][s];
|
||||||
|
|
||||||
|
udata->seq_id_data.push_back(seq_id);
|
||||||
|
seq_set_unq.set(seq_id);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (udata->output[i]) {
|
if (udata->output[i]) {
|
||||||
|
|
@ -728,6 +732,12 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
llama_seq_id * seq_id_ptr = udata->seq_id_data.data();
|
||||||
|
for (size_t i = 0; i < idxs.size(); ++i) {
|
||||||
|
udata->seq_id[i] = seq_id_ptr;
|
||||||
|
seq_id_ptr += udata->n_seq_id[i];
|
||||||
|
}
|
||||||
|
|
||||||
for (uint32_t s = 0; s < n_seq_max; ++s) {
|
for (uint32_t s = 0; s < n_seq_max; ++s) {
|
||||||
if (seq_set_unq.test(s)) {
|
if (seq_set_unq.test(s)) {
|
||||||
udata->seq_idx[s] = udata->seq_id_unq.size();
|
udata->seq_idx[s] = udata->seq_id_unq.size();
|
||||||
|
|
|
||||||
|
|
@ -56,13 +56,15 @@ struct llama_ubatch {
|
||||||
std::vector<float> embd;
|
std::vector<float> embd;
|
||||||
std::vector<llama_pos> pos;
|
std::vector<llama_pos> pos;
|
||||||
std::vector<int32_t> n_seq_id;
|
std::vector<int32_t> n_seq_id;
|
||||||
std::vector<llama_seq_id *> seq_id;
|
std::vector<llama_seq_id *> seq_id; // these point into the seq_id_data below
|
||||||
std::vector<llama_seq_id> seq_id_unq;
|
std::vector<llama_seq_id> seq_id_unq;
|
||||||
std::vector<int32_t> seq_idx;
|
std::vector<int32_t> seq_idx;
|
||||||
std::vector<int8_t> output;
|
std::vector<int8_t> output;
|
||||||
|
|
||||||
|
std::vector<llama_seq_id> seq_id_data;
|
||||||
};
|
};
|
||||||
|
|
||||||
// the llama_ubatch pointers above point to this data if set. otherwise - points to non-owning data
|
// the llama_ubatch pointers above point to this data if set. otherwise - point to external non-owning data
|
||||||
std::shared_ptr<data_t> data;
|
std::shared_ptr<data_t> data;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -9,6 +9,7 @@
|
||||||
#include "llama-model.h"
|
#include "llama-model.h"
|
||||||
|
|
||||||
#include <cinttypes>
|
#include <cinttypes>
|
||||||
|
#include <cmath>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
#include <limits>
|
#include <limits>
|
||||||
#include <stdexcept>
|
#include <stdexcept>
|
||||||
|
|
@ -72,6 +73,43 @@ llama_context::llama_context(
|
||||||
cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f;
|
cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (cparams.yarn_ext_factor != 0) {
|
||||||
|
static auto get_mscale = [](float scale, float mscale) {
|
||||||
|
return scale <= 1.0f ? 1.0f : (0.1f * mscale * logf(scale) + 1.0f);
|
||||||
|
};
|
||||||
|
|
||||||
|
const float factor = 1.0f / cparams.rope_freq_scale;
|
||||||
|
|
||||||
|
// ref: https://github.com/huggingface/transformers/blob/6d00f6b0a5679c36510f203e4226e36f517c3032/src/transformers/modeling_rope_utils.py#L336-L348
|
||||||
|
if (hparams.rope_yarn_log_mul != 0.0f) {
|
||||||
|
// note: here we assume `mscale == 1.0f`
|
||||||
|
// TODO: start reading the actual value of mscale and handle the case where it is not 1.0f
|
||||||
|
float mscale = 1.0f;
|
||||||
|
const float mscale_all_dims = hparams.rope_yarn_log_mul;
|
||||||
|
|
||||||
|
// [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
|
||||||
|
// special-case DEEPSEEK v2:
|
||||||
|
// https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite-Chat/blob/main/config.json#L42-L43
|
||||||
|
if (model.arch == LLM_ARCH_DEEPSEEK2 && mscale_all_dims != 1.0f) {
|
||||||
|
mscale = mscale_all_dims;
|
||||||
|
}
|
||||||
|
|
||||||
|
cparams.yarn_attn_factor = get_mscale(factor, mscale) / get_mscale(factor, mscale_all_dims);
|
||||||
|
|
||||||
|
LLAMA_LOG_WARN("%s: setting new yarn_attn_factor = %.4f (mscale == %.1f, mscale_all_dim = %.1f)\n",
|
||||||
|
__func__, cparams.yarn_attn_factor, mscale, mscale_all_dims);
|
||||||
|
} else {
|
||||||
|
cparams.yarn_attn_factor = get_mscale(factor, 1.0f);
|
||||||
|
}
|
||||||
|
|
||||||
|
// when YARN is applied with yarn_ext_factor != 0.0f, we need to cancel this factor:
|
||||||
|
// https://github.com/ggml-org/llama.cpp/blob/a81a569577cc38b32558958b048228150be63eae/ggml/src/ggml-cpu/ops.cpp#L5541-L5544
|
||||||
|
//
|
||||||
|
// ref: https://github.com/ggml-org/llama.cpp/discussions/7416
|
||||||
|
// https://github.com/ggml-org/llama.cpp/pull/17945
|
||||||
|
cparams.yarn_attn_factor *= 1.0f / (1.0f + 0.1f * logf(factor));
|
||||||
|
}
|
||||||
|
|
||||||
cparams.yarn_attn_factor *= hparams.rope_attn_factor;
|
cparams.yarn_attn_factor *= hparams.rope_attn_factor;
|
||||||
|
|
||||||
if (cparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
|
if (cparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
|
||||||
|
|
@ -93,14 +131,6 @@ llama_context::llama_context(
|
||||||
// with causal attention, the batch size is limited by the context size
|
// with causal attention, the batch size is limited by the context size
|
||||||
cparams.n_batch = cparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
|
cparams.n_batch = cparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
|
||||||
|
|
||||||
// the batch has to be at least GGML_KQ_MASK_PAD because we will be padding the KQ_mask
|
|
||||||
// this is required by GPU kernels in order to avoid out-of-bounds accesses (e.g. ggml_flash_attn_ext)
|
|
||||||
// ref: https://github.com/ggerganov/llama.cpp/pull/5021
|
|
||||||
// TODO: this padding is not needed for the cache-less context so we should probably move it to llama_memory
|
|
||||||
if (cparams.n_batch < GGML_KQ_MASK_PAD) {
|
|
||||||
LLAMA_LOG_WARN("%s: n_batch is less than GGML_KQ_MASK_PAD - increasing to %d\n", __func__, GGML_KQ_MASK_PAD);
|
|
||||||
cparams.n_batch = GGML_KQ_MASK_PAD;
|
|
||||||
}
|
|
||||||
cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
|
cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
|
||||||
|
|
||||||
cparams.op_offload = params.op_offload;
|
cparams.op_offload = params.op_offload;
|
||||||
|
|
@ -228,6 +258,7 @@ llama_context::llama_context(
|
||||||
|
|
||||||
backend_buft.clear();
|
backend_buft.clear();
|
||||||
backend_ptrs.clear();
|
backend_ptrs.clear();
|
||||||
|
backend_buf_exp_size.clear();
|
||||||
|
|
||||||
for (auto & backend : backends) {
|
for (auto & backend : backends) {
|
||||||
auto * buft = ggml_backend_get_default_buffer_type(backend.get());
|
auto * buft = ggml_backend_get_default_buffer_type(backend.get());
|
||||||
|
|
@ -244,11 +275,15 @@ llama_context::llama_context(
|
||||||
|
|
||||||
backend_buft.push_back(buft);
|
backend_buft.push_back(buft);
|
||||||
backend_ptrs.push_back(backend.get());
|
backend_ptrs.push_back(backend.get());
|
||||||
|
backend_buf_exp_size.push_back(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
LLAMA_LOG_DEBUG("%s: backend_ptrs.size() = %zu\n", __func__, backend_ptrs.size());
|
LLAMA_LOG_DEBUG("%s: backend_ptrs.size() = %zu\n", __func__, backend_ptrs.size());
|
||||||
|
|
||||||
const size_t max_nodes = this->graph_max_nodes();
|
const uint32_t n_seqs = cparams.n_seq_max;
|
||||||
|
const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
|
||||||
|
|
||||||
|
const size_t max_nodes = this->graph_max_nodes(n_tokens);
|
||||||
|
|
||||||
LLAMA_LOG_DEBUG("%s: max_nodes = %zu\n", __func__, max_nodes);
|
LLAMA_LOG_DEBUG("%s: max_nodes = %zu\n", __func__, max_nodes);
|
||||||
|
|
||||||
|
|
@ -300,9 +335,6 @@ llama_context::llama_context(
|
||||||
|
|
||||||
cross.v_embd.clear();
|
cross.v_embd.clear();
|
||||||
|
|
||||||
const uint32_t n_seqs = cparams.n_seq_max;
|
|
||||||
const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
|
|
||||||
|
|
||||||
// avoid reserving graphs with zero outputs - assume one output per sequence
|
// avoid reserving graphs with zero outputs - assume one output per sequence
|
||||||
n_outputs = n_seqs;
|
n_outputs = n_seqs;
|
||||||
|
|
||||||
|
|
@ -359,7 +391,8 @@ llama_context::llama_context(
|
||||||
|
|
||||||
// reserve pp (prompt processing) graph first so that buffers are only allocated once
|
// reserve pp (prompt processing) graph first so that buffers are only allocated once
|
||||||
{
|
{
|
||||||
auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
|
auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get(),
|
||||||
|
model.hparams.no_alloc, model.hparams.no_alloc ? backend_buf_exp_size.data() : nullptr);
|
||||||
if (!gf) {
|
if (!gf) {
|
||||||
if (pipeline_parallel) {
|
if (pipeline_parallel) {
|
||||||
LLAMA_LOG_WARN("%s: compute buffer allocation failed, retrying without pipeline parallelism\n", __func__);
|
LLAMA_LOG_WARN("%s: compute buffer allocation failed, retrying without pipeline parallelism\n", __func__);
|
||||||
|
|
@ -377,7 +410,7 @@ llama_context::llama_context(
|
||||||
|
|
||||||
// reserve with tg (token generation) graph to get the number of splits and nodes
|
// reserve with tg (token generation) graph to get the number of splits and nodes
|
||||||
{
|
{
|
||||||
auto * gf = graph_reserve(n_seqs, n_seqs, n_seqs, mctx.get());
|
auto * gf = graph_reserve(n_seqs, n_seqs, n_seqs, mctx.get(), model.hparams.no_alloc);
|
||||||
if (!gf) {
|
if (!gf) {
|
||||||
throw std::runtime_error("failed to allocate compute tg buffers");
|
throw std::runtime_error("failed to allocate compute tg buffers");
|
||||||
}
|
}
|
||||||
|
|
@ -392,7 +425,7 @@ llama_context::llama_context(
|
||||||
//
|
//
|
||||||
// auto * gf = graph_reserve(n_tokens, 1, n_tokens, mctx.get());
|
// auto * gf = graph_reserve(n_tokens, 1, n_tokens, mctx.get());
|
||||||
//
|
//
|
||||||
auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
|
auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get(), model.hparams.no_alloc);
|
||||||
if (!gf) {
|
if (!gf) {
|
||||||
throw std::runtime_error("failed to allocate compute pp buffers");
|
throw std::runtime_error("failed to allocate compute pp buffers");
|
||||||
}
|
}
|
||||||
|
|
@ -401,11 +434,13 @@ llama_context::llama_context(
|
||||||
for (size_t i = 0; i < backend_ptrs.size(); ++i) {
|
for (size_t i = 0; i < backend_ptrs.size(); ++i) {
|
||||||
ggml_backend_t backend = backend_ptrs[i];
|
ggml_backend_t backend = backend_ptrs[i];
|
||||||
ggml_backend_buffer_type_t buft = backend_buft[i];
|
ggml_backend_buffer_type_t buft = backend_buft[i];
|
||||||
size_t size = ggml_backend_sched_get_buffer_size(sched.get(), backend);
|
if (!model.hparams.no_alloc) {
|
||||||
if (size > 1) {
|
backend_buf_exp_size[i] = ggml_backend_sched_get_buffer_size(sched.get(), backend);
|
||||||
|
}
|
||||||
|
if (backend_buf_exp_size[i] > 1) {
|
||||||
LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
|
LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
|
||||||
ggml_backend_buft_name(buft),
|
ggml_backend_buft_name(buft),
|
||||||
size / 1024.0 / 1024.0);
|
backend_buf_exp_size[i] / 1024.0 / 1024.0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -424,6 +459,23 @@ llama_context::llama_context(
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_context::~llama_context() {
|
llama_context::~llama_context() {
|
||||||
|
// FIXME this currently results in a use-after-free bug if the model is freed before the context
|
||||||
|
// if (!model.hparams.no_alloc) {
|
||||||
|
// for (size_t i = 0; i < backend_ptrs.size(); ++i) {
|
||||||
|
// ggml_backend_t backend = backend_ptrs[i];
|
||||||
|
// ggml_backend_buffer_type_t buft = backend_buft[i];
|
||||||
|
|
||||||
|
// const size_t size_exp = backend_buf_exp_size[i];
|
||||||
|
// const size_t size_act = ggml_backend_sched_get_buffer_size(sched.get(), backend);
|
||||||
|
// if (size_exp == size_act) {
|
||||||
|
// LLAMA_LOG_DEBUG("%s: %10s compute buffer size is %8.4f MiB, matches expectation of %8.4f MiB\n",
|
||||||
|
// __func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0));
|
||||||
|
// } else {
|
||||||
|
// LLAMA_LOG_WARN("%s: %10s compute buffer size of %8.4f MiB, does not match expectation of %8.4f MiB\n",
|
||||||
|
// __func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0));
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
// }
|
||||||
ggml_opt_free(opt_ctx);
|
ggml_opt_free(opt_ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -1325,6 +1377,7 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
|
||||||
// This doesn't happen often, but may be annoying in some cases (like the HellaSwag benchmark)
|
// This doesn't happen often, but may be annoying in some cases (like the HellaSwag benchmark)
|
||||||
LLAMA_LOG_INFO("%s: reallocating output buffer from size %.02f MiB to %.02f MiB\n", __func__, prev_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
|
LLAMA_LOG_INFO("%s: reallocating output buffer from size %.02f MiB to %.02f MiB\n", __func__, prev_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
|
||||||
#endif
|
#endif
|
||||||
|
synchronize();
|
||||||
buf_output = nullptr;
|
buf_output = nullptr;
|
||||||
logits = nullptr;
|
logits = nullptr;
|
||||||
embd = nullptr;
|
embd = nullptr;
|
||||||
|
|
@ -1385,9 +1438,9 @@ void llama_context::output_reorder() {
|
||||||
// graph
|
// graph
|
||||||
//
|
//
|
||||||
|
|
||||||
uint32_t llama_context::graph_max_nodes() const {
|
uint32_t llama_context::graph_max_nodes(uint32_t n_tokens) const {
|
||||||
if (model.arch == LLM_ARCH_QWEN3NEXT) {
|
if (model.arch == LLM_ARCH_QWEN3NEXT) {
|
||||||
return std::max<uint32_t>(8192u, 32u*model.n_tensors());
|
return std::max<uint32_t>(n_tokens * 40, 32u * model.n_tensors());
|
||||||
}
|
}
|
||||||
return std::max<uint32_t>(1024u, 8u*model.n_tensors());
|
return std::max<uint32_t>(1024u, 8u*model.n_tensors());
|
||||||
}
|
}
|
||||||
|
|
@ -1396,7 +1449,8 @@ llm_graph_result * llama_context::get_gf_res_reserve() const {
|
||||||
return static_cast<llm_graph_result *>(gf_res_reserve.get());
|
return static_cast<llm_graph_result *>(gf_res_reserve.get());
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only) {
|
ggml_cgraph * llama_context::graph_reserve(
|
||||||
|
uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only, size_t * sizes) {
|
||||||
LLAMA_LOG_DEBUG("%s: reserving a graph for ubatch with n_tokens = %4u, n_seqs = %2u, n_outputs = %4u\n", __func__, n_tokens, n_seqs, n_outputs);
|
LLAMA_LOG_DEBUG("%s: reserving a graph for ubatch with n_tokens = %4u, n_seqs = %2u, n_outputs = %4u\n", __func__, n_tokens, n_seqs, n_outputs);
|
||||||
GGML_ASSERT(n_outputs >= 1);
|
GGML_ASSERT(n_outputs >= 1);
|
||||||
|
|
||||||
|
|
@ -1433,8 +1487,13 @@ ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, u
|
||||||
|
|
||||||
// initialize scheduler with the specified graph
|
// initialize scheduler with the specified graph
|
||||||
if (split_only) {
|
if (split_only) {
|
||||||
|
if (sizes) {
|
||||||
|
ggml_backend_sched_reserve_size(sched.get(), gf, sizes);
|
||||||
|
} else {
|
||||||
ggml_backend_sched_split_graph(sched.get(), gf);
|
ggml_backend_sched_split_graph(sched.get(), gf);
|
||||||
|
}
|
||||||
} else if (!ggml_backend_sched_reserve(sched.get(), gf)) {
|
} else if (!ggml_backend_sched_reserve(sched.get(), gf)) {
|
||||||
|
GGML_ASSERT(!sizes);
|
||||||
LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
|
LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
@ -2056,15 +2115,26 @@ void llama_context::perf_reset() {
|
||||||
|
|
||||||
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> llama_context::memory_breakdown() const {
|
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> llama_context::memory_breakdown() const {
|
||||||
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> ret;
|
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> ret;
|
||||||
for (const auto & buft_size : model.memory_breakdown()) {
|
for (const auto & [buft, size] : model.memory_breakdown()) {
|
||||||
ret[buft_size.first].model += buft_size.second;
|
ret[buft].model += size;
|
||||||
}
|
}
|
||||||
for (const auto & buft_size : memory->memory_breakdown()) {
|
if (memory) {
|
||||||
ret[buft_size.first].context += buft_size.second;
|
for (const auto & [buft, size] : memory->memory_breakdown()) {
|
||||||
|
ret[buft].context += size;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
if (model.hparams.no_alloc) {
|
||||||
|
for (size_t i = 0; i < backends.size(); ++i) {
|
||||||
|
ggml_backend_t backend = backends[i].get();
|
||||||
|
ggml_backend_buffer_type_t buft = ggml_backend_sched_get_buffer_type(sched.get(), backend);
|
||||||
|
ret[buft].compute += backend_buf_exp_size[i];
|
||||||
|
}
|
||||||
|
} else {
|
||||||
for (const auto & backend_ptr : backends) {
|
for (const auto & backend_ptr : backends) {
|
||||||
ggml_backend_t backend = backend_ptr.get();
|
ggml_backend_t backend = backend_ptr.get();
|
||||||
ret[ggml_backend_sched_get_buffer_type(sched.get(), backend)].compute += ggml_backend_sched_get_buffer_size(sched.get(), backend);
|
ggml_backend_buffer_type_t buft = ggml_backend_sched_get_buffer_type(sched.get(), backend);
|
||||||
|
ret[buft].compute += ggml_backend_sched_get_buffer_size(sched.get(), backend);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -26,6 +26,10 @@ struct llama_memory_breakdown_data {
|
||||||
size_t model = 0; // memory allocated for the model
|
size_t model = 0; // memory allocated for the model
|
||||||
size_t context = 0; // memory allocated for the context
|
size_t context = 0; // memory allocated for the context
|
||||||
size_t compute = 0; // memory allocated for temporary compute buffers
|
size_t compute = 0; // memory allocated for temporary compute buffers
|
||||||
|
|
||||||
|
size_t total() const {
|
||||||
|
return model + context + compute;
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct llama_context {
|
struct llama_context {
|
||||||
|
|
@ -197,7 +201,7 @@ private:
|
||||||
//
|
//
|
||||||
|
|
||||||
public:
|
public:
|
||||||
uint32_t graph_max_nodes() const;
|
uint32_t graph_max_nodes(uint32_t n_tokens) const;
|
||||||
|
|
||||||
// can reuse the llm_graph_result instance of the context (for example to update a memory module)
|
// can reuse the llm_graph_result instance of the context (for example to update a memory module)
|
||||||
llm_graph_result * get_gf_res_reserve() const;
|
llm_graph_result * get_gf_res_reserve() const;
|
||||||
|
|
@ -206,7 +210,8 @@ public:
|
||||||
ggml_status graph_compute(ggml_cgraph * gf, bool batched);
|
ggml_status graph_compute(ggml_cgraph * gf, bool batched);
|
||||||
|
|
||||||
// reserve a graph with a dummy ubatch of the specified size
|
// reserve a graph with a dummy ubatch of the specified size
|
||||||
ggml_cgraph * graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only = false);
|
ggml_cgraph * graph_reserve(
|
||||||
|
uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only = false, size_t * sizes = nullptr);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
llm_graph_params graph_params(
|
llm_graph_params graph_params(
|
||||||
|
|
@ -281,9 +286,10 @@ private:
|
||||||
|
|
||||||
std::vector<std::pair<ggml_backend_t, ggml_backend_set_n_threads_t>> set_n_threads_fns;
|
std::vector<std::pair<ggml_backend_t, ggml_backend_set_n_threads_t>> set_n_threads_fns;
|
||||||
|
|
||||||
// buffer types used for the compute buffer of each backend
|
// pointers and buffer types used for the compute buffer of each backend
|
||||||
std::vector<ggml_backend_t> backend_ptrs;
|
std::vector<ggml_backend_t> backend_ptrs;
|
||||||
std::vector<ggml_backend_buffer_type_t> backend_buft;
|
std::vector<ggml_backend_buffer_type_t> backend_buft;
|
||||||
|
std::vector<size_t> backend_buf_exp_size; // expected buffer sizes
|
||||||
|
|
||||||
llm_graph_result_ptr gf_res_prev;
|
llm_graph_result_ptr gf_res_prev;
|
||||||
llm_graph_result_ptr gf_res_reserve;
|
llm_graph_result_ptr gf_res_reserve;
|
||||||
|
|
|
||||||
|
|
@ -181,6 +181,52 @@ static std::pair<uint32_t, const char *> parse_char(const char * src) {
|
||||||
throw std::runtime_error("unexpected end of input");
|
throw std::runtime_error("unexpected end of input");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static std::pair<uint32_t, const char *> parse_token(const llama_vocab * vocab, const char * src) {
|
||||||
|
const char * pos = src;
|
||||||
|
if (*pos != '<') {
|
||||||
|
throw std::runtime_error(std::string("expecting '<' at ") + pos);
|
||||||
|
}
|
||||||
|
pos++;
|
||||||
|
|
||||||
|
// Parse <[id]>
|
||||||
|
if (*pos == '[') {
|
||||||
|
pos++;
|
||||||
|
const char * int_end = parse_int(pos);
|
||||||
|
uint32_t token_id = std::stoul(std::string(pos, int_end - pos));
|
||||||
|
pos = int_end;
|
||||||
|
if (*pos != ']') {
|
||||||
|
throw std::runtime_error(std::string("expecting ']' at ") + pos);
|
||||||
|
}
|
||||||
|
pos++;
|
||||||
|
if (*pos != '>') {
|
||||||
|
throw std::runtime_error(std::string("expecting '>' at ") + pos);
|
||||||
|
}
|
||||||
|
pos++;
|
||||||
|
return std::make_pair(token_id, pos);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (vocab == nullptr) {
|
||||||
|
throw std::runtime_error(std::string("no vocab to parse token at ") + src);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Parse <token> and tokenize to obtain the token id
|
||||||
|
while (*pos != 0 && *pos != '>') {
|
||||||
|
pos++;
|
||||||
|
}
|
||||||
|
if (*pos != '>') {
|
||||||
|
throw std::runtime_error(std::string("expecting '>' at ") + pos);
|
||||||
|
}
|
||||||
|
pos++;
|
||||||
|
|
||||||
|
llama_token tokens[2];
|
||||||
|
int32_t n_tokens = vocab->tokenize(src, static_cast<int32_t>(pos - src), tokens, 2, false, true);
|
||||||
|
if (n_tokens != 1) {
|
||||||
|
// must tokenize to exactly 1 token
|
||||||
|
throw std::runtime_error("invalid token '" + std::string(src, pos - src) + "'");
|
||||||
|
}
|
||||||
|
return std::make_pair(tokens[0], pos);
|
||||||
|
}
|
||||||
|
|
||||||
static void print_grammar_char(FILE * file, uint32_t c) {
|
static void print_grammar_char(FILE * file, uint32_t c) {
|
||||||
if (0x20 <= c && c <= 0x7f) {
|
if (0x20 <= c && c <= 0x7f) {
|
||||||
fprintf(file, "%c", static_cast<char>(c));
|
fprintf(file, "%c", static_cast<char>(c));
|
||||||
|
|
@ -212,6 +258,8 @@ static void print_rule_binary(FILE * file, const llama_grammar_rule & rule) {
|
||||||
case LLAMA_GRETYPE_CHAR_RNG_UPPER: fprintf(file, "CHAR_RNG_UPPER"); break;
|
case LLAMA_GRETYPE_CHAR_RNG_UPPER: fprintf(file, "CHAR_RNG_UPPER"); break;
|
||||||
case LLAMA_GRETYPE_CHAR_ALT: fprintf(file, "CHAR_ALT"); break;
|
case LLAMA_GRETYPE_CHAR_ALT: fprintf(file, "CHAR_ALT"); break;
|
||||||
case LLAMA_GRETYPE_CHAR_ANY: fprintf(file, "CHAR_ANY"); break;
|
case LLAMA_GRETYPE_CHAR_ANY: fprintf(file, "CHAR_ANY"); break;
|
||||||
|
case LLAMA_GRETYPE_TOKEN: fprintf(file, "TOKEN"); break;
|
||||||
|
case LLAMA_GRETYPE_TOKEN_NOT: fprintf(file, "TOKEN_NOT"); break;
|
||||||
}
|
}
|
||||||
switch (elem.type) {
|
switch (elem.type) {
|
||||||
case LLAMA_GRETYPE_END:
|
case LLAMA_GRETYPE_END:
|
||||||
|
|
@ -228,6 +276,17 @@ static void print_rule_binary(FILE * file, const llama_grammar_rule & rule) {
|
||||||
print_grammar_char(file, elem.value);
|
print_grammar_char(file, elem.value);
|
||||||
fprintf(file, "\") ");
|
fprintf(file, "\") ");
|
||||||
break;
|
break;
|
||||||
|
case LLAMA_GRETYPE_TOKEN:
|
||||||
|
fprintf(file, "<[");
|
||||||
|
fprintf(file, "%u", elem.value);
|
||||||
|
fprintf(file, "]> ");
|
||||||
|
break;
|
||||||
|
case LLAMA_GRETYPE_TOKEN_NOT:
|
||||||
|
fprintf(file, "!");
|
||||||
|
fprintf(file, "<[");
|
||||||
|
fprintf(file, "%u", elem.value);
|
||||||
|
fprintf(file, "]> ");
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
fprintf(file, "\n");
|
fprintf(file, "\n");
|
||||||
|
|
@ -284,6 +343,17 @@ static void print_rule(
|
||||||
case LLAMA_GRETYPE_CHAR_ANY:
|
case LLAMA_GRETYPE_CHAR_ANY:
|
||||||
fprintf(file, ".");
|
fprintf(file, ".");
|
||||||
break;
|
break;
|
||||||
|
case LLAMA_GRETYPE_TOKEN:
|
||||||
|
fprintf(file, "<[");
|
||||||
|
fprintf(file, "%u", elem.value);
|
||||||
|
fprintf(file, "]> ");
|
||||||
|
break;
|
||||||
|
case LLAMA_GRETYPE_TOKEN_NOT:
|
||||||
|
fprintf(file, "!");
|
||||||
|
fprintf(file, "<[");
|
||||||
|
fprintf(file, "%u", elem.value);
|
||||||
|
fprintf(file, "]> ");
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
if (is_char_element(elem)) {
|
if (is_char_element(elem)) {
|
||||||
switch (rule[i + 1].type) {
|
switch (rule[i + 1].type) {
|
||||||
|
|
@ -444,6 +514,17 @@ const char * llama_grammar_parser::parse_sequence(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
pos = parse_space(pos + 1, is_nested);
|
pos = parse_space(pos + 1, is_nested);
|
||||||
|
} else if (*pos == '<' || *pos == '!') { // token
|
||||||
|
auto type = LLAMA_GRETYPE_TOKEN;
|
||||||
|
if (*pos == '!') { // token inverse
|
||||||
|
type = LLAMA_GRETYPE_TOKEN_NOT;
|
||||||
|
pos++;
|
||||||
|
}
|
||||||
|
auto token_pair = parse_token(vocab, pos);
|
||||||
|
const char * token_end = token_pair.second;
|
||||||
|
last_sym_start = rule.size();
|
||||||
|
rule.push_back({type, token_pair.first});
|
||||||
|
pos = parse_space(token_end, is_nested);
|
||||||
} else if (is_word_char(*pos)) { // rule reference
|
} else if (is_word_char(*pos)) { // rule reference
|
||||||
const char * name_end = parse_name(pos);
|
const char * name_end = parse_name(pos);
|
||||||
uint32_t ref_rule_id = get_symbol_id(pos, name_end - pos);
|
uint32_t ref_rule_id = get_symbol_id(pos, name_end - pos);
|
||||||
|
|
@ -691,6 +772,21 @@ static bool llama_grammar_match_partial_char(
|
||||||
return !is_positive_char;
|
return !is_positive_char;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// returns true iff token matches the rule at pos (regular or inverse)
|
||||||
|
// asserts that pos is pointing to a token element
|
||||||
|
static bool llama_grammar_match_token(
|
||||||
|
const llama_grammar_element * pos,
|
||||||
|
const llama_token token) {
|
||||||
|
GGML_ASSERT(pos->type == LLAMA_GRETYPE_TOKEN || pos->type == LLAMA_GRETYPE_TOKEN_NOT);
|
||||||
|
if (pos->type == LLAMA_GRETYPE_TOKEN) {
|
||||||
|
return pos->value == static_cast<uint32_t>(token);
|
||||||
|
}
|
||||||
|
if (pos->type == LLAMA_GRETYPE_TOKEN_NOT) {
|
||||||
|
return pos->value != static_cast<uint32_t>(token);
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
// transforms a grammar pushdown stack into N possible stacks, all ending
|
// transforms a grammar pushdown stack into N possible stacks, all ending
|
||||||
// at a character range (terminal element)
|
// at a character range (terminal element)
|
||||||
static void llama_grammar_advance_stack(
|
static void llama_grammar_advance_stack(
|
||||||
|
|
@ -738,6 +834,8 @@ static void llama_grammar_advance_stack(
|
||||||
case LLAMA_GRETYPE_CHAR:
|
case LLAMA_GRETYPE_CHAR:
|
||||||
case LLAMA_GRETYPE_CHAR_NOT:
|
case LLAMA_GRETYPE_CHAR_NOT:
|
||||||
case LLAMA_GRETYPE_CHAR_ANY:
|
case LLAMA_GRETYPE_CHAR_ANY:
|
||||||
|
case LLAMA_GRETYPE_TOKEN:
|
||||||
|
case LLAMA_GRETYPE_TOKEN_NOT:
|
||||||
if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) {
|
if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) {
|
||||||
// only add the stack if it's not a duplicate of one we already have
|
// only add the stack if it's not a duplicate of one we already have
|
||||||
new_stacks.emplace_back(stack);
|
new_stacks.emplace_back(stack);
|
||||||
|
|
@ -831,26 +929,38 @@ llama_grammar_stacks & llama_grammar_get_stacks(struct llama_grammar * grammar)
|
||||||
return grammar->stacks;
|
return grammar->stacks;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void llama_grammar_accept_chr(
|
||||||
|
struct llama_grammar & grammar,
|
||||||
|
const llama_grammar_stack & stack,
|
||||||
|
uint32_t chr,
|
||||||
|
llama_grammar_stacks & new_stacks) {
|
||||||
|
if (stack.empty()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const llama_grammar_element * pos = stack.back();
|
||||||
|
|
||||||
|
// ignore if this turns into a token
|
||||||
|
if (pos->type == LLAMA_GRETYPE_TOKEN || pos->type == LLAMA_GRETYPE_TOKEN_NOT) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto match = llama_grammar_match_char(pos, chr);
|
||||||
|
if (match.first) {
|
||||||
|
llama_grammar_stack new_stack(stack.begin(), stack.end() - 1);
|
||||||
|
if (!llama_grammar_is_end_of_sequence(match.second)) {
|
||||||
|
new_stack.push_back(match.second);
|
||||||
|
}
|
||||||
|
llama_grammar_advance_stack(grammar.rules, new_stack, new_stacks);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void llama_grammar_accept(struct llama_grammar * grammar, uint32_t chr) {
|
void llama_grammar_accept(struct llama_grammar * grammar, uint32_t chr) {
|
||||||
llama_grammar_stacks stacks_new;
|
llama_grammar_stacks stacks_new;
|
||||||
stacks_new.reserve(grammar->stacks.size());
|
stacks_new.reserve(grammar->stacks.size());
|
||||||
|
|
||||||
for (const auto & stack : grammar->stacks) {
|
for (const auto & stack : grammar->stacks) {
|
||||||
if (stack.empty()) {
|
llama_grammar_accept_chr(*grammar, stack, chr, stacks_new);
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
auto match = llama_grammar_match_char(stack.back(), chr);
|
|
||||||
if (match.first) {
|
|
||||||
const llama_grammar_element * pos = match.second;
|
|
||||||
|
|
||||||
// update top of stack to next element, if any
|
|
||||||
llama_grammar_stack new_stack(stack.begin(), stack.end() - 1);
|
|
||||||
if (!llama_grammar_is_end_of_sequence(pos)) {
|
|
||||||
new_stack.push_back(pos);
|
|
||||||
}
|
|
||||||
llama_grammar_advance_stack(grammar->rules, new_stack, stacks_new);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
grammar->stacks = std::move(stacks_new);
|
grammar->stacks = std::move(stacks_new);
|
||||||
|
|
@ -875,6 +985,22 @@ llama_grammar_candidates llama_grammar_reject_candidates_for_stack(
|
||||||
|
|
||||||
const llama_grammar_element * stack_pos = stack.back();
|
const llama_grammar_element * stack_pos = stack.back();
|
||||||
|
|
||||||
|
// if the top of the stack is a token rule, then we only need to check the token id
|
||||||
|
if (stack_pos->type == LLAMA_GRETYPE_TOKEN || stack_pos->type == LLAMA_GRETYPE_TOKEN_NOT) {
|
||||||
|
for (const auto & tok : candidates) {
|
||||||
|
if (*tok.code_points == 0) {
|
||||||
|
// reached the end of a token consumed by char rules, reject iff it ended
|
||||||
|
// in a partial response
|
||||||
|
if (tok.partial_utf8.n_remain != 0) {
|
||||||
|
rejects.push_back(tok);
|
||||||
|
}
|
||||||
|
} else if (!llama_grammar_match_token(stack_pos, tok.id)) {
|
||||||
|
rejects.push_back(tok);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return rejects;
|
||||||
|
}
|
||||||
|
|
||||||
llama_grammar_candidates next_candidates;
|
llama_grammar_candidates next_candidates;
|
||||||
next_candidates.reserve(candidates.size());
|
next_candidates.reserve(candidates.size());
|
||||||
|
|
||||||
|
|
@ -887,7 +1013,7 @@ llama_grammar_candidates llama_grammar_reject_candidates_for_stack(
|
||||||
rejects.push_back(tok);
|
rejects.push_back(tok);
|
||||||
}
|
}
|
||||||
} else if (llama_grammar_match_char(stack_pos, *tok.code_points).first) {
|
} else if (llama_grammar_match_char(stack_pos, *tok.code_points).first) {
|
||||||
next_candidates.push_back({ tok.index, tok.code_points + 1, tok.partial_utf8 });
|
next_candidates.push_back({ tok.index, tok.code_points + 1, tok.partial_utf8, tok.id });
|
||||||
} else {
|
} else {
|
||||||
rejects.push_back(tok);
|
rejects.push_back(tok);
|
||||||
}
|
}
|
||||||
|
|
@ -905,7 +1031,7 @@ llama_grammar_candidates llama_grammar_reject_candidates_for_stack(
|
||||||
|
|
||||||
auto next_rejects = llama_grammar_reject_candidates(rules, next_stacks, next_candidates);
|
auto next_rejects = llama_grammar_reject_candidates(rules, next_stacks, next_candidates);
|
||||||
for (const auto & tok : next_rejects) {
|
for (const auto & tok : next_rejects) {
|
||||||
rejects.push_back({ tok.index, tok.code_points - 1, tok.partial_utf8 });
|
rejects.push_back({ tok.index, tok.code_points - 1, tok.partial_utf8, tok.id });
|
||||||
}
|
}
|
||||||
|
|
||||||
return rejects;
|
return rejects;
|
||||||
|
|
@ -975,9 +1101,10 @@ struct llama_grammar * llama_grammar_init_impl(
|
||||||
std::move(vec_rules),
|
std::move(vec_rules),
|
||||||
std::move(stacks),
|
std::move(stacks),
|
||||||
/* .partial_utf8 = */ {},
|
/* .partial_utf8 = */ {},
|
||||||
/* .lazy =*/ false,
|
/* .lazy = */ false,
|
||||||
/* .awaiting_trigger = */ false,
|
/* .awaiting_trigger = */ false,
|
||||||
/* .trigger_buffer = */ "",
|
/* .trigger_buffer = */ "",
|
||||||
|
/* .trigger_buffer_positions = */ {},
|
||||||
/* .trigger_tokens = */ {},
|
/* .trigger_tokens = */ {},
|
||||||
/* .trigger_patterns = */ {},
|
/* .trigger_patterns = */ {},
|
||||||
};
|
};
|
||||||
|
|
@ -993,7 +1120,7 @@ struct llama_grammar * llama_grammar_init_impl(
|
||||||
size_t num_trigger_patterns,
|
size_t num_trigger_patterns,
|
||||||
const llama_token * trigger_tokens,
|
const llama_token * trigger_tokens,
|
||||||
size_t num_trigger_tokens) {
|
size_t num_trigger_tokens) {
|
||||||
llama_grammar_parser parser;
|
llama_grammar_parser parser(vocab);
|
||||||
|
|
||||||
// if there is a grammar, parse it
|
// if there is a grammar, parse it
|
||||||
// rules will be empty (default) if there are parse errors
|
// rules will be empty (default) if there are parse errors
|
||||||
|
|
@ -1085,6 +1212,7 @@ struct llama_grammar * llama_grammar_init_impl(
|
||||||
/* .lazy = */ lazy,
|
/* .lazy = */ lazy,
|
||||||
/* .awaiting_trigger = */ lazy,
|
/* .awaiting_trigger = */ lazy,
|
||||||
/* .trigger_buffer = */ "",
|
/* .trigger_buffer = */ "",
|
||||||
|
/* .trigger_buffer_positions = */ {},
|
||||||
std::move(vec_trigger_tokens),
|
std::move(vec_trigger_tokens),
|
||||||
std::move(vec_trigger_patterns),
|
std::move(vec_trigger_patterns),
|
||||||
};
|
};
|
||||||
|
|
@ -1108,6 +1236,7 @@ struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & gra
|
||||||
grammar.lazy,
|
grammar.lazy,
|
||||||
grammar.awaiting_trigger,
|
grammar.awaiting_trigger,
|
||||||
grammar.trigger_buffer,
|
grammar.trigger_buffer,
|
||||||
|
grammar.trigger_buffer_positions,
|
||||||
grammar.trigger_tokens,
|
grammar.trigger_tokens,
|
||||||
grammar.trigger_patterns,
|
grammar.trigger_patterns,
|
||||||
};
|
};
|
||||||
|
|
@ -1164,7 +1293,7 @@ void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_
|
||||||
cur_p->data[i].logit = -INFINITY;
|
cur_p->data[i].logit = -INFINITY;
|
||||||
} else {
|
} else {
|
||||||
candidates_decoded.push_back(decode_utf8(piece, grammar.partial_utf8));
|
candidates_decoded.push_back(decode_utf8(piece, grammar.partial_utf8));
|
||||||
candidates_grammar.push_back({ i, candidates_decoded.back().first.data(), candidates_decoded.back().second });
|
candidates_grammar.push_back({ i, candidates_decoded.back().first.data(), candidates_decoded.back().second, id });
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -1184,10 +1313,12 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
|
||||||
if (std::find(grammar.trigger_tokens.begin(), grammar.trigger_tokens.end(), token) != grammar.trigger_tokens.end()) {
|
if (std::find(grammar.trigger_tokens.begin(), grammar.trigger_tokens.end(), token) != grammar.trigger_tokens.end()) {
|
||||||
grammar.awaiting_trigger = false;
|
grammar.awaiting_trigger = false;
|
||||||
grammar.trigger_buffer.clear();
|
grammar.trigger_buffer.clear();
|
||||||
llama_grammar_accept_str(grammar, piece);
|
llama_grammar_accept_token(grammar, token, piece);
|
||||||
LLAMA_LOG_DEBUG("Grammar triggered on token %u (`%s`)", token, piece.c_str());
|
LLAMA_LOG_DEBUG("Grammar triggered on token %u (`%s`)", token, piece.c_str());
|
||||||
return;
|
return;
|
||||||
} else {
|
} else {
|
||||||
|
auto position = std::make_pair(grammar.trigger_buffer.size(), grammar.trigger_buffer.size() + piece.size());
|
||||||
|
grammar.trigger_buffer_positions.push_back(std::make_pair(token, position));
|
||||||
grammar.trigger_buffer += piece;
|
grammar.trigger_buffer += piece;
|
||||||
|
|
||||||
std::smatch match;
|
std::smatch match;
|
||||||
|
|
@ -1205,10 +1336,23 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
|
||||||
if (start == std::string::npos) {
|
if (start == std::string::npos) {
|
||||||
start = match.position(0);
|
start = match.position(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// replay tokens that overlap with [start, end)
|
||||||
|
for (const auto & [tok, tok_pos] : grammar.trigger_buffer_positions) {
|
||||||
|
auto [tok_start, tok_end] = tok_pos;
|
||||||
|
if (tok_end <= start) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t piece_start = (tok_start < start) ? start : tok_start; // allow for partial token pieces
|
||||||
|
size_t piece_len = tok_end - piece_start;
|
||||||
|
auto tok_piece = grammar.trigger_buffer.substr(piece_start, piece_len);
|
||||||
|
llama_grammar_accept_token(grammar, tok, tok_piece);
|
||||||
|
}
|
||||||
|
|
||||||
auto constrained_str = grammar.trigger_buffer.substr(start);
|
auto constrained_str = grammar.trigger_buffer.substr(start);
|
||||||
// std::string constrained_str(match[1].first, grammar.trigger_buffer.end());
|
|
||||||
grammar.trigger_buffer.clear();
|
grammar.trigger_buffer.clear();
|
||||||
llama_grammar_accept_str(grammar, constrained_str);
|
grammar.trigger_buffer_positions.clear();
|
||||||
LLAMA_LOG_DEBUG("Grammar triggered on regex: '%s'\n", constrained_str.c_str());
|
LLAMA_LOG_DEBUG("Grammar triggered on regex: '%s'\n", constrained_str.c_str());
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
@ -1228,7 +1372,7 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
|
||||||
GGML_ABORT("grammar error: end of grammar token received but grammar stack is not empty");
|
GGML_ABORT("grammar error: end of grammar token received but grammar stack is not empty");
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_grammar_accept_str(grammar, piece);
|
llama_grammar_accept_token(grammar, token, piece);
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_grammar_accept_str(struct llama_grammar & grammar, const std::string & piece) {
|
void llama_grammar_accept_str(struct llama_grammar & grammar, const std::string & piece) {
|
||||||
|
|
@ -1246,6 +1390,61 @@ void llama_grammar_accept_str(struct llama_grammar & grammar, const std::string
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void llama_grammar_accept_token(struct llama_grammar & grammar, llama_token token, const std::string & piece) {
|
||||||
|
// Note terminating 0 in decoded string
|
||||||
|
const auto decoded = decode_utf8(piece, grammar.partial_utf8);
|
||||||
|
const auto & code_points = decoded.first;
|
||||||
|
|
||||||
|
llama_grammar_stacks stacks_new;
|
||||||
|
stacks_new.reserve(grammar.stacks.size());
|
||||||
|
|
||||||
|
for (const auto & stack : grammar.stacks) {
|
||||||
|
if (stack.empty()) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
const llama_grammar_element * pos = stack.back();
|
||||||
|
|
||||||
|
if (pos->type == LLAMA_GRETYPE_TOKEN || pos->type == LLAMA_GRETYPE_TOKEN_NOT) {
|
||||||
|
if (llama_grammar_match_token(pos, token)) {
|
||||||
|
llama_grammar_stack new_stack(stack.begin(), stack.end() - 1);
|
||||||
|
if (!llama_grammar_is_end_of_sequence(pos + 1)) {
|
||||||
|
new_stack.push_back(pos + 1);
|
||||||
|
}
|
||||||
|
llama_grammar_advance_stack(grammar.rules, new_stack, stacks_new);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
llama_grammar_stacks current_stacks = {stack};
|
||||||
|
|
||||||
|
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
|
||||||
|
llama_grammar_stacks next_stacks;
|
||||||
|
|
||||||
|
for (const auto & cur_stack : current_stacks) {
|
||||||
|
llama_grammar_accept_chr(grammar, cur_stack, *it, next_stacks);
|
||||||
|
}
|
||||||
|
|
||||||
|
current_stacks = std::move(next_stacks);
|
||||||
|
if (current_stacks.empty()) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (auto & surviving_stack : current_stacks) {
|
||||||
|
if (std::find(stacks_new.begin(), stacks_new.end(), surviving_stack) == stacks_new.end()) {
|
||||||
|
stacks_new.emplace_back(surviving_stack);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
grammar.stacks = std::move(stacks_new);
|
||||||
|
grammar.partial_utf8 = decoded.second;
|
||||||
|
|
||||||
|
if (grammar.stacks.empty()) {
|
||||||
|
throw std::runtime_error("Unexpected empty grammar stack after accepting piece: " + piece + " (" + std::to_string(token) + ")");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
const std::string & ollama_vocab::token_to_piece(const uint32_t token) const {
|
const std::string & ollama_vocab::token_to_piece(const uint32_t token) const {
|
||||||
try {
|
try {
|
||||||
|
|
|
||||||
|
|
@ -47,11 +47,17 @@ enum llama_gretype {
|
||||||
|
|
||||||
// any character (.)
|
// any character (.)
|
||||||
LLAMA_GRETYPE_CHAR_ANY = 7,
|
LLAMA_GRETYPE_CHAR_ANY = 7,
|
||||||
|
|
||||||
|
// terminal element: token (<[token-id]>)
|
||||||
|
LLAMA_GRETYPE_TOKEN = 8,
|
||||||
|
|
||||||
|
// inverse token (!<[token-id]>)
|
||||||
|
LLAMA_GRETYPE_TOKEN_NOT = 9,
|
||||||
};
|
};
|
||||||
|
|
||||||
typedef struct llama_grammar_element {
|
typedef struct llama_grammar_element {
|
||||||
enum llama_gretype type;
|
enum llama_gretype type;
|
||||||
uint32_t value; // Unicode code point or rule ID
|
uint32_t value; // Unicode code point, rule ID, or token ID
|
||||||
} llama_grammar_element;
|
} llama_grammar_element;
|
||||||
|
|
||||||
struct llama_partial_utf8 {
|
struct llama_partial_utf8 {
|
||||||
|
|
@ -63,6 +69,7 @@ struct llama_grammar_candidate {
|
||||||
size_t index;
|
size_t index;
|
||||||
const uint32_t * code_points;
|
const uint32_t * code_points;
|
||||||
llama_partial_utf8 partial_utf8;
|
llama_partial_utf8 partial_utf8;
|
||||||
|
llama_token id;
|
||||||
};
|
};
|
||||||
|
|
||||||
using llama_grammar_rule = std::vector< llama_grammar_element>;
|
using llama_grammar_rule = std::vector< llama_grammar_element>;
|
||||||
|
|
@ -88,10 +95,13 @@ std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_stack(
|
||||||
const llama_grammar_candidates & candidates);
|
const llama_grammar_candidates & candidates);
|
||||||
|
|
||||||
struct llama_grammar_parser {
|
struct llama_grammar_parser {
|
||||||
|
const llama_vocab * vocab;
|
||||||
std::map<std::string, uint32_t> symbol_ids;
|
std::map<std::string, uint32_t> symbol_ids;
|
||||||
|
|
||||||
llama_grammar_rules rules;
|
llama_grammar_rules rules;
|
||||||
|
|
||||||
|
llama_grammar_parser(const struct llama_vocab * vocab = nullptr) : vocab(vocab) {}
|
||||||
|
|
||||||
llama_grammar_stack c_rules() const;
|
llama_grammar_stack c_rules() const;
|
||||||
|
|
||||||
uint32_t get_symbol_id(const char * src, size_t len);
|
uint32_t get_symbol_id(const char * src, size_t len);
|
||||||
|
|
@ -123,6 +133,9 @@ struct llama_grammar_trigger_pattern {
|
||||||
};
|
};
|
||||||
|
|
||||||
struct llama_grammar {
|
struct llama_grammar {
|
||||||
|
// maintain a list of llama_tokens and their positions in the trigger_buffer
|
||||||
|
using token_pos = std::pair<llama_token, std::pair<size_t, size_t>>;
|
||||||
|
|
||||||
// note: allow null vocab for testing (not great)
|
// note: allow null vocab for testing (not great)
|
||||||
const llama_vocab * vocab;
|
const llama_vocab * vocab;
|
||||||
const ollama_vocab * o_vocab;
|
const ollama_vocab * o_vocab;
|
||||||
|
|
@ -139,6 +152,7 @@ struct llama_grammar {
|
||||||
bool lazy = false;
|
bool lazy = false;
|
||||||
bool awaiting_trigger = false; // Initialized to true for lazy grammars only
|
bool awaiting_trigger = false; // Initialized to true for lazy grammars only
|
||||||
std::string trigger_buffer; // Output buffered by lazy grammar. Will be cleared once trigger is found.
|
std::string trigger_buffer; // Output buffered by lazy grammar. Will be cleared once trigger is found.
|
||||||
|
std::vector<token_pos> trigger_buffer_positions; // Tokens buffered by lazy grammar. Used to replay when a trigger is found.
|
||||||
std::vector<llama_token> trigger_tokens; // Tokens that trigger a lazy grammar, or tokens to force printing of (even if special).
|
std::vector<llama_token> trigger_tokens; // Tokens that trigger a lazy grammar, or tokens to force printing of (even if special).
|
||||||
std::vector<llama_grammar_trigger_pattern>
|
std::vector<llama_grammar_trigger_pattern>
|
||||||
trigger_patterns; // Regular expressions that trigger a lazy grammar. Must be a full match of the entire generated
|
trigger_patterns; // Regular expressions that trigger a lazy grammar. Must be a full match of the entire generated
|
||||||
|
|
@ -185,3 +199,8 @@ void llama_grammar_accept_impl(
|
||||||
void llama_grammar_accept_str(
|
void llama_grammar_accept_str(
|
||||||
struct llama_grammar & grammar,
|
struct llama_grammar & grammar,
|
||||||
const std::string & piece);
|
const std::string & piece);
|
||||||
|
|
||||||
|
void llama_grammar_accept_token(
|
||||||
|
struct llama_grammar & grammar,
|
||||||
|
llama_token token,
|
||||||
|
const std::string & piece);
|
||||||
|
|
|
||||||
|
|
@ -71,11 +71,14 @@ void llm_graph_input_attn_temp::set_input(const llama_ubatch * ubatch) {
|
||||||
if (ubatch->pos && attn_scale) {
|
if (ubatch->pos && attn_scale) {
|
||||||
const int64_t n_tokens = ubatch->n_tokens;
|
const int64_t n_tokens = ubatch->n_tokens;
|
||||||
|
|
||||||
|
GGML_ASSERT(f_attn_temp_scale != 0.0f);
|
||||||
|
GGML_ASSERT(n_attn_temp_floor_scale != 0);
|
||||||
|
|
||||||
std::vector<float> attn_scale_data(n_tokens, 0.0f);
|
std::vector<float> attn_scale_data(n_tokens, 0.0f);
|
||||||
for (int i = 0; i < n_tokens; ++i) {
|
for (int i = 0; i < n_tokens; ++i) {
|
||||||
const float pos = ubatch->pos[i];
|
const float pos = ubatch->pos[i];
|
||||||
attn_scale_data[i] = std::log(
|
attn_scale_data[i] = std::log(
|
||||||
std::floor((pos + 1.0f) / n_attn_temp_floor_scale) + 1.0
|
std::floor((pos + f_attn_temp_offset) / n_attn_temp_floor_scale) + 1.0
|
||||||
) * f_attn_temp_scale + 1.0;
|
) * f_attn_temp_scale + 1.0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -251,6 +254,24 @@ void llm_graph_input_rs::set_input(const llama_ubatch * ubatch) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool llm_graph_input_rs::can_reuse(const llm_graph_params & params) {
|
||||||
|
const auto * mctx = static_cast<const llama_memory_recurrent_context *>(params.mctx);
|
||||||
|
|
||||||
|
this->mctx = mctx;
|
||||||
|
|
||||||
|
bool res = true;
|
||||||
|
|
||||||
|
res &= s_copy->ne[0] == mctx->get_n_rs();
|
||||||
|
|
||||||
|
res &= s_copy_main->ne[0] == params.ubatch.n_seqs;
|
||||||
|
res &= s_copy_extra->ne[0] == mctx->get_n_rs() - params.ubatch.n_seqs;
|
||||||
|
|
||||||
|
res &= head == mctx->get_head();
|
||||||
|
res &= rs_z == mctx->get_rs_z();
|
||||||
|
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
void llm_graph_input_cross_embd::set_input(const llama_ubatch * ubatch) {
|
void llm_graph_input_cross_embd::set_input(const llama_ubatch * ubatch) {
|
||||||
GGML_UNUSED(ubatch);
|
GGML_UNUSED(ubatch);
|
||||||
|
|
||||||
|
|
@ -382,7 +403,7 @@ bool llm_graph_input_attn_kv::can_reuse(const llm_graph_params & params) {
|
||||||
//res &= self_v_idxs->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
|
//res &= self_v_idxs->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
|
||||||
|
|
||||||
res &= self_kq_mask->ne[0] == mctx->get_n_kv();
|
res &= self_kq_mask->ne[0] == mctx->get_n_kv();
|
||||||
res &= self_kq_mask->ne[1] == GGML_PAD(params.ubatch.n_tokens, GGML_KQ_MASK_PAD);
|
res &= self_kq_mask->ne[1] == params.ubatch.n_tokens;
|
||||||
|
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
@ -413,10 +434,10 @@ bool llm_graph_input_attn_kv_iswa::can_reuse(const llm_graph_params & params) {
|
||||||
//res &= self_v_idxs_swa->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
|
//res &= self_v_idxs_swa->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
|
||||||
|
|
||||||
res &= self_kq_mask->ne[0] == mctx->get_base()->get_n_kv();
|
res &= self_kq_mask->ne[0] == mctx->get_base()->get_n_kv();
|
||||||
res &= self_kq_mask->ne[1] == GGML_PAD(params.ubatch.n_tokens, GGML_KQ_MASK_PAD);
|
res &= self_kq_mask->ne[1] == params.ubatch.n_tokens;
|
||||||
|
|
||||||
res &= self_kq_mask_swa->ne[0] == mctx->get_swa()->get_n_kv();
|
res &= self_kq_mask_swa->ne[0] == mctx->get_swa()->get_n_kv();
|
||||||
res &= self_kq_mask_swa->ne[1] == GGML_PAD(params.ubatch.n_tokens, GGML_KQ_MASK_PAD);
|
res &= self_kq_mask_swa->ne[1] == params.ubatch.n_tokens;
|
||||||
|
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
@ -449,7 +470,7 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
|
for (int i = n_tokens; i < n_tokens; ++i) {
|
||||||
for (int j = 0; j < n_enc; ++j) {
|
for (int j = 0; j < n_enc; ++j) {
|
||||||
data[h*(n_enc*n_tokens) + i*n_enc + j] = -INFINITY;
|
data[h*(n_enc*n_tokens) + i*n_enc + j] = -INFINITY;
|
||||||
}
|
}
|
||||||
|
|
@ -458,8 +479,46 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
|
||||||
}
|
}
|
||||||
|
|
||||||
void llm_graph_input_mem_hybrid::set_input(const llama_ubatch * ubatch) {
|
void llm_graph_input_mem_hybrid::set_input(const llama_ubatch * ubatch) {
|
||||||
inp_attn->set_input(ubatch);
|
mctx->get_attn()->set_input_k_idxs(inp_attn->self_k_idxs, ubatch);
|
||||||
inp_rs->set_input(ubatch);
|
mctx->get_attn()->set_input_v_idxs(inp_attn->self_v_idxs, ubatch);
|
||||||
|
|
||||||
|
mctx->get_attn()->set_input_kq_mask(inp_attn->self_kq_mask, ubatch, cparams.causal_attn);
|
||||||
|
|
||||||
|
const int64_t n_rs = mctx->get_recr()->get_n_rs();
|
||||||
|
|
||||||
|
if (inp_rs->s_copy) {
|
||||||
|
GGML_ASSERT(ggml_backend_buffer_is_host(inp_rs->s_copy->buffer));
|
||||||
|
int32_t * data = (int32_t *) inp_rs->s_copy->data;
|
||||||
|
|
||||||
|
// assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n
|
||||||
|
for (uint32_t i = 0; i < n_rs; ++i) {
|
||||||
|
data[i] = mctx->get_recr()->s_copy(i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
bool llm_graph_input_mem_hybrid::can_reuse(const llm_graph_params & params) {
|
||||||
|
const auto * mctx = static_cast<const llama_memory_hybrid_context *>(params.mctx);
|
||||||
|
|
||||||
|
this->mctx = mctx;
|
||||||
|
|
||||||
|
bool res = true;
|
||||||
|
|
||||||
|
res &= inp_attn->self_k_idxs->ne[0] == params.ubatch.n_tokens;
|
||||||
|
//res &= inp_attn->self_v_idxs->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
|
||||||
|
|
||||||
|
res &= inp_attn->self_kq_mask->ne[0] == mctx->get_attn()->get_n_kv();
|
||||||
|
res &= inp_attn->self_kq_mask->ne[1] == params.ubatch.n_tokens;
|
||||||
|
|
||||||
|
res &= inp_rs->s_copy->ne[0] == mctx->get_recr()->get_n_rs();
|
||||||
|
|
||||||
|
res &= inp_rs->s_copy_main->ne[0] == params.ubatch.n_seqs;
|
||||||
|
res &= inp_rs->s_copy_extra->ne[0] == mctx->get_recr()->get_n_rs() - params.ubatch.n_seqs;
|
||||||
|
|
||||||
|
res &= inp_rs->head == mctx->get_recr()->get_head();
|
||||||
|
res &= inp_rs->rs_z == mctx->get_recr()->get_rs_z();
|
||||||
|
|
||||||
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
|
|
@ -810,9 +869,6 @@ ggml_tensor * llm_graph_context::build_ffn(
|
||||||
GGML_ABORT("fatal error");
|
GGML_ABORT("fatal error");
|
||||||
}
|
}
|
||||||
|
|
||||||
//expand here so that we can fuse ffn gate
|
|
||||||
ggml_build_forward_expand(gf, cur);
|
|
||||||
|
|
||||||
if (gate && type_gate == LLM_FFN_PAR) {
|
if (gate && type_gate == LLM_FFN_PAR) {
|
||||||
cur = ggml_mul(ctx0, cur, tmp);
|
cur = ggml_mul(ctx0, cur, tmp);
|
||||||
cb(cur, "ffn_gate_par", il);
|
cb(cur, "ffn_gate_par", il);
|
||||||
|
|
@ -973,7 +1029,7 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
||||||
|
|
||||||
// mask out the other groups
|
// mask out the other groups
|
||||||
selection_probs = ggml_get_rows(ctx0, selection_groups, expert_groups); // [n_exp_per_group, n_group_used, n_tokens]
|
selection_probs = ggml_get_rows(ctx0, selection_groups, expert_groups); // [n_exp_per_group, n_group_used, n_tokens]
|
||||||
selection_probs = ggml_set_rows(ctx0, ggml_scale_bias(ctx0, selection_groups, 0.0f, -INFINITY), selection_probs, expert_groups); // [n_exp_per_group, n_expert_groups, n_tokens]
|
selection_probs = ggml_set_rows(ctx0, ggml_fill(ctx0, selection_groups, -INFINITY), selection_probs, expert_groups); // [n_exp_per_group, n_expert_groups, n_tokens]
|
||||||
selection_probs = ggml_reshape_2d(ctx0, selection_probs, n_expert, n_tokens); // [n_expert, n_tokens]
|
selection_probs = ggml_reshape_2d(ctx0, selection_probs, n_expert, n_tokens); // [n_expert, n_tokens]
|
||||||
cb(selection_probs, "ffn_moe_probs_masked", il);
|
cb(selection_probs, "ffn_moe_probs_masked", il);
|
||||||
}
|
}
|
||||||
|
|
@ -1089,13 +1145,19 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
||||||
cur = ggml_relu(ctx0, cur);
|
cur = ggml_relu(ctx0, cur);
|
||||||
cb(cur, "ffn_moe_relu", il);
|
cb(cur, "ffn_moe_relu", il);
|
||||||
} break;
|
} break;
|
||||||
|
case LLM_FFN_RELU_SQR:
|
||||||
|
if (gate_exps) {
|
||||||
|
// TODO: add support for gated squared relu
|
||||||
|
GGML_ABORT("fatal error: gated squared relu not implemented");
|
||||||
|
} else {
|
||||||
|
cur = ggml_relu(ctx0, cur);
|
||||||
|
cur = ggml_sqr(ctx0, cur);
|
||||||
|
cb(cur, "ffn_moe_relu_sqr", il);
|
||||||
|
} break;
|
||||||
default:
|
default:
|
||||||
GGML_ABORT("fatal error");
|
GGML_ABORT("fatal error");
|
||||||
}
|
}
|
||||||
|
|
||||||
//expand here so that we can fuse ffn gate
|
|
||||||
ggml_build_forward_expand(gf, cur);
|
|
||||||
|
|
||||||
experts = build_lora_mm_id(down_exps, cur, selected_experts); // [n_embd, n_expert_used, n_tokens]
|
experts = build_lora_mm_id(down_exps, cur, selected_experts); // [n_embd, n_expert_used, n_tokens]
|
||||||
cb(experts, "ffn_moe_down", il);
|
cb(experts, "ffn_moe_down", il);
|
||||||
|
|
||||||
|
|
@ -1206,7 +1268,7 @@ ggml_tensor * llm_graph_context::build_inp_pos() const {
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_tensor * llm_graph_context::build_inp_attn_scale() const {
|
ggml_tensor * llm_graph_context::build_inp_attn_scale() const {
|
||||||
auto inp = std::make_unique<llm_graph_input_attn_temp>(hparams.n_attn_temp_floor_scale, hparams.f_attn_temp_scale);
|
auto inp = std::make_unique<llm_graph_input_attn_temp>(hparams.n_attn_temp_floor_scale, hparams.f_attn_temp_scale, hparams.f_attn_temp_offset);
|
||||||
|
|
||||||
auto & cur = inp->attn_scale;
|
auto & cur = inp->attn_scale;
|
||||||
|
|
||||||
|
|
@ -1473,13 +1535,13 @@ llm_graph_input_attn_no_cache * llm_graph_context::build_attn_inp_no_cache() con
|
||||||
auto inp = std::make_unique<llm_graph_input_attn_no_cache>(hparams, cparams);
|
auto inp = std::make_unique<llm_graph_input_attn_no_cache>(hparams, cparams);
|
||||||
|
|
||||||
// note: there is no KV cache, so the number of KV values is equal to the number of tokens in the batch
|
// note: there is no KV cache, so the number of KV values is equal to the number of tokens in the batch
|
||||||
inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), 1, 1);
|
inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens, 1, 1);
|
||||||
ggml_set_input(inp->self_kq_mask);
|
ggml_set_input(inp->self_kq_mask);
|
||||||
|
|
||||||
inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
|
inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
|
||||||
|
|
||||||
if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
|
if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
|
||||||
inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), 1, 1);
|
inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens, 1, 1);
|
||||||
ggml_set_input(inp->self_kq_mask_swa);
|
ggml_set_input(inp->self_kq_mask_swa);
|
||||||
|
|
||||||
inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa;
|
inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa;
|
||||||
|
|
@ -1561,7 +1623,7 @@ static std::unique_ptr<llm_graph_input_attn_kv> build_attn_inp_kv_impl(
|
||||||
inp->self_k_idxs = mctx_cur->build_input_k_idxs(ctx0, ubatch);
|
inp->self_k_idxs = mctx_cur->build_input_k_idxs(ctx0, ubatch);
|
||||||
inp->self_v_idxs = mctx_cur->build_input_v_idxs(ctx0, ubatch);
|
inp->self_v_idxs = mctx_cur->build_input_v_idxs(ctx0, ubatch);
|
||||||
|
|
||||||
inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens/n_stream, GGML_KQ_MASK_PAD), 1, n_stream);
|
inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
|
||||||
ggml_set_input(inp->self_kq_mask);
|
ggml_set_input(inp->self_kq_mask);
|
||||||
|
|
||||||
inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
|
inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
|
||||||
|
|
@ -1704,7 +1766,7 @@ llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const {
|
||||||
|
|
||||||
const int32_t n_enc = !cross->v_embd.empty() ? cross->n_enc : hparams.n_ctx_train;
|
const int32_t n_enc = !cross->v_embd.empty() ? cross->n_enc : hparams.n_ctx_train;
|
||||||
|
|
||||||
inp->cross_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_enc, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), 1, 1);
|
inp->cross_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_enc, n_tokens, 1, 1);
|
||||||
ggml_set_input(inp->cross_kq_mask);
|
ggml_set_input(inp->cross_kq_mask);
|
||||||
|
|
||||||
inp->cross_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->cross_kq_mask, GGML_TYPE_F16) : inp->cross_kq_mask;
|
inp->cross_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->cross_kq_mask, GGML_TYPE_F16) : inp->cross_kq_mask;
|
||||||
|
|
@ -1770,7 +1832,7 @@ llm_graph_input_attn_kv_iswa * llm_graph_context::build_attn_inp_kv_iswa() const
|
||||||
inp->self_k_idxs = mctx_cur->get_base()->build_input_k_idxs(ctx0, ubatch);
|
inp->self_k_idxs = mctx_cur->get_base()->build_input_k_idxs(ctx0, ubatch);
|
||||||
inp->self_v_idxs = mctx_cur->get_base()->build_input_v_idxs(ctx0, ubatch);
|
inp->self_v_idxs = mctx_cur->get_base()->build_input_v_idxs(ctx0, ubatch);
|
||||||
|
|
||||||
inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens/n_stream, GGML_KQ_MASK_PAD), 1, n_stream);
|
inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
|
||||||
ggml_set_input(inp->self_kq_mask);
|
ggml_set_input(inp->self_kq_mask);
|
||||||
|
|
||||||
inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
|
inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
|
||||||
|
|
@ -1784,7 +1846,7 @@ llm_graph_input_attn_kv_iswa * llm_graph_context::build_attn_inp_kv_iswa() const
|
||||||
inp->self_k_idxs_swa = mctx_cur->get_swa()->build_input_k_idxs(ctx0, ubatch);
|
inp->self_k_idxs_swa = mctx_cur->get_swa()->build_input_k_idxs(ctx0, ubatch);
|
||||||
inp->self_v_idxs_swa = mctx_cur->get_swa()->build_input_v_idxs(ctx0, ubatch);
|
inp->self_v_idxs_swa = mctx_cur->get_swa()->build_input_v_idxs(ctx0, ubatch);
|
||||||
|
|
||||||
inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens/n_stream, GGML_KQ_MASK_PAD), 1, n_stream);
|
inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
|
||||||
ggml_set_input(inp->self_kq_mask_swa);
|
ggml_set_input(inp->self_kq_mask_swa);
|
||||||
|
|
||||||
inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa;
|
inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa;
|
||||||
|
|
@ -1844,6 +1906,9 @@ static std::unique_ptr<llm_graph_input_rs> build_rs_inp_impl(
|
||||||
inp->s_copy_main = ggml_view_1d(ctx0, inp->s_copy, n_seqs, 0);
|
inp->s_copy_main = ggml_view_1d(ctx0, inp->s_copy, n_seqs, 0);
|
||||||
inp->s_copy_extra = ggml_view_1d(ctx0, inp->s_copy, n_rs - n_seqs, n_seqs * inp->s_copy->nb[0]);
|
inp->s_copy_extra = ggml_view_1d(ctx0, inp->s_copy, n_rs - n_seqs, n_seqs * inp->s_copy->nb[0]);
|
||||||
|
|
||||||
|
inp->head = mctx_cur->get_head();
|
||||||
|
inp->rs_z = mctx_cur->get_rs_z();
|
||||||
|
|
||||||
return inp;
|
return inp;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -1912,10 +1977,10 @@ ggml_tensor * llm_graph_context::build_rwkv_token_shift_store(
|
||||||
llm_graph_input_mem_hybrid * llm_graph_context::build_inp_mem_hybrid() const {
|
llm_graph_input_mem_hybrid * llm_graph_context::build_inp_mem_hybrid() const {
|
||||||
const auto * mctx_cur = static_cast<const llama_memory_hybrid_context *>(mctx);
|
const auto * mctx_cur = static_cast<const llama_memory_hybrid_context *>(mctx);
|
||||||
|
|
||||||
auto inp_rs = build_rs_inp_impl(ctx0, ubatch, mctx_cur->get_recr());
|
auto inp_rs = build_rs_inp_impl (ctx0, ubatch, mctx_cur->get_recr());
|
||||||
auto inp_attn = build_attn_inp_kv_impl(ctx0, ubatch, hparams, cparams, mctx_cur->get_attn());
|
auto inp_attn = build_attn_inp_kv_impl(ctx0, ubatch, hparams, cparams, mctx_cur->get_attn());
|
||||||
|
|
||||||
auto inp = std::make_unique<llm_graph_input_mem_hybrid>(std::move(inp_attn), std::move(inp_rs), mctx_cur);
|
auto inp = std::make_unique<llm_graph_input_mem_hybrid>(cparams, std::move(inp_attn), std::move(inp_rs), mctx_cur);
|
||||||
|
|
||||||
return (llm_graph_input_mem_hybrid *) res->add_input(std::move(inp));
|
return (llm_graph_input_mem_hybrid *) res->add_input(std::move(inp));
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -132,8 +132,8 @@ public:
|
||||||
// temperature tuning, used by llama4
|
// temperature tuning, used by llama4
|
||||||
class llm_graph_input_attn_temp : public llm_graph_input_i {
|
class llm_graph_input_attn_temp : public llm_graph_input_i {
|
||||||
public:
|
public:
|
||||||
llm_graph_input_attn_temp(uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale)
|
llm_graph_input_attn_temp(uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale, float f_attn_temp_offset)
|
||||||
: n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale) {}
|
: n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale), f_attn_temp_offset(f_attn_temp_offset) {}
|
||||||
virtual ~llm_graph_input_attn_temp() = default;
|
virtual ~llm_graph_input_attn_temp() = default;
|
||||||
|
|
||||||
void set_input(const llama_ubatch * ubatch) override;
|
void set_input(const llama_ubatch * ubatch) override;
|
||||||
|
|
@ -142,6 +142,7 @@ public:
|
||||||
|
|
||||||
const uint32_t n_attn_temp_floor_scale;
|
const uint32_t n_attn_temp_floor_scale;
|
||||||
const float f_attn_temp_scale;
|
const float f_attn_temp_scale;
|
||||||
|
const float f_attn_temp_offset;
|
||||||
};
|
};
|
||||||
|
|
||||||
class llm_graph_input_pos_bucket : public llm_graph_input_i {
|
class llm_graph_input_pos_bucket : public llm_graph_input_i {
|
||||||
|
|
@ -224,6 +225,8 @@ public:
|
||||||
|
|
||||||
void set_input(const llama_ubatch * ubatch) override;
|
void set_input(const llama_ubatch * ubatch) override;
|
||||||
|
|
||||||
|
bool can_reuse(const llm_graph_params & params) override;
|
||||||
|
|
||||||
ggml_tensor * s_copy; // I32 [n_rs]
|
ggml_tensor * s_copy; // I32 [n_rs]
|
||||||
|
|
||||||
// views of s_copy, computed once per graph
|
// views of s_copy, computed once per graph
|
||||||
|
|
@ -232,6 +235,10 @@ public:
|
||||||
ggml_tensor * s_copy_extra; // I32 [n_rs - n_seqs]
|
ggml_tensor * s_copy_extra; // I32 [n_rs - n_seqs]
|
||||||
|
|
||||||
const llama_memory_recurrent_context * mctx;
|
const llama_memory_recurrent_context * mctx;
|
||||||
|
|
||||||
|
// used in view offsets, need to match for valid graph reuse
|
||||||
|
uint32_t head;
|
||||||
|
int32_t rs_z;
|
||||||
};
|
};
|
||||||
|
|
||||||
class llm_graph_input_cross_embd : public llm_graph_input_i {
|
class llm_graph_input_cross_embd : public llm_graph_input_i {
|
||||||
|
|
@ -364,22 +371,28 @@ public:
|
||||||
class llm_graph_input_mem_hybrid : public llm_graph_input_i {
|
class llm_graph_input_mem_hybrid : public llm_graph_input_i {
|
||||||
public:
|
public:
|
||||||
llm_graph_input_mem_hybrid(
|
llm_graph_input_mem_hybrid(
|
||||||
|
const llama_cparams & cparams,
|
||||||
std::unique_ptr<llm_graph_input_attn_kv> inp_attn,
|
std::unique_ptr<llm_graph_input_attn_kv> inp_attn,
|
||||||
std::unique_ptr<llm_graph_input_rs> inp_rs,
|
std::unique_ptr<llm_graph_input_rs> inp_rs,
|
||||||
const llama_memory_hybrid_context * mctx) :
|
const llama_memory_hybrid_context * mctx) :
|
||||||
inp_attn(std::move(inp_attn)),
|
inp_attn(std::move(inp_attn)),
|
||||||
inp_rs(std::move(inp_rs)),
|
inp_rs(std::move(inp_rs)),
|
||||||
|
cparams(cparams),
|
||||||
mctx(mctx) { }
|
mctx(mctx) { }
|
||||||
virtual ~llm_graph_input_mem_hybrid() = default;
|
virtual ~llm_graph_input_mem_hybrid() = default;
|
||||||
|
|
||||||
void set_input(const llama_ubatch * ubatch) override;
|
void set_input(const llama_ubatch * ubatch) override;
|
||||||
|
|
||||||
|
bool can_reuse(const llm_graph_params & params) override;
|
||||||
|
|
||||||
std::unique_ptr<llm_graph_input_attn_kv> inp_attn;
|
std::unique_ptr<llm_graph_input_attn_kv> inp_attn;
|
||||||
std::unique_ptr<llm_graph_input_rs> inp_rs;
|
std::unique_ptr<llm_graph_input_rs> inp_rs;
|
||||||
|
|
||||||
llm_graph_input_attn_kv * get_attn() const { return inp_attn.get(); }
|
llm_graph_input_attn_kv * get_attn() const { return inp_attn.get(); }
|
||||||
llm_graph_input_rs * get_recr() const { return inp_rs.get(); }
|
llm_graph_input_rs * get_recr() const { return inp_rs.get(); }
|
||||||
|
|
||||||
|
const llama_cparams cparams;
|
||||||
|
|
||||||
const llama_memory_hybrid_context * mctx;
|
const llama_memory_hybrid_context * mctx;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,8 @@
|
||||||
#include "llama-hparams.h"
|
#include "llama-hparams.h"
|
||||||
|
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
|
|
||||||
|
#include <algorithm>
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
|
|
||||||
void llama_hparams::set_swa_pattern(uint32_t n_pattern, bool dense_first) {
|
void llama_hparams::set_swa_pattern(uint32_t n_pattern, bool dense_first) {
|
||||||
|
|
@ -237,3 +239,7 @@ bool llama_hparams::is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama
|
||||||
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool llama_hparams::use_mrope() const {
|
||||||
|
return rope_sections[0] > 0 && rope_sections[1] > 0;
|
||||||
|
}
|
||||||
|
|
|
||||||
|
|
@ -34,6 +34,7 @@ struct llama_hparams_convnext {
|
||||||
|
|
||||||
struct llama_hparams {
|
struct llama_hparams {
|
||||||
bool vocab_only;
|
bool vocab_only;
|
||||||
|
bool no_alloc;
|
||||||
bool rope_finetuned;
|
bool rope_finetuned;
|
||||||
bool use_par_res;
|
bool use_par_res;
|
||||||
bool swin_norm;
|
bool swin_norm;
|
||||||
|
|
@ -109,6 +110,7 @@ struct llama_hparams {
|
||||||
float rope_freq_base_train_swa;
|
float rope_freq_base_train_swa;
|
||||||
float rope_freq_scale_train;
|
float rope_freq_scale_train;
|
||||||
float rope_freq_scale_train_swa;
|
float rope_freq_scale_train_swa;
|
||||||
|
|
||||||
uint32_t n_ctx_orig_yarn;
|
uint32_t n_ctx_orig_yarn;
|
||||||
float rope_yarn_log_mul = 0.0f;
|
float rope_yarn_log_mul = 0.0f;
|
||||||
|
|
||||||
|
|
@ -164,8 +166,9 @@ struct llama_hparams {
|
||||||
// llama4 smallthinker
|
// llama4 smallthinker
|
||||||
uint32_t n_moe_layer_step = 0;
|
uint32_t n_moe_layer_step = 0;
|
||||||
uint32_t n_no_rope_layer_step = 4;
|
uint32_t n_no_rope_layer_step = 4;
|
||||||
uint32_t n_attn_temp_floor_scale = 8192;
|
uint32_t n_attn_temp_floor_scale = 0;
|
||||||
float f_attn_temp_scale = 0.1;
|
float f_attn_temp_scale = 0.0f;
|
||||||
|
float f_attn_temp_offset = 0.0f; // offset position index
|
||||||
|
|
||||||
// gemma3n altup
|
// gemma3n altup
|
||||||
uint32_t n_altup = 4; // altup_num_inputs
|
uint32_t n_altup = 4; // altup_num_inputs
|
||||||
|
|
@ -272,7 +275,8 @@ struct llama_hparams {
|
||||||
// TODO: think of a better place for this function
|
// TODO: think of a better place for this function
|
||||||
// TODO: pack the SWA params in a struct?
|
// TODO: pack the SWA params in a struct?
|
||||||
static bool is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1);
|
static bool is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1);
|
||||||
|
|
||||||
|
bool use_mrope() const;
|
||||||
};
|
};
|
||||||
|
|
||||||
static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
|
static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -25,6 +25,10 @@ time_meas::~time_meas() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void llama_log_get(ggml_log_callback * log_callback, void ** user_data) {
|
||||||
|
ggml_log_get(log_callback, user_data);
|
||||||
|
}
|
||||||
|
|
||||||
void llama_log_set(ggml_log_callback log_callback, void * user_data) {
|
void llama_log_set(ggml_log_callback log_callback, void * user_data) {
|
||||||
ggml_log_set(log_callback, user_data);
|
ggml_log_set(log_callback, user_data);
|
||||||
g_logger_state.log_callback = log_callback ? log_callback : llama_log_callback_default;
|
g_logger_state.log_callback = log_callback ? log_callback : llama_log_callback_default;
|
||||||
|
|
|
||||||
|
|
@ -37,7 +37,7 @@ void llama_log_callback_default(ggml_log_level level, const char * text, void *
|
||||||
template <typename T>
|
template <typename T>
|
||||||
struct no_init {
|
struct no_init {
|
||||||
T value;
|
T value;
|
||||||
no_init() { /* do nothing */ }
|
no_init() = default;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct time_meas {
|
struct time_meas {
|
||||||
|
|
|
||||||
|
|
@ -175,7 +175,15 @@ llama_kv_cache::llama_kv_cache(
|
||||||
|
|
||||||
// allocate tensors and initialize the buffers to avoid NaNs in the padding
|
// allocate tensors and initialize the buffers to avoid NaNs in the padding
|
||||||
for (auto & [buft, ctx] : ctx_map) {
|
for (auto & [buft, ctx] : ctx_map) {
|
||||||
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx.get(), buft);
|
ggml_backend_buffer_t buf;
|
||||||
|
if (model.hparams.no_alloc) {
|
||||||
|
buf = ggml_backend_buft_alloc_buffer(buft, /*size =*/ 0); // dummy buffer
|
||||||
|
for (ggml_tensor * t = ggml_get_first_tensor(ctx.get()); t != nullptr; t = ggml_get_next_tensor(ctx.get(), t)) {
|
||||||
|
t->buffer = buf; // set dummy buffer for KV cache so that the backend scheduler won't try to allocate it
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx.get(), buft); // real buffer
|
||||||
|
}
|
||||||
if (!buf) {
|
if (!buf) {
|
||||||
throw std::runtime_error("failed to allocate buffer for kv cache");
|
throw std::runtime_error("failed to allocate buffer for kv cache");
|
||||||
}
|
}
|
||||||
|
|
@ -482,9 +490,18 @@ llama_pos llama_kv_cache::seq_pos_max(llama_seq_id seq_id) const {
|
||||||
|
|
||||||
std::map<ggml_backend_buffer_type_t, size_t> llama_kv_cache::memory_breakdown() const {
|
std::map<ggml_backend_buffer_type_t, size_t> llama_kv_cache::memory_breakdown() const {
|
||||||
std::map<ggml_backend_buffer_type_t, size_t> ret;
|
std::map<ggml_backend_buffer_type_t, size_t> ret;
|
||||||
for (const auto & [_, buf] : ctxs_bufs) {
|
for (const auto & [ctx, buf] : ctxs_bufs) {
|
||||||
ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get());
|
ggml_backend_buffer_type_t buft = ggml_backend_buffer_get_type(buf.get());
|
||||||
|
|
||||||
|
if (hparams.no_alloc) {
|
||||||
|
GGML_ASSERT(ggml_backend_buffer_get_base(buf.get()) == nullptr);
|
||||||
|
ret[buft] += ggml_backend_alloc_ctx_tensors_from_buft_size(ctx.get(), buft);
|
||||||
|
} else {
|
||||||
|
// GGML_ASSERT(ggml_backend_buffer_get_base(buf.get()) != nullptr); // multi_buffer does not have a defined base
|
||||||
|
ret[buft] += ggml_backend_buffer_get_size(buf.get());
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -1233,7 +1250,6 @@ void llama_kv_cache::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * u
|
||||||
|
|
||||||
// n_tps == n_tokens_per_stream
|
// n_tps == n_tokens_per_stream
|
||||||
const int64_t n_tps = n_tokens/n_stream;
|
const int64_t n_tps = n_tokens/n_stream;
|
||||||
const int64_t n_tps_pad = GGML_PAD(n_tps, GGML_KQ_MASK_PAD);
|
|
||||||
|
|
||||||
std::fill(data, data + ggml_nelements(dst), -INFINITY);
|
std::fill(data, data + ggml_nelements(dst), -INFINITY);
|
||||||
|
|
||||||
|
|
@ -1266,7 +1282,7 @@ void llama_kv_cache::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * u
|
||||||
const llama_pos p1_x = is_2d ? ubatch->pos[i + ubatch->n_tokens*2] : 0;
|
const llama_pos p1_x = is_2d ? ubatch->pos[i + ubatch->n_tokens*2] : 0;
|
||||||
const llama_pos p1_y = is_2d ? ubatch->pos[i + ubatch->n_tokens] : 0;
|
const llama_pos p1_y = is_2d ? ubatch->pos[i + ubatch->n_tokens] : 0;
|
||||||
|
|
||||||
const uint64_t idst = n_kv*(h*n_stream*n_tps_pad + s*n_tps_pad + ii);
|
const uint64_t idst = n_kv*(h*n_stream*n_tps + s*n_tps + ii);
|
||||||
|
|
||||||
for (uint32_t j = 0; j < n_kv; ++j) {
|
for (uint32_t j = 0; j < n_kv; ++j) {
|
||||||
if (cells.is_empty(j)) {
|
if (cells.is_empty(j)) {
|
||||||
|
|
@ -1373,6 +1389,7 @@ ggml_tensor * llama_kv_cache::build_rope_shift(
|
||||||
const auto & yarn_ext_factor = cparams.yarn_ext_factor;
|
const auto & yarn_ext_factor = cparams.yarn_ext_factor;
|
||||||
const auto & yarn_beta_fast = cparams.yarn_beta_fast;
|
const auto & yarn_beta_fast = cparams.yarn_beta_fast;
|
||||||
const auto & yarn_beta_slow = cparams.yarn_beta_slow;
|
const auto & yarn_beta_slow = cparams.yarn_beta_slow;
|
||||||
|
const auto & yarn_attn_factor = cparams.yarn_attn_factor;
|
||||||
|
|
||||||
const auto & n_rot = hparams.n_rot;
|
const auto & n_rot = hparams.n_rot;
|
||||||
const auto & rope_type = hparams.rope_type == LLAMA_ROPE_TYPE_MROPE || hparams.rope_type == LLAMA_ROPE_TYPE_IMROPE
|
const auto & rope_type = hparams.rope_type == LLAMA_ROPE_TYPE_MROPE || hparams.rope_type == LLAMA_ROPE_TYPE_IMROPE
|
||||||
|
|
@ -1383,12 +1400,6 @@ ggml_tensor * llama_kv_cache::build_rope_shift(
|
||||||
? LLAMA_ROPE_TYPE_NEOX
|
? LLAMA_ROPE_TYPE_NEOX
|
||||||
: hparams.rope_type;
|
: hparams.rope_type;
|
||||||
|
|
||||||
// See llm_build_deepseek2() for why attn_factor has to be scaled for YaRN RoPE to work correctly.
|
|
||||||
// See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
|
|
||||||
const float yarn_attn_factor = model.arch == LLM_ARCH_DEEPSEEK2
|
|
||||||
? 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale))
|
|
||||||
: cparams.yarn_attn_factor;
|
|
||||||
|
|
||||||
ggml_tensor * tmp;
|
ggml_tensor * tmp;
|
||||||
|
|
||||||
if (ggml_is_quantized(cur->type)) {
|
if (ggml_is_quantized(cur->type)) {
|
||||||
|
|
@ -1550,9 +1561,11 @@ void llama_kv_cache::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama
|
||||||
|
|
||||||
const uint32_t strm = seq_id == -1 ? s : seq_to_stream[seq_id];
|
const uint32_t strm = seq_id == -1 ? s : seq_to_stream[seq_id];
|
||||||
|
|
||||||
|
slot_info sinfo;
|
||||||
|
|
||||||
bool res = true;
|
bool res = true;
|
||||||
res = res && state_read_meta(io, strm, cell_count, seq_id);
|
res = res && state_read_meta(io, strm, cell_count, sinfo, seq_id);
|
||||||
res = res && state_read_data(io, strm, cell_count);
|
res = res && state_read_data(io, strm, cell_count, sinfo);
|
||||||
|
|
||||||
if (!res) {
|
if (!res) {
|
||||||
if (seq_id == -1) {
|
if (seq_id == -1) {
|
||||||
|
|
@ -1691,7 +1704,7 @@ void llama_kv_cache::state_write_data(llama_io_write_i & io, const cell_ranges_t
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, llama_seq_id dest_seq_id) {
|
bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, slot_info & sinfo, llama_seq_id dest_seq_id) {
|
||||||
auto & cells = v_cells[strm];
|
auto & cells = v_cells[strm];
|
||||||
auto & head = v_heads[strm];
|
auto & head = v_heads[strm];
|
||||||
|
|
||||||
|
|
@ -1728,7 +1741,7 @@ bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t strm, uint32
|
||||||
ubatch.seq_id[i] = &dest_seq_id;
|
ubatch.seq_id[i] = &dest_seq_id;
|
||||||
}
|
}
|
||||||
|
|
||||||
const auto sinfo = find_slot(ubatch, true);
|
sinfo = find_slot(ubatch, false);
|
||||||
if (sinfo.empty()) {
|
if (sinfo.empty()) {
|
||||||
LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__);
|
LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__);
|
||||||
return false;
|
return false;
|
||||||
|
|
@ -1738,20 +1751,16 @@ bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t strm, uint32
|
||||||
// see: https://github.com/ggml-org/llama.cpp/pull/16825#issuecomment-3460868350
|
// see: https://github.com/ggml-org/llama.cpp/pull/16825#issuecomment-3460868350
|
||||||
apply_ubatch(sinfo, ubatch);
|
apply_ubatch(sinfo, ubatch);
|
||||||
|
|
||||||
const auto head_cur = sinfo.head();
|
LLAMA_LOG_DEBUG("%s: cell_count = %d, dest_seq_id = %d\n", __func__, cell_count, dest_seq_id);
|
||||||
|
|
||||||
// keep the head at the old position because we will read the KV data into it in state_read_data()
|
// DEBUG CHECK: verify that all cells were allocated and have correct seq_id and pos values
|
||||||
head = head_cur;
|
GGML_ASSERT(sinfo.n_stream() == 1);
|
||||||
|
GGML_ASSERT(sinfo.idxs[0].size() == cell_count);
|
||||||
LLAMA_LOG_DEBUG("%s: head_cur = %d, head = %d, cell_count = %d, dest_seq_id = %d\n", __func__, head_cur, head, cell_count, dest_seq_id);
|
for (uint32_t i = 0; i < cell_count; ++i) {
|
||||||
|
const uint32_t idx = sinfo.idxs[0][i];
|
||||||
// DEBUG CHECK: head_cur should be our first cell, head_cur + cell_count - 1 should be our last cell (verify seq_id and pos values)
|
GGML_ASSERT(cells.pos_get(idx) == ubatch.pos[i]);
|
||||||
// Assume that this is one contiguous block of cells
|
GGML_ASSERT(cells.seq_has(idx, dest_seq_id));
|
||||||
GGML_ASSERT(head_cur + cell_count <= cells.size());
|
}
|
||||||
GGML_ASSERT(cells.pos_get(head_cur) == ubatch.pos[0]);
|
|
||||||
GGML_ASSERT(cells.pos_get(head_cur + cell_count - 1) == ubatch.pos[cell_count - 1]);
|
|
||||||
GGML_ASSERT(cells.seq_has(head_cur, dest_seq_id));
|
|
||||||
GGML_ASSERT(cells.seq_has(head_cur + cell_count - 1, dest_seq_id));
|
|
||||||
} else {
|
} else {
|
||||||
// whole KV cache restore
|
// whole KV cache restore
|
||||||
|
|
||||||
|
|
@ -1784,15 +1793,24 @@ bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t strm, uint32
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Create contiguous slot_info for whole cache restore
|
||||||
|
sinfo.s0 = strm;
|
||||||
|
sinfo.s1 = strm;
|
||||||
|
sinfo.resize(1);
|
||||||
|
sinfo.strm[0] = strm;
|
||||||
|
sinfo.idxs[0].resize(cell_count);
|
||||||
|
for (uint32_t i = 0; i < cell_count; ++i) {
|
||||||
|
sinfo.idxs[0][i] = i;
|
||||||
|
}
|
||||||
|
|
||||||
head = 0;
|
head = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool llama_kv_cache::state_read_data(llama_io_read_i & io, uint32_t strm, uint32_t cell_count) {
|
bool llama_kv_cache::state_read_data(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, const slot_info & sinfo) {
|
||||||
auto & cells = v_cells[strm];
|
auto & cells = v_cells[strm];
|
||||||
auto & head = v_heads[strm];
|
|
||||||
|
|
||||||
uint32_t v_trans;
|
uint32_t v_trans;
|
||||||
uint32_t n_layer;
|
uint32_t n_layer;
|
||||||
|
|
@ -1842,8 +1860,17 @@ bool llama_kv_cache::state_read_data(llama_io_read_i & io, uint32_t strm, uint32
|
||||||
}
|
}
|
||||||
|
|
||||||
if (cell_count) {
|
if (cell_count) {
|
||||||
// Read and set the keys for the whole cell range
|
if (sinfo.is_contiguous()) {
|
||||||
ggml_backend_tensor_set(k, io.read(cell_count * k_size_row), head * k_size_row, cell_count * k_size_row);
|
// Fast path: contiguous cells, single memcpy
|
||||||
|
ggml_backend_tensor_set(k, io.read(cell_count * k_size_row), sinfo.head() * k_size_row, cell_count * k_size_row);
|
||||||
|
} else {
|
||||||
|
// Slow path: scatter to non-contiguous positions
|
||||||
|
const void * src = io.read(cell_count * k_size_row);
|
||||||
|
for (uint32_t i = 0; i < cell_count; ++i) {
|
||||||
|
const size_t dst_offset = sinfo.idxs[0][i] * k_size_row;
|
||||||
|
ggml_backend_tensor_set(k, (const char*)src + i * k_size_row, dst_offset, k_size_row);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -1874,8 +1901,17 @@ bool llama_kv_cache::state_read_data(llama_io_read_i & io, uint32_t strm, uint32
|
||||||
}
|
}
|
||||||
|
|
||||||
if (cell_count) {
|
if (cell_count) {
|
||||||
// Read and set the values for the whole cell range
|
if (sinfo.is_contiguous()) {
|
||||||
ggml_backend_tensor_set(v, io.read(cell_count * v_size_row), head * v_size_row, cell_count * v_size_row);
|
// Fast path: contiguous cells, single memcpy
|
||||||
|
ggml_backend_tensor_set(v, io.read(cell_count * v_size_row), sinfo.head() * v_size_row, cell_count * v_size_row);
|
||||||
|
} else {
|
||||||
|
// Slow path: scatter to non-contiguous positions
|
||||||
|
const void * src = io.read(cell_count * v_size_row);
|
||||||
|
for (uint32_t i = 0; i < cell_count; ++i) {
|
||||||
|
const size_t dst_offset = sinfo.idxs[0][i] * v_size_row;
|
||||||
|
ggml_backend_tensor_set(v, (const char*)src + i * v_size_row, dst_offset, v_size_row);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
|
@ -1914,11 +1950,23 @@ bool llama_kv_cache::state_read_data(llama_io_read_i & io, uint32_t strm, uint32
|
||||||
}
|
}
|
||||||
|
|
||||||
if (cell_count) {
|
if (cell_count) {
|
||||||
// For each row in the transposed matrix, read the values for the whole cell range
|
if (sinfo.is_contiguous()) {
|
||||||
|
// Fast path: contiguous cells
|
||||||
|
const uint32_t h = sinfo.head();
|
||||||
for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
|
for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
|
||||||
const size_t dst_offset = (head + j * cells.size()) * v_size_el;
|
const size_t dst_offset = (h + j * cells.size()) * v_size_el;
|
||||||
ggml_backend_tensor_set(v, io.read(cell_count * v_size_el), dst_offset, cell_count * v_size_el);
|
ggml_backend_tensor_set(v, io.read(cell_count * v_size_el), dst_offset, cell_count * v_size_el);
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
// Slow path: scatter to non-contiguous positions
|
||||||
|
for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
|
||||||
|
const void * src = io.read(cell_count * v_size_el);
|
||||||
|
for (uint32_t i = 0; i < cell_count; ++i) {
|
||||||
|
const size_t dst_offset = (sinfo.idxs[0][i] + j * cells.size()) * v_size_el;
|
||||||
|
ggml_backend_tensor_set(v, (const char*)src + i * v_size_el, dst_offset, v_size_el);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -72,6 +72,23 @@ public:
|
||||||
void clear() {
|
void clear() {
|
||||||
idxs.clear();
|
idxs.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// check if indices are contiguous starting from head()
|
||||||
|
bool is_contiguous() const {
|
||||||
|
if (idxs.empty() || idxs[0].empty()) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (idxs.size() > 1) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
const uint32_t h = idxs[0][0];
|
||||||
|
for (size_t i = 0; i < idxs[0].size(); ++i) {
|
||||||
|
if (idxs[0][i] != h + i) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
using slot_info_vec_t = std::vector<slot_info>;
|
using slot_info_vec_t = std::vector<slot_info>;
|
||||||
|
|
@ -264,8 +281,8 @@ private:
|
||||||
void state_write_meta(llama_io_write_i & io, const cell_ranges_t & cr, llama_seq_id seq_id = -1) const;
|
void state_write_meta(llama_io_write_i & io, const cell_ranges_t & cr, llama_seq_id seq_id = -1) const;
|
||||||
void state_write_data(llama_io_write_i & io, const cell_ranges_t & cr) const;
|
void state_write_data(llama_io_write_i & io, const cell_ranges_t & cr) const;
|
||||||
|
|
||||||
bool state_read_meta(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, llama_seq_id dest_seq_id = -1);
|
bool state_read_meta(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, slot_info & sinfo, llama_seq_id dest_seq_id = -1);
|
||||||
bool state_read_data(llama_io_read_i & io, uint32_t strm, uint32_t cell_count);
|
bool state_read_data(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, const slot_info & sinfo);
|
||||||
};
|
};
|
||||||
|
|
||||||
class llama_kv_cache_context : public llama_memory_context_i {
|
class llama_kv_cache_context : public llama_memory_context_i {
|
||||||
|
|
|
||||||
|
|
@ -485,7 +485,7 @@ struct llama_mlock::impl {
|
||||||
if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit)) {
|
if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit)) {
|
||||||
suggest = false;
|
suggest = false;
|
||||||
}
|
}
|
||||||
if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size)) {
|
if (suggest && ((uint64_t)lock_limit.rlim_max > (uint64_t)lock_limit.rlim_cur + size)) {
|
||||||
suggest = false;
|
suggest = false;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
|
||||||
|
|
@ -473,6 +473,7 @@ llama_model_loader::llama_model_loader(
|
||||||
std::vector<std::string> & splits,
|
std::vector<std::string> & splits,
|
||||||
bool use_mmap,
|
bool use_mmap,
|
||||||
bool check_tensors,
|
bool check_tensors,
|
||||||
|
bool no_alloc,
|
||||||
const llama_model_kv_override * param_overrides_p,
|
const llama_model_kv_override * param_overrides_p,
|
||||||
const llama_model_tensor_buft_override * param_tensor_buft_overrides_p) {
|
const llama_model_tensor_buft_override * param_tensor_buft_overrides_p) {
|
||||||
int trace = 0;
|
int trace = 0;
|
||||||
|
|
@ -716,6 +717,7 @@ llama_model_loader::llama_model_loader(
|
||||||
|
|
||||||
this->use_mmap = use_mmap;
|
this->use_mmap = use_mmap;
|
||||||
this->check_tensors = check_tensors;
|
this->check_tensors = check_tensors;
|
||||||
|
this->no_alloc = no_alloc;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string llama_model_loader::get_arch_name() const {
|
std::string llama_model_loader::get_arch_name() const {
|
||||||
|
|
|
||||||
|
|
@ -71,6 +71,7 @@ struct llama_model_loader {
|
||||||
|
|
||||||
bool use_mmap = false;
|
bool use_mmap = false;
|
||||||
bool check_tensors;
|
bool check_tensors;
|
||||||
|
bool no_alloc;
|
||||||
|
|
||||||
llama_files files;
|
llama_files files;
|
||||||
llama_ftype ftype;
|
llama_ftype ftype;
|
||||||
|
|
@ -97,6 +98,7 @@ struct llama_model_loader {
|
||||||
std::vector<std::string> & splits, // optional, only need if the split does not follow naming scheme
|
std::vector<std::string> & splits, // optional, only need if the split does not follow naming scheme
|
||||||
bool use_mmap,
|
bool use_mmap,
|
||||||
bool check_tensors,
|
bool check_tensors,
|
||||||
|
bool no_alloc,
|
||||||
const llama_model_kv_override * param_overrides_p,
|
const llama_model_kv_override * param_overrides_p,
|
||||||
const llama_model_tensor_buft_override * param_tensor_buft_overrides_p);
|
const llama_model_tensor_buft_override * param_tensor_buft_overrides_p);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -120,6 +120,8 @@ const char * llm_type_name(llm_type type) {
|
||||||
case LLM_TYPE_16B_A1B: return "16B.A1B";
|
case LLM_TYPE_16B_A1B: return "16B.A1B";
|
||||||
case LLM_TYPE_21B_A3B: return "21B.A3B";
|
case LLM_TYPE_21B_A3B: return "21B.A3B";
|
||||||
case LLM_TYPE_30B_A3B: return "30B.A3B";
|
case LLM_TYPE_30B_A3B: return "30B.A3B";
|
||||||
|
case LLM_TYPE_31B_A3_5B: return "31B.A3.5B";
|
||||||
|
case LLM_TYPE_80B_A3B: return "80B.A3B";
|
||||||
case LLM_TYPE_100B_A6B: return "100B.A6B";
|
case LLM_TYPE_100B_A6B: return "100B.A6B";
|
||||||
case LLM_TYPE_106B_A12B: return "106B.A12B";
|
case LLM_TYPE_106B_A12B: return "106B.A12B";
|
||||||
case LLM_TYPE_230B_A10B: return "230B.A10B";
|
case LLM_TYPE_230B_A10B: return "230B.A10B";
|
||||||
|
|
@ -423,8 +425,8 @@ static buft_list_t make_gpu_buft_list(ggml_backend_dev_t dev, llama_split_mode s
|
||||||
}
|
}
|
||||||
|
|
||||||
struct llama_model::impl {
|
struct llama_model::impl {
|
||||||
impl() {}
|
impl() = default;
|
||||||
~impl() {}
|
~impl() = default;
|
||||||
|
|
||||||
uint64_t n_elements = 0;
|
uint64_t n_elements = 0;
|
||||||
|
|
||||||
|
|
@ -461,7 +463,7 @@ llama_model::llama_model(const llama_model_params & params) : params(params), pi
|
||||||
pimpl->has_tensor_overrides = params.tensor_buft_overrides && params.tensor_buft_overrides[0].pattern;
|
pimpl->has_tensor_overrides = params.tensor_buft_overrides && params.tensor_buft_overrides[0].pattern;
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_model::~llama_model() {}
|
llama_model::~llama_model() = default;
|
||||||
|
|
||||||
void llama_model::load_stats(llama_model_loader & ml) {
|
void llama_model::load_stats(llama_model_loader & ml) {
|
||||||
pimpl->n_elements = ml.n_elements;
|
pimpl->n_elements = ml.n_elements;
|
||||||
|
|
@ -665,6 +667,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||||
} else {
|
} else {
|
||||||
hparams.swa_type = LLAMA_SWA_TYPE_CHUNKED;
|
hparams.swa_type = LLAMA_SWA_TYPE_CHUNKED;
|
||||||
hparams.n_swa = 8192;
|
hparams.n_swa = 8192;
|
||||||
|
hparams.n_attn_temp_floor_scale = 8192;
|
||||||
|
hparams.f_attn_temp_scale = 0.1f;
|
||||||
|
hparams.f_attn_temp_offset = 1.0f;
|
||||||
hparams.set_swa_pattern(4); // pattern: 3 chunked - 1 full
|
hparams.set_swa_pattern(4); // pattern: 3 chunked - 1 full
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -1262,18 +1267,25 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||||
} break;
|
} break;
|
||||||
case LLM_ARCH_GEMMA3:
|
case LLM_ARCH_GEMMA3:
|
||||||
{
|
{
|
||||||
|
const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
|
||||||
|
if (found_swa && hparams.n_swa > 0) {
|
||||||
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
||||||
hparams.set_swa_pattern(6);
|
hparams.set_swa_pattern(6);
|
||||||
|
|
||||||
hparams.rope_freq_base_train_swa = 10000.0f;
|
hparams.rope_freq_base_train_swa = 10000.0f;
|
||||||
hparams.rope_freq_scale_train_swa = 1.0f;
|
hparams.rope_freq_scale_train_swa = 1.0f;
|
||||||
|
} else {
|
||||||
|
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
|
||||||
|
}
|
||||||
|
|
||||||
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
hparams.f_final_logit_softcapping = 0.0f;
|
||||||
|
ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping, false);
|
||||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||||
|
|
||||||
switch (hparams.n_layer) {
|
switch (hparams.n_layer) {
|
||||||
case 18: type = LLM_TYPE_270M; break;
|
case 18: type = LLM_TYPE_270M; break;
|
||||||
case 26: type = LLM_TYPE_1B; break;
|
case 26: type = LLM_TYPE_1B; break;
|
||||||
|
case 32: type = LLM_TYPE_8B; break; // Rnj-1
|
||||||
case 34: type = LLM_TYPE_4B; break;
|
case 34: type = LLM_TYPE_4B; break;
|
||||||
case 48: type = LLM_TYPE_12B; break;
|
case 48: type = LLM_TYPE_12B; break;
|
||||||
case 62: type = LLM_TYPE_27B; break;
|
case 62: type = LLM_TYPE_27B; break;
|
||||||
|
|
@ -1597,8 +1609,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||||
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
|
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
|
||||||
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
|
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
|
||||||
|
|
||||||
switch (hparams.n_layer) {
|
switch (hparams.n_ff_exp) {
|
||||||
case 28: type = LLM_TYPE_20B; break;
|
case 1408: type = LLM_TYPE_16B; break;
|
||||||
|
case 1792: type = LLM_TYPE_20B; break;
|
||||||
default: type = LLM_TYPE_UNKNOWN;
|
default: type = LLM_TYPE_UNKNOWN;
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
|
@ -1624,7 +1637,18 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||||
// that have no expert_gating_func model parameter set
|
// that have no expert_gating_func model parameter set
|
||||||
hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX;
|
hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX;
|
||||||
}
|
}
|
||||||
ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, false);
|
|
||||||
|
if (ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, 0.0f)) {
|
||||||
|
// [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
|
||||||
|
// cancel the factor from the convert script
|
||||||
|
hparams.rope_yarn_log_mul /= 0.1f;
|
||||||
|
}
|
||||||
|
|
||||||
|
// (optional) temperature tuning - used by mistral-large
|
||||||
|
ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_SCALE, hparams.f_attn_temp_scale, false);
|
||||||
|
ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_LENGTH, hparams.n_attn_temp_floor_scale, false);
|
||||||
|
|
||||||
|
hparams.f_attn_temp_offset = 0.0f;
|
||||||
|
|
||||||
switch (hparams.n_layer) {
|
switch (hparams.n_layer) {
|
||||||
case 27: type = LLM_TYPE_16B; break;
|
case 27: type = LLM_TYPE_16B; break;
|
||||||
|
|
@ -1666,6 +1690,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||||
case LLM_ARCH_GLM4:
|
case LLM_ARCH_GLM4:
|
||||||
{
|
{
|
||||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||||
|
ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false);
|
||||||
switch (hparams.n_layer) {
|
switch (hparams.n_layer) {
|
||||||
case 40: type = LLM_TYPE_9B; break;
|
case 40: type = LLM_TYPE_9B; break;
|
||||||
case 61: type = LLM_TYPE_32B; break;
|
case 61: type = LLM_TYPE_32B; break;
|
||||||
|
|
@ -1676,6 +1701,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||||
{
|
{
|
||||||
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
||||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||||
|
ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false);
|
||||||
|
|
||||||
// MoE parameters
|
// MoE parameters
|
||||||
ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert);
|
ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert);
|
||||||
|
|
@ -1774,6 +1800,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
case LLM_ARCH_NEMOTRON_H:
|
case LLM_ARCH_NEMOTRON_H:
|
||||||
|
case LLM_ARCH_NEMOTRON_H_MOE:
|
||||||
{
|
{
|
||||||
ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
|
ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
|
||||||
ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
|
ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
|
||||||
|
|
@ -1789,7 +1816,14 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||||
|
|
||||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||||
|
|
||||||
|
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
|
||||||
|
ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
|
||||||
|
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared, false);
|
||||||
|
ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
|
||||||
|
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
|
||||||
|
|
||||||
switch (hparams.n_layer) {
|
switch (hparams.n_layer) {
|
||||||
|
case 52: type = LLM_TYPE_31B_A3_5B; break; // Nemotron-H_MOE 31B
|
||||||
case 56: type = LLM_TYPE_9B; break;
|
case 56: type = LLM_TYPE_9B; break;
|
||||||
default: type = LLM_TYPE_UNKNOWN;
|
default: type = LLM_TYPE_UNKNOWN;
|
||||||
}
|
}
|
||||||
|
|
@ -2258,7 +2292,33 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||||
}
|
}
|
||||||
|
|
||||||
switch (hparams.n_layer) {
|
switch (hparams.n_layer) {
|
||||||
case 80: type = LLM_TYPE_80B_A3B; break;
|
case 48: type = LLM_TYPE_80B_A3B; break;
|
||||||
|
default: type = LLM_TYPE_UNKNOWN;
|
||||||
|
}
|
||||||
|
} break;
|
||||||
|
case LLM_ARCH_MISTRAL3:
|
||||||
|
{
|
||||||
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||||
|
ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_SCALE, hparams.f_attn_temp_scale, false);
|
||||||
|
|
||||||
|
ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_FAST, hparams.yarn_beta_fast, false);
|
||||||
|
ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, hparams.yarn_beta_slow, false);
|
||||||
|
ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, 0.0f);
|
||||||
|
|
||||||
|
hparams.f_attn_temp_offset = 0.0f;
|
||||||
|
|
||||||
|
// TODO: maybe add n_attn_temp_floor_scale as a separate KV?
|
||||||
|
if (hparams.f_attn_temp_scale != 0.0f) {
|
||||||
|
hparams.n_attn_temp_floor_scale = hparams.n_ctx_orig_yarn;
|
||||||
|
if (hparams.n_attn_temp_floor_scale == 0) {
|
||||||
|
throw std::runtime_error("invalid n_ctx_orig_yarn for attention temperature scaling");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
switch (hparams.n_layer) {
|
||||||
|
case 26: type = LLM_TYPE_3B; break;
|
||||||
|
case 34: type = LLM_TYPE_8B; break;
|
||||||
|
case 40: type = LLM_TYPE_14B; break;
|
||||||
default: type = LLM_TYPE_UNKNOWN;
|
default: type = LLM_TYPE_UNKNOWN;
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
|
@ -2575,6 +2635,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||||
case LLM_ARCH_MINICPM:
|
case LLM_ARCH_MINICPM:
|
||||||
case LLM_ARCH_GRANITE:
|
case LLM_ARCH_GRANITE:
|
||||||
case LLM_ARCH_GRANITE_MOE:
|
case LLM_ARCH_GRANITE_MOE:
|
||||||
|
case LLM_ARCH_MISTRAL3:
|
||||||
{
|
{
|
||||||
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||||
|
|
||||||
|
|
@ -3353,9 +3414,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||||
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
||||||
|
|
||||||
// optional bias tensors
|
// optional bias tensors
|
||||||
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
|
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
||||||
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
|
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
|
||||||
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
|
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
|
||||||
|
|
||||||
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||||
|
|
||||||
|
|
@ -5124,6 +5185,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
case LLM_ARCH_NEMOTRON_H:
|
case LLM_ARCH_NEMOTRON_H:
|
||||||
|
case LLM_ARCH_NEMOTRON_H_MOE:
|
||||||
{
|
{
|
||||||
// mamba2 Mixer SSM params
|
// mamba2 Mixer SSM params
|
||||||
// NOTE: int64_t for tensor dimensions
|
// NOTE: int64_t for tensor dimensions
|
||||||
|
|
@ -5134,6 +5196,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||||
const int64_t n_group = hparams.ssm_n_group;
|
const int64_t n_group = hparams.ssm_n_group;
|
||||||
const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_ssm_head;
|
const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_ssm_head;
|
||||||
|
|
||||||
|
const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
|
||||||
|
const int64_t n_ff_shexp = hparams.n_ff_shexp;
|
||||||
|
|
||||||
// embeddings
|
// embeddings
|
||||||
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||||
|
|
||||||
|
|
@ -5183,6 +5248,19 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||||
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_k_gqa_i}, TENSOR_NOT_REQUIRED);
|
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_k_gqa_i}, TENSOR_NOT_REQUIRED);
|
||||||
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_v_gqa_i}, TENSOR_NOT_REQUIRED);
|
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_v_gqa_i}, TENSOR_NOT_REQUIRED);
|
||||||
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
||||||
|
} else {
|
||||||
|
if (n_expert != 0) {
|
||||||
|
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert}, 0);
|
||||||
|
layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert }, 0);
|
||||||
|
|
||||||
|
// MoE branch
|
||||||
|
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
|
||||||
|
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
|
||||||
|
|
||||||
|
// Shared expert branch
|
||||||
|
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, 0);
|
||||||
|
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_shexp}, 0);
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
// mlp layers
|
// mlp layers
|
||||||
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { hparams.n_ff(i), n_embd}, 0);
|
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { hparams.n_ff(i), n_embd}, 0);
|
||||||
|
|
@ -5191,6 +5269,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||||
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {hparams.n_ff(i)}, TENSOR_NOT_REQUIRED);
|
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {hparams.n_ff(i)}, TENSOR_NOT_REQUIRED);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
} break;
|
} break;
|
||||||
case LLM_ARCH_EXAONE:
|
case LLM_ARCH_EXAONE:
|
||||||
{
|
{
|
||||||
|
|
@ -6530,7 +6609,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||||
layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), { n_embd, qkvz_dim }, 0);
|
layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), { n_embd, qkvz_dim }, 0);
|
||||||
layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), { hparams.ssm_d_conv, conv_dim }, 0);
|
layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), { hparams.ssm_d_conv, conv_dim }, 0);
|
||||||
layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), { hparams.ssm_dt_rank }, 0);
|
layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), { hparams.ssm_dt_rank }, 0);
|
||||||
layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), { hparams.ssm_dt_rank }, 0);
|
layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A_NOSCAN, i), { hparams.ssm_dt_rank }, 0);
|
||||||
layer.ssm_beta_alpha = create_tensor(tn(LLM_TENSOR_SSM_BETA_ALPHA, "weight", i), { n_embd, ba_dim }, 0);
|
layer.ssm_beta_alpha = create_tensor(tn(LLM_TENSOR_SSM_BETA_ALPHA, "weight", i), { n_embd, ba_dim }, 0);
|
||||||
layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), { head_v_dim }, 0);
|
layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), { head_v_dim }, 0);
|
||||||
layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), { value_dim, n_embd }, 0);
|
layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), { value_dim, n_embd }, 0);
|
||||||
|
|
@ -6599,9 +6678,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||||
|
|
||||||
std::vector<ggml_backend_buffer_ptr> bufs;
|
std::vector<ggml_backend_buffer_ptr> bufs;
|
||||||
if (ml.use_mmap && use_mmap_buffer && buffer_from_host_ptr_supported && is_default_buft) {
|
if (ml.use_mmap && use_mmap_buffer && buffer_from_host_ptr_supported && is_default_buft) {
|
||||||
|
GGML_ASSERT(!ml.no_alloc);
|
||||||
for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
|
for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
|
||||||
// only the mmap region containing the tensors in the model is mapped to the backend buffer
|
// only the mmap region containing the tensors in the model is mapped to the backend buffer
|
||||||
// this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
|
// this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer,
|
||||||
|
// then we could just use metal for all layers
|
||||||
// this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
|
// this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
|
||||||
void * addr = nullptr;
|
void * addr = nullptr;
|
||||||
size_t first, last; // NOLINT
|
size_t first, last; // NOLINT
|
||||||
|
|
@ -6617,9 +6698,16 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||||
bufs.emplace_back(buf);
|
bufs.emplace_back(buf);
|
||||||
buf_map.emplace(idx, buf);
|
buf_map.emplace(idx, buf);
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
ggml_backend_buffer_t buf;
|
||||||
|
if (ml.no_alloc) {
|
||||||
|
buf = ggml_backend_buft_alloc_buffer(buft, /*size =*/ 0); // dummy buffer
|
||||||
|
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
|
||||||
|
t->buffer = buf; // set dummy buffer for weights so that the backend scheduler won't try to allocate them
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); // real buffer
|
||||||
}
|
}
|
||||||
else {
|
|
||||||
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
|
|
||||||
if (buf == nullptr) {
|
if (buf == nullptr) {
|
||||||
throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
|
throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
|
||||||
}
|
}
|
||||||
|
|
@ -6674,6 +6762,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (ml.no_alloc) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
// load tensor data
|
// load tensor data
|
||||||
for (auto & [ctx, buf_map] : ctx_buf_maps) {
|
for (auto & [ctx, buf_map] : ctx_buf_maps) {
|
||||||
if (!ml.load_all_data(ctx, buf_map, use_mlock ? &pimpl->mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data)) {
|
if (!ml.load_all_data(ctx, buf_map, use_mlock ? &pimpl->mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data)) {
|
||||||
|
|
@ -6716,11 +6808,20 @@ size_t llama_model::n_devices() const {
|
||||||
|
|
||||||
std::map<ggml_backend_buffer_type_t, size_t> llama_model::memory_breakdown() const {
|
std::map<ggml_backend_buffer_type_t, size_t> llama_model::memory_breakdown() const {
|
||||||
std::map<ggml_backend_buffer_type_t, size_t> ret;
|
std::map<ggml_backend_buffer_type_t, size_t> ret;
|
||||||
for (const auto & [_, bufs] : pimpl->ctxs_bufs) {
|
for (const auto & [ctx, bufs] : pimpl->ctxs_bufs) {
|
||||||
|
if (hparams.no_alloc) {
|
||||||
|
GGML_ASSERT(bufs.size() == 1);
|
||||||
|
ggml_backend_buffer_t buf = bufs[0].get();
|
||||||
|
GGML_ASSERT(ggml_backend_buffer_get_base(buf) == nullptr);
|
||||||
|
ggml_backend_buffer_type_t buft = ggml_backend_buffer_get_type(buf);
|
||||||
|
ret[buft] += ggml_backend_alloc_ctx_tensors_from_buft_size(ctx.get(), buft);
|
||||||
|
} else {
|
||||||
for (const auto & buf : bufs) {
|
for (const auto & buf : bufs) {
|
||||||
|
// GGML_ASSERT(ggml_backend_buffer_get_base(buf.get()) != nullptr); // multi_buffer does not have a defined base
|
||||||
ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get());
|
ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -6763,6 +6864,7 @@ void llama_model::print_info() const {
|
||||||
// hparams
|
// hparams
|
||||||
LLAMA_LOG_INFO("%s: arch = %s\n", __func__, arch_name().c_str());
|
LLAMA_LOG_INFO("%s: arch = %s\n", __func__, arch_name().c_str());
|
||||||
LLAMA_LOG_INFO("%s: vocab_only = %d\n", __func__, hparams.vocab_only);
|
LLAMA_LOG_INFO("%s: vocab_only = %d\n", __func__, hparams.vocab_only);
|
||||||
|
LLAMA_LOG_INFO("%s: no_alloc = %d\n", __func__, hparams.no_alloc);
|
||||||
|
|
||||||
if (!hparams.vocab_only) {
|
if (!hparams.vocab_only) {
|
||||||
LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
|
LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
|
||||||
|
|
@ -6797,6 +6899,7 @@ void llama_model::print_info() const {
|
||||||
LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
|
LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
|
||||||
LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
|
LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
|
||||||
LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
|
LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
|
||||||
|
LLAMA_LOG_INFO("%s: rope_yarn_log_mul= %.4f\n", __func__, hparams.rope_yarn_log_mul);
|
||||||
LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
|
LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
|
||||||
// MRoPE (Multi-axis Rotary Position Embedding) sections
|
// MRoPE (Multi-axis Rotary Position Embedding) sections
|
||||||
if (const auto & s = hparams.rope_sections; s[0] || s[1] || s[2] || s[3]) {
|
if (const auto & s = hparams.rope_sections; s[0] || s[1] || s[2] || s[3]) {
|
||||||
|
|
@ -6819,7 +6922,8 @@ void llama_model::print_info() const {
|
||||||
arch == LLM_ARCH_PLAMO2 ||
|
arch == LLM_ARCH_PLAMO2 ||
|
||||||
arch == LLM_ARCH_GRANITE_HYBRID ||
|
arch == LLM_ARCH_GRANITE_HYBRID ||
|
||||||
arch == LLM_ARCH_QWEN3NEXT ||
|
arch == LLM_ARCH_QWEN3NEXT ||
|
||||||
arch == LLM_ARCH_NEMOTRON_H) {
|
arch == LLM_ARCH_NEMOTRON_H ||
|
||||||
|
arch == LLM_ARCH_NEMOTRON_H_MOE) {
|
||||||
LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
|
LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
|
||||||
LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
|
LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
|
||||||
LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
|
LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
|
||||||
|
|
@ -6860,7 +6964,6 @@ void llama_model::print_info() const {
|
||||||
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
|
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
|
||||||
LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
|
LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
|
||||||
LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
|
LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
|
||||||
LLAMA_LOG_INFO("%s: rope_yarn_log_mul = %.4f\n", __func__, hparams.rope_yarn_log_mul);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (arch == LLM_ARCH_QWEN2MOE) {
|
if (arch == LLM_ARCH_QWEN2MOE) {
|
||||||
|
|
@ -6875,7 +6978,8 @@ void llama_model::print_info() const {
|
||||||
if (arch == LLM_ARCH_MINICPM ||
|
if (arch == LLM_ARCH_MINICPM ||
|
||||||
arch == LLM_ARCH_GRANITE ||
|
arch == LLM_ARCH_GRANITE ||
|
||||||
arch == LLM_ARCH_GRANITE_MOE ||
|
arch == LLM_ARCH_GRANITE_MOE ||
|
||||||
arch == LLM_ARCH_GRANITE_HYBRID) {
|
arch == LLM_ARCH_GRANITE_HYBRID ||
|
||||||
|
arch == LLM_ARCH_NEMOTRON_H_MOE) {
|
||||||
LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
|
LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
|
||||||
LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
|
LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
|
||||||
LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
|
LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
|
||||||
|
|
@ -7056,7 +7160,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
||||||
if (arch == LLM_ARCH_FALCON_H1) {
|
if (arch == LLM_ARCH_FALCON_H1) {
|
||||||
filter_attn = [&](int32_t) { return true; };
|
filter_attn = [&](int32_t) { return true; };
|
||||||
filter_recr = [&](int32_t) { return true; };
|
filter_recr = [&](int32_t) { return true; };
|
||||||
} else if (arch == LLM_ARCH_NEMOTRON_H) {
|
} else if (arch == LLM_ARCH_NEMOTRON_H || arch == LLM_ARCH_NEMOTRON_H_MOE) {
|
||||||
filter_attn = [&](int32_t il) {
|
filter_attn = [&](int32_t il) {
|
||||||
return !hparams.is_recurrent(il) && hparams.n_ff(il) == 0;
|
return !hparams.is_recurrent(il) && hparams.n_ff(il) == 0;
|
||||||
};
|
};
|
||||||
|
|
@ -7304,7 +7408,11 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
||||||
} break;
|
} break;
|
||||||
case LLM_ARCH_GEMMA3:
|
case LLM_ARCH_GEMMA3:
|
||||||
{
|
{
|
||||||
llm = std::make_unique<llm_build_gemma3_iswa>(*this, params);
|
if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) {
|
||||||
|
llm = std::make_unique<llm_build_gemma3<true>>(*this, params);
|
||||||
|
} else {
|
||||||
|
llm = std::make_unique<llm_build_gemma3<false>>(*this, params);
|
||||||
|
}
|
||||||
} break;
|
} break;
|
||||||
case LLM_ARCH_GEMMA3N:
|
case LLM_ARCH_GEMMA3N:
|
||||||
{
|
{
|
||||||
|
|
@ -7423,6 +7531,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
||||||
llm = std::make_unique<llm_build_nemotron>(*this, params);
|
llm = std::make_unique<llm_build_nemotron>(*this, params);
|
||||||
} break;
|
} break;
|
||||||
case LLM_ARCH_NEMOTRON_H:
|
case LLM_ARCH_NEMOTRON_H:
|
||||||
|
case LLM_ARCH_NEMOTRON_H_MOE:
|
||||||
{
|
{
|
||||||
llm = std::make_unique<llm_build_nemotron_h>(*this, params);
|
llm = std::make_unique<llm_build_nemotron_h>(*this, params);
|
||||||
} break;
|
} break;
|
||||||
|
|
@ -7569,6 +7678,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
||||||
{
|
{
|
||||||
llm = std::make_unique<llm_build_qwen3next>(*this, params);
|
llm = std::make_unique<llm_build_qwen3next>(*this, params);
|
||||||
} break;
|
} break;
|
||||||
|
case LLM_ARCH_MISTRAL3:
|
||||||
|
{
|
||||||
|
llm = std::make_unique<llm_build_mistral3>(*this, params);
|
||||||
|
} break;
|
||||||
default:
|
default:
|
||||||
GGML_ABORT("fatal error");
|
GGML_ABORT("fatal error");
|
||||||
}
|
}
|
||||||
|
|
@ -7607,6 +7720,7 @@ llama_model_params llama_model_default_params() {
|
||||||
/*.check_tensors =*/ false,
|
/*.check_tensors =*/ false,
|
||||||
/*.use_extra_bufts =*/ true,
|
/*.use_extra_bufts =*/ true,
|
||||||
/*.no_host =*/ false,
|
/*.no_host =*/ false,
|
||||||
|
/*.no_alloc =*/ false,
|
||||||
};
|
};
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
|
|
@ -7706,6 +7820,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||||
case LLM_ARCH_ARWKV7:
|
case LLM_ARCH_ARWKV7:
|
||||||
case LLM_ARCH_WAVTOKENIZER_DEC:
|
case LLM_ARCH_WAVTOKENIZER_DEC:
|
||||||
case LLM_ARCH_NEMOTRON_H:
|
case LLM_ARCH_NEMOTRON_H:
|
||||||
|
case LLM_ARCH_NEMOTRON_H_MOE:
|
||||||
return LLAMA_ROPE_TYPE_NONE;
|
return LLAMA_ROPE_TYPE_NONE;
|
||||||
|
|
||||||
// use what we call a normal RoPE, operating on pairs of consecutive head values
|
// use what we call a normal RoPE, operating on pairs of consecutive head values
|
||||||
|
|
@ -7726,7 +7841,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||||
case LLM_ARCH_DEEPSEEK2:
|
case LLM_ARCH_DEEPSEEK2:
|
||||||
case LLM_ARCH_PLM:
|
case LLM_ARCH_PLM:
|
||||||
case LLM_ARCH_CHATGLM:
|
case LLM_ARCH_CHATGLM:
|
||||||
case LLM_ARCH_GLM4:
|
|
||||||
case LLM_ARCH_GRANITE:
|
case LLM_ARCH_GRANITE:
|
||||||
case LLM_ARCH_GRANITE_MOE:
|
case LLM_ARCH_GRANITE_MOE:
|
||||||
case LLM_ARCH_GRANITE_HYBRID:
|
case LLM_ARCH_GRANITE_HYBRID:
|
||||||
|
|
@ -7738,6 +7852,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||||
case LLM_ARCH_ARCEE:
|
case LLM_ARCH_ARCEE:
|
||||||
case LLM_ARCH_ERNIE4_5:
|
case LLM_ARCH_ERNIE4_5:
|
||||||
case LLM_ARCH_ERNIE4_5_MOE:
|
case LLM_ARCH_ERNIE4_5_MOE:
|
||||||
|
case LLM_ARCH_MISTRAL3:
|
||||||
return LLAMA_ROPE_TYPE_NORM;
|
return LLAMA_ROPE_TYPE_NORM;
|
||||||
|
|
||||||
// the pairs of head values are offset by n_rot/2
|
// the pairs of head values are offset by n_rot/2
|
||||||
|
|
@ -7788,7 +7903,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||||
case LLM_ARCH_LFM2:
|
case LLM_ARCH_LFM2:
|
||||||
case LLM_ARCH_LFM2MOE:
|
case LLM_ARCH_LFM2MOE:
|
||||||
case LLM_ARCH_SMALLTHINKER:
|
case LLM_ARCH_SMALLTHINKER:
|
||||||
case LLM_ARCH_GLM4_MOE:
|
|
||||||
case LLM_ARCH_SEED_OSS:
|
case LLM_ARCH_SEED_OSS:
|
||||||
case LLM_ARCH_GROVEMOE:
|
case LLM_ARCH_GROVEMOE:
|
||||||
case LLM_ARCH_APERTUS:
|
case LLM_ARCH_APERTUS:
|
||||||
|
|
@ -7805,6 +7919,11 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||||
case LLM_ARCH_QWEN3VLMOE:
|
case LLM_ARCH_QWEN3VLMOE:
|
||||||
return LLAMA_ROPE_TYPE_IMROPE;
|
return LLAMA_ROPE_TYPE_IMROPE;
|
||||||
|
|
||||||
|
case LLM_ARCH_GLM4:
|
||||||
|
return model->hparams.use_mrope() ? LLAMA_ROPE_TYPE_MROPE : LLAMA_ROPE_TYPE_NORM;
|
||||||
|
case LLM_ARCH_GLM4_MOE:
|
||||||
|
return model->hparams.use_mrope() ? LLAMA_ROPE_TYPE_MROPE : LLAMA_ROPE_TYPE_NEOX;
|
||||||
|
|
||||||
// all model arches should be listed explicitly here
|
// all model arches should be listed explicitly here
|
||||||
case LLM_ARCH_UNKNOWN:
|
case LLM_ARCH_UNKNOWN:
|
||||||
GGML_ABORT("unknown architecture");
|
GGML_ABORT("unknown architecture");
|
||||||
|
|
|
||||||
|
|
@ -114,6 +114,7 @@ enum llm_type {
|
||||||
LLM_TYPE_16B_A1B,
|
LLM_TYPE_16B_A1B,
|
||||||
LLM_TYPE_21B_A3B, // Ernie MoE small
|
LLM_TYPE_21B_A3B, // Ernie MoE small
|
||||||
LLM_TYPE_30B_A3B,
|
LLM_TYPE_30B_A3B,
|
||||||
|
LLM_TYPE_31B_A3_5B,
|
||||||
LLM_TYPE_80B_A3B, // Qwen3 Next
|
LLM_TYPE_80B_A3B, // Qwen3 Next
|
||||||
LLM_TYPE_100B_A6B,
|
LLM_TYPE_100B_A6B,
|
||||||
LLM_TYPE_106B_A12B, // GLM-4.5-Air
|
LLM_TYPE_106B_A12B, // GLM-4.5-Air
|
||||||
|
|
|
||||||
|
|
@ -596,7 +596,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<std::string> splits = {};
|
std::vector<std::string> splits = {};
|
||||||
llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, kv_overrides, nullptr);
|
llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
|
||||||
ml.init_mappings(false); // no prefetching
|
ml.init_mappings(false); // no prefetching
|
||||||
|
|
||||||
llama_model model(llama_model_default_params());
|
llama_model model(llama_model_default_params());
|
||||||
|
|
@ -666,7 +666,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
||||||
|
|
||||||
std::map<int, std::string> mapped;
|
std::map<int, std::string> mapped;
|
||||||
int blk_id = 0;
|
int blk_id = 0;
|
||||||
int pruned_attention_w = 0;
|
|
||||||
|
|
||||||
// make a list of weights
|
// make a list of weights
|
||||||
std::vector<const llama_model_loader::llama_tensor_weight *> tensors;
|
std::vector<const llama_model_loader::llama_tensor_weight *> tensors;
|
||||||
|
|
@ -674,11 +673,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
||||||
for (const auto & it : ml.weights_map) {
|
for (const auto & it : ml.weights_map) {
|
||||||
const std::string remapped_name(remap_layer(it.first, prune_list, mapped, blk_id));
|
const std::string remapped_name(remap_layer(it.first, prune_list, mapped, blk_id));
|
||||||
if (remapped_name.empty()) {
|
if (remapped_name.empty()) {
|
||||||
if (it.first.find("attn_v.weight") != std::string::npos ||
|
|
||||||
it.first.find("attn_qkv.weight") != std::string::npos ||
|
|
||||||
it.first.find("attn_kv_b.weight") != std::string::npos) {
|
|
||||||
pruned_attention_w++;
|
|
||||||
}
|
|
||||||
LLAMA_LOG_DEBUG("%s: pruning tensor %s\n", __func__, it.first.c_str());
|
LLAMA_LOG_DEBUG("%s: pruning tensor %s\n", __func__, it.first.c_str());
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
@ -703,7 +697,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
bool is_clip_model = false;
|
|
||||||
for (const auto * it : tensors) {
|
for (const auto * it : tensors) {
|
||||||
const struct ggml_tensor * tensor = it->tensor;
|
const struct ggml_tensor * tensor = it->tensor;
|
||||||
|
|
||||||
|
|
@ -717,32 +710,10 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
||||||
} else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
|
} else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
|
||||||
qs.has_output = true;
|
qs.has_output = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
is_clip_model |= name.rfind("mm.", 0) == 0; // check the "mm." prefix
|
|
||||||
}
|
}
|
||||||
|
|
||||||
qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
|
qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
|
||||||
|
|
||||||
// sanity checks for models that have attention layers
|
|
||||||
if (qs.n_attention_wv != 0 && !is_clip_model)
|
|
||||||
{
|
|
||||||
const auto & n_head_kv_iter = model.hparams.n_head_kv_arr.begin();
|
|
||||||
// attention layers have a non-zero number of kv heads
|
|
||||||
int32_t n_layer_attn = model.hparams.n_layer - std::count(n_head_kv_iter, n_head_kv_iter + model.hparams.n_layer, 0);
|
|
||||||
if (llama_model_has_encoder(&model)) {
|
|
||||||
// now n_layer_attn is the number of attention layers in the encoder
|
|
||||||
// for each decoder block, there are 2 attention layers
|
|
||||||
n_layer_attn += 2 * model.hparams.dec_n_layer;
|
|
||||||
}
|
|
||||||
|
|
||||||
// note: for linear-attention models (such as Qwen3 Next) this is the number of linear layers
|
|
||||||
const int32_t n_layer_recr = std::count(model.hparams.recurrent_layer_arr.begin(), model.hparams.recurrent_layer_arr.end(), true);
|
|
||||||
|
|
||||||
LLAMA_LOG_INFO("%s: n_layer_attn = %d, n_layer_recr = %d, pruned_attention_w = %d\n", __func__, n_layer_attn, n_layer_recr, pruned_attention_w);
|
|
||||||
|
|
||||||
GGML_ASSERT((qs.n_attention_wv == n_layer_attn - pruned_attention_w - n_layer_recr) && "n_attention_wv is unexpected");
|
|
||||||
}
|
|
||||||
|
|
||||||
size_t total_size_org = 0;
|
size_t total_size_org = 0;
|
||||||
size_t total_size_new = 0;
|
size_t total_size_new = 0;
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1884,7 +1884,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||||
clean_spaces = false;
|
clean_spaces = false;
|
||||||
} else if (
|
} else if (
|
||||||
tokenizer_pre == "qwen2" ||
|
tokenizer_pre == "qwen2" ||
|
||||||
tokenizer_pre == "deepseek-r1-qwen") {
|
tokenizer_pre == "deepseek-r1-qwen" ||
|
||||||
|
tokenizer_pre == "kormo") {
|
||||||
pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2;
|
pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2;
|
||||||
clean_spaces = false;
|
clean_spaces = false;
|
||||||
} else if (
|
} else if (
|
||||||
|
|
@ -3243,8 +3244,7 @@ void llama_vocab::impl::print_info() const {
|
||||||
llama_vocab::llama_vocab() : pimpl(new impl(*this)) {
|
llama_vocab::llama_vocab() : pimpl(new impl(*this)) {
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_vocab::~llama_vocab() {
|
llama_vocab::~llama_vocab() = default;
|
||||||
}
|
|
||||||
|
|
||||||
void llama_vocab::load(llama_model_loader & ml, const LLM_KV & kv) {
|
void llama_vocab::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||||
pimpl->load(ml, kv);
|
pimpl->load(ml, kv);
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,9 @@
|
||||||
|
#include "llama.h"
|
||||||
|
|
||||||
#include "llama-impl.h"
|
#include "llama-impl.h"
|
||||||
|
|
||||||
#include "llama-chat.h"
|
#include "llama-chat.h"
|
||||||
|
#include "llama-context.h"
|
||||||
#include "llama-mmap.h"
|
#include "llama-mmap.h"
|
||||||
#include "llama-vocab.h"
|
#include "llama-vocab.h"
|
||||||
#include "llama-model-loader.h"
|
#include "llama-model-loader.h"
|
||||||
|
|
@ -11,11 +14,14 @@
|
||||||
#include "ggml-backend.h"
|
#include "ggml-backend.h"
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
#include <cassert>
|
||||||
|
#include <cinttypes>
|
||||||
#include <cstddef>
|
#include <cstddef>
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
#include <ctime>
|
#include <ctime>
|
||||||
|
#include <stdexcept>
|
||||||
|
|
||||||
#if defined(_MSC_VER)
|
#if defined(_MSC_VER)
|
||||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||||
|
|
@ -37,6 +43,646 @@ const char * llama_flash_attn_type_name(enum llama_flash_attn_type flash_attn_ty
|
||||||
GGML_ABORT("fatal error");
|
GGML_ABORT("fatal error");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct llama_device_memory_data {
|
||||||
|
int64_t total;
|
||||||
|
int64_t free;
|
||||||
|
llama_memory_breakdown_data mb;
|
||||||
|
};
|
||||||
|
|
||||||
|
static std::vector<llama_device_memory_data> llama_get_device_memory_data(
|
||||||
|
const char * path_model, const llama_model_params * mparams, const llama_context_params * cparams,
|
||||||
|
std::vector<ggml_backend_dev_t> & devs, uint32_t & hp_ngl, uint32_t & hp_n_ctx_train, uint32_t & hp_n_expert,
|
||||||
|
const ggml_log_level log_level) {
|
||||||
|
struct user_data_t {
|
||||||
|
struct {
|
||||||
|
ggml_log_callback callback;
|
||||||
|
void * user_data;
|
||||||
|
} original_logger;
|
||||||
|
ggml_log_level min_level; // prints below this log level go to debug log
|
||||||
|
};
|
||||||
|
user_data_t ud;
|
||||||
|
llama_log_get(&ud.original_logger.callback, &ud.original_logger.user_data);
|
||||||
|
ud.min_level = log_level;
|
||||||
|
|
||||||
|
llama_log_set([](ggml_log_level level, const char * text, void * user_data) {
|
||||||
|
const user_data_t * ud = (const user_data_t *) user_data;
|
||||||
|
const ggml_log_level level_eff = level >= ud->min_level ? level : GGML_LOG_LEVEL_DEBUG;
|
||||||
|
ud->original_logger.callback(level_eff, text, ud->original_logger.user_data);
|
||||||
|
}, &ud);
|
||||||
|
|
||||||
|
llama_model_params mparams_copy = *mparams;
|
||||||
|
mparams_copy.no_alloc = true;
|
||||||
|
mparams_copy.use_mmap = false;
|
||||||
|
|
||||||
|
llama_model * model = llama_model_load_from_file(path_model, mparams_copy);
|
||||||
|
if (model == nullptr) {
|
||||||
|
llama_log_set(ud.original_logger.callback, ud.original_logger.user_data);
|
||||||
|
throw std::runtime_error("failed to load model");
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_context * ctx = llama_init_from_model(model, *cparams);
|
||||||
|
if (ctx == nullptr) {
|
||||||
|
llama_model_free(model);
|
||||||
|
llama_log_set(ud.original_logger.callback, ud.original_logger.user_data);
|
||||||
|
throw std::runtime_error("failed to create llama_context from model");
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<llama_device_memory_data> ret(model->devices.size());
|
||||||
|
|
||||||
|
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> memory_breakdown = ctx->memory_breakdown();
|
||||||
|
|
||||||
|
for (const auto & [buft, mb] : memory_breakdown) {
|
||||||
|
if (ggml_backend_buft_is_host(buft)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
|
||||||
|
if (!dev) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
for (size_t i = 0; i < ret.size(); i++) {
|
||||||
|
if (model->devices[i] == dev) {
|
||||||
|
ret[i].mb.model += mb.model;
|
||||||
|
ret[i].mb.context += mb.context;
|
||||||
|
ret[i].mb.compute += mb.compute;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (size_t i = 0; i < ret.size(); i++) {
|
||||||
|
size_t free, total;
|
||||||
|
ggml_backend_dev_memory(model->devices[i], &free, &total);
|
||||||
|
ret[i].free = free;
|
||||||
|
ret[i].total = total;
|
||||||
|
}
|
||||||
|
|
||||||
|
devs = model->devices;
|
||||||
|
hp_ngl = model->hparams.n_layer;
|
||||||
|
hp_n_ctx_train = model->hparams.n_ctx_train;
|
||||||
|
hp_n_expert = model->hparams.n_expert;
|
||||||
|
|
||||||
|
llama_memory_breakdown_print(ctx); // goes to debug log
|
||||||
|
|
||||||
|
llama_free(ctx);
|
||||||
|
llama_model_free(model);
|
||||||
|
llama_log_set(ud.original_logger.callback, ud.original_logger.user_data);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
// enum to identify part of a layer for distributing its tensors:
|
||||||
|
enum layer_fraction_t {
|
||||||
|
LAYER_FRACTION_NONE = 0, // nothing
|
||||||
|
LAYER_FRACTION_ATTN = 1, // attention
|
||||||
|
LAYER_FRACTION_UP = 2, // attention + up
|
||||||
|
LAYER_FRACTION_GATE = 3, // attention + up + gate
|
||||||
|
LAYER_FRACTION_MOE = 4, // everything but sparse MoE weights
|
||||||
|
};
|
||||||
|
// this enum is only used in llama_params_fit_impl but needs to be defined outside of it to fix a Windows compilation issue
|
||||||
|
|
||||||
|
static void llama_params_fit_impl(
|
||||||
|
const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
|
||||||
|
float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
|
||||||
|
size_t margin_s, uint32_t n_ctx_min, enum ggml_log_level log_level) {
|
||||||
|
constexpr int64_t MiB = 1024*1024;
|
||||||
|
const int64_t margin = margin_s; // this function uses int64_t rather than size_t for memory sizes to more conveniently handle deficits
|
||||||
|
typedef std::vector<llama_device_memory_data> dmds_t;
|
||||||
|
const llama_model_params default_mparams = llama_model_default_params();
|
||||||
|
|
||||||
|
std::vector<ggml_backend_dev_t> devs;
|
||||||
|
uint32_t hp_ngl = 0; // hparams.n_gpu_layers
|
||||||
|
uint32_t hp_nct = 0; // hparams.n_ctx_train
|
||||||
|
uint32_t hp_nex = 0; // hparams.n_expert
|
||||||
|
|
||||||
|
// step 1: get data for default parameters and check whether any changes are necessary in the first place
|
||||||
|
|
||||||
|
LLAMA_LOG_DEBUG("%s: getting device memory data for initial parameters:\n", __func__);
|
||||||
|
const dmds_t dmds_full = llama_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
|
||||||
|
const size_t nd = devs.size(); // number of devices
|
||||||
|
if (nd == 0) {
|
||||||
|
LLAMA_LOG_INFO("%s: no devices with dedicated memory found\n", __func__);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<std::string> dev_names;
|
||||||
|
{
|
||||||
|
dev_names.reserve(nd);
|
||||||
|
size_t max_length = 0;
|
||||||
|
for (ggml_backend_dev_t dev : devs) {
|
||||||
|
std::string name = ggml_backend_dev_name(dev);
|
||||||
|
name += " (";
|
||||||
|
name += ggml_backend_dev_description(dev);
|
||||||
|
name += ")";
|
||||||
|
dev_names.push_back(name);
|
||||||
|
max_length = std::max(max_length, name.length());
|
||||||
|
}
|
||||||
|
for (std::string & dn : dev_names) {
|
||||||
|
dn.insert(dn.end(), max_length - dn.length(), ' ');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int64_t sum_total = 0;
|
||||||
|
int64_t sum_projected_free = 0;
|
||||||
|
int64_t min_projected_free = INT64_MAX;
|
||||||
|
int64_t sum_projected_used = 0;
|
||||||
|
int64_t sum_projected_ctx = 0;
|
||||||
|
|
||||||
|
if (nd > 1) {
|
||||||
|
LLAMA_LOG_INFO("%s: projected memory use with initial parameters [MiB]:\n", __func__);
|
||||||
|
}
|
||||||
|
for (size_t id = 0; id < nd; id++) {
|
||||||
|
const llama_device_memory_data & dmd = dmds_full[id];
|
||||||
|
|
||||||
|
const int64_t projected_used = dmd.mb.total();
|
||||||
|
const int64_t projected_free = dmd.free - projected_used;
|
||||||
|
|
||||||
|
sum_total += dmd.total;
|
||||||
|
sum_projected_used += projected_used;
|
||||||
|
sum_projected_free += projected_free;
|
||||||
|
min_projected_free = std::min(min_projected_free, projected_free);
|
||||||
|
sum_projected_ctx += dmd.mb.context;
|
||||||
|
|
||||||
|
if (nd > 1) {
|
||||||
|
LLAMA_LOG_INFO("%s: - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " %s\n",
|
||||||
|
__func__, dev_names[id].c_str(), dmd.total/MiB, projected_used/MiB, std::abs(projected_free)/MiB,
|
||||||
|
projected_free >= 0 ? "surplus" : "deficit");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
assert(sum_total >= 0 && sum_projected_used >= 0 && sum_projected_ctx >= 0);
|
||||||
|
assert(sum_projected_used >= sum_projected_ctx);
|
||||||
|
LLAMA_LOG_INFO("%s: projected to use %" PRId64 " MiB of device memory vs. %" PRId64 " MiB of free device memory\n",
|
||||||
|
__func__, sum_projected_used/MiB, sum_total/MiB);
|
||||||
|
if (min_projected_free >= margin) {
|
||||||
|
if (nd == 1) {
|
||||||
|
LLAMA_LOG_INFO("%s: will leave %" PRId64 " >= %" PRId64 " MiB of free device memory, no changes needed\n",
|
||||||
|
__func__, min_projected_free/MiB, margin/MiB);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
LLAMA_LOG_INFO("%s: will leave at least %" PRId64 " >= %" PRId64 " MiB of free memory on all devices, no changes needed\n",
|
||||||
|
__func__, min_projected_free/MiB, margin/MiB);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// step 2: try reducing memory use by reducing the context size
|
||||||
|
|
||||||
|
{
|
||||||
|
int64_t global_surplus = sum_projected_free - int64_t(nd)*margin;
|
||||||
|
if (global_surplus < 0) {
|
||||||
|
LLAMA_LOG_INFO(nd == 1 ?
|
||||||
|
"%s: cannot fulfill margin of %" PRId64 " MiB, need to reduce device memory by %" PRId64 " MiB\n" :
|
||||||
|
"%s: cannot fulfill margin of %" PRId64 " MiB on all devices, need to use %" PRId64 " MiB less in total\n",
|
||||||
|
__func__, margin/MiB, -global_surplus/MiB);
|
||||||
|
if (cparams->n_ctx == 0) {
|
||||||
|
if (hp_nct > n_ctx_min) {
|
||||||
|
const int64_t bytes_per_ctx = sum_projected_ctx / hp_nct;
|
||||||
|
const uint32_t ctx_reduction = std::min(
|
||||||
|
uint32_t((-global_surplus + bytes_per_ctx - 1) / bytes_per_ctx), hp_nct - n_ctx_min);
|
||||||
|
cparams->n_ctx = hp_nct - ctx_reduction;
|
||||||
|
const int64_t memory_reduction = ctx_reduction * bytes_per_ctx;
|
||||||
|
global_surplus += memory_reduction;
|
||||||
|
LLAMA_LOG_INFO("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
|
||||||
|
__func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
|
||||||
|
if (global_surplus >= 0) {
|
||||||
|
if (nd == 1) {
|
||||||
|
LLAMA_LOG_INFO("%s: entire model can be fit by reducing context\n", __func__);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
LLAMA_LOG_INFO("%s: entire model should be fit across devices by reducing context\n", __func__);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
LLAMA_LOG_INFO("%s: default model context size is %" PRIu32 " which is <= the min. context size of %" PRIu32 " -> no change\n",
|
||||||
|
__func__, hp_nct, n_ctx_min);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
LLAMA_LOG_INFO("%s: context size set by user to %" PRIu32 " -> no change\n", __func__, cparams->n_ctx);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (mparams->n_gpu_layers != default_mparams.n_gpu_layers) {
|
||||||
|
throw std::runtime_error("n_gpu_layers already set by user to " + std::to_string(mparams->n_gpu_layers) + ", abort");
|
||||||
|
}
|
||||||
|
if (nd > 1) {
|
||||||
|
if (!tensor_split) {
|
||||||
|
throw std::runtime_error("did not provide a buffer to write the tensor_split to, abort");
|
||||||
|
}
|
||||||
|
if (mparams->tensor_split) {
|
||||||
|
for (size_t id = 0; id < nd; id++) {
|
||||||
|
if (mparams->tensor_split[id] != 0.0f) {
|
||||||
|
throw std::runtime_error("model_params::tensor_split already set by user, abort");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (mparams->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
||||||
|
throw std::runtime_error("changing weight allocation for LLAMA_SPLIT_MODE_ROW not implemented, abort");
|
||||||
|
}
|
||||||
|
if (hp_ngl < 2*nd) {
|
||||||
|
throw std::runtime_error("model has only " + std::to_string(hp_ngl) + " layers but need at least "
|
||||||
|
+ std::to_string(2*nd) + " to fit memory for " + std::to_string(nd) + " devices, abort");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!tensor_buft_overrides) {
|
||||||
|
throw std::runtime_error("did not provide buffer to set tensor_buft_overrides, abort");
|
||||||
|
}
|
||||||
|
if (mparams->tensor_buft_overrides && (mparams->tensor_buft_overrides->pattern || mparams->tensor_buft_overrides->buft)) {
|
||||||
|
throw std::runtime_error("model_params::tensor_buft_overrides already set by user, abort");
|
||||||
|
}
|
||||||
|
|
||||||
|
// step 3: iteratively fill the back to front with "dense" layers
|
||||||
|
// - for a dense model simply fill full layers, giving each device a contiguous slice of the model
|
||||||
|
// - for a MoE model, same as dense model but with all MoE tensors in system memory
|
||||||
|
|
||||||
|
// utility function that returns a static C string matching the tensors for a specific layer index and layer fraction:
|
||||||
|
auto get_overflow_pattern = [&](const size_t il, const layer_fraction_t lf) -> const char * {
|
||||||
|
constexpr size_t n_strings = 1000;
|
||||||
|
if (il >= n_strings) {
|
||||||
|
throw std::runtime_error("at most " + std::to_string(n_strings) + " model layers are supported");
|
||||||
|
}
|
||||||
|
switch (lf) {
|
||||||
|
case LAYER_FRACTION_ATTN: {
|
||||||
|
static std::array<std::string, n_strings> patterns;
|
||||||
|
if (patterns[il].empty()) {
|
||||||
|
patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_(up|gate|down).*";
|
||||||
|
}
|
||||||
|
return patterns[il].c_str();
|
||||||
|
}
|
||||||
|
case LAYER_FRACTION_UP: {
|
||||||
|
static std::array<std::string, n_strings> patterns;
|
||||||
|
if (patterns[il].empty()) {
|
||||||
|
patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_(gate|down).*";
|
||||||
|
}
|
||||||
|
return patterns[il].c_str();
|
||||||
|
}
|
||||||
|
case LAYER_FRACTION_GATE: {
|
||||||
|
static std::array<std::string, n_strings> patterns;
|
||||||
|
if (patterns[il].empty()) {
|
||||||
|
patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_down.*";
|
||||||
|
}
|
||||||
|
return patterns[il].c_str();
|
||||||
|
}
|
||||||
|
case LAYER_FRACTION_MOE: {
|
||||||
|
static std::array<std::string, n_strings> patterns;
|
||||||
|
if (patterns[il].empty()) {
|
||||||
|
patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_(up|down|gate)_(ch|)exps";
|
||||||
|
}
|
||||||
|
return patterns[il].c_str();
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
GGML_ABORT("fatal error");
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct ngl_t {
|
||||||
|
uint32_t n_layer = 0; // number of total layers
|
||||||
|
uint32_t n_part = 0; // number of partial layers, <= n_layer
|
||||||
|
|
||||||
|
// for the first partial layer varying parts can overflow, all further layers use LAYER_FRACTION_MOE:
|
||||||
|
layer_fraction_t overflow_type = LAYER_FRACTION_MOE;
|
||||||
|
};
|
||||||
|
|
||||||
|
const size_t ntbo = llama_max_tensor_buft_overrides();
|
||||||
|
|
||||||
|
// utility function to set n_gpu_layers and tensor_split
|
||||||
|
auto set_ngl_tensor_split_tbo = [&](
|
||||||
|
const std::vector<ngl_t> & ngl_per_device,
|
||||||
|
const std::vector<ggml_backend_buffer_type_t> & overflow_bufts,
|
||||||
|
llama_model_params & mparams,
|
||||||
|
const bool add_nonrepeating) {
|
||||||
|
mparams.n_gpu_layers = 0;
|
||||||
|
for (size_t id = 0; id < nd; id++) {
|
||||||
|
mparams.n_gpu_layers += ngl_per_device[id].n_layer;
|
||||||
|
if (nd > 1) {
|
||||||
|
tensor_split[id] = ngl_per_device[id].n_layer;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
assert(uint32_t(mparams.n_gpu_layers) <= hp_ngl);
|
||||||
|
uint32_t il0 = hp_ngl - mparams.n_gpu_layers; // start index for tensor buft overrides
|
||||||
|
|
||||||
|
if (add_nonrepeating) {
|
||||||
|
mparams.n_gpu_layers += 1;
|
||||||
|
tensor_split[nd - 1] += 1;
|
||||||
|
}
|
||||||
|
mparams.tensor_split = tensor_split;
|
||||||
|
|
||||||
|
size_t itbo = 0;
|
||||||
|
for (size_t id = 0; id < nd; id++) {
|
||||||
|
il0 += ngl_per_device[id].n_layer - ngl_per_device[id].n_part;
|
||||||
|
for (uint32_t il = il0; il < il0 + ngl_per_device[id].n_part; il++) {
|
||||||
|
if (itbo + 1 >= ntbo) {
|
||||||
|
tensor_buft_overrides[itbo].pattern = nullptr;
|
||||||
|
tensor_buft_overrides[itbo].buft = nullptr;
|
||||||
|
itbo++;
|
||||||
|
mparams.tensor_buft_overrides = tensor_buft_overrides;
|
||||||
|
throw std::runtime_error("llama_params_fit_n_tensor_buft_overrides() == "
|
||||||
|
+ std::to_string(ntbo) + " is insufficient for model\n");
|
||||||
|
}
|
||||||
|
tensor_buft_overrides[itbo].pattern = get_overflow_pattern(il, il == il0 ? ngl_per_device[id].overflow_type : LAYER_FRACTION_MOE);
|
||||||
|
tensor_buft_overrides[itbo].buft = overflow_bufts[id];
|
||||||
|
itbo++;
|
||||||
|
}
|
||||||
|
il0 += ngl_per_device[id].n_part;
|
||||||
|
}
|
||||||
|
tensor_buft_overrides[itbo].pattern = nullptr;
|
||||||
|
tensor_buft_overrides[itbo].buft = nullptr;
|
||||||
|
itbo++;
|
||||||
|
mparams.tensor_buft_overrides = tensor_buft_overrides;
|
||||||
|
};
|
||||||
|
|
||||||
|
// utility function that returns the memory use per device for given numbers of layers per device
|
||||||
|
auto get_memory_for_layers = [&](
|
||||||
|
const char * func_name,
|
||||||
|
const std::vector<ngl_t> & ngl_per_device,
|
||||||
|
const std::vector<ggml_backend_buffer_type_t> & overflow_bufts,
|
||||||
|
const bool add_nonrepeating) -> std::vector<int64_t> {
|
||||||
|
llama_model_params mparams_copy = *mparams;
|
||||||
|
set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, mparams_copy, add_nonrepeating);
|
||||||
|
|
||||||
|
const dmds_t dmd_nl = llama_get_device_memory_data(
|
||||||
|
path_model, &mparams_copy, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
|
||||||
|
|
||||||
|
LLAMA_LOG_DEBUG("%s: memory for test allocation by device:\n", func_name);
|
||||||
|
for (size_t id = 0; id < nd; id++) {
|
||||||
|
const ngl_t & n = ngl_per_device[id];
|
||||||
|
LLAMA_LOG_DEBUG(
|
||||||
|
"%s: id=%zu, n_layer=%2" PRIu32 ", n_part=%2" PRIu32 ", overflow_type=%d, mem=%6" PRId64 " MiB\n",
|
||||||
|
func_name, id, n.n_layer, n.n_part, int(n.overflow_type), dmd_nl[id].mb.total()/MiB);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<int64_t> ret;
|
||||||
|
ret.reserve(nd);
|
||||||
|
for (const llama_device_memory_data & dmd : dmd_nl) {
|
||||||
|
ret.push_back(dmd.mb.total());
|
||||||
|
}
|
||||||
|
return ret;
|
||||||
|
};
|
||||||
|
|
||||||
|
int64_t global_surplus_cpu_moe = 0;
|
||||||
|
if (hp_nex > 0) {
|
||||||
|
const static std::string pattern_moe_all = "blk\\.\\d+\\.ffn_(up|down|gate)_(ch|)exps"; // matches all MoE tensors
|
||||||
|
ggml_backend_buffer_type_t cpu_buft = ggml_backend_cpu_buffer_type();
|
||||||
|
tensor_buft_overrides[0] = {pattern_moe_all.c_str(), cpu_buft};
|
||||||
|
tensor_buft_overrides[1] = {nullptr, nullptr};
|
||||||
|
mparams->tensor_buft_overrides = tensor_buft_overrides;
|
||||||
|
|
||||||
|
LLAMA_LOG_DEBUG("%s: getting device memory data with all MoE tensors moved to system memory:\n", __func__);
|
||||||
|
const dmds_t dmds_cpu_moe = llama_get_device_memory_data(
|
||||||
|
path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
|
||||||
|
|
||||||
|
for (const llama_device_memory_data & dmd : dmds_cpu_moe) {
|
||||||
|
global_surplus_cpu_moe += dmd.free;
|
||||||
|
global_surplus_cpu_moe -= int64_t(dmd.mb.total()) + margin;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (global_surplus_cpu_moe > 0) {
|
||||||
|
LLAMA_LOG_INFO("%s: with only dense weights in device memory there is a total surplus of %" PRId64 " MiB\n",
|
||||||
|
__func__, global_surplus_cpu_moe/MiB);
|
||||||
|
} else {
|
||||||
|
LLAMA_LOG_INFO("%s: with only dense weights in device memory there is still a total deficit of %" PRId64 " MiB\n",
|
||||||
|
__func__, -global_surplus_cpu_moe/MiB);
|
||||||
|
}
|
||||||
|
|
||||||
|
// reset
|
||||||
|
tensor_buft_overrides[0] = {nullptr, nullptr};
|
||||||
|
mparams->tensor_buft_overrides = tensor_buft_overrides;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<int64_t> targets; // maximum acceptable memory use per device
|
||||||
|
targets.reserve(nd);
|
||||||
|
for (size_t id = 0; id < nd; id++) {
|
||||||
|
targets.push_back(dmds_full[id].free - margin);
|
||||||
|
LLAMA_LOG_DEBUG("%s: id=%zu, target=%" PRId64 " MiB\n", __func__, id, targets[id]/MiB);
|
||||||
|
}
|
||||||
|
|
||||||
|
// whether for the optimal memory use we expect to load at least some MoE tensors:
|
||||||
|
const bool partial_moe = hp_nex > 0 && global_surplus_cpu_moe > 0;
|
||||||
|
|
||||||
|
std::vector<ggml_backend_buffer_type_t> overflow_bufts; // which bufts the partial layers of a device overflow to:
|
||||||
|
overflow_bufts.reserve(nd);
|
||||||
|
for (size_t id = 0; id < nd - 1; ++id) {
|
||||||
|
overflow_bufts.push_back(ggml_backend_dev_buffer_type(devs[id + 1]));
|
||||||
|
}
|
||||||
|
overflow_bufts.push_back(ggml_backend_cpu_buffer_type());
|
||||||
|
|
||||||
|
std::vector<ngl_t> ngl_per_device(nd);
|
||||||
|
std::vector<int64_t> mem = get_memory_for_layers(__func__, ngl_per_device, overflow_bufts, partial_moe);
|
||||||
|
if (hp_nex > 0) {
|
||||||
|
for (size_t id = 0; id < nd; id++) {
|
||||||
|
ngl_per_device[id].overflow_type = LAYER_FRACTION_MOE;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// optimize the number of layers per device using the method of false position:
|
||||||
|
// - ngl_per_device has 0 layers for each device, lower bound
|
||||||
|
// - try a "high" configuration where a device is given all unassigned layers
|
||||||
|
// - interpolate the memory use / layer between low and high linearly to get a guess where it meets our target
|
||||||
|
// - check memory use of our guess, replace either the low or high bound
|
||||||
|
// - once we only have a difference of a single layer, stop and return the lower bound that just barely still fits
|
||||||
|
if (hp_nex == 0) {
|
||||||
|
LLAMA_LOG_INFO("%s: filling dense layers back-to-front:\n", __func__);
|
||||||
|
} else {
|
||||||
|
LLAMA_LOG_INFO("%s: filling dense-only layers back-to-front:\n", __func__);
|
||||||
|
}
|
||||||
|
uint32_t n_unassigned = hp_ngl;
|
||||||
|
for (int id = nd - 1; id >= 0; id--) {
|
||||||
|
std::vector<ngl_t> ngl_per_device_high = ngl_per_device;
|
||||||
|
ngl_per_device_high[id].n_layer = n_unassigned;
|
||||||
|
if (hp_nex > 0) {
|
||||||
|
ngl_per_device_high[id].n_part = ngl_per_device_high[id].n_layer;
|
||||||
|
}
|
||||||
|
if (ngl_per_device_high[id].n_layer > 0) {
|
||||||
|
std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts, partial_moe);
|
||||||
|
if (mem_high[id] > targets[id]) {
|
||||||
|
uint32_t delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer;
|
||||||
|
while (delta > 1) {
|
||||||
|
uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]);
|
||||||
|
step_size = std::max(step_size, uint32_t(1));
|
||||||
|
step_size = std::min(step_size, delta - 1);
|
||||||
|
|
||||||
|
std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
|
||||||
|
ngl_per_device_test[id].n_layer += step_size;
|
||||||
|
if (hp_nex) {
|
||||||
|
ngl_per_device_test[id].n_part += step_size;
|
||||||
|
}
|
||||||
|
const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);
|
||||||
|
|
||||||
|
if (mem_test[id] <= targets[id]) {
|
||||||
|
ngl_per_device = ngl_per_device_test;
|
||||||
|
mem = mem_test;
|
||||||
|
n_unassigned -= ngl_per_device[id].n_layer;
|
||||||
|
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
|
||||||
|
} else {
|
||||||
|
ngl_per_device_high = ngl_per_device_test;
|
||||||
|
mem_high = mem_test;
|
||||||
|
LLAMA_LOG_DEBUG("%s: set ngl_per_device_high[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
|
||||||
|
}
|
||||||
|
delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
ngl_per_device = ngl_per_device_high;
|
||||||
|
n_unassigned -= ngl_per_device[id].n_layer;
|
||||||
|
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const int64_t projected_margin = dmds_full[id].free - mem[id];
|
||||||
|
LLAMA_LOG_INFO(
|
||||||
|
"%s: - %s: %2" PRIu32 " layers, %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
|
||||||
|
__func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, mem[id]/MiB, projected_margin/MiB);
|
||||||
|
}
|
||||||
|
if (hp_nex == 0 || global_surplus_cpu_moe <= 0) {
|
||||||
|
set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams, partial_moe);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// step 4: for a MoE model where all dense tensors fit,
|
||||||
|
// convert the dense-only layers in the back to full layers in the front until all devices are full
|
||||||
|
// essentially the same procedure as for the dense-only layers except front-to-back
|
||||||
|
// also, try fitting at least part of one more layer to reduce waste for "small" GPUs with e.g. 24 GiB VRAM
|
||||||
|
|
||||||
|
size_t id_dense_start = nd;
|
||||||
|
for (int id = nd - 1; id >= 0; id--) {
|
||||||
|
if (ngl_per_device[id].n_layer > 0) {
|
||||||
|
id_dense_start = id;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
assert(id_dense_start < nd);
|
||||||
|
|
||||||
|
LLAMA_LOG_INFO("%s: converting dense-only layers to full layers and filling them front-to-back with overflow to next device/system memory:\n", __func__);
|
||||||
|
for (size_t id = 0; id <= id_dense_start; id++) {
|
||||||
|
std::vector<ngl_t> ngl_per_device_high = ngl_per_device;
|
||||||
|
for (size_t jd = id_dense_start; jd < nd; jd++) {
|
||||||
|
const uint32_t n_layer_move = ngl_per_device_high[jd].n_layer;
|
||||||
|
ngl_per_device_high[id].n_layer += n_layer_move;
|
||||||
|
ngl_per_device_high[jd].n_layer -= n_layer_move;
|
||||||
|
ngl_per_device_high[jd].n_part = 0;
|
||||||
|
}
|
||||||
|
size_t id_dense_start_high = nd - 1;
|
||||||
|
std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts, partial_moe);
|
||||||
|
|
||||||
|
if (mem_high[id] > targets[id]) {
|
||||||
|
assert(ngl_per_device_high[id].n_layer >= ngl_per_device_high[id].n_part);
|
||||||
|
assert(ngl_per_device[id].n_layer >= ngl_per_device[id].n_part);
|
||||||
|
assert((ngl_per_device_high[id].n_layer - ngl_per_device_high[id].n_part)
|
||||||
|
>= ngl_per_device[id].n_layer - ngl_per_device[id].n_part);
|
||||||
|
uint32_t delta = (ngl_per_device_high[id].n_layer - ngl_per_device_high[id].n_part)
|
||||||
|
- (ngl_per_device[id].n_layer - ngl_per_device[id].n_part);
|
||||||
|
while (delta > 1) {
|
||||||
|
uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]);
|
||||||
|
step_size = std::max(step_size, uint32_t(1));
|
||||||
|
step_size = std::min(step_size, delta - 1);
|
||||||
|
|
||||||
|
std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
|
||||||
|
size_t id_dense_start_test = id_dense_start;
|
||||||
|
uint32_t n_converted_test = 0;
|
||||||
|
for (;id_dense_start_test < nd; id_dense_start_test++) {
|
||||||
|
const uint32_t n_convert_jd = std::min(step_size - n_converted_test, ngl_per_device_test[id_dense_start_test].n_part);
|
||||||
|
ngl_per_device_test[id_dense_start_test].n_layer -= n_convert_jd;
|
||||||
|
ngl_per_device_test[id_dense_start_test].n_part -= n_convert_jd;
|
||||||
|
ngl_per_device_test[id].n_layer += n_convert_jd;
|
||||||
|
n_converted_test += n_convert_jd;
|
||||||
|
|
||||||
|
if (ngl_per_device_test[id_dense_start_test].n_layer > 0) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);
|
||||||
|
|
||||||
|
if (mem_test[id] <= targets[id]) {
|
||||||
|
ngl_per_device = ngl_per_device_test;
|
||||||
|
mem = mem_test;
|
||||||
|
id_dense_start = id_dense_start_test;
|
||||||
|
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start=%zu\n",
|
||||||
|
__func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
|
||||||
|
} else {
|
||||||
|
ngl_per_device_high = ngl_per_device_test;
|
||||||
|
mem_high = mem_test;
|
||||||
|
id_dense_start_high = id_dense_start_test;
|
||||||
|
LLAMA_LOG_DEBUG("%s: set ngl_per_device_high[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start_high=%zu\n",
|
||||||
|
__func__, id, ngl_per_device_high[id].n_layer, ngl_per_device_high[id].n_part, id_dense_start_high);
|
||||||
|
}
|
||||||
|
delta = (ngl_per_device_high[id].n_layer - ngl_per_device_high[id].n_part)
|
||||||
|
- (ngl_per_device[id].n_layer - ngl_per_device[id].n_part);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
ngl_per_device = ngl_per_device_high;
|
||||||
|
id_dense_start = id_dense_start_high;
|
||||||
|
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start=%zu\n",
|
||||||
|
__func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
|
||||||
|
}
|
||||||
|
|
||||||
|
// try to fit at least part of one more layer
|
||||||
|
if (ngl_per_device[id_dense_start].n_layer > 0) {
|
||||||
|
std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
|
||||||
|
size_t id_dense_start_test = id_dense_start;
|
||||||
|
ngl_per_device_test[id_dense_start_test].n_layer--;
|
||||||
|
ngl_per_device_test[id_dense_start_test].n_part--;
|
||||||
|
ngl_per_device_test[id].n_layer++;
|
||||||
|
ngl_per_device_test[id].n_part++;
|
||||||
|
if (ngl_per_device_test[id_dense_start_test].n_layer == 0) {
|
||||||
|
id_dense_start_test++;
|
||||||
|
}
|
||||||
|
ngl_per_device_test[id].overflow_type = LAYER_FRACTION_UP;
|
||||||
|
LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_UP\n", __func__);
|
||||||
|
std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);
|
||||||
|
if (mem_test[id] < targets[id]) {
|
||||||
|
ngl_per_device = ngl_per_device_test;
|
||||||
|
mem = mem_test;
|
||||||
|
id_dense_start = id_dense_start_test;
|
||||||
|
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", UP), id_dense_start=%zu\n",
|
||||||
|
__func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
|
||||||
|
|
||||||
|
ngl_per_device_test[id].overflow_type = LAYER_FRACTION_GATE;
|
||||||
|
LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_GATE\n", __func__);
|
||||||
|
mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);
|
||||||
|
if (mem_test[id] < targets[id]) {
|
||||||
|
ngl_per_device = ngl_per_device_test;
|
||||||
|
mem = mem_test;
|
||||||
|
id_dense_start = id_dense_start_test;
|
||||||
|
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", GATE), id_dense_start=%zu\n",
|
||||||
|
__func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
ngl_per_device_test[id].overflow_type = LAYER_FRACTION_ATTN;
|
||||||
|
LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_ATTN\n", __func__);
|
||||||
|
mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);
|
||||||
|
if (mem_test[id] < targets[id]) {
|
||||||
|
ngl_per_device = ngl_per_device_test;
|
||||||
|
mem = mem_test;
|
||||||
|
id_dense_start = id_dense_start_test;
|
||||||
|
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", ATTN), id_dense_start=%zu\n",
|
||||||
|
__func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const int64_t projected_margin = dmds_full[id].free - mem[id];
|
||||||
|
LLAMA_LOG_INFO(
|
||||||
|
"%s: - %s: %2" PRIu32 " layers (%2" PRIu32 " overflowing), %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
|
||||||
|
__func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB);
|
||||||
|
}
|
||||||
|
|
||||||
|
set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams, partial_moe);
|
||||||
|
}
|
||||||
|
|
||||||
|
bool llama_params_fit(
|
||||||
|
const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
|
||||||
|
float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
|
||||||
|
size_t margin_s, uint32_t n_ctx_min, enum ggml_log_level log_level) {
|
||||||
|
const int64_t t0_us = llama_time_us();
|
||||||
|
bool ok = true;
|
||||||
|
try {
|
||||||
|
llama_params_fit_impl(path_model, mparams, cparams, tensor_split, tensor_buft_overrides, margin_s, n_ctx_min, log_level);
|
||||||
|
LLAMA_LOG_INFO("%s: successfully fit params to free device memory\n", __func__);
|
||||||
|
} catch (const std::runtime_error & e) {
|
||||||
|
LLAMA_LOG_WARN("%s: failed to fit params to free device memory: %s\n", __func__, e.what());
|
||||||
|
ok = false;
|
||||||
|
}
|
||||||
|
const int64_t t1_us = llama_time_us();
|
||||||
|
LLAMA_LOG_INFO("%s: fitting params to free memory took %.2f seconds\n", __func__, (t1_us - t0_us) * 1e-6);
|
||||||
|
return ok;
|
||||||
|
}
|
||||||
|
|
||||||
struct llama_sampler_chain_params llama_sampler_chain_default_params() {
|
struct llama_sampler_chain_params llama_sampler_chain_default_params() {
|
||||||
struct llama_sampler_chain_params result = {
|
struct llama_sampler_chain_params result = {
|
||||||
/*.no_perf =*/ true,
|
/*.no_perf =*/ true,
|
||||||
|
|
@ -49,6 +695,10 @@ size_t llama_max_devices(void) {
|
||||||
return 16;
|
return 16;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
size_t llama_max_tensor_buft_overrides() {
|
||||||
|
return 4096;
|
||||||
|
}
|
||||||
|
|
||||||
bool llama_supports_mmap(void) {
|
bool llama_supports_mmap(void) {
|
||||||
return llama_mmap::SUPPORTED;
|
return llama_mmap::SUPPORTED;
|
||||||
}
|
}
|
||||||
|
|
@ -108,11 +758,12 @@ static int llama_model_load(const std::string & fname, std::vector<std::string>
|
||||||
model.t_start_us = tm.t_start_us;
|
model.t_start_us = tm.t_start_us;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.kv_overrides, params.tensor_buft_overrides);
|
llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides);
|
||||||
|
|
||||||
ml.print_info();
|
ml.print_info();
|
||||||
|
|
||||||
model.hparams.vocab_only = params.vocab_only;
|
model.hparams.vocab_only = params.vocab_only;
|
||||||
|
model.hparams.no_alloc = params.no_alloc;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
model.load_arch(ml);
|
model.load_arch(ml);
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,5 @@
|
||||||
#include "models.h"
|
#include "models.h"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_graph_params & params) :
|
llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_graph_params & params) :
|
||||||
llm_graph_context(params) {
|
llm_graph_context(params) {
|
||||||
// lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B
|
// lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B
|
||||||
|
|
@ -20,9 +18,15 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
|
||||||
|
|
||||||
// We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly.
|
// We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly.
|
||||||
// See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
|
// See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
|
||||||
const float mscale = attn_factor * (1.0f + hparams.rope_yarn_log_mul * logf(1.0f / freq_scale));
|
// And also: https://github.com/ggml-org/llama.cpp/pull/17945 [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
|
||||||
|
|
||||||
|
// first cancel the adjustment from llama_hparams::yarn_attn_factor_adjust to get the original attn_factor
|
||||||
|
GGML_ASSERT(ext_factor >= 0.0f);
|
||||||
|
const float attn_factor_org = attn_factor * (1.0f + 0.1f * logf(1.0f / freq_scale));
|
||||||
|
|
||||||
|
// use the original attn_factor to pre-scale the kq_scale
|
||||||
|
const float mscale = attn_factor_org * (1.0f + 0.1f * hparams.rope_yarn_log_mul * logf(1.0f / freq_scale));
|
||||||
const float kq_scale = 1.0f * mscale * mscale / sqrtf(float(n_embd_head_k));
|
const float kq_scale = 1.0f * mscale * mscale / sqrtf(float(n_embd_head_k));
|
||||||
const float attn_factor = 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale));
|
|
||||||
|
|
||||||
ggml_tensor * cur;
|
ggml_tensor * cur;
|
||||||
ggml_tensor * inpL;
|
ggml_tensor * inpL;
|
||||||
|
|
@ -30,6 +34,12 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
|
||||||
// {n_embd, n_tokens}
|
// {n_embd, n_tokens}
|
||||||
inpL = build_inp_embd(model.tok_embd);
|
inpL = build_inp_embd(model.tok_embd);
|
||||||
|
|
||||||
|
// (optional) temperature tuning - used by mistral-large
|
||||||
|
ggml_tensor * inp_attn_scale = nullptr;
|
||||||
|
if (hparams.f_attn_temp_scale != 0.0f) {
|
||||||
|
inp_attn_scale = build_inp_attn_scale();
|
||||||
|
}
|
||||||
|
|
||||||
// inp_pos - contains the positions
|
// inp_pos - contains the positions
|
||||||
ggml_tensor * inp_pos = build_inp_pos();
|
ggml_tensor * inp_pos = build_inp_pos();
|
||||||
|
|
||||||
|
|
@ -128,6 +138,12 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
|
||||||
ggml_tensor * Vcur = kv_cmpr;
|
ggml_tensor * Vcur = kv_cmpr;
|
||||||
cb(Vcur, "Vcur", il);
|
cb(Vcur, "Vcur", il);
|
||||||
|
|
||||||
|
if (inp_attn_scale) {
|
||||||
|
// apply llama 4 temperature scaling
|
||||||
|
Qcur = ggml_mul(ctx0, Qcur, inp_attn_scale);
|
||||||
|
cb(Qcur, "Qcur_attn_temp_scaled", il);
|
||||||
|
}
|
||||||
|
|
||||||
// note: MLA with the absorption optimzation converts into MQA (ie: GQA with 1 group)
|
// note: MLA with the absorption optimzation converts into MQA (ie: GQA with 1 group)
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, NULL,
|
model.layers[il].wo, NULL,
|
||||||
|
|
@ -160,6 +176,12 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
|
||||||
ggml_tensor * Kcur = ggml_concat(ctx0, ggml_repeat(ctx0, k_pe, q_pe), k_nope, 0);
|
ggml_tensor * Kcur = ggml_concat(ctx0, ggml_repeat(ctx0, k_pe, q_pe), k_nope, 0);
|
||||||
cb(Kcur, "Kcur", il);
|
cb(Kcur, "Kcur", il);
|
||||||
|
|
||||||
|
if (inp_attn_scale) {
|
||||||
|
// apply llama 4 temperature scaling
|
||||||
|
Qcur = ggml_mul(ctx0, Qcur, inp_attn_scale);
|
||||||
|
cb(Qcur, "Qcur_attn_temp_scaled", il);
|
||||||
|
}
|
||||||
|
|
||||||
// note: MLA without the absorption optimization converts into MHA (ie: GQA with full n_head groups)
|
// note: MLA without the absorption optimization converts into MHA (ie: GQA with full n_head groups)
|
||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, NULL,
|
model.layers[il].wo, NULL,
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,7 @@
|
||||||
#include "models.h"
|
#include "models.h"
|
||||||
|
|
||||||
llm_build_gemma3_iswa::llm_build_gemma3_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
template <bool iswa>
|
||||||
|
llm_build_gemma3<iswa>::llm_build_gemma3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||||
const int64_t n_embd_head = hparams.n_embd_head_k;
|
const int64_t n_embd_head = hparams.n_embd_head_k;
|
||||||
|
|
||||||
ggml_tensor * cur;
|
ggml_tensor * cur;
|
||||||
|
|
@ -17,13 +18,28 @@ llm_build_gemma3_iswa::llm_build_gemma3_iswa(const llama_model & model, const ll
|
||||||
ggml_tensor * inp_pos = build_inp_pos();
|
ggml_tensor * inp_pos = build_inp_pos();
|
||||||
|
|
||||||
// TODO: is causal == true correct? might need some changes
|
// TODO: is causal == true correct? might need some changes
|
||||||
auto * inp_attn = build_attn_inp_kv_iswa();
|
using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_iswa, llm_graph_input_attn_kv>;
|
||||||
|
inp_attn_type * inp_attn = nullptr;
|
||||||
|
|
||||||
|
if constexpr (iswa) {
|
||||||
|
inp_attn = build_attn_inp_kv_iswa();
|
||||||
|
} else {
|
||||||
|
inp_attn = build_attn_inp_kv();
|
||||||
|
}
|
||||||
|
|
||||||
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
const float freq_base_l = model.get_rope_freq_base (cparams, il);
|
float freq_base_l = 0.0f;
|
||||||
const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
|
float freq_scale_l = 0.0f;
|
||||||
|
|
||||||
|
if constexpr (iswa) {
|
||||||
|
freq_base_l = model.get_rope_freq_base (cparams, il);
|
||||||
|
freq_scale_l = model.get_rope_freq_scale(cparams, il);
|
||||||
|
} else {
|
||||||
|
freq_base_l = freq_base;
|
||||||
|
freq_scale_l = freq_scale;
|
||||||
|
}
|
||||||
|
|
||||||
// norm
|
// norm
|
||||||
cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
|
cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
|
||||||
|
|
@ -102,7 +118,7 @@ llm_build_gemma3_iswa::llm_build_gemma3_iswa(const llama_model & model, const ll
|
||||||
cur = build_norm(cur,
|
cur = build_norm(cur,
|
||||||
model.layers[il].ffn_post_norm, NULL,
|
model.layers[il].ffn_post_norm, NULL,
|
||||||
LLM_NORM_RMS, -1);
|
LLM_NORM_RMS, -1);
|
||||||
cb(cur, "ffn_post_norm", -1);
|
cb(cur, "ffn_post_norm", il);
|
||||||
|
|
||||||
cur = ggml_add(ctx0, cur, sa_out);
|
cur = ggml_add(ctx0, cur, sa_out);
|
||||||
|
|
||||||
|
|
@ -124,8 +140,17 @@ llm_build_gemma3_iswa::llm_build_gemma3_iswa(const llama_model & model, const ll
|
||||||
// lm_head
|
// lm_head
|
||||||
cur = build_lora_mm(model.output, cur);
|
cur = build_lora_mm(model.output, cur);
|
||||||
|
|
||||||
|
if (hparams.f_final_logit_softcapping) {
|
||||||
|
cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping);
|
||||||
|
cur = ggml_tanh(ctx0, cur);
|
||||||
|
cur = ggml_scale(ctx0, cur, hparams.f_final_logit_softcapping);
|
||||||
|
}
|
||||||
|
|
||||||
cb(cur, "result_output", -1);
|
cb(cur, "result_output", -1);
|
||||||
res->t_logits = cur;
|
res->t_logits = cur;
|
||||||
|
|
||||||
ggml_build_forward_expand(gf, cur);
|
ggml_build_forward_expand(gf, cur);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template struct llm_build_gemma3<false>;
|
||||||
|
template struct llm_build_gemma3<true>;
|
||||||
|
|
@ -5,11 +5,20 @@ llm_build_glm4_moe::llm_build_glm4_moe(const llama_model & model, const llm_grap
|
||||||
|
|
||||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||||
|
|
||||||
|
int sections[4];
|
||||||
|
std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
|
||||||
|
|
||||||
ggml_tensor * cur;
|
ggml_tensor * cur;
|
||||||
ggml_tensor * inpL;
|
ggml_tensor * inpL;
|
||||||
|
|
||||||
inpL = build_inp_embd(model.tok_embd);
|
inpL = build_inp_embd(model.tok_embd);
|
||||||
|
|
||||||
|
bool use_mrope = hparams.use_mrope();
|
||||||
|
if (ubatch.embd && !use_mrope) {
|
||||||
|
// unfortunately, we need to forcefully stop here, to avoid users complaining about wrong results
|
||||||
|
GGML_ABORT("This GGUF does not support multimodal. Please reconvert it.");
|
||||||
|
}
|
||||||
|
|
||||||
// inp_pos - contains the positions
|
// inp_pos - contains the positions
|
||||||
ggml_tensor * inp_pos = build_inp_pos();
|
ggml_tensor * inp_pos = build_inp_pos();
|
||||||
|
|
||||||
|
|
@ -60,17 +69,25 @@ llm_build_glm4_moe::llm_build_glm4_moe(const llama_model & model, const llm_grap
|
||||||
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
|
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
|
||||||
cb(Kcur, "Kcur_normed", il);
|
cb(Kcur, "Kcur_normed", il);
|
||||||
}
|
}
|
||||||
Qcur = ggml_rope_ext(
|
|
||||||
ctx0, Qcur, inp_pos, nullptr,
|
|
||||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
||||||
ext_factor, attn_factor, beta_fast, beta_slow
|
|
||||||
);
|
|
||||||
|
|
||||||
Kcur = ggml_rope_ext(
|
if (use_mrope) {
|
||||||
ctx0, Kcur, inp_pos, nullptr,
|
Qcur = ggml_rope_multi(ctx0, Qcur, inp_pos, nullptr,
|
||||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||||
ext_factor, attn_factor, beta_fast, beta_slow
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||||
);
|
|
||||||
|
Kcur = ggml_rope_multi(ctx0, Kcur, inp_pos, nullptr,
|
||||||
|
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||||
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||||
|
} else {
|
||||||
|
// Normal RoPE
|
||||||
|
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot,
|
||||||
|
rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||||
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||||
|
|
||||||
|
Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot,
|
||||||
|
rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||||
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||||
|
}
|
||||||
|
|
||||||
cb(Qcur, "Qcur", il);
|
cb(Qcur, "Qcur", il);
|
||||||
cb(Kcur, "Kcur", il);
|
cb(Kcur, "Kcur", il);
|
||||||
|
|
|
||||||
|
|
@ -8,11 +8,20 @@ llm_build_glm4::llm_build_glm4(const llama_model & model, const llm_graph_params
|
||||||
|
|
||||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||||
|
|
||||||
|
int sections[4];
|
||||||
|
std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
|
||||||
|
|
||||||
ggml_tensor * cur;
|
ggml_tensor * cur;
|
||||||
ggml_tensor * inpL;
|
ggml_tensor * inpL;
|
||||||
|
|
||||||
inpL = build_inp_embd(model.tok_embd);
|
inpL = build_inp_embd(model.tok_embd);
|
||||||
|
|
||||||
|
bool use_mrope = hparams.use_mrope();
|
||||||
|
if (ubatch.embd && !use_mrope) {
|
||||||
|
// unfortunately, we need to forcefully stop here, to avoid users complaining about wrong results
|
||||||
|
GGML_ABORT("This GGUF does not support multimodal. Please reconvert it.");
|
||||||
|
}
|
||||||
|
|
||||||
// inp_pos - contains the positions
|
// inp_pos - contains the positions
|
||||||
ggml_tensor * inp_pos = build_inp_pos();
|
ggml_tensor * inp_pos = build_inp_pos();
|
||||||
|
|
||||||
|
|
@ -63,11 +72,25 @@ llm_build_glm4::llm_build_glm4(const llama_model & model, const llm_graph_params
|
||||||
Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float),
|
Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float),
|
||||||
cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa));
|
cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa));
|
||||||
}
|
}
|
||||||
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
||||||
|
if (use_mrope) {
|
||||||
|
Qcur = ggml_rope_multi(ctx0, Qcur, inp_pos, nullptr,
|
||||||
|
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||||
|
|
||||||
Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
Kcur = ggml_rope_multi(ctx0, Kcur, inp_pos, nullptr,
|
||||||
|
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||||
|
} else {
|
||||||
|
// Normal RoPE
|
||||||
|
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot,
|
||||||
|
rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||||
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||||
|
|
||||||
|
Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot,
|
||||||
|
rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||||
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||||
|
}
|
||||||
|
|
||||||
cb(Qcur, "Qcur", il);
|
cb(Qcur, "Qcur", il);
|
||||||
cb(Kcur, "Kcur", il);
|
cb(Kcur, "Kcur", il);
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,160 @@
|
||||||
|
#include "models.h"
|
||||||
|
|
||||||
|
llm_build_mistral3::llm_build_mistral3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||||
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||||
|
|
||||||
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||||
|
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||||
|
|
||||||
|
ggml_tensor * cur;
|
||||||
|
ggml_tensor * inpL;
|
||||||
|
|
||||||
|
inpL = build_inp_embd(model.tok_embd);
|
||||||
|
|
||||||
|
// inp_pos - contains the positions
|
||||||
|
ggml_tensor * inp_pos = build_inp_pos();
|
||||||
|
|
||||||
|
// (optional) temperature tuning
|
||||||
|
ggml_tensor * inp_attn_scale = nullptr;
|
||||||
|
if (hparams.f_attn_temp_scale != 0.0f) {
|
||||||
|
inp_attn_scale = build_inp_attn_scale();
|
||||||
|
}
|
||||||
|
|
||||||
|
auto * inp_attn = build_attn_inp_kv();
|
||||||
|
|
||||||
|
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
||||||
|
|
||||||
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||||
|
|
||||||
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
|
ggml_tensor * inpSA = inpL;
|
||||||
|
|
||||||
|
// norm
|
||||||
|
cur = build_norm(inpL,
|
||||||
|
model.layers[il].attn_norm, NULL,
|
||||||
|
LLM_NORM_RMS, il);
|
||||||
|
cb(cur, "attn_norm", il);
|
||||||
|
|
||||||
|
// self-attention
|
||||||
|
{
|
||||||
|
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
||||||
|
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
||||||
|
|
||||||
|
// compute Q and K and RoPE them
|
||||||
|
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
||||||
|
cb(Qcur, "Qcur", il);
|
||||||
|
if (model.layers[il].bq) {
|
||||||
|
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
||||||
|
cb(Qcur, "Qcur", il);
|
||||||
|
}
|
||||||
|
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
||||||
|
cb(Kcur, "Kcur", il);
|
||||||
|
if (model.layers[il].bk) {
|
||||||
|
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
||||||
|
cb(Kcur, "Kcur", il);
|
||||||
|
}
|
||||||
|
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
||||||
|
cb(Vcur, "Vcur", il);
|
||||||
|
if (model.layers[il].bv) {
|
||||||
|
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
||||||
|
cb(Vcur, "Vcur", il);
|
||||||
|
}
|
||||||
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
||||||
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
||||||
|
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
||||||
|
|
||||||
|
Qcur = ggml_rope_ext(
|
||||||
|
ctx0, Qcur, inp_pos, rope_factors,
|
||||||
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||||
|
ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
|
);
|
||||||
|
|
||||||
|
Kcur = ggml_rope_ext(
|
||||||
|
ctx0, Kcur, inp_pos, rope_factors,
|
||||||
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||||
|
ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
|
);
|
||||||
|
|
||||||
|
cb(Qcur, "Qcur", il);
|
||||||
|
cb(Kcur, "Kcur", il);
|
||||||
|
cb(Vcur, "Vcur", il);
|
||||||
|
|
||||||
|
if (inp_attn_scale) {
|
||||||
|
// apply llama 4 temperature scaling
|
||||||
|
Qcur = ggml_mul(ctx0, Qcur, inp_attn_scale);
|
||||||
|
cb(Qcur, "Qcur_attn_temp_scaled", il);
|
||||||
|
}
|
||||||
|
|
||||||
|
cur = build_attn(inp_attn,
|
||||||
|
model.layers[il].wo, model.layers[il].bo,
|
||||||
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
||||||
|
cb(cur, "attn_out", il);
|
||||||
|
}
|
||||||
|
if (il == n_layer - 1 && inp_out_ids) {
|
||||||
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||||
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
||||||
|
}
|
||||||
|
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
||||||
|
cb(ffn_inp, "ffn_inp", il);
|
||||||
|
|
||||||
|
// feed-forward network (non-MoE)
|
||||||
|
if (model.layers[il].ffn_gate_inp == nullptr) {
|
||||||
|
|
||||||
|
cur = build_norm(ffn_inp,
|
||||||
|
model.layers[il].ffn_norm, NULL,
|
||||||
|
LLM_NORM_RMS, il);
|
||||||
|
cb(cur, "ffn_norm", il);
|
||||||
|
|
||||||
|
cur = build_ffn(cur,
|
||||||
|
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
||||||
|
model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
|
||||||
|
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
||||||
|
NULL,
|
||||||
|
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
||||||
|
cb(cur, "ffn_out", il);
|
||||||
|
} else {
|
||||||
|
// MoE branch
|
||||||
|
cur = build_norm(ffn_inp,
|
||||||
|
model.layers[il].ffn_norm, NULL,
|
||||||
|
LLM_NORM_RMS, il);
|
||||||
|
cb(cur, "ffn_norm", il);
|
||||||
|
|
||||||
|
cur = build_moe_ffn(cur,
|
||||||
|
model.layers[il].ffn_gate_inp,
|
||||||
|
model.layers[il].ffn_up_exps,
|
||||||
|
model.layers[il].ffn_gate_exps,
|
||||||
|
model.layers[il].ffn_down_exps,
|
||||||
|
nullptr,
|
||||||
|
n_expert, n_expert_used,
|
||||||
|
LLM_FFN_SILU, true,
|
||||||
|
false, 0.0,
|
||||||
|
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
||||||
|
il);
|
||||||
|
cb(cur, "ffn_moe_out", il);
|
||||||
|
}
|
||||||
|
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||||
|
cb(cur, "ffn_out", il);
|
||||||
|
|
||||||
|
cur = build_cvec(cur, il);
|
||||||
|
cb(cur, "l_out", il);
|
||||||
|
|
||||||
|
// input for next layer
|
||||||
|
inpL = cur;
|
||||||
|
}
|
||||||
|
cur = inpL;
|
||||||
|
|
||||||
|
cur = build_norm(cur,
|
||||||
|
model.output_norm, NULL,
|
||||||
|
LLM_NORM_RMS, -1);
|
||||||
|
|
||||||
|
cb(cur, "result_norm", -1);
|
||||||
|
res->t_embd = cur;
|
||||||
|
|
||||||
|
// lm_head
|
||||||
|
cur = build_lora_mm(model.output, cur);
|
||||||
|
|
||||||
|
cb(cur, "result_output", -1);
|
||||||
|
res->t_logits = cur;
|
||||||
|
|
||||||
|
ggml_build_forward_expand(gf, cur);
|
||||||
|
}
|
||||||
|
|
@ -179,8 +179,9 @@ struct llm_build_gemma2_iswa : public llm_graph_context {
|
||||||
llm_build_gemma2_iswa(const llama_model & model, const llm_graph_params & params);
|
llm_build_gemma2_iswa(const llama_model & model, const llm_graph_params & params);
|
||||||
};
|
};
|
||||||
|
|
||||||
struct llm_build_gemma3_iswa : public llm_graph_context {
|
template <bool iswa>
|
||||||
llm_build_gemma3_iswa(const llama_model & model, const llm_graph_params & params);
|
struct llm_build_gemma3 : public llm_graph_context {
|
||||||
|
llm_build_gemma3(const llama_model & model, const llm_graph_params & params);
|
||||||
};
|
};
|
||||||
|
|
||||||
struct llm_build_gemma3n_iswa : public llm_graph_context {
|
struct llm_build_gemma3n_iswa : public llm_graph_context {
|
||||||
|
|
@ -322,6 +323,10 @@ struct llm_build_minimax_m2 : public llm_graph_context {
|
||||||
llm_build_minimax_m2(const llama_model & model, const llm_graph_params & params);
|
llm_build_minimax_m2(const llama_model & model, const llm_graph_params & params);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct llm_build_mistral3 : public llm_graph_context {
|
||||||
|
llm_build_mistral3(const llama_model & model, const llm_graph_params & params);
|
||||||
|
};
|
||||||
|
|
||||||
struct llm_build_mpt : public llm_graph_context {
|
struct llm_build_mpt : public llm_graph_context {
|
||||||
llm_build_mpt(const llama_model & model, const llm_graph_params & params);
|
llm_build_mpt(const llama_model & model, const llm_graph_params & params);
|
||||||
};
|
};
|
||||||
|
|
@ -436,23 +441,13 @@ private:
|
||||||
ggml_tensor * cur,
|
ggml_tensor * cur,
|
||||||
ggml_tensor * causal_mask,
|
ggml_tensor * causal_mask,
|
||||||
ggml_tensor * identity,
|
ggml_tensor * identity,
|
||||||
|
ggml_tensor * diag_mask,
|
||||||
int il);
|
int il);
|
||||||
|
|
||||||
ggml_tensor * build_layer_ffn(
|
ggml_tensor * build_layer_ffn(
|
||||||
ggml_tensor * cur,
|
ggml_tensor * cur,
|
||||||
int il);
|
int il);
|
||||||
|
|
||||||
ggml_tensor * build_delta_net_recurrent(
|
|
||||||
ggml_tensor * q,
|
|
||||||
ggml_tensor * k,
|
|
||||||
ggml_tensor * v,
|
|
||||||
ggml_tensor * g,
|
|
||||||
ggml_tensor * beta,
|
|
||||||
ggml_tensor * state,
|
|
||||||
ggml_tensor * causal_mask,
|
|
||||||
ggml_tensor * identity,
|
|
||||||
int il);
|
|
||||||
|
|
||||||
ggml_tensor * build_delta_net_chunking(
|
ggml_tensor * build_delta_net_chunking(
|
||||||
ggml_tensor * q,
|
ggml_tensor * q,
|
||||||
ggml_tensor * k,
|
ggml_tensor * k,
|
||||||
|
|
@ -462,6 +457,16 @@ private:
|
||||||
ggml_tensor * state,
|
ggml_tensor * state,
|
||||||
ggml_tensor * causal_mask,
|
ggml_tensor * causal_mask,
|
||||||
ggml_tensor * identity,
|
ggml_tensor * identity,
|
||||||
|
ggml_tensor * diag_mask,
|
||||||
|
int il);
|
||||||
|
|
||||||
|
ggml_tensor * build_delta_net_autoregressive(
|
||||||
|
ggml_tensor * q,
|
||||||
|
ggml_tensor * k,
|
||||||
|
ggml_tensor * v,
|
||||||
|
ggml_tensor * g,
|
||||||
|
ggml_tensor * beta,
|
||||||
|
ggml_tensor * state,
|
||||||
int il);
|
int il);
|
||||||
|
|
||||||
ggml_tensor * build_norm_gated(
|
ggml_tensor * build_norm_gated(
|
||||||
|
|
|
||||||
|
|
@ -107,12 +107,41 @@ ggml_tensor * llm_build_nemotron_h::build_attention_layer(ggml_tensor *
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_tensor * llm_build_nemotron_h::build_ffn_layer(ggml_tensor * cur, const llama_model & model, const int il) {
|
ggml_tensor * llm_build_nemotron_h::build_ffn_layer(ggml_tensor * cur, const llama_model & model, const int il) {
|
||||||
|
if (model.layers[il].ffn_gate_inp == nullptr) {
|
||||||
cur = build_ffn(cur,
|
cur = build_ffn(cur,
|
||||||
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
||||||
NULL, NULL, NULL,
|
NULL, NULL, NULL,
|
||||||
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
||||||
NULL, LLM_FFN_RELU_SQR, LLM_FFN_PAR, il);
|
NULL,
|
||||||
|
LLM_FFN_RELU_SQR, LLM_FFN_PAR, il);
|
||||||
cb(cur, "ffn_out", il);
|
cb(cur, "ffn_out", il);
|
||||||
|
} else {
|
||||||
|
ggml_tensor * ffn_inp = cur;
|
||||||
|
ggml_tensor * moe_out =
|
||||||
|
build_moe_ffn(ffn_inp,
|
||||||
|
model.layers[il].ffn_gate_inp,
|
||||||
|
model.layers[il].ffn_up_exps,
|
||||||
|
nullptr, // no gate
|
||||||
|
model.layers[il].ffn_down_exps,
|
||||||
|
model.layers[il].ffn_exp_probs_b,
|
||||||
|
n_expert, n_expert_used,
|
||||||
|
LLM_FFN_RELU_SQR, hparams.expert_weights_norm,
|
||||||
|
true, hparams.expert_weights_scale,
|
||||||
|
LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID,
|
||||||
|
il);
|
||||||
|
cb(moe_out, "ffn_moe_out", il);
|
||||||
|
|
||||||
|
ggml_tensor * ffn_shexp = build_ffn(ffn_inp,
|
||||||
|
model.layers[il].ffn_up_shexp, NULL, NULL,
|
||||||
|
NULL /* no gate */ , NULL, NULL,
|
||||||
|
model.layers[il].ffn_down_shexp, NULL, NULL,
|
||||||
|
NULL,
|
||||||
|
LLM_FFN_RELU_SQR, LLM_FFN_PAR, il);
|
||||||
|
cb(ffn_shexp, "ffn_shexp", il);
|
||||||
|
|
||||||
|
cur = ggml_add(ctx0, moe_out, ffn_shexp);
|
||||||
|
cb(cur, "ffn_out", il);
|
||||||
|
}
|
||||||
|
|
||||||
cur = build_cvec(cur, il);
|
cur = build_cvec(cur, il);
|
||||||
cb(cur, "l_out", il);
|
cb(cur, "l_out", il);
|
||||||
|
|
|
||||||
|
|
@ -31,16 +31,25 @@ llm_build_qwen2::llm_build_qwen2(const llama_model & model, const llm_graph_para
|
||||||
{
|
{
|
||||||
// compute Q and K and RoPE them
|
// compute Q and K and RoPE them
|
||||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
||||||
|
cb(Qcur, "Qcur", il);
|
||||||
|
if (model.layers[il].bq) {
|
||||||
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
||||||
cb(Qcur, "Qcur", il);
|
cb(Qcur, "Qcur", il);
|
||||||
|
}
|
||||||
|
|
||||||
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
||||||
|
cb(Kcur, "Kcur", il);
|
||||||
|
if (model.layers[il].bk) {
|
||||||
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
||||||
cb(Kcur, "Kcur", il);
|
cb(Kcur, "Kcur", il);
|
||||||
|
}
|
||||||
|
|
||||||
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
||||||
|
cb(Vcur, "Vcur", il);
|
||||||
|
if (model.layers[il].bv) {
|
||||||
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
||||||
cb(Vcur, "Vcur", il);
|
cb(Vcur, "Vcur", il);
|
||||||
|
}
|
||||||
|
|
||||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
||||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
||||||
|
|
|
||||||
|
|
@ -17,13 +17,15 @@ llm_build_qwen3next::llm_build_qwen3next(const llama_model & model, const llm_gr
|
||||||
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||||
|
|
||||||
ggml_tensor * causal_mask =
|
ggml_tensor * causal_mask =
|
||||||
ggml_tri(ctx0, ggml_fill_inplace(ctx0, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, ubatch.n_seq_tokens, ubatch.n_seq_tokens), 1.0f),
|
ggml_tri(ctx0, ggml_fill_inplace(ctx0, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, CHUNK_SIZE, CHUNK_SIZE), 1.0f),
|
||||||
GGML_TRI_TYPE_LOWER);
|
GGML_TRI_TYPE_LOWER);
|
||||||
|
|
||||||
ggml_tensor * identity = ggml_diag(ctx0, ggml_fill_inplace(ctx0, ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, ubatch.n_seq_tokens), 1.0f));
|
ggml_tensor * identity = ggml_diag(ctx0, ggml_fill_inplace(ctx0, ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, CHUNK_SIZE), 1.0f));
|
||||||
|
ggml_tensor * diag_mask = ggml_add(ctx0, causal_mask, identity);
|
||||||
|
|
||||||
ggml_build_forward_expand(gf, causal_mask);
|
ggml_build_forward_expand(gf, causal_mask);
|
||||||
ggml_build_forward_expand(gf, identity);
|
ggml_build_forward_expand(gf, identity);
|
||||||
|
ggml_build_forward_expand(gf, diag_mask);
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
ggml_tensor * inpSA = inpL;
|
ggml_tensor * inpSA = inpL;
|
||||||
|
|
@ -34,7 +36,7 @@ llm_build_qwen3next::llm_build_qwen3next(const llama_model & model, const llm_gr
|
||||||
// Determine layer type and build appropriate attention mechanism
|
// Determine layer type and build appropriate attention mechanism
|
||||||
if (hparams.is_recurrent(il)) {
|
if (hparams.is_recurrent(il)) {
|
||||||
// Linear attention layer (gated delta net)
|
// Linear attention layer (gated delta net)
|
||||||
cur = build_layer_attn_linear(inp->get_recr(), cur, causal_mask, identity, il);
|
cur = build_layer_attn_linear(inp->get_recr(), cur, causal_mask, identity, diag_mask, il);
|
||||||
} else {
|
} else {
|
||||||
// Full attention layer
|
// Full attention layer
|
||||||
cur = build_layer_attn(inp->get_attn(), cur, inp_pos, il);
|
cur = build_layer_attn(inp->get_attn(), cur, inp_pos, il);
|
||||||
|
|
@ -93,14 +95,8 @@ ggml_tensor * llm_build_qwen3next::build_delta_net_chunking(
|
||||||
ggml_tensor * state,
|
ggml_tensor * state,
|
||||||
ggml_tensor * causal_mask,
|
ggml_tensor * causal_mask,
|
||||||
ggml_tensor * identity,
|
ggml_tensor * identity,
|
||||||
|
ggml_tensor * diag_mask,
|
||||||
int il) {
|
int il) {
|
||||||
GGML_ASSERT(ggml_is_contiguous(q));
|
|
||||||
GGML_ASSERT(ggml_is_contiguous(k));
|
|
||||||
GGML_ASSERT(ggml_is_contiguous(v));
|
|
||||||
GGML_ASSERT(ggml_is_contiguous(g));
|
|
||||||
GGML_ASSERT(ggml_is_contiguous(beta));
|
|
||||||
GGML_ASSERT(ggml_is_contiguous(state));
|
|
||||||
|
|
||||||
const int64_t S_k = q->ne[0];
|
const int64_t S_k = q->ne[0];
|
||||||
const int64_t H_k = q->ne[1];
|
const int64_t H_k = q->ne[1];
|
||||||
const int64_t n_tokens = q->ne[2];
|
const int64_t n_tokens = q->ne[2];
|
||||||
|
|
@ -120,15 +116,10 @@ ggml_tensor * llm_build_qwen3next::build_delta_net_chunking(
|
||||||
|
|
||||||
GGML_ASSERT(H_k == H_v); // we did a repeat to make sure this is the case
|
GGML_ASSERT(H_k == H_v); // we did a repeat to make sure this is the case
|
||||||
|
|
||||||
// TODO: can this ever be false?
|
|
||||||
const bool use_qk_l2norm = true;
|
|
||||||
|
|
||||||
if (use_qk_l2norm) {
|
|
||||||
const float eps_norm = hparams.f_norm_rms_eps;
|
const float eps_norm = hparams.f_norm_rms_eps;
|
||||||
|
|
||||||
q = ggml_l2_norm(ctx0, q, eps_norm);
|
q = ggml_l2_norm(ctx0, q, eps_norm);
|
||||||
k = ggml_l2_norm(ctx0, k, eps_norm);
|
k = ggml_l2_norm(ctx0, k, eps_norm);
|
||||||
}
|
|
||||||
|
|
||||||
const float scale = 1.0f / sqrtf(S_v);
|
const float scale = 1.0f / sqrtf(S_v);
|
||||||
|
|
||||||
|
|
@ -136,8 +127,6 @@ ggml_tensor * llm_build_qwen3next::build_delta_net_chunking(
|
||||||
|
|
||||||
beta = ggml_sigmoid(ctx0, beta);
|
beta = ggml_sigmoid(ctx0, beta);
|
||||||
|
|
||||||
ggml_tensor * causal_diag_mask = ggml_add(ctx0, causal_mask, identity);
|
|
||||||
|
|
||||||
cb(q, "q_in", il);
|
cb(q, "q_in", il);
|
||||||
cb(k, "k_in", il);
|
cb(k, "k_in", il);
|
||||||
cb(v, "v_in", il);
|
cb(v, "v_in", il);
|
||||||
|
|
@ -188,36 +177,21 @@ ggml_tensor * llm_build_qwen3next::build_delta_net_chunking(
|
||||||
cb(v_beta, "v_beta", il);
|
cb(v_beta, "v_beta", il);
|
||||||
cb(k_beta, "k_beta", il);
|
cb(k_beta, "k_beta", il);
|
||||||
|
|
||||||
ggml_tensor * chunked_mask =
|
q = ggml_reshape_4d(ctx0, q, S_k, chunk_size, n_chunks, H_k * n_seqs);
|
||||||
ggml_view_4d(ctx0, causal_mask, chunk_size,
|
k = ggml_reshape_4d(ctx0, k, S_k, chunk_size, n_chunks, H_k * n_seqs);
|
||||||
chunk_size, causal_mask->ne[2], causal_mask->ne[3],
|
k_beta = ggml_reshape_4d(ctx0, k_beta, S_k, chunk_size, n_chunks, H_k * n_seqs);
|
||||||
causal_mask->nb[1], causal_mask->nb[2], causal_mask->nb[3], 0);
|
v = ggml_reshape_4d(ctx0, v, S_v, chunk_size, n_chunks, H_v * n_seqs);
|
||||||
|
v_beta = ggml_reshape_4d(ctx0, v_beta, S_v, chunk_size, n_chunks, H_v * n_seqs);
|
||||||
|
|
||||||
ggml_tensor * chunked_diag_mask =
|
g = ggml_reshape_4d(ctx0, g, chunk_size, 1, n_chunks, H_k * n_seqs);
|
||||||
ggml_view_4d(ctx0, causal_diag_mask, chunk_size,
|
beta = ggml_reshape_4d(ctx0, beta, 1, chunk_size, n_chunks, H_k * n_seqs);
|
||||||
chunk_size, causal_diag_mask->ne[2], causal_diag_mask->ne[3],
|
|
||||||
causal_diag_mask->nb[1], causal_diag_mask->nb[2], causal_diag_mask->nb[3], 0);
|
|
||||||
|
|
||||||
ggml_tensor * chunked_identity =
|
|
||||||
ggml_view_4d(ctx0, identity, chunk_size,
|
|
||||||
chunk_size, identity->ne[2], identity->ne[3],
|
|
||||||
identity->nb[1], identity->nb[2], identity->nb[3], 0);
|
|
||||||
|
|
||||||
q = ggml_cont_4d(ctx0, q, S_k, chunk_size, n_chunks, H_k * n_seqs);
|
|
||||||
k = ggml_cont_4d(ctx0, k, S_k, chunk_size, n_chunks, H_k * n_seqs);
|
|
||||||
k_beta = ggml_cont_4d(ctx0, k_beta, S_k, chunk_size, n_chunks, H_k * n_seqs);
|
|
||||||
v = ggml_cont_4d(ctx0, v, S_v, chunk_size, n_chunks, H_v * n_seqs);
|
|
||||||
v_beta = ggml_cont_4d(ctx0, v_beta, S_v, chunk_size, n_chunks, H_v * n_seqs);
|
|
||||||
|
|
||||||
g = ggml_cont_4d(ctx0, g, chunk_size, 1, n_chunks, H_k * n_seqs);
|
|
||||||
beta = ggml_cont_4d(ctx0, beta, 1, chunk_size, n_chunks, H_k * n_seqs);
|
|
||||||
|
|
||||||
ggml_tensor * g_cumsum = ggml_cumsum(ctx0, g);
|
ggml_tensor * g_cumsum = ggml_cumsum(ctx0, g);
|
||||||
|
|
||||||
cb(g_cumsum, "g_cumsum", il);
|
cb(g_cumsum, "g_cumsum", il);
|
||||||
|
|
||||||
ggml_tensor * gcs_i = ggml_cont_4d(ctx0, g_cumsum, chunk_size, 1, n_chunks, H_v * n_seqs);
|
ggml_tensor * gcs_i = ggml_reshape_4d(ctx0, g_cumsum, chunk_size, 1, n_chunks, H_v * n_seqs);
|
||||||
ggml_tensor * gcs_j = ggml_cont_4d(ctx0, g_cumsum, 1, chunk_size, n_chunks, H_v * n_seqs);
|
ggml_tensor * gcs_j = ggml_reshape_4d(ctx0, g_cumsum, 1, chunk_size, n_chunks, H_v * n_seqs);
|
||||||
|
|
||||||
ggml_tensor * gcs_j_broadcast =
|
ggml_tensor * gcs_j_broadcast =
|
||||||
ggml_repeat_4d(ctx0, gcs_j, chunk_size, chunk_size, n_chunks, H_v * n_seqs);
|
ggml_repeat_4d(ctx0, gcs_j, chunk_size, chunk_size, n_chunks, H_v * n_seqs);
|
||||||
|
|
@ -226,23 +200,23 @@ ggml_tensor * llm_build_qwen3next::build_delta_net_chunking(
|
||||||
|
|
||||||
cb(decay_mask, "decay_mask", il);
|
cb(decay_mask, "decay_mask", il);
|
||||||
|
|
||||||
decay_mask = ggml_mul(ctx0, decay_mask, chunked_diag_mask);
|
decay_mask = ggml_mul(ctx0, decay_mask, diag_mask);
|
||||||
decay_mask = ggml_exp(ctx0, decay_mask);
|
decay_mask = ggml_exp(ctx0, decay_mask);
|
||||||
decay_mask = ggml_mul(ctx0, decay_mask, chunked_diag_mask);
|
decay_mask = ggml_mul(ctx0, decay_mask, diag_mask);
|
||||||
|
|
||||||
ggml_tensor * kmulkbeta = ggml_mul_mat(ctx0, k, k_beta);
|
ggml_tensor * kmulkbeta = ggml_mul_mat(ctx0, k, k_beta);
|
||||||
|
|
||||||
ggml_tensor * k_decay = ggml_mul(ctx0, kmulkbeta, decay_mask);
|
ggml_tensor * k_decay = ggml_mul(ctx0, kmulkbeta, decay_mask);
|
||||||
ggml_tensor * attn = ggml_neg(ctx0, ggml_mul(ctx0, k_decay, chunked_mask));
|
ggml_tensor * attn = ggml_neg(ctx0, ggml_mul(ctx0, k_decay, causal_mask));
|
||||||
|
|
||||||
cb(attn, "attn_pre_solve", il);
|
cb(attn, "attn_pre_solve", il);
|
||||||
|
|
||||||
ggml_tensor * attn_lower = ggml_mul(ctx0, attn, chunked_mask);
|
ggml_tensor * attn_lower = ggml_mul(ctx0, attn, causal_mask);
|
||||||
ggml_tensor * lhs = ggml_sub(ctx0, ggml_repeat(ctx0, chunked_identity, attn_lower), attn_lower);
|
ggml_tensor * lhs = ggml_sub(ctx0, ggml_repeat(ctx0, identity, attn_lower), attn_lower);
|
||||||
|
|
||||||
ggml_tensor * lin_solve = ggml_solve_tri(ctx0, lhs, attn, true, true, false);
|
ggml_tensor * lin_solve = ggml_solve_tri(ctx0, lhs, attn, true, true, false);
|
||||||
attn = ggml_mul(ctx0, lin_solve, chunked_mask);
|
attn = ggml_mul(ctx0, lin_solve, causal_mask);
|
||||||
attn = ggml_add(ctx0, attn, chunked_identity);
|
attn = ggml_add(ctx0, attn, identity);
|
||||||
|
|
||||||
cb(attn, "attn_solved", il);
|
cb(attn, "attn_solved", il);
|
||||||
|
|
||||||
|
|
@ -291,7 +265,7 @@ ggml_tensor * llm_build_qwen3next::build_delta_net_chunking(
|
||||||
// attn = (q_i @ k_i.transpose(-1, -2) * decay_mask[:, :, i]).masked_fill_(mask, 0)
|
// attn = (q_i @ k_i.transpose(-1, -2) * decay_mask[:, :, i]).masked_fill_(mask, 0)
|
||||||
attn = ggml_mul_mat(ctx0, k_chunk, q_chunk);
|
attn = ggml_mul_mat(ctx0, k_chunk, q_chunk);
|
||||||
attn = ggml_mul(ctx0, attn, decay_mask_chunk);
|
attn = ggml_mul(ctx0, attn, decay_mask_chunk);
|
||||||
attn = ggml_mul(ctx0, attn, ggml_add(ctx0, chunked_identity, chunked_mask));
|
attn = ggml_mul(ctx0, attn, diag_mask);
|
||||||
|
|
||||||
ggml_tensor * state_t = ggml_cont_4d(ctx0, ggml_permute(ctx0, new_state, 1, 0, 2, 3), S_v, S_v, 1, H_v * n_seqs);
|
ggml_tensor * state_t = ggml_cont_4d(ctx0, ggml_permute(ctx0, new_state, 1, 0, 2, 3), S_v, S_v, 1, H_v * n_seqs);
|
||||||
|
|
||||||
|
|
@ -361,23 +335,14 @@ ggml_tensor * llm_build_qwen3next::build_delta_net_chunking(
|
||||||
return ggml_concat(ctx0, flat_output, flat_state, 0);
|
return ggml_concat(ctx0, flat_output, flat_state, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_tensor * llm_build_qwen3next::build_delta_net_recurrent(
|
ggml_tensor * llm_build_qwen3next::build_delta_net_autoregressive(
|
||||||
ggml_tensor * q,
|
ggml_tensor * q,
|
||||||
ggml_tensor * k,
|
ggml_tensor * k,
|
||||||
ggml_tensor * v,
|
ggml_tensor * v,
|
||||||
ggml_tensor * g,
|
ggml_tensor * g,
|
||||||
ggml_tensor * beta,
|
ggml_tensor * beta,
|
||||||
ggml_tensor * state,
|
ggml_tensor * state,
|
||||||
ggml_tensor * causal_mask,
|
|
||||||
ggml_tensor * identity,
|
|
||||||
int il) {
|
int il) {
|
||||||
GGML_ASSERT(ggml_is_contiguous(q));
|
|
||||||
GGML_ASSERT(ggml_is_contiguous(k));
|
|
||||||
GGML_ASSERT(ggml_is_contiguous(v));
|
|
||||||
GGML_ASSERT(ggml_is_contiguous(g));
|
|
||||||
GGML_ASSERT(ggml_is_contiguous(beta));
|
|
||||||
GGML_ASSERT(ggml_is_contiguous(state));
|
|
||||||
|
|
||||||
const int64_t S_k = q->ne[0];
|
const int64_t S_k = q->ne[0];
|
||||||
const int64_t H_k = q->ne[1];
|
const int64_t H_k = q->ne[1];
|
||||||
const int64_t n_tokens = q->ne[2];
|
const int64_t n_tokens = q->ne[2];
|
||||||
|
|
@ -386,6 +351,7 @@ ggml_tensor * llm_build_qwen3next::build_delta_net_recurrent(
|
||||||
const int64_t S_v = v->ne[0];
|
const int64_t S_v = v->ne[0];
|
||||||
const int64_t H_v = v->ne[1];
|
const int64_t H_v = v->ne[1];
|
||||||
|
|
||||||
|
GGML_ASSERT(n_tokens == 1); // This function is optimized for single token processing
|
||||||
GGML_ASSERT(v->ne[2] == n_tokens);
|
GGML_ASSERT(v->ne[2] == n_tokens);
|
||||||
GGML_ASSERT(k->ne[2] == n_tokens);
|
GGML_ASSERT(k->ne[2] == n_tokens);
|
||||||
GGML_ASSERT(g->ne[0] == H_v && g->ne[1] == n_tokens && g->ne[2] == n_seqs);
|
GGML_ASSERT(g->ne[0] == H_v && g->ne[1] == n_tokens && g->ne[2] == n_seqs);
|
||||||
|
|
@ -397,215 +363,65 @@ ggml_tensor * llm_build_qwen3next::build_delta_net_recurrent(
|
||||||
|
|
||||||
GGML_ASSERT(H_k == H_v); // we did a repeat to make sure this is the case
|
GGML_ASSERT(H_k == H_v); // we did a repeat to make sure this is the case
|
||||||
|
|
||||||
// TODO: can this ever be false?
|
|
||||||
const bool use_qk_l2norm = true;
|
|
||||||
|
|
||||||
if (use_qk_l2norm) {
|
|
||||||
const float eps_norm = hparams.f_norm_rms_eps;
|
const float eps_norm = hparams.f_norm_rms_eps;
|
||||||
|
|
||||||
q = ggml_l2_norm(ctx0, q, eps_norm);
|
q = ggml_l2_norm(ctx0, q, eps_norm);
|
||||||
k = ggml_l2_norm(ctx0, k, eps_norm);
|
k = ggml_l2_norm(ctx0, k, eps_norm);
|
||||||
}
|
|
||||||
|
|
||||||
const float scale = 1.0f / sqrtf(S_v);
|
const float scale = 1.0f / sqrtf(S_v);
|
||||||
|
|
||||||
q = ggml_scale(ctx0, q, scale);
|
q = ggml_scale(ctx0, q, scale);
|
||||||
|
|
||||||
beta = ggml_sigmoid(ctx0, beta);
|
beta = ggml_sigmoid(ctx0, beta);
|
||||||
|
|
||||||
ggml_tensor * causal_diag_mask = ggml_add(ctx0, causal_mask, identity);
|
|
||||||
|
|
||||||
cb(q, "q_in", il);
|
cb(q, "q_in", il);
|
||||||
cb(k, "k_in", il);
|
cb(k, "k_in", il);
|
||||||
cb(v, "v_in", il);
|
cb(v, "v_in", il);
|
||||||
cb(beta, "beta_in", il);
|
cb(beta, "beta_in", il);
|
||||||
cb(g, "g_in", il);
|
cb(g, "g_in", il);
|
||||||
|
|
||||||
q = ggml_cont_4d(ctx0, ggml_permute(ctx0, q, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs);
|
|
||||||
k = ggml_cont_4d(ctx0, ggml_permute(ctx0, k, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs);
|
|
||||||
v = ggml_cont_4d(ctx0, ggml_permute(ctx0, v, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs);
|
|
||||||
g = ggml_cont_4d(ctx0, ggml_permute(ctx0, g, 2, 0, 3, 1), n_tokens, 1, H_k, n_seqs);
|
|
||||||
|
|
||||||
beta = ggml_cont(ctx0, ggml_permute(ctx0, beta, 2, 0, 1, 3));
|
|
||||||
state = ggml_reshape_4d(ctx0, state, S_v, S_v, H_v, n_seqs);
|
state = ggml_reshape_4d(ctx0, state, S_v, S_v, H_v, n_seqs);
|
||||||
|
|
||||||
cb(q, "q_perm", il);
|
ggml_tensor * g_t = ggml_reshape_4d(ctx0, ggml_transpose(ctx0, g), 1, 1, H_k, n_seqs);
|
||||||
cb(k, "k_perm", il);
|
ggml_tensor * beta_t = ggml_reshape_4d(ctx0, ggml_transpose(ctx0, beta), 1, 1, H_k, n_seqs);
|
||||||
cb(v, "v_perm", il);
|
|
||||||
cb(beta, "beta_perm", il);
|
|
||||||
cb(g, "g_perm", il);
|
|
||||||
cb(state, "state_in", il);
|
|
||||||
|
|
||||||
GGML_ASSERT(q->ne[1] == n_tokens && q->ne[0] == S_k && q->ne[2] == H_k && q->ne[3] == n_seqs);
|
// Apply exponential to g_t
|
||||||
GGML_ASSERT(k->ne[1] == n_tokens && k->ne[0] == S_k && k->ne[2] == H_k && k->ne[3] == n_seqs);
|
g_t = ggml_exp(ctx0, g_t);
|
||||||
GGML_ASSERT(v->ne[1] == n_tokens && v->ne[0] == S_v && v->ne[2] == H_k && v->ne[3] == n_seqs);
|
|
||||||
GGML_ASSERT(beta->ne[1] == n_tokens && beta->ne[2] == H_k && beta->ne[0] == 1 && beta->ne[3] == n_seqs);
|
|
||||||
|
|
||||||
ggml_tensor * v_beta = ggml_mul(ctx0, v, beta);
|
// Apply the gated delta rule for the single timestep
|
||||||
ggml_tensor * k_beta = ggml_mul(ctx0, k, beta);
|
// last_recurrent_state = last_recurrent_state * g_t
|
||||||
|
state = ggml_mul(ctx0, state, g_t);
|
||||||
|
|
||||||
ggml_tensor * g_cumsum = ggml_cumsum(ctx0, g);
|
// kv_mem = (last_recurrent_state * k_t.unsqueeze(-1)).sum(dim=-2)
|
||||||
|
ggml_tensor * k_t_unsqueezed = ggml_reshape_4d(ctx0, k, 1, S_v, H_v, n_seqs);
|
||||||
|
ggml_tensor * kv_mem = ggml_mul(ctx0, state, k_t_unsqueezed);
|
||||||
|
// we need to sum over dim=-2, so we transpose, sum, then transpose again
|
||||||
|
kv_mem = ggml_transpose(ctx0, ggml_sum_rows(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, kv_mem))));
|
||||||
|
|
||||||
cb(k_beta, "k_beta", il);
|
// v_t = v.unsqueeze(2) (we insert the singleton dimension after n_seqs and H_v)
|
||||||
cb(v_beta, "v_beta", il);
|
ggml_tensor * v_t = ggml_reshape_4d(ctx0, v, S_v, 1, H_v, n_seqs);
|
||||||
cb(g_cumsum, "g_cumsum", il);
|
// delta = (v_t - kv_mem) * beta_t
|
||||||
|
ggml_tensor * v_diff = ggml_sub(ctx0, v_t, kv_mem); // both should be [S_v, 1, H_v, n_seqs]
|
||||||
|
ggml_tensor * delta = ggml_mul(ctx0, v_diff, beta_t);
|
||||||
|
|
||||||
ggml_tensor * gcs_i = ggml_cont_4d(ctx0, g_cumsum, n_tokens, 1, H_v, n_seqs); // [chunk_size, 1, n_tokens, n_seqs]
|
// last_recurrent_state = last_recurrent_state + k_t.unsqueeze(-1) * delta
|
||||||
ggml_tensor * gcs_j = ggml_cont_4d(ctx0, g_cumsum, 1, n_tokens, H_v, n_seqs); // [1, chunk_size, n_tokens, n_seqs]
|
ggml_tensor * k_t_delta = ggml_mul(ctx0, ggml_repeat_4d(ctx0, k_t_unsqueezed, S_v, S_v, H_v, n_seqs), delta);
|
||||||
|
state = ggml_add(ctx0, state, k_t_delta);
|
||||||
|
|
||||||
// Broadcast both tensors to [chunk_size, chunk_size, H_v, n_seqs]
|
// Compute the attention output
|
||||||
// ggml_tensor * gcs_i_broadcast =
|
// core_attn_out = (last_recurrent_state * q_t.unsqueeze(-1)).sum(dim=-2)
|
||||||
// ggml_repeat_4d(ctx0, gcs_i, GGML_DELTA_NET_CHUNK, GGML_DELTA_NET_CHUNK, num_chunks * H_v,
|
ggml_tensor * q_t_unsqueezed = ggml_reshape_4d(ctx0, q, 1, S_v, H_v, n_seqs); // unsqueeze q_t
|
||||||
// n_seqs); // [chunk_size, 1, H_v, n_seqs] -> [chunk_size, chunk_size, H_v, n_seqs]
|
ggml_tensor * state_q = ggml_mul(ctx0, state, q_t_unsqueezed);
|
||||||
// Don't need this, this one will get auto-broadcast
|
// again, since it's over dim = -2, transpose, sum, transpose back
|
||||||
ggml_tensor * gcs_j_broadcast =
|
ggml_tensor * core_attn_out =
|
||||||
ggml_repeat_4d(ctx0, gcs_j, n_tokens, n_tokens, H_v, n_seqs); // [1, chunk_size, H_v, n_seqs] -> [chunk_size, chunk_size, H_v, n_seqs]
|
ggml_transpose(ctx0, ggml_sum_rows(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, state_q))));
|
||||||
|
|
||||||
ggml_tensor * decay_mask = ggml_sub(ctx0, gcs_j_broadcast, gcs_i);
|
|
||||||
|
|
||||||
// Apply lower triangular mask to ensure attention is causal (only past tokens influence current)
|
|
||||||
decay_mask = ggml_mul(ctx0, decay_mask, causal_diag_mask);
|
|
||||||
// Apply exponential to get the decay mask values
|
|
||||||
decay_mask = ggml_exp(ctx0, decay_mask);
|
|
||||||
// Apply lower triangular mask again to ensure only lower triangular values remain
|
|
||||||
decay_mask = ggml_mul(ctx0, decay_mask, causal_diag_mask);
|
|
||||||
|
|
||||||
cb(decay_mask, "decay_mask", il);
|
|
||||||
|
|
||||||
// attn = -((k_beta @ key.transpose(-1, -2)) * decay_mask).masked_fill(mask, 0)
|
|
||||||
ggml_tensor * kmulkbeta = ggml_mul_mat(ctx0, k, k_beta);
|
|
||||||
|
|
||||||
cb(kmulkbeta, "kmulkbeta", il);
|
|
||||||
|
|
||||||
ggml_tensor * k_decay = ggml_mul(ctx0, kmulkbeta, decay_mask);
|
|
||||||
ggml_tensor * attn = ggml_neg(ctx0, ggml_mul(ctx0, k_decay, causal_mask));
|
|
||||||
|
|
||||||
cb(attn, "attn_pre_rec", il);
|
|
||||||
|
|
||||||
// for i in range(1, chunk_size):
|
|
||||||
// row = attn[..., i, :i].clone()
|
|
||||||
// sub = attn[..., :i, :i].clone()
|
|
||||||
// attn[..., i, :i] = row + (row.unsqueeze(-1) * sub).sum(-2)
|
|
||||||
// attn = attn + torch.eye(chunk_size, dtype=attn.dtype, device=attn.device)
|
|
||||||
//
|
|
||||||
// We reduce this to a linear triangular solve: AX = B, where B = attn, A = I - tril(A)
|
|
||||||
ggml_tensor * attn_lower = ggml_mul(ctx0, attn, causal_mask);
|
|
||||||
ggml_tensor * lhs = ggml_sub(ctx0, ggml_repeat(ctx0, identity, attn_lower), attn_lower);
|
|
||||||
|
|
||||||
ggml_tensor * lin_solve = ggml_solve_tri(ctx0, lhs, attn, true, true, false);
|
|
||||||
attn = ggml_mul(ctx0, lin_solve, causal_mask);
|
|
||||||
attn = ggml_add(ctx0, attn, identity);
|
|
||||||
|
|
||||||
// value = attn @ v_beta
|
|
||||||
v = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, v_beta)), attn);
|
|
||||||
|
|
||||||
cb(v, "value_beta", il);
|
|
||||||
|
|
||||||
// k_cumdecay = attn @ (k_beta * g.exp().unsqueeze(-1))
|
|
||||||
ggml_tensor * g_cumsum_t = ggml_cont(ctx0, ggml_transpose(ctx0, g_cumsum));
|
|
||||||
ggml_tensor * gexp = ggml_exp(ctx0, g_cumsum_t);
|
|
||||||
|
|
||||||
cb(gexp, "g_cum_exp", il);
|
|
||||||
|
|
||||||
ggml_tensor * kbeta_gexp = ggml_mul(ctx0, k_beta, gexp);
|
|
||||||
|
|
||||||
cb(kbeta_gexp, "kbeta_gexp", il);
|
|
||||||
|
|
||||||
ggml_tensor * k_cumdecay =
|
|
||||||
ggml_cont(ctx0, ggml_transpose(ctx0, ggml_mul_mat(ctx0, attn, ggml_cont(ctx0, ggml_transpose(ctx0, kbeta_gexp)))));
|
|
||||||
|
|
||||||
cb(k_cumdecay, "k_cumdecay", il);
|
|
||||||
|
|
||||||
// attn = (q_i @ k_i.transpose(-1, -2) * decay_mask[:, :, i]).masked_fill_(mask, 0)
|
|
||||||
attn = ggml_mul_mat(ctx0, k, q);
|
|
||||||
attn = ggml_mul(ctx0, attn, decay_mask);
|
|
||||||
attn = ggml_mul(ctx0, attn, ggml_add(ctx0, identity, causal_mask));
|
|
||||||
|
|
||||||
cb(attn, "attn_decay_key", il);
|
|
||||||
|
|
||||||
ggml_tensor * state_t = ggml_cont(ctx0, ggml_transpose(ctx0, state));
|
|
||||||
|
|
||||||
// v_prime = (k_cumdecay[:, :, i]) @ last_recurrent_state
|
|
||||||
ggml_tensor * v_prime = ggml_mul_mat(ctx0, state_t, k_cumdecay);
|
|
||||||
|
|
||||||
cb(v_prime, "v_prime", il);
|
|
||||||
|
|
||||||
// v_new = v_i - v_prime
|
|
||||||
ggml_tensor * v_new = ggml_sub(ctx0, ggml_repeat(ctx0, v, v_prime), v_prime);
|
|
||||||
|
|
||||||
ggml_tensor * v_new_t = ggml_cont(ctx0, ggml_transpose(ctx0, v_new));
|
|
||||||
|
|
||||||
cb(v_new, "v_new", il);
|
|
||||||
|
|
||||||
// attn_inter = (q_i * g[:, :, i, :, None].exp()) @ last_recurrent_state
|
|
||||||
ggml_tensor * q_g_exp = ggml_mul(ctx0, q, gexp);
|
|
||||||
ggml_tensor * attn_inter = ggml_mul_mat(ctx0, state_t, q_g_exp);
|
|
||||||
|
|
||||||
cb(attn_inter, "attn_inter", il);
|
|
||||||
|
|
||||||
// core_attn_out[:, :, i] = attn_inter + attn @ v_new
|
|
||||||
ggml_tensor * v_attn = ggml_mul_mat(ctx0, v_new_t, attn);
|
|
||||||
|
|
||||||
cb(v_attn, "v_attn", il);
|
|
||||||
|
|
||||||
ggml_tensor * core_attn_out = ggml_add(ctx0, attn_inter, v_attn);
|
|
||||||
|
|
||||||
cb(core_attn_out, "core_attn_out", il);
|
|
||||||
|
|
||||||
// g_last = torch.clamp(g_cum[:, :, -1], max=50.0).exp().unsqueeze(-1).unsqueeze(-1)
|
|
||||||
// g_diff = torch.clamp(g_cum[:, :, -1:] - g_cum, max=50.0).exp()
|
|
||||||
// key_gdiff = key * g_diff.unsqueeze(-1)
|
|
||||||
// kgdmulvnew = (key_gdiff).transpose(-1, -2) @ v_new
|
|
||||||
// last_recurrent_state = last_recurrent_state * g_last + kgdmulvnew
|
|
||||||
|
|
||||||
ggml_tensor * g_cum_last =
|
|
||||||
ggml_cont(ctx0, ggml_view_4d(ctx0, g_cumsum_t, g_cumsum_t->ne[0], 1, g_cumsum_t->ne[2], g_cumsum_t->ne[3],
|
|
||||||
g_cumsum_t->nb[1], g_cumsum_t->nb[2], g_cumsum_t->nb[3],
|
|
||||||
g_cumsum_t->nb[0] * (g_cumsum_t->ne[1] - 1)));
|
|
||||||
|
|
||||||
cb(g_cum_last, "g_cum_last", il);
|
|
||||||
|
|
||||||
ggml_tensor * gexp_last =
|
|
||||||
ggml_reshape_4d(ctx0, ggml_exp(ctx0, g_cum_last), 1, 1, g_cum_last->ne[0] * g_cum_last->ne[2], g_cum_last->ne[3]);
|
|
||||||
|
|
||||||
cb(gexp_last, "gexp_last", il);
|
|
||||||
|
|
||||||
ggml_tensor * g_cum_last_3d =
|
|
||||||
ggml_reshape_3d(ctx0, g_cum_last, g_cum_last->ne[0], g_cum_last->ne[2], g_cum_last->ne[3]);
|
|
||||||
|
|
||||||
cb(g_cum_last_3d, "g_cum_last_3d", il);
|
|
||||||
|
|
||||||
ggml_tensor * g_cumsum_3d = ggml_reshape_3d(ctx0, g_cumsum, g_cumsum->ne[0], g_cumsum->ne[2], g_cumsum->ne[3]);
|
|
||||||
|
|
||||||
cb(g_cumsum_3d, "g_cumsum_3d", il);
|
|
||||||
|
|
||||||
ggml_tensor * g_diff = ggml_neg(ctx0, ggml_sub(ctx0, g_cumsum_3d, g_cum_last_3d));
|
|
||||||
|
|
||||||
cb(g_diff, "g_diff", il);
|
|
||||||
|
|
||||||
ggml_tensor * g_diff_exp = ggml_exp(ctx0, g_diff);
|
|
||||||
|
|
||||||
cb(g_diff_exp, "g_diff_exp", il);
|
|
||||||
|
|
||||||
ggml_tensor * key_gdiff = ggml_mul(ctx0, k,
|
|
||||||
ggml_reshape_4d(ctx0, g_diff_exp, 1, g_diff_exp->ne[0], g_diff_exp->ne[1],
|
|
||||||
g_diff_exp->ne[2] * g_diff_exp->ne[3]));
|
|
||||||
|
|
||||||
cb(key_gdiff, "key_gdiff", il);
|
|
||||||
|
|
||||||
ggml_tensor * kgdmulvnew = ggml_mul_mat(ctx0, v_new_t, ggml_cont(ctx0, ggml_transpose(ctx0, key_gdiff)));
|
|
||||||
|
|
||||||
cb(kgdmulvnew, "kgdmulvnew", il);
|
|
||||||
|
|
||||||
state = ggml_add(ctx0, ggml_mul(ctx0, state, gexp_last), kgdmulvnew);
|
|
||||||
|
|
||||||
|
// core_attn_out should be [S_v, 1, H_v, n_seqs] after this
|
||||||
|
cb(core_attn_out, "output_tokens", il);
|
||||||
cb(state, "new_state", il);
|
cb(state, "new_state", il);
|
||||||
|
|
||||||
// flatten output
|
// flatten output, no need to permute since n_tokens is 1 so [S_v, 1, H_v, n_seqs] and [S_v, H_v, 1, n_seqs] are equivalent memory-layout wise
|
||||||
ggml_tensor * flat_output =
|
ggml_tensor * flat_output = ggml_reshape_1d(ctx0, core_attn_out, S_v * H_v * n_tokens * n_seqs);
|
||||||
ggml_cont_1d(ctx0, ggml_permute(ctx0, core_attn_out, 0, 2, 1, 3), S_v * H_v * n_tokens * n_seqs);
|
ggml_tensor * flat_state = ggml_reshape_1d(ctx0, state, S_v * S_v * H_v * n_seqs);
|
||||||
|
|
||||||
ggml_tensor * flat_state = ggml_cont_1d(ctx0, state, S_v * S_v * H_v * n_seqs);
|
|
||||||
|
|
||||||
return ggml_concat(ctx0, flat_output, flat_state, 0);
|
return ggml_concat(ctx0, flat_output, flat_state, 0);
|
||||||
}
|
}
|
||||||
|
|
@ -712,6 +528,7 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
|
||||||
ggml_tensor * cur,
|
ggml_tensor * cur,
|
||||||
ggml_tensor * causal_mask,
|
ggml_tensor * causal_mask,
|
||||||
ggml_tensor * identity,
|
ggml_tensor * identity,
|
||||||
|
ggml_tensor * diag_mask,
|
||||||
int il) {
|
int il) {
|
||||||
const auto * mctx_cur = inp->mctx;
|
const auto * mctx_cur = inp->mctx;
|
||||||
|
|
||||||
|
|
@ -737,11 +554,11 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
|
||||||
cb(mixed_ba, "linear_attn_mixed_ba", il);
|
cb(mixed_ba, "linear_attn_mixed_ba", il);
|
||||||
|
|
||||||
int64_t qkvz_new_dim = 2 * head_k_dim + 2 * head_v_dim * (num_v_heads / num_k_heads);
|
int64_t qkvz_new_dim = 2 * head_k_dim + 2 * head_v_dim * (num_v_heads / num_k_heads);
|
||||||
ggml_tensor * mixed_qkvz_reshaped = ggml_cont_4d(ctx0, mixed_qkvz, qkvz_new_dim, num_k_heads, n_seq_tokens, n_seqs);
|
ggml_tensor * mixed_qkvz_reshaped = ggml_reshape_4d(ctx0, mixed_qkvz, qkvz_new_dim, num_k_heads, n_seq_tokens, n_seqs);
|
||||||
|
|
||||||
// Reshape mixed_ba: [batch, seq_len, hidden_size] -> [batch, seq_len, num_k_heads, 2*num_v_heads/num_k_heads]
|
// Reshape mixed_ba: [batch, seq_len, hidden_size] -> [batch, seq_len, num_k_heads, 2*num_v_heads/num_k_heads]
|
||||||
int64_t ba_new_dim = 2 * num_v_heads / num_k_heads;
|
int64_t ba_new_dim = 2 * num_v_heads / num_k_heads;
|
||||||
ggml_tensor * mixed_ba_reshaped = ggml_cont_4d(ctx0, mixed_ba, ba_new_dim, num_k_heads, n_seq_tokens, n_seqs);
|
ggml_tensor * mixed_ba_reshaped = ggml_reshape_4d(ctx0, mixed_ba, ba_new_dim, num_k_heads, n_seq_tokens, n_seqs);
|
||||||
|
|
||||||
// Split mixed_ba into b and a (beta and alpha parameters)
|
// Split mixed_ba into b and a (beta and alpha parameters)
|
||||||
int64_t split_sizes_ba[2] = {
|
int64_t split_sizes_ba[2] = {
|
||||||
|
|
@ -762,8 +579,6 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
|
||||||
ggml_tensor * beta = ggml_cont_3d(ctx0, b, num_v_heads, n_seq_tokens, n_seqs);
|
ggml_tensor * beta = ggml_cont_3d(ctx0, b, num_v_heads, n_seq_tokens, n_seqs);
|
||||||
ggml_tensor * alpha = ggml_cont_3d(ctx0, a, num_v_heads, n_seq_tokens, n_seqs);
|
ggml_tensor * alpha = ggml_cont_3d(ctx0, a, num_v_heads, n_seq_tokens, n_seqs);
|
||||||
|
|
||||||
GGML_ASSERT(ggml_nelements(beta) + ggml_nelements(alpha) == ggml_nelements(mixed_ba));
|
|
||||||
|
|
||||||
ggml_tensor * alpha_biased = ggml_add(ctx0, alpha, model.layers[il].ssm_dt);
|
ggml_tensor * alpha_biased = ggml_add(ctx0, alpha, model.layers[il].ssm_dt);
|
||||||
ggml_tensor * alpha_softplus = ggml_softplus(ctx0, alpha_biased);
|
ggml_tensor * alpha_softplus = ggml_softplus(ctx0, alpha_biased);
|
||||||
cb(alpha_softplus, "a_softplus", il);
|
cb(alpha_softplus, "a_softplus", il);
|
||||||
|
|
@ -799,9 +614,6 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
|
||||||
(split_sizes_qkvz[0] + split_sizes_qkvz[1] + split_sizes_qkvz[2]) * sizeof(float));
|
(split_sizes_qkvz[0] + split_sizes_qkvz[1] + split_sizes_qkvz[2]) * sizeof(float));
|
||||||
cb(z, "z", il);
|
cb(z, "z", il);
|
||||||
|
|
||||||
GGML_ASSERT(ggml_nelements(query) + ggml_nelements(key) + ggml_nelements(value) + ggml_nelements(z) ==
|
|
||||||
ggml_nelements(mixed_qkvz));
|
|
||||||
|
|
||||||
// After creating query, key, and value_reshaped, reshape each to flatten the head dimensions
|
// After creating query, key, and value_reshaped, reshape each to flatten the head dimensions
|
||||||
// query: [head_k_dim, num_k_heads, n_tokens, n_seqs] -> [head_k_dim * num_k_heads, n_tokens, n_seqs]
|
// query: [head_k_dim, num_k_heads, n_tokens, n_seqs] -> [head_k_dim * num_k_heads, n_tokens, n_seqs]
|
||||||
ggml_tensor * query_flat = ggml_cont_3d(ctx0, query, head_k_dim * num_k_heads, n_seq_tokens, n_seqs);
|
ggml_tensor * query_flat = ggml_cont_3d(ctx0, query, head_k_dim * num_k_heads, n_seq_tokens, n_seqs);
|
||||||
|
|
@ -925,10 +737,13 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
|
||||||
cb(k_conv, "k_conv_predelta", il);
|
cb(k_conv, "k_conv_predelta", il);
|
||||||
cb(v_conv, "v_conv_predelta", il);
|
cb(v_conv, "v_conv_predelta", il);
|
||||||
|
|
||||||
// Choose between build_delta_net_chunking and build_delta_net_recurrent based on n_tokens
|
// Choose between build_delta_net_chunking, build_delta_net_recurrent, and build_delta_net_autoregressive based on n_tokens
|
||||||
ggml_tensor * attn_out = n_seq_tokens > CHUNK_SIZE ?
|
ggml_tensor * attn_out;
|
||||||
build_delta_net_chunking (q_conv, k_conv, v_conv, gate, beta, state, causal_mask, identity, il) :
|
if (n_seq_tokens == 1) {
|
||||||
build_delta_net_recurrent(q_conv, k_conv, v_conv, gate, beta, state, causal_mask, identity, il);
|
attn_out = build_delta_net_autoregressive(q_conv, k_conv, v_conv, gate, beta, state, il);
|
||||||
|
} else {
|
||||||
|
attn_out = build_delta_net_chunking(q_conv, k_conv, v_conv, gate, beta, state, causal_mask, identity, diag_mask, il);
|
||||||
|
}
|
||||||
cb(attn_out, "attn_out", il);
|
cb(attn_out, "attn_out", il);
|
||||||
|
|
||||||
// The tensors were concatenated 1d, so we need to extract them 1d as well
|
// The tensors were concatenated 1d, so we need to extract them 1d as well
|
||||||
|
|
|
||||||
|
|
@ -520,7 +520,7 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
|
||||||
|
|
||||||
// use std::wregex to split the text
|
// use std::wregex to split the text
|
||||||
static std::vector<size_t> unicode_regex_split_stl(const std::wstring & wtext, const std::wstring & regex_expr, const std::vector<size_t> & offsets) {
|
static std::vector<size_t> unicode_regex_split_stl(const std::wstring & wtext, const std::wstring & regex_expr, const std::vector<size_t> & offsets) {
|
||||||
std::wregex expr(regex_expr);
|
std::wregex expr(regex_expr, std::regex_constants::optimize | std::regex_constants::nosubs);
|
||||||
std::vector<size_t> bpe_offsets; // store the offset of each word
|
std::vector<size_t> bpe_offsets; // store the offset of each word
|
||||||
bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
|
bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
|
||||||
size_t start = 0;
|
size_t start = 0;
|
||||||
|
|
@ -550,7 +550,7 @@ static std::vector<size_t> unicode_regex_split_stl(const std::wstring & wtext, c
|
||||||
|
|
||||||
// use std::regex to split the text
|
// use std::regex to split the text
|
||||||
static std::vector<size_t> unicode_regex_split_stl(const std::string & text, const std::string & regex_expr, const std::vector<size_t> & offsets) {
|
static std::vector<size_t> unicode_regex_split_stl(const std::string & text, const std::string & regex_expr, const std::vector<size_t> & offsets) {
|
||||||
std::regex expr(regex_expr);
|
std::regex expr(regex_expr, std::regex_constants::optimize | std::regex_constants::nosubs);
|
||||||
std::vector<size_t> bpe_offsets; // store the offset of each word
|
std::vector<size_t> bpe_offsets; // store the offset of each word
|
||||||
bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
|
bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
|
||||||
size_t start = 0;
|
size_t start = 0;
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,121 @@
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "ggml.h"
|
||||||
|
#include "ggml-cpp.h"
|
||||||
|
#include "clip.h"
|
||||||
|
#include "clip-impl.h"
|
||||||
|
#include "clip-model.h"
|
||||||
|
|
||||||
|
#include <vector>
|
||||||
|
#include <functional>
|
||||||
|
|
||||||
|
#define DEFAULT_INTERPOLATION_MODE (GGML_SCALE_MODE_BILINEAR | GGML_SCALE_FLAG_ANTIALIAS)
|
||||||
|
|
||||||
|
struct clip_graph {
|
||||||
|
const clip_model & model;
|
||||||
|
const clip_hparams & hparams;
|
||||||
|
projector_type proj_type;
|
||||||
|
|
||||||
|
// we only support single image per batch
|
||||||
|
const clip_image_f32 & img;
|
||||||
|
|
||||||
|
const int patch_size;
|
||||||
|
const int n_patches_x;
|
||||||
|
const int n_patches_y;
|
||||||
|
const int n_patches;
|
||||||
|
const int n_embd;
|
||||||
|
const int n_head;
|
||||||
|
const int d_head;
|
||||||
|
const int n_layer;
|
||||||
|
const int n_mmproj_embd;
|
||||||
|
const float eps;
|
||||||
|
const float kq_scale;
|
||||||
|
const clip_flash_attn_type flash_attn_type;
|
||||||
|
|
||||||
|
// for debugging
|
||||||
|
const bool debug_graph;
|
||||||
|
std::vector<ggml_tensor *> & debug_print_tensors;
|
||||||
|
|
||||||
|
ggml_context_ptr ctx0_ptr;
|
||||||
|
ggml_context * ctx0;
|
||||||
|
ggml_cgraph * gf;
|
||||||
|
|
||||||
|
clip_graph(clip_ctx * ctx, const clip_image_f32 & img);
|
||||||
|
|
||||||
|
virtual ~clip_graph() = default;
|
||||||
|
virtual ggml_cgraph * build() = 0;
|
||||||
|
|
||||||
|
//
|
||||||
|
// utility functions
|
||||||
|
//
|
||||||
|
void cb(ggml_tensor * cur0, const char * name, int il) const;
|
||||||
|
|
||||||
|
// siglip2 naflex
|
||||||
|
ggml_tensor * resize_position_embeddings(uint32_t interpolation_mode = DEFAULT_INTERPOLATION_MODE);
|
||||||
|
|
||||||
|
// build vision transformer (ViT) cgraph
|
||||||
|
// this function should cover most of the models
|
||||||
|
// if your model has specific features, you should probably duplicate this function
|
||||||
|
ggml_tensor * build_vit(
|
||||||
|
ggml_tensor * inp,
|
||||||
|
int64_t n_pos,
|
||||||
|
norm_type norm_t,
|
||||||
|
ffn_op_type ffn_t,
|
||||||
|
ggml_tensor * learned_pos_embd,
|
||||||
|
std::function<ggml_tensor *(ggml_tensor *, const clip_layer &)> add_pos);
|
||||||
|
|
||||||
|
// build the input after conv2d (inp_raw --> patches)
|
||||||
|
// returns tensor with shape [n_embd, n_patches]
|
||||||
|
ggml_tensor * build_inp();
|
||||||
|
|
||||||
|
ggml_tensor * build_inp_raw(int channels = 3);
|
||||||
|
|
||||||
|
ggml_tensor * build_norm(
|
||||||
|
ggml_tensor * cur,
|
||||||
|
ggml_tensor * mw,
|
||||||
|
ggml_tensor * mb,
|
||||||
|
norm_type type,
|
||||||
|
float norm_eps,
|
||||||
|
int il) const;
|
||||||
|
|
||||||
|
ggml_tensor * build_ffn(
|
||||||
|
ggml_tensor * cur,
|
||||||
|
ggml_tensor * up,
|
||||||
|
ggml_tensor * up_b,
|
||||||
|
ggml_tensor * gate,
|
||||||
|
ggml_tensor * gate_b,
|
||||||
|
ggml_tensor * down,
|
||||||
|
ggml_tensor * down_b,
|
||||||
|
ffn_op_type type_op,
|
||||||
|
int il) const;
|
||||||
|
|
||||||
|
ggml_tensor * build_attn(
|
||||||
|
ggml_tensor * wo,
|
||||||
|
ggml_tensor * wo_b,
|
||||||
|
ggml_tensor * q_cur,
|
||||||
|
ggml_tensor * k_cur,
|
||||||
|
ggml_tensor * v_cur,
|
||||||
|
ggml_tensor * kq_mask,
|
||||||
|
float kq_scale,
|
||||||
|
int il) const;
|
||||||
|
|
||||||
|
// implementation of the 2D RoPE without adding a new op in ggml
|
||||||
|
// this is not efficient (use double the memory), but works on all backends
|
||||||
|
// TODO: there was a more efficient which relies on ggml_view and ggml_rope_ext_inplace, but the rope inplace does not work well with non-contiguous tensors ; we should fix that and revert back to the original implementation in https://github.com/ggml-org/llama.cpp/pull/13065
|
||||||
|
ggml_tensor * build_rope_2d(
|
||||||
|
ggml_context * ctx0,
|
||||||
|
ggml_tensor * cur,
|
||||||
|
ggml_tensor * pos_a, // first half
|
||||||
|
ggml_tensor * pos_b, // second half
|
||||||
|
const float freq_base,
|
||||||
|
const bool interleave_freq
|
||||||
|
);
|
||||||
|
|
||||||
|
// aka pixel_shuffle / pixel_unshuffle / patch_merger (Kimi-VL)
|
||||||
|
// support dynamic resolution
|
||||||
|
ggml_tensor * build_patch_merge_permute(ggml_tensor * cur, int scale_factor);
|
||||||
|
|
||||||
|
// Generic function to stack frames for audio processing
|
||||||
|
// Abstracts out the StackAudioFrames logic used by ultravox
|
||||||
|
ggml_tensor * build_stack(ggml_tensor * cur, int32_t stack_factor, int32_t n_embed);
|
||||||
|
};
|
||||||
|
|
@ -1,3 +1,5 @@
|
||||||
|
#pragma once
|
||||||
|
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
#include "gguf.h"
|
#include "gguf.h"
|
||||||
#include "clip.h"
|
#include "clip.h"
|
||||||
|
|
@ -13,6 +15,8 @@
|
||||||
|
|
||||||
// Internal header for clip.cpp
|
// Internal header for clip.cpp
|
||||||
|
|
||||||
|
#define MTMD_INTERNAL_HEADER
|
||||||
|
|
||||||
#define KEY_FTYPE "general.file_type"
|
#define KEY_FTYPE "general.file_type"
|
||||||
#define KEY_NAME "general.name"
|
#define KEY_NAME "general.name"
|
||||||
#define KEY_DESCRIPTION "general.description"
|
#define KEY_DESCRIPTION "general.description"
|
||||||
|
|
@ -64,6 +68,7 @@
|
||||||
#define TN_PATCH_EMBD "v.patch_embd.weight" // not rename tensor with ".0" postfix for backwrad compat
|
#define TN_PATCH_EMBD "v.patch_embd.weight" // not rename tensor with ".0" postfix for backwrad compat
|
||||||
#define TN_PATCH_EMBD_1 "v.patch_embd.weight.1"
|
#define TN_PATCH_EMBD_1 "v.patch_embd.weight.1"
|
||||||
#define TN_PATCH_BIAS "v.patch_embd.bias"
|
#define TN_PATCH_BIAS "v.patch_embd.bias"
|
||||||
|
#define TN_NORM_EMBD "v.norm_embd.%s"
|
||||||
#define TN_ATTN_QKV "%s.blk.%d.attn_qkv.%s"
|
#define TN_ATTN_QKV "%s.blk.%d.attn_qkv.%s"
|
||||||
#define TN_ATTN_K "%s.blk.%d.attn_k.%s"
|
#define TN_ATTN_K "%s.blk.%d.attn_k.%s"
|
||||||
#define TN_ATTN_Q "%s.blk.%d.attn_q.%s"
|
#define TN_ATTN_Q "%s.blk.%d.attn_q.%s"
|
||||||
|
|
@ -82,6 +87,10 @@
|
||||||
#define TN_LN_PRE "%s.pre_ln.%s"
|
#define TN_LN_PRE "%s.pre_ln.%s"
|
||||||
#define TN_LN_POST "%s.post_ln.%s"
|
#define TN_LN_POST "%s.post_ln.%s"
|
||||||
#define TN_LLAVA_PROJ "mm.%d.%s"
|
#define TN_LLAVA_PROJ "mm.%d.%s"
|
||||||
|
#define TN_MM_UP "mm.up.%s"
|
||||||
|
#define TN_MM_GATE "mm.gate.%s"
|
||||||
|
#define TN_MM_DOWN "mm.down.%s"
|
||||||
|
#define TN_MM_POST_NORM "mm.post_norm.%s"
|
||||||
#define TN_MVLM_PROJ_MLP "mm.model.mlp.%d.%s"
|
#define TN_MVLM_PROJ_MLP "mm.model.mlp.%d.%s"
|
||||||
#define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s"
|
#define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s"
|
||||||
#define TN_MVLM_PROJ_PEG "mm.model.peg.%d.%s"
|
#define TN_MVLM_PROJ_PEG "mm.model.peg.%d.%s"
|
||||||
|
|
@ -91,7 +100,7 @@
|
||||||
#define TN_MM_INP_PROJ "mm.input_projection.weight" // gemma3
|
#define TN_MM_INP_PROJ "mm.input_projection.weight" // gemma3
|
||||||
#define TN_MM_SOFT_EMB_N "mm.soft_emb_norm.weight" // gemma3
|
#define TN_MM_SOFT_EMB_N "mm.soft_emb_norm.weight" // gemma3
|
||||||
#define TN_MM_PROJECTOR "mm.model.fc.weight" // idefics3
|
#define TN_MM_PROJECTOR "mm.model.fc.weight" // idefics3
|
||||||
#define TN_MM_PATCH_MERGER "mm.patch_merger.weight" // mistral small 3.1
|
#define TN_MM_PATCH_MERGER "mm.patch_merger.%s" // mistral small 3.1, glm4v
|
||||||
#define TN_TOK_IMG_BREAK "v.token_embd.img_break" // pixtral
|
#define TN_TOK_IMG_BREAK "v.token_embd.img_break" // pixtral
|
||||||
#define TN_TOK_GLM_BOI "adapter.boi" // glm-edge (these embeddings are not in text model)
|
#define TN_TOK_GLM_BOI "adapter.boi" // glm-edge (these embeddings are not in text model)
|
||||||
#define TN_TOK_GLM_EOI "adapter.eoi" // glm-edge (these embeddings are not in text model)
|
#define TN_TOK_GLM_EOI "adapter.eoi" // glm-edge (these embeddings are not in text model)
|
||||||
|
|
@ -132,6 +141,10 @@
|
||||||
// align x to upper multiple of n
|
// align x to upper multiple of n
|
||||||
#define CLIP_ALIGN(x, n) ((((x) + (n) - 1) / (n)) * (n))
|
#define CLIP_ALIGN(x, n) ((((x) + (n) - 1) / (n)) * (n))
|
||||||
|
|
||||||
|
// forward declaration
|
||||||
|
// TODO: improve this later
|
||||||
|
struct clip_ctx;
|
||||||
|
|
||||||
enum projector_type {
|
enum projector_type {
|
||||||
PROJECTOR_TYPE_MLP,
|
PROJECTOR_TYPE_MLP,
|
||||||
PROJECTOR_TYPE_MLP_NORM,
|
PROJECTOR_TYPE_MLP_NORM,
|
||||||
|
|
@ -149,6 +162,7 @@ enum projector_type {
|
||||||
PROJECTOR_TYPE_INTERNVL,
|
PROJECTOR_TYPE_INTERNVL,
|
||||||
PROJECTOR_TYPE_LLAMA4,
|
PROJECTOR_TYPE_LLAMA4,
|
||||||
PROJECTOR_TYPE_QWEN2A,
|
PROJECTOR_TYPE_QWEN2A,
|
||||||
|
PROJECTOR_TYPE_GLMA,
|
||||||
PROJECTOR_TYPE_QWEN25O, // will be replaced by QWEN2A or QWEN25VL depending on clip_ctx
|
PROJECTOR_TYPE_QWEN25O, // will be replaced by QWEN2A or QWEN25VL depending on clip_ctx
|
||||||
PROJECTOR_TYPE_VOXTRAL,
|
PROJECTOR_TYPE_VOXTRAL,
|
||||||
PROJECTOR_TYPE_LFM2,
|
PROJECTOR_TYPE_LFM2,
|
||||||
|
|
@ -156,6 +170,7 @@ enum projector_type {
|
||||||
PROJECTOR_TYPE_LIGHTONOCR,
|
PROJECTOR_TYPE_LIGHTONOCR,
|
||||||
PROJECTOR_TYPE_COGVLM,
|
PROJECTOR_TYPE_COGVLM,
|
||||||
PROJECTOR_TYPE_JANUS_PRO,
|
PROJECTOR_TYPE_JANUS_PRO,
|
||||||
|
PROJECTOR_TYPE_GLM4V,
|
||||||
PROJECTOR_TYPE_UNKNOWN,
|
PROJECTOR_TYPE_UNKNOWN,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
@ -175,6 +190,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
|
||||||
{ PROJECTOR_TYPE_INTERNVL, "internvl"},
|
{ PROJECTOR_TYPE_INTERNVL, "internvl"},
|
||||||
{ PROJECTOR_TYPE_LLAMA4, "llama4"},
|
{ PROJECTOR_TYPE_LLAMA4, "llama4"},
|
||||||
{ PROJECTOR_TYPE_QWEN2A, "qwen2a"},
|
{ PROJECTOR_TYPE_QWEN2A, "qwen2a"},
|
||||||
|
{ PROJECTOR_TYPE_GLMA, "glma"},
|
||||||
{ PROJECTOR_TYPE_QWEN25O, "qwen2.5o"},
|
{ PROJECTOR_TYPE_QWEN25O, "qwen2.5o"},
|
||||||
{ PROJECTOR_TYPE_VOXTRAL, "voxtral"},
|
{ PROJECTOR_TYPE_VOXTRAL, "voxtral"},
|
||||||
{ PROJECTOR_TYPE_LFM2, "lfm2"},
|
{ PROJECTOR_TYPE_LFM2, "lfm2"},
|
||||||
|
|
@ -182,6 +198,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
|
||||||
{ PROJECTOR_TYPE_LIGHTONOCR,"lightonocr"},
|
{ PROJECTOR_TYPE_LIGHTONOCR,"lightonocr"},
|
||||||
{ PROJECTOR_TYPE_COGVLM, "cogvlm"},
|
{ PROJECTOR_TYPE_COGVLM, "cogvlm"},
|
||||||
{ PROJECTOR_TYPE_JANUS_PRO, "janus_pro"},
|
{ PROJECTOR_TYPE_JANUS_PRO, "janus_pro"},
|
||||||
|
{ PROJECTOR_TYPE_GLM4V, "glm4v"},
|
||||||
};
|
};
|
||||||
|
|
||||||
static projector_type clip_projector_type_from_string(const std::string & str) {
|
static projector_type clip_projector_type_from_string(const std::string & str) {
|
||||||
|
|
@ -485,6 +502,8 @@ static void print_tensor_data(ggml_tensor * t, uint8_t * data, int64_t n) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void clip_debug_encode(clip_ctx * ctx, int h, int w, float fill_value);
|
||||||
|
|
||||||
//
|
//
|
||||||
// API used internally with mtmd
|
// API used internally with mtmd
|
||||||
//
|
//
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,300 @@
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "ggml.h"
|
||||||
|
#include "clip.h"
|
||||||
|
#include "clip-impl.h"
|
||||||
|
|
||||||
|
#include <vector>
|
||||||
|
#include <unordered_set>
|
||||||
|
#include <cstdint>
|
||||||
|
#include <cmath>
|
||||||
|
|
||||||
|
enum ffn_op_type {
|
||||||
|
FFN_GELU,
|
||||||
|
FFN_GELU_ERF,
|
||||||
|
FFN_SILU,
|
||||||
|
FFN_GELU_QUICK,
|
||||||
|
};
|
||||||
|
|
||||||
|
enum norm_type {
|
||||||
|
NORM_TYPE_NORMAL,
|
||||||
|
NORM_TYPE_RMS,
|
||||||
|
};
|
||||||
|
|
||||||
|
enum patch_merge_type {
|
||||||
|
PATCH_MERGE_FLAT,
|
||||||
|
PATCH_MERGE_SPATIAL_UNPAD,
|
||||||
|
};
|
||||||
|
|
||||||
|
struct clip_hparams {
|
||||||
|
int32_t image_size = 0;
|
||||||
|
int32_t patch_size = 0;
|
||||||
|
int32_t n_embd = 0;
|
||||||
|
int32_t n_ff = 0;
|
||||||
|
int32_t projection_dim = 0;
|
||||||
|
int32_t n_head = 0;
|
||||||
|
int32_t n_layer = 0;
|
||||||
|
// idefics3
|
||||||
|
int32_t image_longest_edge = 0;
|
||||||
|
int32_t image_min_pixels = -1;
|
||||||
|
int32_t image_max_pixels = -1;
|
||||||
|
int32_t n_merge = 0; // number of patch merges **per-side**
|
||||||
|
|
||||||
|
float image_mean[3];
|
||||||
|
float image_std[3];
|
||||||
|
|
||||||
|
// for models using dynamic image size, we need to have a smaller image size to warmup
|
||||||
|
// otherwise, user will get OOM everytime they load the model
|
||||||
|
int32_t warmup_image_size = 0;
|
||||||
|
int32_t warmup_audio_size = 3000;
|
||||||
|
|
||||||
|
ffn_op_type ffn_op = FFN_GELU;
|
||||||
|
|
||||||
|
patch_merge_type mm_patch_merge_type = PATCH_MERGE_FLAT;
|
||||||
|
|
||||||
|
float eps = 1e-6;
|
||||||
|
float rope_theta = 0.0;
|
||||||
|
|
||||||
|
std::vector<clip_image_size> image_res_candidates; // for llava-uhd style models
|
||||||
|
int32_t image_crop_resolution;
|
||||||
|
std::unordered_set<int32_t> vision_feature_layer;
|
||||||
|
int32_t attn_window_size = 0;
|
||||||
|
int32_t n_wa_pattern = 0;
|
||||||
|
|
||||||
|
// audio
|
||||||
|
int32_t n_mel_bins = 0; // whisper preprocessor
|
||||||
|
int32_t proj_stack_factor = 0; // ultravox
|
||||||
|
|
||||||
|
// audio-to-mel preprocessor params
|
||||||
|
int32_t audio_chunk_len = -1; // in seconds
|
||||||
|
int32_t audio_sample_rate = -1;
|
||||||
|
int32_t audio_n_fft = -1;
|
||||||
|
int32_t audio_window_len = -1;
|
||||||
|
int32_t audio_hop_len = -1;
|
||||||
|
|
||||||
|
// legacy
|
||||||
|
bool has_llava_projector = false;
|
||||||
|
int minicpmv_version = 0;
|
||||||
|
int32_t minicpmv_query_num = 0; // MiniCPM-V query number
|
||||||
|
|
||||||
|
// custom value provided by user, can be undefined if not set
|
||||||
|
int32_t custom_image_min_tokens = -1;
|
||||||
|
int32_t custom_image_max_tokens = -1;
|
||||||
|
|
||||||
|
void set_limit_image_tokens(int n_tokens_min, int n_tokens_max) {
|
||||||
|
const int cur_merge = n_merge == 0 ? 1 : n_merge;
|
||||||
|
const int patch_area = patch_size * patch_size * cur_merge * cur_merge;
|
||||||
|
image_min_pixels = (custom_image_min_tokens > 0 ? custom_image_min_tokens : n_tokens_min) * patch_area;
|
||||||
|
image_max_pixels = (custom_image_max_tokens > 0 ? custom_image_max_tokens : n_tokens_max) * patch_area;
|
||||||
|
warmup_image_size = static_cast<int>(std::sqrt(image_max_pixels));
|
||||||
|
}
|
||||||
|
|
||||||
|
void set_warmup_n_tokens(int n_tokens) {
|
||||||
|
int n_tok_per_side = static_cast<int>(std::sqrt(n_tokens));
|
||||||
|
GGML_ASSERT(n_tok_per_side * n_tok_per_side == n_tokens && "n_tokens must be n*n");
|
||||||
|
const int cur_merge = n_merge == 0 ? 1 : n_merge;
|
||||||
|
warmup_image_size = n_tok_per_side * patch_size * cur_merge;
|
||||||
|
// TODO: support warmup size for custom token numbers
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct clip_layer {
|
||||||
|
// attention
|
||||||
|
ggml_tensor * k_w = nullptr;
|
||||||
|
ggml_tensor * k_b = nullptr;
|
||||||
|
ggml_tensor * q_w = nullptr;
|
||||||
|
ggml_tensor * q_b = nullptr;
|
||||||
|
ggml_tensor * v_w = nullptr;
|
||||||
|
ggml_tensor * v_b = nullptr;
|
||||||
|
ggml_tensor * qkv_w = nullptr;
|
||||||
|
ggml_tensor * qkv_b = nullptr;
|
||||||
|
|
||||||
|
ggml_tensor * o_w = nullptr;
|
||||||
|
ggml_tensor * o_b = nullptr;
|
||||||
|
|
||||||
|
ggml_tensor * k_norm = nullptr;
|
||||||
|
ggml_tensor * q_norm = nullptr;
|
||||||
|
|
||||||
|
// layernorm 1
|
||||||
|
ggml_tensor * ln_1_w = nullptr;
|
||||||
|
ggml_tensor * ln_1_b = nullptr;
|
||||||
|
|
||||||
|
ggml_tensor * ff_up_w = nullptr;
|
||||||
|
ggml_tensor * ff_up_b = nullptr;
|
||||||
|
ggml_tensor * ff_gate_w = nullptr;
|
||||||
|
ggml_tensor * ff_gate_b = nullptr;
|
||||||
|
ggml_tensor * ff_down_w = nullptr;
|
||||||
|
ggml_tensor * ff_down_b = nullptr;
|
||||||
|
|
||||||
|
// layernorm 2
|
||||||
|
ggml_tensor * ln_2_w = nullptr;
|
||||||
|
ggml_tensor * ln_2_b = nullptr;
|
||||||
|
|
||||||
|
// layer scale (no bias)
|
||||||
|
ggml_tensor * ls_1_w = nullptr;
|
||||||
|
ggml_tensor * ls_2_w = nullptr;
|
||||||
|
|
||||||
|
// qwen3vl deepstack merger
|
||||||
|
ggml_tensor * deepstack_norm_w = nullptr;
|
||||||
|
ggml_tensor * deepstack_norm_b = nullptr;
|
||||||
|
ggml_tensor * deepstack_fc1_w = nullptr;
|
||||||
|
ggml_tensor * deepstack_fc1_b = nullptr;
|
||||||
|
ggml_tensor * deepstack_fc2_w = nullptr;
|
||||||
|
ggml_tensor * deepstack_fc2_b = nullptr;
|
||||||
|
|
||||||
|
bool has_deepstack() const {
|
||||||
|
return deepstack_fc1_w != nullptr;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct clip_model {
|
||||||
|
clip_modality modality = CLIP_MODALITY_VISION;
|
||||||
|
projector_type proj_type = PROJECTOR_TYPE_MLP;
|
||||||
|
clip_hparams hparams;
|
||||||
|
|
||||||
|
// embeddings
|
||||||
|
ggml_tensor * class_embedding = nullptr;
|
||||||
|
ggml_tensor * patch_embeddings_0 = nullptr;
|
||||||
|
ggml_tensor * patch_embeddings_1 = nullptr; // second Conv2D kernel when we decouple Conv3D along temproal dimension (Qwen2VL)
|
||||||
|
ggml_tensor * patch_bias = nullptr;
|
||||||
|
ggml_tensor * position_embeddings = nullptr;
|
||||||
|
ggml_tensor * norm_embd_w = nullptr;
|
||||||
|
ggml_tensor * norm_embd_b = nullptr;
|
||||||
|
|
||||||
|
ggml_tensor * pre_ln_w = nullptr;
|
||||||
|
ggml_tensor * pre_ln_b = nullptr;
|
||||||
|
|
||||||
|
std::vector<clip_layer> layers;
|
||||||
|
|
||||||
|
int32_t n_deepstack_layers = 0; // used by Qwen3-VL, calculated from clip_layer
|
||||||
|
|
||||||
|
ggml_tensor * post_ln_w;
|
||||||
|
ggml_tensor * post_ln_b;
|
||||||
|
|
||||||
|
ggml_tensor * projection; // TODO: rename it to fc (fully connected layer)
|
||||||
|
ggml_tensor * mm_fc_w;
|
||||||
|
ggml_tensor * mm_fc_b;
|
||||||
|
ggml_tensor * mm_ffn_up_w = nullptr;
|
||||||
|
ggml_tensor * mm_ffn_up_b = nullptr;
|
||||||
|
ggml_tensor * mm_ffn_gate_w = nullptr;
|
||||||
|
ggml_tensor * mm_ffn_gate_b = nullptr;
|
||||||
|
ggml_tensor * mm_ffn_down_w = nullptr;
|
||||||
|
ggml_tensor * mm_ffn_down_b = nullptr;
|
||||||
|
ggml_tensor * mm_post_norm_w = nullptr;
|
||||||
|
ggml_tensor * mm_post_norm_b = nullptr;
|
||||||
|
|
||||||
|
// LLaVA projection
|
||||||
|
ggml_tensor * mm_input_norm_w = nullptr;
|
||||||
|
ggml_tensor * mm_input_norm_b = nullptr;
|
||||||
|
ggml_tensor * mm_0_w = nullptr;
|
||||||
|
ggml_tensor * mm_0_b = nullptr;
|
||||||
|
ggml_tensor * mm_2_w = nullptr;
|
||||||
|
ggml_tensor * mm_2_b = nullptr;
|
||||||
|
|
||||||
|
ggml_tensor * image_newline = nullptr;
|
||||||
|
|
||||||
|
// Yi type models with mlp+normalization projection
|
||||||
|
ggml_tensor * mm_1_w = nullptr; // Yi type models have 0, 1, 3, 4
|
||||||
|
ggml_tensor * mm_1_b = nullptr;
|
||||||
|
ggml_tensor * mm_3_w = nullptr;
|
||||||
|
ggml_tensor * mm_3_b = nullptr;
|
||||||
|
ggml_tensor * mm_4_w = nullptr;
|
||||||
|
ggml_tensor * mm_4_b = nullptr;
|
||||||
|
|
||||||
|
// GLMV-Edge projection
|
||||||
|
ggml_tensor * mm_model_adapter_conv_w = nullptr;
|
||||||
|
ggml_tensor * mm_model_adapter_conv_b = nullptr;
|
||||||
|
|
||||||
|
// MobileVLM projection
|
||||||
|
ggml_tensor * mm_model_mlp_1_w = nullptr;
|
||||||
|
ggml_tensor * mm_model_mlp_1_b = nullptr;
|
||||||
|
ggml_tensor * mm_model_mlp_3_w = nullptr;
|
||||||
|
ggml_tensor * mm_model_mlp_3_b = nullptr;
|
||||||
|
ggml_tensor * mm_model_block_1_block_0_0_w = nullptr;
|
||||||
|
ggml_tensor * mm_model_block_1_block_0_1_w = nullptr;
|
||||||
|
ggml_tensor * mm_model_block_1_block_0_1_b = nullptr;
|
||||||
|
ggml_tensor * mm_model_block_1_block_1_fc1_w = nullptr;
|
||||||
|
ggml_tensor * mm_model_block_1_block_1_fc1_b = nullptr;
|
||||||
|
ggml_tensor * mm_model_block_1_block_1_fc2_w = nullptr;
|
||||||
|
ggml_tensor * mm_model_block_1_block_1_fc2_b = nullptr;
|
||||||
|
ggml_tensor * mm_model_block_1_block_2_0_w = nullptr;
|
||||||
|
ggml_tensor * mm_model_block_1_block_2_1_w = nullptr;
|
||||||
|
ggml_tensor * mm_model_block_1_block_2_1_b = nullptr;
|
||||||
|
ggml_tensor * mm_model_block_2_block_0_0_w = nullptr;
|
||||||
|
ggml_tensor * mm_model_block_2_block_0_1_w = nullptr;
|
||||||
|
ggml_tensor * mm_model_block_2_block_0_1_b = nullptr;
|
||||||
|
ggml_tensor * mm_model_block_2_block_1_fc1_w = nullptr;
|
||||||
|
ggml_tensor * mm_model_block_2_block_1_fc1_b = nullptr;
|
||||||
|
ggml_tensor * mm_model_block_2_block_1_fc2_w = nullptr;
|
||||||
|
ggml_tensor * mm_model_block_2_block_1_fc2_b = nullptr;
|
||||||
|
ggml_tensor * mm_model_block_2_block_2_0_w = nullptr;
|
||||||
|
ggml_tensor * mm_model_block_2_block_2_1_w = nullptr;
|
||||||
|
ggml_tensor * mm_model_block_2_block_2_1_b = nullptr;
|
||||||
|
|
||||||
|
// MobileVLM_V2 projection
|
||||||
|
ggml_tensor * mm_model_mlp_0_w = nullptr;
|
||||||
|
ggml_tensor * mm_model_mlp_0_b = nullptr;
|
||||||
|
ggml_tensor * mm_model_mlp_2_w = nullptr;
|
||||||
|
ggml_tensor * mm_model_mlp_2_b = nullptr;
|
||||||
|
ggml_tensor * mm_model_peg_0_w = nullptr;
|
||||||
|
ggml_tensor * mm_model_peg_0_b = nullptr;
|
||||||
|
|
||||||
|
// MINICPMV projection
|
||||||
|
ggml_tensor * mm_model_pos_embed_k = nullptr;
|
||||||
|
ggml_tensor * mm_model_query = nullptr;
|
||||||
|
ggml_tensor * mm_model_proj = nullptr;
|
||||||
|
ggml_tensor * mm_model_kv_proj = nullptr;
|
||||||
|
ggml_tensor * mm_model_attn_q_w = nullptr;
|
||||||
|
ggml_tensor * mm_model_attn_q_b = nullptr;
|
||||||
|
ggml_tensor * mm_model_attn_k_w = nullptr;
|
||||||
|
ggml_tensor * mm_model_attn_k_b = nullptr;
|
||||||
|
ggml_tensor * mm_model_attn_v_w = nullptr;
|
||||||
|
ggml_tensor * mm_model_attn_v_b = nullptr;
|
||||||
|
ggml_tensor * mm_model_attn_o_w = nullptr;
|
||||||
|
ggml_tensor * mm_model_attn_o_b = nullptr;
|
||||||
|
ggml_tensor * mm_model_ln_q_w = nullptr;
|
||||||
|
ggml_tensor * mm_model_ln_q_b = nullptr;
|
||||||
|
ggml_tensor * mm_model_ln_kv_w = nullptr;
|
||||||
|
ggml_tensor * mm_model_ln_kv_b = nullptr;
|
||||||
|
ggml_tensor * mm_model_ln_post_w = nullptr;
|
||||||
|
ggml_tensor * mm_model_ln_post_b = nullptr;
|
||||||
|
|
||||||
|
// gemma3
|
||||||
|
ggml_tensor * mm_input_proj_w = nullptr;
|
||||||
|
ggml_tensor * mm_soft_emb_norm_w = nullptr;
|
||||||
|
|
||||||
|
// pixtral, glm4v
|
||||||
|
ggml_tensor * token_embd_img_break = nullptr;
|
||||||
|
ggml_tensor * mm_patch_merger_w = nullptr;
|
||||||
|
ggml_tensor * mm_patch_merger_b = nullptr;
|
||||||
|
|
||||||
|
// ultravox / whisper encoder
|
||||||
|
ggml_tensor * conv1d_1_w = nullptr;
|
||||||
|
ggml_tensor * conv1d_1_b = nullptr;
|
||||||
|
ggml_tensor * conv1d_2_w = nullptr;
|
||||||
|
ggml_tensor * conv1d_2_b = nullptr;
|
||||||
|
ggml_tensor * mm_norm_pre_w = nullptr;
|
||||||
|
ggml_tensor * mm_norm_pre_b = nullptr;
|
||||||
|
ggml_tensor * mm_norm_mid_w = nullptr;
|
||||||
|
|
||||||
|
// cogvlm
|
||||||
|
ggml_tensor * mm_post_fc_norm_w = nullptr;
|
||||||
|
ggml_tensor * mm_post_fc_norm_b = nullptr;
|
||||||
|
ggml_tensor * mm_h_to_4h_w = nullptr;
|
||||||
|
ggml_tensor * mm_gate_w = nullptr;
|
||||||
|
ggml_tensor * mm_4h_to_h_w = nullptr;
|
||||||
|
ggml_tensor * mm_boi = nullptr;
|
||||||
|
ggml_tensor * mm_eoi = nullptr;
|
||||||
|
|
||||||
|
bool audio_has_avgpool() const {
|
||||||
|
return proj_type == PROJECTOR_TYPE_QWEN2A
|
||||||
|
|| proj_type == PROJECTOR_TYPE_VOXTRAL;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool audio_has_stack_frames() const {
|
||||||
|
return proj_type == PROJECTOR_TYPE_ULTRAVOX
|
||||||
|
|| proj_type == PROJECTOR_TYPE_VOXTRAL;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
const clip_hparams * clip_get_hparams(const struct clip_ctx * ctx);
|
||||||
File diff suppressed because it is too large
Load Diff
|
|
@ -7,6 +7,8 @@
|
||||||
|
|
||||||
// !!! Internal header, to be used by mtmd only !!!
|
// !!! Internal header, to be used by mtmd only !!!
|
||||||
|
|
||||||
|
#define MTMD_INTERNAL_HEADER
|
||||||
|
|
||||||
struct clip_ctx;
|
struct clip_ctx;
|
||||||
|
|
||||||
struct clip_image_size {
|
struct clip_image_size {
|
||||||
|
|
@ -34,6 +36,7 @@ struct clip_context_params {
|
||||||
enum clip_flash_attn_type flash_attn_type;
|
enum clip_flash_attn_type flash_attn_type;
|
||||||
int image_min_tokens;
|
int image_min_tokens;
|
||||||
int image_max_tokens;
|
int image_max_tokens;
|
||||||
|
bool warmup;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct clip_init_result {
|
struct clip_init_result {
|
||||||
|
|
@ -101,7 +104,7 @@ bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct
|
||||||
|
|
||||||
int clip_is_minicpmv(const struct clip_ctx * ctx);
|
int clip_is_minicpmv(const struct clip_ctx * ctx);
|
||||||
bool clip_is_glm(const struct clip_ctx * ctx);
|
bool clip_is_glm(const struct clip_ctx * ctx);
|
||||||
bool clip_is_qwen2vl(const struct clip_ctx * ctx);
|
bool clip_is_mrope(const struct clip_ctx * ctx);
|
||||||
bool clip_is_llava(const struct clip_ctx * ctx);
|
bool clip_is_llava(const struct clip_ctx * ctx);
|
||||||
bool clip_is_gemma3(const struct clip_ctx * ctx);
|
bool clip_is_gemma3(const struct clip_ctx * ctx);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,98 @@
|
||||||
|
#include "models.h"
|
||||||
|
|
||||||
|
ggml_cgraph * clip_graph_cogvlm::build() {
|
||||||
|
GGML_ASSERT(model.class_embedding != nullptr);
|
||||||
|
GGML_ASSERT(model.position_embeddings != nullptr);
|
||||||
|
|
||||||
|
const int n_pos = n_patches + 1; // +1 for [CLS]
|
||||||
|
|
||||||
|
// build input and concatenate class embedding
|
||||||
|
ggml_tensor * inp = build_inp();
|
||||||
|
inp = ggml_concat(ctx0, inp, model.class_embedding, 1);
|
||||||
|
|
||||||
|
inp = ggml_add(ctx0, inp, model.position_embeddings);
|
||||||
|
cb(inp, "inp_pos", -1);
|
||||||
|
|
||||||
|
ggml_tensor * inpL = inp;
|
||||||
|
|
||||||
|
for (int il = 0; il < n_layer; il++) {
|
||||||
|
auto & layer = model.layers[il];
|
||||||
|
ggml_tensor * cur = inpL;
|
||||||
|
|
||||||
|
cur = ggml_mul_mat(ctx0, layer.qkv_w, cur);
|
||||||
|
|
||||||
|
cur = ggml_add(ctx0, cur, layer.qkv_b);
|
||||||
|
|
||||||
|
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, d_head*sizeof(float),
|
||||||
|
cur->nb[1], 0);
|
||||||
|
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, d_head*sizeof(float),
|
||||||
|
cur->nb[1], n_embd * sizeof(float));
|
||||||
|
ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, d_head*sizeof(float),
|
||||||
|
cur->nb[1], 2 * n_embd * sizeof(float));
|
||||||
|
|
||||||
|
cb(Qcur, "Qcur", il);
|
||||||
|
cb(Kcur, "Kcur", il);
|
||||||
|
cb(Vcur, "Vcur", il);
|
||||||
|
|
||||||
|
cur = build_attn(layer.o_w, layer.o_b,
|
||||||
|
Qcur, Kcur, Vcur, nullptr, kq_scale, il);
|
||||||
|
cb(cur, "attn_out", il);
|
||||||
|
|
||||||
|
cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il);
|
||||||
|
cb(cur, "attn_post_norm", il);
|
||||||
|
|
||||||
|
cur = ggml_add(ctx0, cur, inpL);
|
||||||
|
inpL = cur;
|
||||||
|
|
||||||
|
cur = build_ffn(cur,
|
||||||
|
layer.ff_up_w, layer.ff_up_b,
|
||||||
|
layer.ff_gate_w, layer.ff_gate_b,
|
||||||
|
layer.ff_down_w, layer.ff_down_b,
|
||||||
|
hparams.ffn_op, il);
|
||||||
|
|
||||||
|
cb(cur, "ffn_out", il);
|
||||||
|
|
||||||
|
cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il);
|
||||||
|
cb(cur, "ffn_post_norm", il);
|
||||||
|
|
||||||
|
cur = ggml_add(ctx0, cur, inpL);
|
||||||
|
cb(cur, "layer_out", il);
|
||||||
|
inpL = cur;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
// remove CLS token (like build_llama4 does)
|
||||||
|
ggml_tensor * cur = ggml_view_2d(ctx0, inpL,
|
||||||
|
n_embd, n_patches,
|
||||||
|
ggml_row_size(inpL->type, n_embd), 0);
|
||||||
|
|
||||||
|
// Multiply with mm_model_proj
|
||||||
|
cur = ggml_mul_mat(ctx0, model.mm_model_proj, cur);
|
||||||
|
|
||||||
|
// Apply layernorm, weight, bias
|
||||||
|
cur = build_norm(cur, model.mm_post_fc_norm_w, model.mm_post_fc_norm_b, NORM_TYPE_NORMAL, 1e-5, -1);
|
||||||
|
|
||||||
|
// Apply GELU
|
||||||
|
cur = ggml_gelu_inplace(ctx0, cur);
|
||||||
|
|
||||||
|
// Branch 1: multiply with mm_h_to_4h_w
|
||||||
|
ggml_tensor * h_to_4h = ggml_mul_mat(ctx0, model.mm_h_to_4h_w, cur);
|
||||||
|
|
||||||
|
// Branch 2: multiply with mm_gate_w
|
||||||
|
ggml_tensor * gate = ggml_mul_mat(ctx0, model.mm_gate_w, cur);
|
||||||
|
|
||||||
|
// Apply silu
|
||||||
|
gate = ggml_swiglu_split(ctx0, gate, h_to_4h);
|
||||||
|
|
||||||
|
// Apply mm_4h_to_h_w
|
||||||
|
cur = ggml_mul_mat(ctx0, model.mm_4h_to_h_w, gate);
|
||||||
|
|
||||||
|
// Concatenate with boi and eoi
|
||||||
|
cur = ggml_concat(ctx0, model.mm_boi, cur, 1);
|
||||||
|
cur = ggml_concat(ctx0, cur, model.mm_eoi, 1);
|
||||||
|
|
||||||
|
// build the graph
|
||||||
|
ggml_build_forward_expand(gf, cur);
|
||||||
|
|
||||||
|
return gf;
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,120 @@
|
||||||
|
#include "models.h"
|
||||||
|
|
||||||
|
ggml_cgraph * clip_graph_glm4v::build() {
|
||||||
|
GGML_ASSERT(model.patch_bias != nullptr);
|
||||||
|
GGML_ASSERT(model.position_embeddings != nullptr);
|
||||||
|
GGML_ASSERT(model.class_embedding == nullptr);
|
||||||
|
|
||||||
|
const int batch_size = 1;
|
||||||
|
|
||||||
|
norm_type norm_t = NORM_TYPE_RMS;
|
||||||
|
|
||||||
|
ggml_tensor * inp_raw = build_inp_raw();
|
||||||
|
ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
|
||||||
|
|
||||||
|
int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
|
||||||
|
ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches * 4);
|
||||||
|
ggml_set_name(positions, "positions");
|
||||||
|
ggml_set_input(positions);
|
||||||
|
|
||||||
|
GGML_ASSERT(img.nx % (patch_size * 2) == 0);
|
||||||
|
GGML_ASSERT(img.ny % (patch_size * 2) == 0);
|
||||||
|
|
||||||
|
// second conv dimension
|
||||||
|
{
|
||||||
|
auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
|
||||||
|
inp = ggml_add(ctx0, inp, inp_1);
|
||||||
|
|
||||||
|
inp = ggml_permute(ctx0, inp, 1, 2, 0, 3); // [w, h, c, b] -> [c, w, h, b]
|
||||||
|
inp = ggml_cont_4d(
|
||||||
|
ctx0, inp,
|
||||||
|
n_embd * 2, n_patches_x / 2, n_patches_y, batch_size);
|
||||||
|
inp = ggml_reshape_4d(
|
||||||
|
ctx0, inp,
|
||||||
|
n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2));
|
||||||
|
inp = ggml_permute(ctx0, inp, 0, 2, 1, 3);
|
||||||
|
inp = ggml_cont_3d(
|
||||||
|
ctx0, inp,
|
||||||
|
n_embd, n_patches_x * n_patches_y, batch_size);
|
||||||
|
}
|
||||||
|
|
||||||
|
// add patch bias
|
||||||
|
inp = ggml_add(ctx0, inp, model.patch_bias);
|
||||||
|
cb(inp, "patch_bias", -1);
|
||||||
|
|
||||||
|
// pos-conv norm
|
||||||
|
inp = build_norm(inp, model.norm_embd_w, model.norm_embd_b, norm_t, eps, -1);
|
||||||
|
|
||||||
|
// calculate absolute position embedding and apply
|
||||||
|
ggml_tensor * learned_pos_embd = resize_position_embeddings(GGML_SCALE_MODE_BICUBIC);
|
||||||
|
learned_pos_embd = ggml_cont_4d(
|
||||||
|
ctx0, learned_pos_embd,
|
||||||
|
n_embd * 2, n_patches_x / 2, n_patches_y, batch_size);
|
||||||
|
learned_pos_embd = ggml_reshape_4d(
|
||||||
|
ctx0, learned_pos_embd,
|
||||||
|
n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2));
|
||||||
|
learned_pos_embd = ggml_permute(ctx0, learned_pos_embd, 0, 2, 1, 3);
|
||||||
|
learned_pos_embd = ggml_cont_3d(
|
||||||
|
ctx0, learned_pos_embd,
|
||||||
|
n_embd, n_patches_x * n_patches_y, batch_size);
|
||||||
|
cb(learned_pos_embd, "learned_pos_embd", -1);
|
||||||
|
|
||||||
|
auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
|
||||||
|
return ggml_rope_multi(
|
||||||
|
ctx0, cur, positions, nullptr,
|
||||||
|
d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION,
|
||||||
|
32768, hparams.rope_theta, 1, 0, 1, 32, 1);
|
||||||
|
};
|
||||||
|
|
||||||
|
ggml_tensor * cur = build_vit(
|
||||||
|
inp, n_patches,
|
||||||
|
norm_t,
|
||||||
|
hparams.ffn_op,
|
||||||
|
learned_pos_embd,
|
||||||
|
add_pos);
|
||||||
|
|
||||||
|
cb(cur, "vit_out", -1);
|
||||||
|
// cb(ggml_sum(ctx0, cur), "vit_out_sum", -1);
|
||||||
|
|
||||||
|
// GLM4V projector
|
||||||
|
// ref: https://github.com/huggingface/transformers/blob/40dc11cd3eb4126652aa41ef8272525affd4a636/src/transformers/models/glm4v/modeling_glm4v.py#L116-L130
|
||||||
|
|
||||||
|
// patch merger (downsample)
|
||||||
|
{
|
||||||
|
int n_merge = hparams.n_merge;
|
||||||
|
GGML_ASSERT(n_merge > 0);
|
||||||
|
|
||||||
|
int n_token_out = n_patches / n_merge / n_merge;
|
||||||
|
cur = ggml_reshape_4d(ctx0, cur, n_embd, n_merge, n_merge, n_token_out);
|
||||||
|
cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3)); // [n_merge, n_merge, n_embd, n_token_out]
|
||||||
|
cur = ggml_conv_2d(ctx0, model.mm_patch_merger_w, cur, n_merge, n_merge, 0, 0, 1, 1);
|
||||||
|
cur = ggml_reshape_2d(ctx0, cur, cur->ne[2], n_token_out); // [n_embd_out, n_token_out]
|
||||||
|
|
||||||
|
cur = ggml_add(ctx0, cur, model.mm_patch_merger_b);
|
||||||
|
}
|
||||||
|
|
||||||
|
// FC projector
|
||||||
|
{
|
||||||
|
cur = ggml_mul_mat(ctx0, model.projection, cur);
|
||||||
|
// default LayerNorm (post_projection_norm)
|
||||||
|
cur = build_norm(cur, model.mm_post_norm_w, model.mm_post_norm_b, NORM_TYPE_NORMAL, 1e-5, -1);
|
||||||
|
cur = ggml_gelu_erf(ctx0, cur);
|
||||||
|
cb(cur, "after_fc_proj", -1);
|
||||||
|
}
|
||||||
|
|
||||||
|
// FFN projector
|
||||||
|
{
|
||||||
|
cur = build_ffn(cur,
|
||||||
|
model.mm_ffn_up_w, model.mm_ffn_up_b,
|
||||||
|
model.mm_ffn_gate_w, model.mm_ffn_gate_b,
|
||||||
|
model.mm_ffn_down_w, model.mm_ffn_down_b,
|
||||||
|
hparams.ffn_op, -1);
|
||||||
|
cb(cur, "after_ffn_proj", -1);
|
||||||
|
// cb(ggml_sum(ctx0, cur), "merged_sum", -1);
|
||||||
|
}
|
||||||
|
|
||||||
|
// build the graph
|
||||||
|
ggml_build_forward_expand(gf, cur);
|
||||||
|
|
||||||
|
return gf;
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,69 @@
|
||||||
|
#include "models.h"
|
||||||
|
|
||||||
|
ggml_cgraph * clip_graph_internvl::build() {
|
||||||
|
GGML_ASSERT(model.class_embedding != nullptr);
|
||||||
|
GGML_ASSERT(model.position_embeddings != nullptr);
|
||||||
|
|
||||||
|
const int n_pos = n_patches + 1;
|
||||||
|
ggml_tensor * inp = build_inp();
|
||||||
|
|
||||||
|
// add CLS token
|
||||||
|
inp = ggml_concat(ctx0, inp, model.class_embedding, 1);
|
||||||
|
|
||||||
|
// The larger models use a different ViT, which uses RMS norm instead of layer norm
|
||||||
|
// ref: https://github.com/ggml-org/llama.cpp/pull/13443#issuecomment-2869786188
|
||||||
|
norm_type norm_t = (hparams.n_embd == 3200 && hparams.n_layer == 45)
|
||||||
|
? NORM_TYPE_RMS // 6B ViT (Used by InternVL 2.5/3 - 26B, 38B, 78B)
|
||||||
|
: NORM_TYPE_NORMAL; // 300M ViT (Used by all smaller InternVL models)
|
||||||
|
|
||||||
|
ggml_tensor * cur = build_vit(
|
||||||
|
inp, n_pos,
|
||||||
|
norm_t,
|
||||||
|
hparams.ffn_op,
|
||||||
|
model.position_embeddings,
|
||||||
|
nullptr);
|
||||||
|
|
||||||
|
// remove CLS token
|
||||||
|
cur = ggml_view_2d(ctx0, cur,
|
||||||
|
n_embd, n_patches,
|
||||||
|
ggml_row_size(cur->type, n_embd), 0);
|
||||||
|
|
||||||
|
// pixel shuffle
|
||||||
|
{
|
||||||
|
const int scale_factor = model.hparams.n_merge;
|
||||||
|
const int bsz = 1; // batch size, always 1 for now since we don't support batching
|
||||||
|
const int height = n_patches_y;
|
||||||
|
const int width = n_patches_x;
|
||||||
|
GGML_ASSERT(scale_factor > 0);
|
||||||
|
cur = ggml_reshape_4d(ctx0, cur, n_embd * scale_factor, height / scale_factor, width, bsz);
|
||||||
|
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
|
||||||
|
cur = ggml_cont_4d(ctx0, cur,
|
||||||
|
n_embd * scale_factor * scale_factor,
|
||||||
|
height / scale_factor,
|
||||||
|
width / scale_factor,
|
||||||
|
bsz);
|
||||||
|
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
|
||||||
|
// flatten to 2D
|
||||||
|
cur = ggml_cont_2d(ctx0, cur,
|
||||||
|
n_embd * scale_factor * scale_factor,
|
||||||
|
cur->ne[1] * cur->ne[2]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// projector (always using GELU activation)
|
||||||
|
{
|
||||||
|
// projector LayerNorm uses pytorch's default eps = 1e-5
|
||||||
|
// ref: https://huggingface.co/OpenGVLab/InternVL3-8B-Instruct/blob/a34d3e4e129a5856abfd6aa6de79776484caa14e/modeling_internvl_chat.py#L79
|
||||||
|
cur = build_norm(cur, model.mm_0_w, model.mm_0_b, NORM_TYPE_NORMAL, 1e-5, -1);
|
||||||
|
cur = build_ffn(cur,
|
||||||
|
model.mm_1_w, model.mm_1_b,
|
||||||
|
nullptr, nullptr,
|
||||||
|
model.mm_3_w, model.mm_3_b,
|
||||||
|
FFN_GELU,
|
||||||
|
-1);
|
||||||
|
}
|
||||||
|
|
||||||
|
// build the graph
|
||||||
|
ggml_build_forward_expand(gf, cur);
|
||||||
|
|
||||||
|
return gf;
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,63 @@
|
||||||
|
#include "models.h"
|
||||||
|
|
||||||
|
ggml_cgraph * clip_graph_kimivl::build() {
|
||||||
|
// 2D input positions
|
||||||
|
ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
|
||||||
|
ggml_set_name(pos_h, "pos_h");
|
||||||
|
ggml_set_input(pos_h);
|
||||||
|
|
||||||
|
ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
|
||||||
|
ggml_set_name(pos_w, "pos_w");
|
||||||
|
ggml_set_input(pos_w);
|
||||||
|
|
||||||
|
ggml_tensor * learned_pos_embd = resize_position_embeddings();
|
||||||
|
|
||||||
|
// build ViT with 2D position embeddings
|
||||||
|
auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
|
||||||
|
// first half is X axis and second half is Y axis
|
||||||
|
return build_rope_2d(ctx0, cur, pos_w, pos_h, hparams.rope_theta, false);
|
||||||
|
};
|
||||||
|
|
||||||
|
ggml_tensor * inp = build_inp();
|
||||||
|
ggml_tensor * cur = build_vit(
|
||||||
|
inp, n_patches,
|
||||||
|
NORM_TYPE_NORMAL,
|
||||||
|
hparams.ffn_op,
|
||||||
|
learned_pos_embd,
|
||||||
|
add_pos);
|
||||||
|
|
||||||
|
cb(cur, "vit_out", -1);
|
||||||
|
|
||||||
|
{
|
||||||
|
// patch_merger
|
||||||
|
const int scale_factor = model.hparams.n_merge;
|
||||||
|
cur = build_patch_merge_permute(cur, scale_factor);
|
||||||
|
|
||||||
|
// projection norm
|
||||||
|
int proj_inp_dim = cur->ne[0];
|
||||||
|
cur = ggml_view_2d(ctx0, cur,
|
||||||
|
n_embd, cur->ne[1] * scale_factor * scale_factor,
|
||||||
|
ggml_row_size(cur->type, n_embd), 0);
|
||||||
|
cur = ggml_norm(ctx0, cur, 1e-5); // default nn.LayerNorm
|
||||||
|
cur = ggml_mul(ctx0, cur, model.mm_input_norm_w);
|
||||||
|
cur = ggml_add(ctx0, cur, model.mm_input_norm_b);
|
||||||
|
cur = ggml_view_2d(ctx0, cur,
|
||||||
|
proj_inp_dim, cur->ne[1] / scale_factor / scale_factor,
|
||||||
|
ggml_row_size(cur->type, proj_inp_dim), 0);
|
||||||
|
cb(cur, "proj_inp_normed", -1);
|
||||||
|
|
||||||
|
// projection mlp
|
||||||
|
cur = build_ffn(cur,
|
||||||
|
model.mm_1_w, model.mm_1_b,
|
||||||
|
nullptr, nullptr,
|
||||||
|
model.mm_2_w, model.mm_2_b,
|
||||||
|
FFN_GELU,
|
||||||
|
-1);
|
||||||
|
cb(cur, "proj_out", -1);
|
||||||
|
}
|
||||||
|
|
||||||
|
// build the graph
|
||||||
|
ggml_build_forward_expand(gf, cur);
|
||||||
|
|
||||||
|
return gf;
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,96 @@
|
||||||
|
#include "models.h"
|
||||||
|
|
||||||
|
ggml_cgraph * clip_graph_llama4::build() {
|
||||||
|
GGML_ASSERT(model.class_embedding != nullptr);
|
||||||
|
GGML_ASSERT(model.position_embeddings != nullptr);
|
||||||
|
|
||||||
|
const int n_pos = n_patches + 1; // +1 for [CLS]
|
||||||
|
|
||||||
|
// 2D input positions
|
||||||
|
ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
|
||||||
|
ggml_set_name(pos_h, "pos_h");
|
||||||
|
ggml_set_input(pos_h);
|
||||||
|
|
||||||
|
ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
|
||||||
|
ggml_set_name(pos_w, "pos_w");
|
||||||
|
ggml_set_input(pos_w);
|
||||||
|
|
||||||
|
ggml_tensor * inp = build_inp_raw();
|
||||||
|
|
||||||
|
// Llama4UnfoldConvolution
|
||||||
|
{
|
||||||
|
ggml_tensor * kernel = ggml_reshape_4d(ctx0, model.patch_embeddings_0,
|
||||||
|
patch_size, patch_size, 3, n_embd);
|
||||||
|
inp = ggml_im2col(ctx0, kernel, inp, patch_size, patch_size, 0, 0, 1, 1, true, inp->type);
|
||||||
|
inp = ggml_mul_mat(ctx0, model.patch_embeddings_0, inp);
|
||||||
|
inp = ggml_reshape_2d(ctx0, inp, n_embd, n_patches);
|
||||||
|
cb(inp, "patch_conv", -1);
|
||||||
|
}
|
||||||
|
|
||||||
|
// add CLS token
|
||||||
|
inp = ggml_concat(ctx0, inp, model.class_embedding, 1);
|
||||||
|
|
||||||
|
// build ViT with 2D position embeddings
|
||||||
|
auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
|
||||||
|
// first half is X axis and second half is Y axis
|
||||||
|
// ref: https://github.com/huggingface/transformers/blob/40a493c7ed4f19f08eadb0639cf26d49bfa5e180/src/transformers/models/llama4/modeling_llama4.py#L1312
|
||||||
|
// ref: https://github.com/Blaizzy/mlx-vlm/blob/a57156aa87b33cca6e5ee6cfc14dd4ef8f611be6/mlx_vlm/models/llama4/vision.py#L441
|
||||||
|
return build_rope_2d(ctx0, cur, pos_w, pos_h, hparams.rope_theta, false);
|
||||||
|
};
|
||||||
|
ggml_tensor * cur = build_vit(
|
||||||
|
inp, n_pos,
|
||||||
|
NORM_TYPE_NORMAL,
|
||||||
|
hparams.ffn_op,
|
||||||
|
model.position_embeddings,
|
||||||
|
add_pos);
|
||||||
|
|
||||||
|
// remove CLS token
|
||||||
|
cur = ggml_view_2d(ctx0, cur,
|
||||||
|
n_embd, n_patches,
|
||||||
|
ggml_row_size(cur->type, n_embd), 0);
|
||||||
|
|
||||||
|
// pixel shuffle
|
||||||
|
// based on Llama4VisionPixelShuffleMLP
|
||||||
|
// https://github.com/huggingface/transformers/blob/2932f318a20d9e54cc7aea052e040164d85de7d6/src/transformers/models/llama4/modeling_llama4.py#L1151
|
||||||
|
{
|
||||||
|
const int scale_factor = model.hparams.n_merge;
|
||||||
|
const int bsz = 1; // batch size, always 1 for now since we don't support batching
|
||||||
|
GGML_ASSERT(scale_factor > 0);
|
||||||
|
GGML_ASSERT(n_patches_x == n_patches_y); // llama4 only supports square images
|
||||||
|
cur = ggml_reshape_4d(ctx0, cur,
|
||||||
|
n_embd * scale_factor,
|
||||||
|
n_patches_x / scale_factor,
|
||||||
|
n_patches_y,
|
||||||
|
bsz);
|
||||||
|
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
|
||||||
|
cur = ggml_cont_4d(ctx0, cur,
|
||||||
|
n_embd * scale_factor * scale_factor,
|
||||||
|
n_patches_x / scale_factor,
|
||||||
|
n_patches_y / scale_factor,
|
||||||
|
bsz);
|
||||||
|
//cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
|
||||||
|
// flatten to 2D
|
||||||
|
cur = ggml_cont_2d(ctx0, cur,
|
||||||
|
n_embd * scale_factor * scale_factor,
|
||||||
|
n_patches / scale_factor / scale_factor);
|
||||||
|
cb(cur, "pixel_shuffle", -1);
|
||||||
|
}
|
||||||
|
|
||||||
|
// based on Llama4VisionMLP2 (always uses GELU activation, no bias)
|
||||||
|
{
|
||||||
|
cur = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w, cur);
|
||||||
|
cur = ggml_gelu(ctx0, cur);
|
||||||
|
cur = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, cur);
|
||||||
|
cur = ggml_gelu(ctx0, cur);
|
||||||
|
cb(cur, "adapter_mlp", -1);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Llama4MultiModalProjector
|
||||||
|
cur = ggml_mul_mat(ctx0, model.mm_model_proj, cur);
|
||||||
|
cb(cur, "projected", -1);
|
||||||
|
|
||||||
|
// build the graph
|
||||||
|
ggml_build_forward_expand(gf, cur);
|
||||||
|
|
||||||
|
return gf;
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,374 @@
|
||||||
|
#include "models.h"
|
||||||
|
|
||||||
|
// this graph is used by llava, granite and glm
|
||||||
|
// due to having embedding_stack (used by granite), we cannot reuse build_vit
|
||||||
|
ggml_cgraph * clip_graph_llava::build() {
|
||||||
|
const int batch_size = 1;
|
||||||
|
const int n_pos = n_patches + (model.class_embedding ? 1 : 0);
|
||||||
|
|
||||||
|
GGML_ASSERT(n_patches_x == n_patches_y && "only square images supported");
|
||||||
|
|
||||||
|
// Calculate the deepest feature layer based on hparams and projector type
|
||||||
|
int max_feature_layer = n_layer;
|
||||||
|
{
|
||||||
|
// Get the index of the second to last layer; this is the default for models that have a llava projector
|
||||||
|
int il_last = hparams.n_layer - 1;
|
||||||
|
int deepest_feature_layer = -1;
|
||||||
|
|
||||||
|
if (proj_type == PROJECTOR_TYPE_MINICPMV || proj_type == PROJECTOR_TYPE_GLM_EDGE) {
|
||||||
|
il_last += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// If we set explicit vision feature layers, only go up to the deepest one
|
||||||
|
// NOTE: only used by granite-vision models for now
|
||||||
|
for (const auto & feature_layer : hparams.vision_feature_layer) {
|
||||||
|
if (feature_layer > deepest_feature_layer) {
|
||||||
|
deepest_feature_layer = feature_layer;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
max_feature_layer = deepest_feature_layer < 0 ? il_last : deepest_feature_layer;
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_tensor * inp = build_inp();
|
||||||
|
|
||||||
|
// concat class_embeddings and patch_embeddings
|
||||||
|
if (model.class_embedding) {
|
||||||
|
inp = ggml_concat(ctx0, inp, model.class_embedding, 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
|
||||||
|
ggml_set_name(positions, "positions");
|
||||||
|
ggml_set_input(positions);
|
||||||
|
|
||||||
|
inp = ggml_add(ctx0, inp, ggml_get_rows(ctx0, model.position_embeddings, positions));
|
||||||
|
|
||||||
|
ggml_tensor * inpL = inp;
|
||||||
|
|
||||||
|
// pre-layernorm
|
||||||
|
if (model.pre_ln_w) {
|
||||||
|
inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, NORM_TYPE_NORMAL, eps, -1);
|
||||||
|
cb(inpL, "pre_ln", -1);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<ggml_tensor *> embedding_stack;
|
||||||
|
const auto & vision_feature_layer = hparams.vision_feature_layer;
|
||||||
|
|
||||||
|
// loop over layers
|
||||||
|
for (int il = 0; il < max_feature_layer; il++) {
|
||||||
|
auto & layer = model.layers[il];
|
||||||
|
ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
|
||||||
|
|
||||||
|
// If this is an embedding feature layer, save the output.
|
||||||
|
// NOTE: 0 index here refers to the input to the encoder.
|
||||||
|
if (vision_feature_layer.find(il) != vision_feature_layer.end()) {
|
||||||
|
embedding_stack.push_back(cur);
|
||||||
|
}
|
||||||
|
|
||||||
|
// layernorm1
|
||||||
|
cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il);
|
||||||
|
cb(cur, "layer_inp_normed", il);
|
||||||
|
|
||||||
|
// self-attention
|
||||||
|
{
|
||||||
|
ggml_tensor * Qcur = ggml_mul_mat(ctx0, layer.q_w, cur);
|
||||||
|
if (layer.q_b) {
|
||||||
|
Qcur = ggml_add(ctx0, Qcur, layer.q_b);
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_tensor * Kcur = ggml_mul_mat(ctx0, layer.k_w, cur);
|
||||||
|
if (layer.k_b) {
|
||||||
|
Kcur = ggml_add(ctx0, Kcur, layer.k_b);
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_tensor * Vcur = ggml_mul_mat(ctx0, layer.v_w, cur);
|
||||||
|
if (layer.v_b) {
|
||||||
|
Vcur = ggml_add(ctx0, Vcur, layer.v_b);
|
||||||
|
}
|
||||||
|
|
||||||
|
Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos);
|
||||||
|
Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos);
|
||||||
|
Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_pos);
|
||||||
|
|
||||||
|
cb(Qcur, "Qcur", il);
|
||||||
|
cb(Kcur, "Kcur", il);
|
||||||
|
cb(Vcur, "Vcur", il);
|
||||||
|
|
||||||
|
cur = build_attn(layer.o_w, layer.o_b,
|
||||||
|
Qcur, Kcur, Vcur, nullptr, kq_scale, il);
|
||||||
|
cb(cur, "attn_out", il);
|
||||||
|
}
|
||||||
|
|
||||||
|
// re-add the layer input, e.g., residual
|
||||||
|
cur = ggml_add(ctx0, cur, inpL);
|
||||||
|
|
||||||
|
inpL = cur; // inpL = residual, cur = hidden_states
|
||||||
|
|
||||||
|
cb(cur, "ffn_inp", il);
|
||||||
|
|
||||||
|
// layernorm2
|
||||||
|
cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il);
|
||||||
|
cb(cur, "ffn_inp_normed", il);
|
||||||
|
|
||||||
|
// ffn
|
||||||
|
cur = build_ffn(cur,
|
||||||
|
layer.ff_up_w, layer.ff_up_b,
|
||||||
|
layer.ff_gate_w, layer.ff_gate_b,
|
||||||
|
layer.ff_down_w, layer.ff_down_b,
|
||||||
|
hparams.ffn_op, il);
|
||||||
|
|
||||||
|
cb(cur, "ffn_out", il);
|
||||||
|
|
||||||
|
// residual 2
|
||||||
|
cur = ggml_add(ctx0, inpL, cur);
|
||||||
|
cb(cur, "layer_out", il);
|
||||||
|
|
||||||
|
inpL = cur;
|
||||||
|
}
|
||||||
|
|
||||||
|
// post-layernorm
|
||||||
|
if (model.post_ln_w) {
|
||||||
|
inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, NORM_TYPE_NORMAL, eps, -1);
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_tensor * embeddings = inpL;
|
||||||
|
|
||||||
|
// process vision feature layers (used by granite)
|
||||||
|
{
|
||||||
|
// final layer is a vision feature layer
|
||||||
|
if (vision_feature_layer.find(max_feature_layer) != vision_feature_layer.end()) {
|
||||||
|
embedding_stack.push_back(inpL);
|
||||||
|
}
|
||||||
|
|
||||||
|
// If feature layers are explicitly set, stack them (if we have multiple)
|
||||||
|
if (!embedding_stack.empty()) {
|
||||||
|
embeddings = embedding_stack[0];
|
||||||
|
for (size_t i = 1; i < embedding_stack.size(); i++) {
|
||||||
|
embeddings = ggml_concat(ctx0, embeddings, embedding_stack[i], 0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// llava projector (also used by granite)
|
||||||
|
if (hparams.has_llava_projector) {
|
||||||
|
embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]);
|
||||||
|
|
||||||
|
ggml_tensor * patches = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
|
||||||
|
ggml_set_name(patches, "patches");
|
||||||
|
ggml_set_input(patches);
|
||||||
|
|
||||||
|
// shape [1, 576, 1024]
|
||||||
|
// ne is whcn, ne = [1024, 576, 1, 1]
|
||||||
|
embeddings = ggml_get_rows(ctx0, embeddings, patches);
|
||||||
|
|
||||||
|
// print_tensor_info(embeddings, "embeddings");
|
||||||
|
|
||||||
|
// llava projector
|
||||||
|
if (proj_type == PROJECTOR_TYPE_MLP) {
|
||||||
|
embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
|
||||||
|
embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
|
||||||
|
|
||||||
|
embeddings = ggml_gelu(ctx0, embeddings);
|
||||||
|
if (model.mm_2_w) {
|
||||||
|
embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
|
||||||
|
embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (proj_type == PROJECTOR_TYPE_MLP_NORM) {
|
||||||
|
embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
|
||||||
|
embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
|
||||||
|
// ggml_tensor_printf(embeddings, "mm_0_w",0,true,false);
|
||||||
|
// First LayerNorm
|
||||||
|
embeddings = ggml_norm(ctx0, embeddings, eps);
|
||||||
|
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_1_w),
|
||||||
|
model.mm_1_b);
|
||||||
|
|
||||||
|
// GELU activation
|
||||||
|
embeddings = ggml_gelu(ctx0, embeddings);
|
||||||
|
|
||||||
|
// Second linear layer
|
||||||
|
embeddings = ggml_mul_mat(ctx0, model.mm_3_w, embeddings);
|
||||||
|
embeddings = ggml_add(ctx0, embeddings, model.mm_3_b);
|
||||||
|
|
||||||
|
// Second LayerNorm
|
||||||
|
embeddings = ggml_norm(ctx0, embeddings, eps);
|
||||||
|
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_4_w),
|
||||||
|
model.mm_4_b);
|
||||||
|
}
|
||||||
|
else if (proj_type == PROJECTOR_TYPE_LDP) {
|
||||||
|
// MobileVLM projector
|
||||||
|
int n_patch = 24;
|
||||||
|
ggml_tensor * mlp_1 = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w, embeddings);
|
||||||
|
mlp_1 = ggml_add(ctx0, mlp_1, model.mm_model_mlp_1_b);
|
||||||
|
mlp_1 = ggml_gelu(ctx0, mlp_1);
|
||||||
|
ggml_tensor * mlp_3 = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, mlp_1);
|
||||||
|
mlp_3 = ggml_add(ctx0, mlp_3, model.mm_model_mlp_3_b);
|
||||||
|
// mlp_3 shape = [1, 576, 2048], ne = [2048, 576, 1, 1]
|
||||||
|
|
||||||
|
// block 1
|
||||||
|
ggml_tensor * block_1 = nullptr;
|
||||||
|
{
|
||||||
|
// transpose from [1, 576, 2048] --> [1, 2048, 576] --> [1, 2048, 24, 24]
|
||||||
|
mlp_3 = ggml_permute(ctx0, mlp_3, 1, 0, 2, 3);
|
||||||
|
mlp_3 = ggml_cont_4d(ctx0, mlp_3, n_patch, n_patch, mlp_3->ne[1], mlp_3->ne[2]);
|
||||||
|
// stride = 1, padding = 1, bias is nullptr
|
||||||
|
block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, 1, 1, 1, 1, 1, 1);
|
||||||
|
|
||||||
|
// layer norm
|
||||||
|
// // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
|
||||||
|
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3));
|
||||||
|
// block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
|
||||||
|
block_1 = ggml_norm(ctx0, block_1, eps);
|
||||||
|
block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_0_1_w), model.mm_model_block_1_block_0_1_b);
|
||||||
|
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
|
||||||
|
|
||||||
|
// block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
|
||||||
|
// hardswish
|
||||||
|
ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1);
|
||||||
|
|
||||||
|
block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0);
|
||||||
|
// block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
|
||||||
|
// pointwise conv
|
||||||
|
block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]);
|
||||||
|
block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc1_w, block_1);
|
||||||
|
block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc1_b);
|
||||||
|
block_1 = ggml_relu(ctx0, block_1);
|
||||||
|
block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc2_w, block_1);
|
||||||
|
block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc2_b);
|
||||||
|
block_1 = ggml_hardsigmoid(ctx0, block_1);
|
||||||
|
// block_1_hw shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1], block_1 shape = [1, 2048], ne = [2048, 1, 1, 1]
|
||||||
|
block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]);
|
||||||
|
block_1 = ggml_mul(ctx0, block_1_hw, block_1);
|
||||||
|
|
||||||
|
int w = block_1->ne[0], h = block_1->ne[1];
|
||||||
|
block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]);
|
||||||
|
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3));
|
||||||
|
|
||||||
|
// block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
|
||||||
|
block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_2_0_w, block_1);
|
||||||
|
block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]);
|
||||||
|
|
||||||
|
// block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
|
||||||
|
block_1 = ggml_norm(ctx0, block_1, eps);
|
||||||
|
block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_2_1_w), model.mm_model_block_1_block_2_1_b);
|
||||||
|
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
|
||||||
|
// block1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
|
||||||
|
// residual
|
||||||
|
block_1 = ggml_add(ctx0, mlp_3, block_1);
|
||||||
|
}
|
||||||
|
|
||||||
|
// block_2
|
||||||
|
{
|
||||||
|
// stride = 2
|
||||||
|
block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_2_block_0_0_w, block_1, 2, 2, 1, 1, 1, 1);
|
||||||
|
|
||||||
|
// block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
|
||||||
|
// layer norm
|
||||||
|
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3));
|
||||||
|
// block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1]
|
||||||
|
block_1 = ggml_norm(ctx0, block_1, eps);
|
||||||
|
block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_0_1_w), model.mm_model_block_2_block_0_1_b);
|
||||||
|
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
|
||||||
|
// block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
|
||||||
|
// hardswish
|
||||||
|
ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1);
|
||||||
|
|
||||||
|
// not sure the parameters is right for globalAvgPooling
|
||||||
|
block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0);
|
||||||
|
// block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
|
||||||
|
// pointwise conv
|
||||||
|
block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]);
|
||||||
|
block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc1_w, block_1);
|
||||||
|
block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc1_b);
|
||||||
|
block_1 = ggml_relu(ctx0, block_1);
|
||||||
|
block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc2_w, block_1);
|
||||||
|
block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc2_b);
|
||||||
|
block_1 = ggml_hardsigmoid(ctx0, block_1);
|
||||||
|
|
||||||
|
// block_1_hw shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1], block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
|
||||||
|
block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]);
|
||||||
|
block_1 = ggml_mul(ctx0, block_1_hw, block_1);
|
||||||
|
|
||||||
|
int w = block_1->ne[0], h = block_1->ne[1];
|
||||||
|
block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]);
|
||||||
|
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3));
|
||||||
|
// block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
|
||||||
|
block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_2_0_w, block_1);
|
||||||
|
block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]);
|
||||||
|
|
||||||
|
|
||||||
|
// block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1]
|
||||||
|
block_1 = ggml_norm(ctx0, block_1, eps);
|
||||||
|
block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_2_1_w), model.mm_model_block_2_block_2_1_b);
|
||||||
|
block_1 = ggml_reshape_3d(ctx0, block_1, block_1->ne[0], block_1->ne[1] * block_1->ne[2], block_1->ne[3]);
|
||||||
|
// block_1 shape = [1, 144, 2048], ne = [2048, 144, 1]
|
||||||
|
}
|
||||||
|
embeddings = block_1;
|
||||||
|
}
|
||||||
|
else if (proj_type == PROJECTOR_TYPE_LDPV2)
|
||||||
|
{
|
||||||
|
int n_patch = 24;
|
||||||
|
ggml_tensor * mlp_0 = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings);
|
||||||
|
mlp_0 = ggml_add(ctx0, mlp_0, model.mm_model_mlp_0_b);
|
||||||
|
mlp_0 = ggml_gelu(ctx0, mlp_0);
|
||||||
|
ggml_tensor * mlp_2 = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, mlp_0);
|
||||||
|
mlp_2 = ggml_add(ctx0, mlp_2, model.mm_model_mlp_2_b);
|
||||||
|
// mlp_2 ne = [2048, 576, 1, 1]
|
||||||
|
// // AVG Pool Layer 2*2, strides = 2
|
||||||
|
mlp_2 = ggml_permute(ctx0, mlp_2, 1, 0, 2, 3);
|
||||||
|
// mlp_2 ne = [576, 2048, 1, 1]
|
||||||
|
mlp_2 = ggml_cont_4d(ctx0, mlp_2, n_patch, n_patch, mlp_2->ne[1], mlp_2->ne[2]);
|
||||||
|
// mlp_2 ne [24, 24, 2048, 1]
|
||||||
|
mlp_2 = ggml_pool_2d(ctx0, mlp_2, GGML_OP_POOL_AVG, 2, 2, 2, 2, 0, 0);
|
||||||
|
// weight ne = [3, 3, 2048, 1]
|
||||||
|
ggml_tensor * peg_0 = ggml_conv_2d_dw(ctx0, model.mm_model_peg_0_w, mlp_2, 1, 1, 1, 1, 1, 1);
|
||||||
|
peg_0 = ggml_cont(ctx0, ggml_permute(ctx0, peg_0, 1, 2, 0, 3));
|
||||||
|
peg_0 = ggml_add(ctx0, peg_0, model.mm_model_peg_0_b);
|
||||||
|
mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 2, 0, 3));
|
||||||
|
peg_0 = ggml_add(ctx0, peg_0, mlp_2);
|
||||||
|
peg_0 = ggml_reshape_3d(ctx0, peg_0, peg_0->ne[0], peg_0->ne[1] * peg_0->ne[2], peg_0->ne[3]);
|
||||||
|
embeddings = peg_0;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
GGML_ABORT("fatal error");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// glm projector
|
||||||
|
else if (proj_type == PROJECTOR_TYPE_GLM_EDGE) {
|
||||||
|
size_t gridsz = (size_t)sqrt(embeddings->ne[1]);
|
||||||
|
embeddings = ggml_permute(ctx0,embeddings,1,0,2,3);
|
||||||
|
embeddings = ggml_cont_3d(ctx0, embeddings, gridsz, gridsz, embeddings->ne[1]);
|
||||||
|
embeddings = ggml_conv_2d(ctx0, model.mm_model_adapter_conv_w, embeddings, 2, 2, 0, 0, 1, 1);
|
||||||
|
embeddings = ggml_reshape_3d(ctx0, embeddings,embeddings->ne[0]*embeddings->ne[1] , embeddings->ne[2], batch_size);
|
||||||
|
embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings, 1, 0, 2, 3));
|
||||||
|
embeddings = ggml_add(ctx0, embeddings, model.mm_model_adapter_conv_b);
|
||||||
|
// GLU
|
||||||
|
{
|
||||||
|
embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings);
|
||||||
|
embeddings = ggml_norm(ctx0, embeddings, eps);
|
||||||
|
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_q_w), model.mm_model_ln_q_b);
|
||||||
|
embeddings = ggml_gelu_inplace(ctx0, embeddings);
|
||||||
|
ggml_tensor * x = embeddings;
|
||||||
|
embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, embeddings);
|
||||||
|
x = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w,x);
|
||||||
|
embeddings = ggml_swiglu_split(ctx0, embeddings, x);
|
||||||
|
embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, embeddings);
|
||||||
|
}
|
||||||
|
// arrangement of BOI/EOI token embeddings
|
||||||
|
// note: these embeddings are not present in text model, hence we cannot process them as text tokens
|
||||||
|
// see: https://huggingface.co/THUDM/glm-edge-v-2b/blob/main/siglip.py#L53
|
||||||
|
{
|
||||||
|
embeddings = ggml_concat(ctx0, model.mm_boi, embeddings, 1); // BOI
|
||||||
|
embeddings = ggml_concat(ctx0, embeddings, model.mm_eoi, 1); // EOI
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
else {
|
||||||
|
GGML_ABORT("llava: unknown projector type");
|
||||||
|
}
|
||||||
|
|
||||||
|
// build the graph
|
||||||
|
ggml_build_forward_expand(gf, embeddings);
|
||||||
|
|
||||||
|
return gf;
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,114 @@
|
||||||
|
#include "models.h"
|
||||||
|
|
||||||
|
ggml_cgraph * clip_graph_minicpmv::build() {
|
||||||
|
GGML_ASSERT(model.class_embedding == nullptr);
|
||||||
|
const int n_pos = n_patches;
|
||||||
|
const int n_embd_proj = n_mmproj_embd;
|
||||||
|
|
||||||
|
// position embeddings for the projector (not for ViT)
|
||||||
|
// see: https://huggingface.co/openbmb/MiniCPM-o-2_6/blob/main/resampler.py#L70
|
||||||
|
// base frequency omega
|
||||||
|
ggml_tensor * omega = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_embd_proj / 4);
|
||||||
|
ggml_set_name(omega, "omega");
|
||||||
|
ggml_set_input(omega);
|
||||||
|
|
||||||
|
// 2D input positions (using float for sinusoidal embeddings)
|
||||||
|
ggml_tensor * pos_h = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_pos);
|
||||||
|
ggml_set_name(pos_h, "pos_h");
|
||||||
|
ggml_set_input(pos_h);
|
||||||
|
ggml_tensor * pos_w = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_pos);
|
||||||
|
ggml_set_name(pos_w, "pos_w");
|
||||||
|
ggml_set_input(pos_w);
|
||||||
|
|
||||||
|
// for selecting learned pos embd, used by ViT
|
||||||
|
struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
|
||||||
|
ggml_set_name(positions, "positions");
|
||||||
|
ggml_set_input(positions);
|
||||||
|
|
||||||
|
ggml_tensor * learned_pos_embd = ggml_get_rows(ctx0, model.position_embeddings, positions);
|
||||||
|
|
||||||
|
ggml_tensor * inp = build_inp();
|
||||||
|
ggml_tensor * embeddings = build_vit(
|
||||||
|
inp, n_pos,
|
||||||
|
NORM_TYPE_NORMAL,
|
||||||
|
hparams.ffn_op,
|
||||||
|
learned_pos_embd,
|
||||||
|
nullptr);
|
||||||
|
|
||||||
|
// resampler projector (it is just another transformer)
|
||||||
|
|
||||||
|
ggml_tensor * q = model.mm_model_query;
|
||||||
|
ggml_tensor * v = ggml_mul_mat(ctx0, model.mm_model_kv_proj, embeddings);
|
||||||
|
|
||||||
|
// norm
|
||||||
|
q = build_norm(q, model.mm_model_ln_q_w, model.mm_model_ln_q_b, NORM_TYPE_NORMAL, eps, -1);
|
||||||
|
v = build_norm(v, model.mm_model_ln_kv_w, model.mm_model_ln_kv_b, NORM_TYPE_NORMAL, eps, -1);
|
||||||
|
|
||||||
|
// calculate sinusoidal pos embd
|
||||||
|
ggml_tensor * pos_embed = nullptr;
|
||||||
|
{
|
||||||
|
// outer product
|
||||||
|
ggml_tensor * omega_b = ggml_repeat_4d(ctx0, omega, omega->ne[0], n_pos, 1, 1); // n_pos rows
|
||||||
|
ggml_tensor * theta_x = ggml_mul(ctx0, omega_b, pos_w);
|
||||||
|
ggml_tensor * theta_y = ggml_mul(ctx0, omega_b, pos_h);
|
||||||
|
// sin and cos
|
||||||
|
ggml_tensor * pos_embd_x = ggml_concat(
|
||||||
|
ctx0,
|
||||||
|
ggml_sin(ctx0, theta_x),
|
||||||
|
ggml_cos(ctx0, theta_x),
|
||||||
|
0 // concat on first dim
|
||||||
|
);
|
||||||
|
ggml_tensor * pos_embd_y = ggml_concat(
|
||||||
|
ctx0,
|
||||||
|
ggml_sin(ctx0, theta_y),
|
||||||
|
ggml_cos(ctx0, theta_y),
|
||||||
|
0 // concat on first dim
|
||||||
|
);
|
||||||
|
pos_embed = ggml_concat(ctx0, pos_embd_x, pos_embd_y, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
// k = v + pos_embed
|
||||||
|
ggml_tensor * k = ggml_add(ctx0, v, pos_embed);
|
||||||
|
|
||||||
|
// attention
|
||||||
|
{
|
||||||
|
const int d_head = 128;
|
||||||
|
int n_head = n_embd_proj/d_head;
|
||||||
|
// Use actual config value if available, otherwise fall back to hardcoded values
|
||||||
|
int num_query = hparams.minicpmv_query_num;
|
||||||
|
ggml_tensor * Q = ggml_add(ctx0,
|
||||||
|
ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q),
|
||||||
|
model.mm_model_attn_q_b);
|
||||||
|
ggml_tensor * K = ggml_add(ctx0,
|
||||||
|
ggml_mul_mat(ctx0, model.mm_model_attn_k_w, k),
|
||||||
|
model.mm_model_attn_k_b);
|
||||||
|
ggml_tensor * V = ggml_add(ctx0,
|
||||||
|
ggml_mul_mat(ctx0, model.mm_model_attn_v_w, v),
|
||||||
|
model.mm_model_attn_v_b);
|
||||||
|
|
||||||
|
Q = ggml_reshape_3d(ctx0, Q, d_head, n_head, num_query);
|
||||||
|
K = ggml_reshape_3d(ctx0, K, d_head, n_head, n_pos);
|
||||||
|
V = ggml_reshape_3d(ctx0, V, d_head, n_head, n_pos);
|
||||||
|
|
||||||
|
cb(Q, "resampler_Q", -1);
|
||||||
|
cb(K, "resampler_K", -1);
|
||||||
|
cb(V, "resampler_V", -1);
|
||||||
|
|
||||||
|
float resampler_kq_scale = 1.0f/ sqrtf(float(d_head));
|
||||||
|
embeddings = build_attn(
|
||||||
|
model.mm_model_attn_o_w,
|
||||||
|
model.mm_model_attn_o_b,
|
||||||
|
Q, K, V, nullptr, resampler_kq_scale, -1);
|
||||||
|
cb(embeddings, "resampler_attn_out", -1);
|
||||||
|
}
|
||||||
|
// layernorm
|
||||||
|
embeddings = build_norm(embeddings, model.mm_model_ln_post_w, model.mm_model_ln_post_b, NORM_TYPE_NORMAL, eps, -1);
|
||||||
|
|
||||||
|
// projection
|
||||||
|
embeddings = ggml_mul_mat(ctx0, model.mm_model_proj, embeddings);
|
||||||
|
|
||||||
|
// build the graph
|
||||||
|
ggml_build_forward_expand(gf, embeddings);
|
||||||
|
|
||||||
|
return gf;
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,6 @@
|
||||||
|
package models
|
||||||
|
|
||||||
|
// #cgo CXXFLAGS: -std=c++17
|
||||||
|
// #cgo CPPFLAGS: -I${SRCDIR}/../../../include -I${SRCDIR}/../../../common -I${SRCDIR}/../../../vendor
|
||||||
|
// #cgo CPPFLAGS: -I${SRCDIR}/../../../../../ml/backend/ggml/ggml/include
|
||||||
|
import "C"
|
||||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue