Skip to content

Commit a0caaa2

Browse files
authored
Merge branch 'ollama:main' into main
2 parents b394f87 + de52b6c commit a0caaa2

25 files changed

+570
-63
lines changed

.github/workflows/test.yaml

+3-4
Original file line numberDiff line numberDiff line change
@@ -243,7 +243,7 @@ jobs:
243243
$env:PATH="$gopath;$gccpath;$env:PATH"
244244
echo $env:PATH
245245
if (!(gcc --version | select-string -quiet clang)) { throw "wrong gcc compiler detected - must be clang" }
246-
make -j 4
246+
make -j 4
247247
- name: 'Build Unix Go Runners'
248248
if: ${{ ! startsWith(matrix.os, 'windows-') }}
249249
run: make -j 4
@@ -310,8 +310,7 @@ jobs:
310310
arm64) echo ARCH=arm64 ;;
311311
esac >>$GITHUB_ENV
312312
shell: bash
313-
- run: go build
314-
- run: go test -v ./...
313+
- run: go test ./...
315314

316315
patches:
317316
needs: [changes]
@@ -323,4 +322,4 @@ jobs:
323322
submodules: recursive
324323
- name: Verify patches carry all the changes
325324
run: |
326-
make apply-patches sync && git diff --compact-summary --exit-code llama
325+
make apply-patches sync && git diff --compact-summary --exit-code llama

README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -67,12 +67,12 @@ Here are some example models that can be downloaded:
6767

6868
| Model | Parameters | Size | Download |
6969
| ------------------ | ---------- | ----- | -------------------------------- |
70+
| Llama 3.3 | 70B | 43GB | `ollama run llama3.3` |
7071
| Llama 3.2 | 3B | 2.0GB | `ollama run llama3.2` |
7172
| Llama 3.2 | 1B | 1.3GB | `ollama run llama3.2:1b` |
7273
| Llama 3.2 Vision | 11B | 7.9GB | `ollama run llama3.2-vision` |
7374
| Llama 3.2 Vision | 90B | 55GB | `ollama run llama3.2-vision:90b` |
7475
| Llama 3.1 | 8B | 4.7GB | `ollama run llama3.1` |
75-
| Llama 3.1 | 70B | 40GB | `ollama run llama3.1:70b` |
7676
| Llama 3.1 | 405B | 231GB | `ollama run llama3.1:405b` |
7777
| Phi 3 Mini | 3.8B | 2.3GB | `ollama run phi3` |
7878
| Phi 3 Medium | 14B | 7.9GB | `ollama run phi3:medium` |

api/types.go

+2-2
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ type GenerateRequest struct {
6767
Raw bool `json:"raw,omitempty"`
6868

6969
// Format specifies the format to return a response in.
70-
Format string `json:"format"`
70+
Format json.RawMessage `json:"format,omitempty"`
7171

7272
// KeepAlive controls how long the model will stay loaded in memory following
7373
// this request.
@@ -94,7 +94,7 @@ type ChatRequest struct {
9494
Stream *bool `json:"stream,omitempty"`
9595

9696
// Format is the format to return the response in (e.g. "json").
97-
Format string `json:"format"`
97+
Format json.RawMessage `json:"format,omitempty"`
9898

9999
// KeepAlive controls how long the model will stay loaded into memory
100100
// following the request.

cmd/cmd.go

+12-2
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import (
88
"crypto/ed25519"
99
"crypto/rand"
1010
"crypto/sha256"
11+
"encoding/json"
1112
"encoding/pem"
1213
"errors"
1314
"fmt"
@@ -1035,10 +1036,14 @@ func chat(cmd *cobra.Command, opts runOptions) (*api.Message, error) {
10351036
return nil
10361037
}
10371038

1039+
if opts.Format == "json" {
1040+
opts.Format = `"` + opts.Format + `"`
1041+
}
1042+
10381043
req := &api.ChatRequest{
10391044
Model: opts.Model,
10401045
Messages: opts.Messages,
1041-
Format: opts.Format,
1046+
Format: json.RawMessage(opts.Format),
10421047
Options: opts.Options,
10431048
}
10441049

@@ -1120,12 +1125,16 @@ func generate(cmd *cobra.Command, opts runOptions) error {
11201125
}
11211126
}
11221127

1128+
if opts.Format == "json" {
1129+
opts.Format = `"` + opts.Format + `"`
1130+
}
1131+
11231132
request := api.GenerateRequest{
11241133
Model: opts.Model,
11251134
Prompt: opts.Prompt,
11261135
Context: generateContext,
11271136
Images: opts.Images,
1128-
Format: opts.Format,
1137+
Format: json.RawMessage(opts.Format),
11291138
System: opts.System,
11301139
Options: opts.Options,
11311140
KeepAlive: opts.KeepAlive,
@@ -1445,6 +1454,7 @@ func NewCLI() *cobra.Command {
14451454
envVars["OLLAMA_SCHED_SPREAD"],
14461455
envVars["OLLAMA_TMPDIR"],
14471456
envVars["OLLAMA_FLASH_ATTENTION"],
1457+
envVars["OLLAMA_KV_CACHE_TYPE"],
14481458
envVars["OLLAMA_LLM_LIBRARY"],
14491459
envVars["OLLAMA_GPU_OVERHEAD"],
14501460
envVars["OLLAMA_LOAD_TIMEOUT"],

convert/tokenizer.go

+23-4
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import (
1010
"log/slog"
1111
"os"
1212
"slices"
13+
"strings"
1314

1415
"golang.org/x/exp/maps"
1516
)
@@ -60,7 +61,25 @@ func parseTokenizer(fsys fs.FS, specialTokenTypes []string) (*Tokenizer, error)
6061
addedTokens[t.Content] = t
6162
}
6263

63-
t.Merges = tt.Model.Merges
64+
if len(tt.Model.Merges) == 0 {
65+
// noop; merges is empty
66+
} else if err := json.Unmarshal(tt.Model.Merges, &t.Merges); err == nil {
67+
// noop; merges is []string
68+
} else if merges, err := func() ([][]string, error) {
69+
var merges [][]string
70+
if err := json.Unmarshal(tt.Model.Merges, &merges); err != nil {
71+
return nil, err
72+
}
73+
74+
return merges, nil
75+
}(); err == nil {
76+
t.Merges = make([]string, len(merges))
77+
for i := range merges {
78+
t.Merges[i] = strings.Join(merges[i], " ")
79+
}
80+
} else {
81+
return nil, fmt.Errorf("could not parse tokenizer merges. expected []string or [][]string: %w", err)
82+
}
6483

6584
sha256sum := sha256.New()
6685
for _, pt := range tt.PreTokenizer.PreTokenizers {
@@ -156,9 +175,9 @@ func parseTokenizer(fsys fs.FS, specialTokenTypes []string) (*Tokenizer, error)
156175
type tokenizer struct {
157176
AddedTokens []token `json:"added_tokens"`
158177
Model struct {
159-
Type string `json:"type"`
160-
Vocab map[string]int `json:"vocab"`
161-
Merges []string `json:"merges"`
178+
Type string `json:"type"`
179+
Vocab map[string]int `json:"vocab"`
180+
Merges json.RawMessage `json:"merges"`
162181
} `json:"model"`
163182

164183
PreTokenizer struct {

convert/tokenizer_test.go

+56
Original file line numberDiff line numberDiff line change
@@ -191,6 +191,62 @@ func TestParseTokenizer(t *testing.T) {
191191
Pre: "default",
192192
},
193193
},
194+
{
195+
name: "list string merges",
196+
fsys: createTokenizerFS(t, t.TempDir(), map[string]io.Reader{
197+
"tokenizer.json": strings.NewReader(`{
198+
"model": {
199+
"merges": [
200+
"a b",
201+
"c d",
202+
"e f"
203+
]
204+
}
205+
}`),
206+
}),
207+
want: &Tokenizer{
208+
Vocabulary: &Vocabulary{
209+
Model: "gpt2",
210+
},
211+
Merges: []string{
212+
"a b",
213+
"c d",
214+
"e f",
215+
},
216+
Pre: "default",
217+
},
218+
},
219+
{
220+
name: "list list string merges",
221+
fsys: createTokenizerFS(t, t.TempDir(), map[string]io.Reader{
222+
"tokenizer.json": strings.NewReader(`{
223+
"model": {
224+
"merges": [
225+
[
226+
"a", "b"
227+
],
228+
[
229+
"c", "d"
230+
],
231+
[
232+
"e", "f"
233+
]
234+
]
235+
}
236+
}`),
237+
}),
238+
want: &Tokenizer{
239+
Vocabulary: &Vocabulary{
240+
Model: "gpt2",
241+
},
242+
Merges: []string{
243+
"a b",
244+
"c d",
245+
"e f",
246+
},
247+
Pre: "default",
248+
},
249+
},
194250
}
195251

196252
for _, tt := range cases {

discover/types.go

+14
Original file line numberDiff line numberDiff line change
@@ -183,3 +183,17 @@ func (si SystemInfo) GetOptimalThreadCount() int {
183183

184184
return coreCount
185185
}
186+
187+
// For each GPU, check if it does NOT support flash attention
188+
func (l GpuInfoList) FlashAttentionSupported() bool {
189+
for _, gpu := range l {
190+
supportsFA := gpu.Library == "metal" ||
191+
(gpu.Library == "cuda" && gpu.DriverMajor >= 7) ||
192+
gpu.Library == "rocm"
193+
194+
if !supportsFA {
195+
return false
196+
}
197+
}
198+
return true
199+
}

docs/api.md

+104-2
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ Generate a response for a given prompt with a provided model. This is a streamin
4545

4646
Advanced parameters (optional):
4747

48-
- `format`: the format to return a response in. Currently the only accepted value is `json`
48+
- `format`: the format to return a response in. Format can be `json` or a JSON schema
4949
- `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
5050
- `system`: system message to (overrides what is defined in the `Modelfile`)
5151
- `template`: the prompt template to use (overrides what is defined in the `Modelfile`)
@@ -54,6 +54,10 @@ Advanced parameters (optional):
5454
- `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`)
5555
- `context` (deprecated): the context parameter returned from a previous request to `/generate`, this can be used to keep a short conversational memory
5656

57+
#### Structured outputs
58+
59+
Structured outputs are supported by providing a JSON schema in the `format` parameter. The model will generate a response that matches the schema. See the [structured outputs](#request-structured-outputs) example below.
60+
5761
#### JSON mode
5862

5963
Enable JSON mode by setting the `format` parameter to `json`. This will structure the response as a valid JSON object. See the JSON mode [example](#request-json-mode) below.
@@ -185,6 +189,52 @@ curl http://localhost:11434/api/generate -d '{
185189
}
186190
```
187191

192+
#### Request (Structured outputs)
193+
194+
##### Request
195+
196+
```shell
197+
curl -X POST http://localhost:11434/api/generate -H "Content-Type: application/json" -d '{
198+
"model": "llama3.1:8b",
199+
"prompt": "Ollama is 22 years old and is busy saving the world. Respond using JSON",
200+
"stream": false,
201+
"format": {
202+
"type": "object",
203+
"properties": {
204+
"age": {
205+
"type": "integer"
206+
},
207+
"available": {
208+
"type": "boolean"
209+
}
210+
},
211+
"required": [
212+
"age",
213+
"available"
214+
]
215+
}
216+
}'
217+
```
218+
219+
##### Response
220+
221+
```json
222+
{
223+
"model": "llama3.1:8b",
224+
"created_at": "2024-12-06T00:48:09.983619Z",
225+
"response": "{\n \"age\": 22,\n \"available\": true\n}",
226+
"done": true,
227+
"done_reason": "stop",
228+
"context": [1, 2, 3],
229+
"total_duration": 1075509083,
230+
"load_duration": 567678166,
231+
"prompt_eval_count": 28,
232+
"prompt_eval_duration": 236000000,
233+
"eval_count": 16,
234+
"eval_duration": 269000000
235+
}
236+
```
237+
188238
#### Request (JSON mode)
189239

190240
> [!IMPORTANT]
@@ -456,11 +506,15 @@ The `message` object has the following fields:
456506

457507
Advanced parameters (optional):
458508

459-
- `format`: the format to return a response in. Currently the only accepted value is `json`
509+
- `format`: the format to return a response in. Format can be `json` or a JSON schema.
460510
- `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
461511
- `stream`: if `false` the response will be returned as a single response object, rather than a stream of objects
462512
- `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`)
463513

514+
### Structured outputs
515+
516+
Structured outputs are supported by providing a JSON schema in the `format` parameter. The model will generate a response that matches the schema. See the [Chat request (Structured outputs)](#chat-request-structured-outputs) example below.
517+
464518
### Examples
465519

466520
#### Chat Request (Streaming)
@@ -551,6 +605,54 @@ curl http://localhost:11434/api/chat -d '{
551605
}
552606
```
553607

608+
#### Chat request (Structured outputs)
609+
610+
##### Request
611+
612+
```shell
613+
curl -X POST http://localhost:11434/api/chat -H "Content-Type: application/json" -d '{
614+
"model": "llama3.1",
615+
"messages": [{"role": "user", "content": "Ollama is 22 years old and busy saving the world. Return a JSON object with the age and availability."}],
616+
"stream": false,
617+
"format": {
618+
"type": "object",
619+
"properties": {
620+
"age": {
621+
"type": "integer"
622+
},
623+
"available": {
624+
"type": "boolean"
625+
}
626+
},
627+
"required": [
628+
"age",
629+
"available"
630+
]
631+
},
632+
"options": {
633+
"temperature": 0
634+
}
635+
}'
636+
```
637+
638+
##### Response
639+
640+
```json
641+
{
642+
"model": "llama3.1",
643+
"created_at": "2024-12-06T00:46:58.265747Z",
644+
"message": { "role": "assistant", "content": "{\"age\": 22, \"available\": false}" },
645+
"done_reason": "stop",
646+
"done": true,
647+
"total_duration": 2254970291,
648+
"load_duration": 574751416,
649+
"prompt_eval_count": 34,
650+
"prompt_eval_duration": 1502000000,
651+
"eval_count": 12,
652+
"eval_duration": 175000000
653+
}
654+
```
655+
554656
#### Chat request (With History)
555657

556658
Send a chat message with a conversation history. You can use this same approach to start the conversation using multi-shot or chain-of-thought prompting.

0 commit comments

Comments
 (0)