Skip to content

Expose generation timings from server & update completions.js #2116

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Jul 5, 2023
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
392 changes: 210 additions & 182 deletions examples/server/completion.js.hpp

Large diffs are not rendered by default.

1,556 changes: 805 additions & 751 deletions examples/server/index.html.hpp

Large diffs are not rendered by default.

119 changes: 103 additions & 16 deletions examples/server/public/completion.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,20 +5,29 @@ const paramDefaults = {
stop: ["</s>"]
};

/**
* This function completes the input text using a llama dictionary.
* @param {object} params - The parameters for the completion request.
* @param {object} controller - an instance of AbortController if you need one, or null.
* @param {function} callback - The callback function to call when the completion is done.
* @returns {string} the completed text as a string. Ideally ignored, and you get at it via the callback.
*/
export const llamaComplete = async (params, controller, callback) => {
let generation_settings = null;


// Completes the prompt as a generator. Recommended for most use cases.
//
// Example:
//
// import { llama } from '/completion.js'
//
// const request = llama("Tell me a joke", {n_predict: 800})
// for await (const chunk of request) {
// document.write(chunk.data.content)
// }
//
export async function* llama(prompt, params = {}, config = {}) {
let controller = config.controller;

if (!controller) {
controller = new AbortController();
}
const completionParams = { ...paramDefaults, ...params };

// we use fetch directly here becasue the built in fetchEventSource does not support POST
const completionParams = { ...paramDefaults, ...params, prompt };

const response = await fetch("/completion", {
method: 'POST',
body: JSON.stringify(completionParams),
Expand All @@ -36,7 +45,6 @@ export const llamaComplete = async (params, controller, callback) => {
let content = "";

try {

let cont = true;

while (cont) {
Expand All @@ -59,18 +67,21 @@ export const llamaComplete = async (params, controller, callback) => {
result.data = JSON.parse(result.data);
content += result.data.content;

// callack
if (callback) {
cont = callback(result) != false;
}
// yield
yield result;

// if we got a stop token from server, we will break here
if (result.data.stop) {
if (result.data.generation_settings) {
generation_settings = result.data.generation_settings;
}
break;
}
}
} catch (e) {
console.error("llama error: ", e);
if (e.name !== 'AbortError') {
console.error("llama error: ", e);
}
throw e;
}
finally {
Expand All @@ -79,3 +90,79 @@ export const llamaComplete = async (params, controller, callback) => {

return content;
}

// Call llama, return an event target that you can subcribe to
//
// Example:
//
// import { llamaEventTarget } from '/completion.js'
//
// const conn = llamaEventTarget(prompt)
// conn.addEventListener("message", (chunk) => {
// document.write(chunk.detail.content)
// })
//
export const llamaEventTarget = (prompt, params = {}, config = {}) => {
const eventTarget = new EventTarget();
(async () => {
let content = "";
for await (const chunk of llama(prompt, params, config)) {
if (chunk.data) {
content += chunk.data.content;
eventTarget.dispatchEvent(new CustomEvent("message", { detail: chunk.data }));
}
if (chunk.data.generation_settings) {
eventTarget.dispatchEvent(new CustomEvent("generation_settings", { detail: chunk.data.generation_settings }));
}
if (chunk.data.timings) {
eventTarget.dispatchEvent(new CustomEvent("timings", { detail: chunk.data.timings }));
}
}
eventTarget.dispatchEvent(new CustomEvent("done", { detail: { content } }));
})();
return eventTarget;
}

// Call llama, return a promise that resolves to the completed text. This does not support streaming
//
// Example:
//
// llamaPromise(prompt).then((content) => {
// document.write(content)
// })
//
// or
//
// const content = await llamaPromise(prompt)
// document.write(content)
//
export const llamaPromise = (prompt, params = {}, config = {}) => {
return new Promise(async (resolve, reject) => {
let content = "";
try {
for await (const chunk of llama(prompt, params, config)) {
content += chunk.data.content;
}
resolve(content);
} catch (error) {
reject(error);
}
});
};

/**
* (deprecated)
*/
export const llamaComplete = async (params, controller, callback) => {
for await (const chunk of llama(params.prompt, params, { controller })) {
callback(chunk);
}
}

// Get the model info from the server. This is useful for getting the context window and so on.
export const llamaModelInfo = async () => {
if (!generation_settings) {
generation_settings = await fetch("/model.json").then(r => r.json());
}
return generation_settings;
}
135 changes: 78 additions & 57 deletions examples/server/public/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
<title>llama.cpp - chat</title>

<style>

body {
background-color: #fff;
color: #000;
Expand All @@ -22,10 +21,6 @@
height: 100%;
}

header, footer {
text-align: center;
}

main {
margin: 3px;
display: flex;
Expand Down Expand Up @@ -99,17 +94,26 @@
margin: 0.5em 0;
display: block;
}

header, footer {
text-align: center;
}

footer {
font-size: 80%;
color: #888;
}
</style>

<script type="module">
import {
html, h, signal, effect, computed, render, useSignal, useEffect, useRef
} from '/index.js';

import { llamaComplete } from '/completion.js';
import { llama } from '/completion.js';

const session = signal({
prompt: "This is a conversation between user and llama, a friendly chatbot. respond in markdown.",
prompt: "This is a conversation between user and llama, a friendly chatbot. respond in simple markdown.",
template: "{{prompt}}\n\n{{history}}\n{{char}}:",
historyTemplate: "{{name}}: {{message}}",
transcript: [],
Expand All @@ -118,15 +122,6 @@
user: "User",
})

const transcriptUpdate = (transcript) => {
session.value = {
...session.value,
transcript
}
}

const chatStarted = computed(() => session.value.transcript.length > 0)

const params = signal({
n_predict: 400,
temperature: 0.7,
Expand All @@ -136,8 +131,18 @@
top_p: 0.5,
})

const llamaStats = signal(null)
const controller = signal(null)

const generating = computed(() => controller.value == null )
const chatStarted = computed(() => session.value.transcript.length > 0)

const transcriptUpdate = (transcript) => {
session.value = {
...session.value,
transcript
}
}

// simple template replace
const template = (str, extraSettings) => {
Expand All @@ -158,7 +163,7 @@

transcriptUpdate([...session.value.transcript, ["{{user}}", msg]])

const payload = template(session.value.template, {
const prompt = template(session.value.template, {
message: msg,
history: session.value.transcript.flatMap(([name, message]) => template(session.value.historyTemplate, {name, message})).join("\n"),
});
Expand All @@ -168,22 +173,26 @@

const llamaParams = {
...params.value,
prompt: payload,
stop: ["</s>", template("{{char}}:"), template("{{user}}:")],
}

await llamaComplete(llamaParams, controller.value, (message) => {
const data = message.data;
for await (const chunk of llama(prompt, llamaParams, { controller: controller.value })) {
const data = chunk.data;
currentMessage += data.content;

// remove leading whitespace
currentMessage = currentMessage.replace(/^\s+/, "")

transcriptUpdate([...history, ["{{char}}", currentMessage]])

if (data.stop) {
console.log("-->", data, ' response was:', currentMessage, 'transcript state:', session.value.transcript);
console.log("Completion finished: '", currentMessage, "', summary: ", data);
}
})

if (data.timings) {
llamaStats.value = data.timings;
}
}

controller.value = null;
}
Expand Down Expand Up @@ -219,13 +228,12 @@
return html`
<form onsubmit=${submit}>
<div>
<textarea type="text" rows=2 onkeypress=${enterSubmits} value="${message}" oninput=${(e) => message.value = e.target.value} placeholder="Say something..."/>

<textarea type="text" rows=2 onkeypress=${enterSubmits} value="${message}" oninput=${(e) => message.value = e.target.value} placeholder="Say something..."/>
</div>
<div class="right">
<button type="submit" disabled=${!generating.value} >Send</button>
<button onclick=${stop} disabled=${generating}>Stop</button>
<button onclick=${reset}>Reset</button>
<button type="submit" disabled=${!generating.value} >Send</button>
<button onclick=${stop} disabled=${generating}>Stop</button>
<button onclick=${reset}>Reset</button>
</div>
</form>
`
Expand All @@ -243,7 +251,7 @@
}, [messages])

const chatLine = ([user, msg]) => {
return html`<p key=${msg}><strong>${template(user)}:</strong> <${Markdown} text=${template(msg)} /></p>`
return html`<p key=${msg}><strong>${template(user)}:</strong> <${Markdownish} text=${template(msg)} /></p>`
};

return html`
Expand Down Expand Up @@ -313,39 +321,52 @@
</form>
`
}
const Markdown = (params) => {
const md = params.text
.replace(/^#{1,6} (.*)$/gim, '<h3>$1</h3>')
.replace(/\*\*(.*?)\*\*/g, '<strong>$1</strong>')
.replace(/__(.*?)__/g, '<strong>$1</strong>')
.replace(/\*(.*?)\*/g, '<em>$1</em>')
.replace(/_(.*?)_/g, '<em>$1</em>')
.replace(/```.*?\n([\s\S]*?)```/g, '<pre><code>$1</code></pre>')
.replace(/`(.*?)`/g, '<code>$1</code>')
.replace(/\n/gim, '<br />');
return html`<span dangerouslySetInnerHTML=${{ __html: md }} />`;
};
// poor mans markdown replacement
const Markdownish = (params) => {
const md = params.text
.replace(/^#{1,6} (.*)$/gim, '<h3>$1</h3>')
.replace(/\*\*(.*?)\*\*/g, '<strong>$1</strong>')
.replace(/__(.*?)__/g, '<strong>$1</strong>')
.replace(/\*(.*?)\*/g, '<em>$1</em>')
.replace(/_(.*?)_/g, '<em>$1</em>')
.replace(/```.*?\n([\s\S]*?)```/g, '<pre><code>$1</code></pre>')
.replace(/`(.*?)`/g, '<code>$1</code>')
.replace(/\n/gim, '<br />');
return html`<span dangerouslySetInnerHTML=${{ __html: md }} />`;
};

const ModelGenerationInfo = (params) => {
if (!llamaStats.value) {
return html`<span/>`
}
return html`
<span>
${llamaStats.value.predicted_per_token_ms.toFixed()}ms per token, ${llamaStats.value.predicted_per_second.toFixed(2)} tokens per second
</span>
`
}

function App(props) {

return html`
<div id="container">
<header>
<h1>llama.cpp</h1>
</header>

<main id="content">
<${chatStarted.value ? ChatLog : ConfigForm} />
</main>

<footer id="write">
<${MessageInput} />
</footer>

<footer>
<p>Powered by <a href="https://github.com/ggerganov/llama.cpp">llama.cpp</a> and <a href="https://ggml.ai">ggml.ai</a></p>
</footer>
</div>
<div id="container">
<header>
<h1>llama.cpp</h1>
</header>

<main id="content">
<${chatStarted.value ? ChatLog : ConfigForm} />
</main>

<section id="write">
<${MessageInput} />
</section>

<footer>
<p><${ModelGenerationInfo} /></p>
<p>Powered by <a href="https://github.com/ggerganov/llama.cpp">llama.cpp</a> and <a href="https://ggml.ai">ggml.ai</a>.</p>
</footer>
</div>
`;
}

Expand Down
Loading