Initial commit.

pull/520/head
Adam Treat 1 year ago
commit ff2fdecce1

1
.gitignore vendored

@ -0,0 +1 @@
CMakeLists.txt.user

3
.gitmodules vendored

@ -0,0 +1,3 @@
[submodule "ggml"]
path = ggml
url = https://github.com/manyoso/ggml.git

@ -0,0 +1,40 @@
cmake_minimum_required(VERSION 3.16)
project(gpt4all-chat VERSION 0.1 LANGUAGES CXX)
set(CMAKE_AUTOMOC ON)
set(CMAKE_AUTORCC ON)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
find_package(Qt6 6.2 COMPONENTS Quick REQUIRED)
set(GGML_BUILD_EXAMPLES ON CACHE BOOL "ggml: build examples" FORCE)
add_subdirectory(ggml)
qt_add_executable(chat
main.cpp
gptj.h gptj.cpp
llm.h llm.cpp
)
qt_add_qml_module(chat
URI gpt4all-chat
VERSION 1.0
QML_FILES main.qml
RESOURCES icons/send_message.svg icons/stop_generating.svg icons/regenerate.svg
)
set_target_properties(chat PROPERTIES
MACOSX_BUNDLE_GUI_IDENTIFIER my.example.com
MACOSX_BUNDLE_BUNDLE_VERSION ${PROJECT_VERSION}
MACOSX_BUNDLE_SHORT_VERSION_STRING ${PROJECT_VERSION_MAJOR}.${PROJECT_VERSION_MINOR}
MACOSX_BUNDLE TRUE
WIN32_EXECUTABLE TRUE
)
target_compile_definitions(chat
PRIVATE $<$<OR:$<CONFIG:Debug>,$<CONFIG:RelWithDebInfo>>:QT_QML_DEBUG>)
target_link_libraries(chat
PRIVATE Qt6::Quick Qt6::Svg)
target_link_libraries(chat
PRIVATE ggml ggml_utils)

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2023 Adam Treat
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

@ -0,0 +1 @@
# gpt4all-chat

@ -0,0 +1 @@
Subproject commit c9f702ac573a2be4a1b9926979084941f95d0e33

@ -0,0 +1,781 @@
#include "gptj.h"
#include "ggml/ggml.h"
#include "utils.h"
#include <cassert>
#include <cmath>
#include <cstdio>
#include <cstring>
#include <fstream>
#include <map>
#include <string>
#include <vector>
#include <iostream>
#include <unistd.h>
// default hparams (GPT-J 6B)
struct gptj_hparams {
int32_t n_vocab = 50400;
int32_t n_ctx = 2048;
int32_t n_embd = 4096;
int32_t n_head = 16;
int32_t n_layer = 28;
int32_t n_rot = 64;
int32_t f16 = 1;
};
struct gptj_layer {
// normalization
struct ggml_tensor * ln_1_g;
struct ggml_tensor * ln_1_b;
// attention
struct ggml_tensor * c_attn_q_proj_w;
struct ggml_tensor * c_attn_k_proj_w;
struct ggml_tensor * c_attn_v_proj_w;
struct ggml_tensor * c_attn_proj_w;
// ff
struct ggml_tensor * c_mlp_fc_w;
struct ggml_tensor * c_mlp_fc_b;
struct ggml_tensor * c_mlp_proj_w;
struct ggml_tensor * c_mlp_proj_b;
};
struct gptj_model {
gptj_hparams hparams;
// normalization
struct ggml_tensor * ln_f_g;
struct ggml_tensor * ln_f_b;
struct ggml_tensor * wte; // position embedding
struct ggml_tensor * lmh_g; // language model head
struct ggml_tensor * lmh_b; // language model bias
std::vector<gptj_layer> layers;
// key + value memory
struct ggml_tensor * memory_k;
struct ggml_tensor * memory_v;
//
struct ggml_context * ctx;
std::map<std::string, struct ggml_tensor *> tensors;
};
// load the model's weights from a stream
bool gptj_model_load(const std::string &fname, std::istream &fin, gptj_model & model, gpt_vocab & vocab) {
printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
// verify magic
{
uint32_t magic;
fin.read((char *) &magic, sizeof(magic));
if (magic != 0x67676d6c) {
fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
return false;
}
}
// load hparams
{
auto & hparams = model.hparams;
fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
fin.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd));
fin.read((char *) &hparams.n_head, sizeof(hparams.n_head));
fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
fin.read((char *) &hparams.n_rot, sizeof(hparams.n_rot));
fin.read((char *) &hparams.f16, sizeof(hparams.f16));
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
printf("%s: n_head = %d\n", __func__, hparams.n_head);
printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
printf("%s: n_rot = %d\n", __func__, hparams.n_rot);
printf("%s: f16 = %d\n", __func__, hparams.f16);
}
// load vocab
{
int32_t n_vocab = 0;
fin.read((char *) &n_vocab, sizeof(n_vocab));
if (n_vocab != model.hparams.n_vocab) {
fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
__func__, fname.c_str(), n_vocab, model.hparams.n_vocab);
return false;
}
std::string word;
for (int i = 0; i < n_vocab; i++) {
uint32_t len;
fin.read((char *) &len, sizeof(len));
word.resize(len);
fin.read((char *) word.data(), len);
vocab.token_to_id[word] = i;
vocab.id_to_token[i] = word;
}
}
// for the big tensors, we have the option to store the data in 16-bit floats or quantized
// in order to save memory and also to speed up the computation
ggml_type wtype = GGML_TYPE_COUNT;
switch (model.hparams.f16) {
case 0: wtype = GGML_TYPE_F32; break;
case 1: wtype = GGML_TYPE_F16; break;
case 2: wtype = GGML_TYPE_Q4_0; break;
case 3: wtype = GGML_TYPE_Q4_1; break;
default:
{
fprintf(stderr, "%s: invalid model file '%s' (bad f16 value %d)\n",
__func__, fname.c_str(), model.hparams.f16);
return false;
}
}
const ggml_type wtype2 = GGML_TYPE_F32;
auto & ctx = model.ctx;
size_t ctx_size = 0;
{
const auto & hparams = model.hparams;
const int n_embd = hparams.n_embd;
const int n_layer = hparams.n_layer;
const int n_ctx = hparams.n_ctx;
const int n_vocab = hparams.n_vocab;
ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g
ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b
ctx_size += n_embd*n_vocab*ggml_type_sizef(wtype); // wte
ctx_size += n_embd*n_vocab*ggml_type_sizef(wtype); // lmh_g
ctx_size += n_vocab*ggml_type_sizef(GGML_TYPE_F32); // lmh_b
ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g
ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b
ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_q_proj_w
ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_k_proj_w
ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_v_proj_w
ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_proj_w
ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_fc_w
ctx_size += n_layer*( 4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b
ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w
ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v
ctx_size += (5 + 10*n_layer)*256; // object overhead
printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
}
// create the ggml context
{
struct ggml_init_params params = {
.mem_size = ctx_size,
.mem_buffer = NULL,
};
model.ctx = ggml_init(params);
if (!model.ctx) {
fprintf(stderr, "%s: ggml_init() failed\n", __func__);
return false;
}
}
// prepare memory for the weights
{
const auto & hparams = model.hparams;
const int n_embd = hparams.n_embd;
const int n_layer = hparams.n_layer;
const int n_ctx = hparams.n_ctx;
const int n_vocab = hparams.n_vocab;
model.layers.resize(n_layer);
model.wte = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab);
model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
model.lmh_g = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab);
model.lmh_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_vocab);
// map by name
model.tensors["transformer.wte.weight"] = model.wte;
model.tensors["transformer.ln_f.weight"] = model.ln_f_g;
model.tensors["transformer.ln_f.bias"] = model.ln_f_b;
model.tensors["lm_head.weight"] = model.lmh_g;
model.tensors["lm_head.bias"] = model.lmh_b;
for (int i = 0; i < n_layer; ++i) {
auto & layer = model.layers[i];
layer.ln_1_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
layer.ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
layer.c_attn_q_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
layer.c_attn_k_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
layer.c_attn_v_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
layer.c_mlp_fc_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 4*n_embd);
layer.c_mlp_fc_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
layer.c_mlp_proj_w = ggml_new_tensor_2d(ctx, wtype, 4*n_embd, n_embd);
layer.c_mlp_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
// map by name
model.tensors["transformer.h." + std::to_string(i) + ".ln_1.weight"] = layer.ln_1_g;
model.tensors["transformer.h." + std::to_string(i) + ".ln_1.bias"] = layer.ln_1_b;
model.tensors["transformer.h." + std::to_string(i) + ".attn.q_proj.weight"] = layer.c_attn_q_proj_w;
model.tensors["transformer.h." + std::to_string(i) + ".attn.k_proj.weight"] = layer.c_attn_k_proj_w;
model.tensors["transformer.h." + std::to_string(i) + ".attn.v_proj.weight"] = layer.c_attn_v_proj_w;
model.tensors["transformer.h." + std::to_string(i) + ".attn.out_proj.weight"] = layer.c_attn_proj_w;
model.tensors["transformer.h." + std::to_string(i) + ".mlp.fc_in.weight"] = layer.c_mlp_fc_w;
model.tensors["transformer.h." + std::to_string(i) + ".mlp.fc_in.bias"] = layer.c_mlp_fc_b;
model.tensors["transformer.h." + std::to_string(i) + ".mlp.fc_out.weight"] = layer.c_mlp_proj_w;
model.tensors["transformer.h." + std::to_string(i) + ".mlp.fc_out.bias"] = layer.c_mlp_proj_b;
}
}
// key + value memory
{
const auto & hparams = model.hparams;
const int n_embd = hparams.n_embd;
const int n_layer = hparams.n_layer;
const int n_ctx = hparams.n_ctx;
const int n_mem = n_layer*n_ctx;
const int n_elements = n_embd*n_mem;
model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
printf("%s: memory_size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem);
}
// load weights
{
int n_tensors = 0;
size_t total_size = 0;
printf("%s: ", __func__);
while (true) {
int32_t n_dims;
int32_t length;
int32_t ftype;
fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
fin.read(reinterpret_cast<char *>(&length), sizeof(length));
fin.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
if (fin.eof()) {
break;
}
int32_t nelements = 1;
int32_t ne[2] = { 1, 1 };
for (int i = 0; i < n_dims; ++i) {
fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
nelements *= ne[i];
}
std::string name(length, 0);
fin.read(&name[0], length);
if (model.tensors.find(name.data()) == model.tensors.end()) {
fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
return false;
}
auto tensor = model.tensors[name.data()];
if (ggml_nelements(tensor) != nelements) {
fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
return false;
}
if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
__func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
return false;
}
if (0) {
static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ftype_str[ftype], ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
}
size_t bpe = 0;
switch (ftype) {
case 0: bpe = ggml_type_size(GGML_TYPE_F32); break;
case 1: bpe = ggml_type_size(GGML_TYPE_F16); break;
case 2: bpe = ggml_type_size(GGML_TYPE_Q4_0); assert(ne[0] % 64 == 0); break;
case 3: bpe = ggml_type_size(GGML_TYPE_Q4_1); assert(ne[0] % 64 == 0); break;
default:
{
fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ftype);
return false;
}
};
if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
__func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
return false;
}
fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
//printf("%42s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
total_size += ggml_nbytes(tensor);
if (++n_tensors % 8 == 0) {
printf(".");
fflush(stdout);
}
}
printf(" done\n");
printf("%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, n_tensors);
}
return true;
}
// load the model's weights from a file path
bool gptj_model_load(const std::string & fname, gptj_model & model, gpt_vocab & vocab) {
auto fin = std::ifstream(fname, std::ios::binary);
if (!fin) {
fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
return false;
}
bool loaded = gptj_model_load(fname, fin, model, vocab);
fin.close();
return loaded;
}
// evaluate the transformer
//
// - model: the model
// - n_threads: number of threads to use
// - n_past: the context size so far
// - embd_inp: the embeddings of the tokens in the context
// - embd_w: the predicted logits for the next token
//
// The GPT-J model requires about 16MB of memory per input token.
//
bool gptj_eval(
const gptj_model & model,
const int n_threads,
const int n_past,
const std::vector<gpt_vocab::id> & embd_inp,
std::vector<float> & embd_w,
size_t & mem_per_token) {
const int N = embd_inp.size();
const auto & hparams = model.hparams;
const int n_embd = hparams.n_embd;
const int n_layer = hparams.n_layer;
const int n_ctx = hparams.n_ctx;
const int n_head = hparams.n_head;
const int n_vocab = hparams.n_vocab;
const int n_rot = hparams.n_rot;
const int d_key = n_embd/n_head;
static size_t buf_size = 256u*1024*1024;
static void * buf = malloc(buf_size);
if (mem_per_token > 0 && mem_per_token*N > buf_size) {
const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
//printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
// reallocate
buf_size = buf_size_new;
buf = realloc(buf, buf_size);
if (buf == nullptr) {
fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size);
return false;
}
}
struct ggml_init_params params = {
.mem_size = buf_size,
.mem_buffer = buf,
};
struct ggml_context * ctx0 = ggml_init(params);
struct ggml_cgraph gf = { .n_threads = n_threads };
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
// wte
struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.wte, embd);
for (int il = 0; il < n_layer; ++il) {
struct ggml_tensor * cur;
// norm
{
cur = ggml_norm(ctx0, inpL);
// cur = ln_1_g*cur + ln_1_b
cur = ggml_add(ctx0,
ggml_mul(ctx0,
ggml_repeat(ctx0, model.layers[il].ln_1_g, cur),
cur),
ggml_repeat(ctx0, model.layers[il].ln_1_b, cur));
}
struct ggml_tensor * inpSA = cur;
// self-attention
{
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_q_proj_w, cur);
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_k_proj_w, cur);
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_v_proj_w, cur);
// store key and value to memory
if (N >= 1) {
struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past));
struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past));
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
}
// Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
struct ggml_tensor * Q =
ggml_permute(ctx0,
ggml_rope(ctx0,
ggml_cpy(ctx0,
Qcur,
ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)),
n_past, n_rot, 0),
0, 2, 1, 3);
// K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
struct ggml_tensor * K =
ggml_permute(ctx0,
ggml_rope(ctx0,
ggml_reshape_3d(ctx0,
ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd),
n_embd/n_head, n_head, n_past + N),
n_past, n_rot, 1),
0, 2, 1, 3);
// K * Q
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
// KQ_scaled = KQ / sqrt(n_embd/n_head)
struct ggml_tensor * KQ_scaled =
ggml_scale(ctx0,
KQ,
ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head))
);
// KQ_masked = mask_past(KQ_scaled)
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
// KQ = soft_max(KQ_masked)
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
// V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
struct ggml_tensor * V_trans =
ggml_cpy(ctx0,
ggml_permute(ctx0,
ggml_reshape_3d(ctx0,
ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
n_embd/n_head, n_head, n_past + N),
1, 2, 0, 3),
ggml_new_tensor_3d(ctx0, model.memory_v->type, n_past + N, n_embd/n_head, n_head));
// KQV = transpose(V) * KQ_soft_max
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
// KQV_merged = KQV.permute(0, 2, 1, 3)
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
// cur = KQV_merged.contiguous().view(n_embd, N)
cur = ggml_cpy(ctx0,
KQV_merged,
ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
// projection (no bias)
cur = ggml_mul_mat(ctx0,
model.layers[il].c_attn_proj_w,
cur);
}
struct ggml_tensor * inpFF = cur;
// feed-forward network
// this is independent of the self-attention result, so it could be done in parallel to the self-attention
{
// note here we pass inpSA instead of cur
cur = ggml_mul_mat(ctx0,
model.layers[il].c_mlp_fc_w,
inpSA);
cur = ggml_add(ctx0,
ggml_repeat(ctx0, model.layers[il].c_mlp_fc_b, cur),
cur);
// GELU activation
cur = ggml_gelu(ctx0, cur);
// projection
// cur = proj_w*cur + proj_b
cur = ggml_mul_mat(ctx0,
model.layers[il].c_mlp_proj_w,
cur);
cur = ggml_add(ctx0,
ggml_repeat(ctx0, model.layers[il].c_mlp_proj_b, cur),
cur);
}
// self-attention + FF
cur = ggml_add(ctx0, cur, inpFF);
// input for next layer
inpL = ggml_add(ctx0, cur, inpL);
}
// norm
{
inpL = ggml_norm(ctx0, inpL);
// inpL = ln_f_g*inpL + ln_f_b
inpL = ggml_add(ctx0,
ggml_mul(ctx0,
ggml_repeat(ctx0, model.ln_f_g, inpL),
inpL),
ggml_repeat(ctx0, model.ln_f_b, inpL));
}
// lm_head
{
inpL = ggml_mul_mat(ctx0, model.lmh_g, inpL);
inpL = ggml_add(ctx0,
ggml_repeat(ctx0, model.lmh_b, inpL),
inpL);
}
// logits -> probs
//inpL = ggml_soft_max(ctx0, inpL);
// run the computation
ggml_build_forward_expand(&gf, inpL);
ggml_graph_compute (ctx0, &gf);
//if (n_past%100 == 0) {
// ggml_graph_print (&gf);
// ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
//}
//embd_w.resize(n_vocab*N);
//memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
// return result for just the last token
embd_w.resize(n_vocab);
memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
if (mem_per_token == 0) {
mem_per_token = ggml_used_mem(ctx0)/N;
}
//printf("used_mem = %zu\n", ggml_used_mem(ctx0));
ggml_free(ctx0);
return true;
}
struct GPTJPrivate {
const std::string modelPath;
bool modelLoaded;
gpt_vocab vocab;
gptj_model model;
int64_t t_main_start_us = 0;
int64_t t_load_us = 0;
int64_t n_threads = 0;
std::mt19937 rng;
};
GPTJ::GPTJ()
: d_ptr(new GPTJPrivate) {
d_ptr->modelLoaded = false;
}
bool GPTJ::loadModel(const std::string &modelPath, std::istream &fin) {
d_ptr->t_main_start_us = ggml_time_us();
std::mt19937 rng(time(NULL));
d_ptr->rng = rng;
// load the model
{
const int64_t t_start_us = ggml_time_us();
if (!gptj_model_load(modelPath, fin, d_ptr->model, d_ptr->vocab)) {
std::cerr << "GPT-J ERROR: failed to load model from" << modelPath;
return false;
}
d_ptr->t_load_us = ggml_time_us() - t_start_us;
}
d_ptr->n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
d_ptr->modelLoaded = true;
return true;
}
GPTJ::~GPTJ()
{
ggml_free(d_ptr->model.ctx);
}
bool GPTJ::isModelLoaded() const
{
return d_ptr->modelLoaded;
}
void GPTJ::prompt(const std::string &prompt, std::function<bool(const std::string&)> response,
int32_t n_predict, int32_t top_k, float top_p, float temp,
int32_t n_batch) {
if (!isModelLoaded()) {
std::cerr << "GPT-J ERROR: prompt won't work with an unloaded model!\n";
return;
}
int n_past = 0;
int64_t t_sample_us = 0;
int64_t t_predict_us = 0;
std::vector<float> logits;
// tokenize the prompt
std::vector<gpt_vocab::id> embd_inp = ::gpt_tokenize(d_ptr->vocab, prompt);
n_predict = std::min(n_predict, d_ptr->model.hparams.n_ctx - (int) embd_inp.size());
std::vector<gpt_vocab::id> embd;
// determine the required inference memory per token:
size_t mem_per_token = 0;
gptj_eval(d_ptr->model, d_ptr->n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
for (int i = embd.size(); i < embd_inp.size() + n_predict; i++) {
// predict
if (embd.size() > 0) {
const int64_t t_start_us = ggml_time_us();
if (!gptj_eval(d_ptr->model, d_ptr->n_threads, n_past, embd, logits, mem_per_token)) {
std::cerr << "GPT-J ERROR: Failed to predict\n";
return;
}
t_predict_us += ggml_time_us() - t_start_us;
}
n_past += embd.size();
embd.clear();
if (i >= embd_inp.size()) {
// sample next token
const int n_vocab = d_ptr->model.hparams.n_vocab;
gpt_vocab::id id = 0;
{
const int64_t t_start_sample_us = ggml_time_us();
id = gpt_sample_top_k_top_p(d_ptr->vocab, logits.data() + (logits.size() - n_vocab), top_k, top_p, temp, d_ptr->rng);
t_sample_us += ggml_time_us() - t_start_sample_us;
}
// add it to the context
embd.push_back(id);
} else {
// if here, it means we are still processing the input prompt
for (int k = i; k < embd_inp.size(); k++) {
embd.push_back(embd_inp[k]);
if (embd.size() > n_batch) {
break;
}
}
i += embd.size() - 1;
}
// display text
for (auto id : embd) {
if (!response(d_ptr->vocab.id_to_token[id]))
goto stop_generating;
}
// end of text token
if (embd.back() == 50256) {
break;
}
}
stop_generating:
#if 1
// report timing
{
const int64_t t_main_end_us = ggml_time_us();
std::cout << "GPT-J INFO: mem per token = " << mem_per_token << " bytes\n";
std::cout << "GPT-J INFO: load time = " << d_ptr->t_load_us/1000.0f << " ms\n";
std::cout << "GPT-J INFO: sample time = " << t_sample_us/1000.0f << " ms\n";
std::cout << "GPT-J INFO: predict time = " << t_predict_us/1000.0f << " ms / " << t_predict_us/1000.0f/n_past << " ms per token\n";
std::cout << "GPT-J INFO: total time = " << (t_main_end_us - d_ptr->t_main_start_us)/1000.0f << " ms\n";
fflush(stdout);
fflush(stderr);
}
#endif
return;
}

@ -0,0 +1,24 @@
#ifndef GPTJ_H
#define GPTJ_H
#include <string>
#include <sstream>
#include <functional>
class GPTJPrivate;
class GPTJ {
public:
GPTJ();
~GPTJ();
bool loadModel(const std::string &modelPath, std::istream &fin);
bool isModelLoaded() const;
void prompt(const std::string &prompt, std::function<bool(const std::string&)> response,
int32_t n_predict = 200, int32_t top_k = 40, float top_p = 0.9f, float temp = 0.9f,
int32_t n_batch = 9);
private:
GPTJPrivate *d_ptr;
};
#endif // GPTJ_H

@ -0,0 +1 @@
<svg stroke="#7d7d8e" fill="none" stroke-width="1.5" viewBox="0 0 24 24" stroke-linecap="round" stroke-linejoin="round" class="h-3 w-3" height="1em" width="1em" xmlns="http://www.w3.org/2000/svg"><polyline points="1 4 1 10 7 10"></polyline><polyline points="23 20 23 14 17 14"></polyline><path d="M20.49 9A9 9 0 0 0 5.64 5.64L1 10m22 4l-4.64 4.36A9 9 0 0 1 3.51 15"></path></svg>

After

Width:  |  Height:  |  Size: 380 B

@ -0,0 +1 @@
<svg stroke="#7d7d8e" fill="none" stroke-width="2" viewBox="0 0 24 24" stroke-linecap="round" stroke-linejoin="round" class="h-4 w-4 mr-1" height="1em" width="1em" xmlns="http://www.w3.org/2000/svg"><line x1="22" y1="2" x2="11" y2="13"></line><polygon points="22 2 15 22 11 13 2 9 22 2"></polygon></svg>

After

Width:  |  Height:  |  Size: 304 B

@ -0,0 +1 @@
<svg stroke="#7d7d8e" fill="none" stroke-width="1.5" viewBox="0 0 24 24" stroke-linecap="round" stroke-linejoin="round" class="h-3 w-3" height="1em" width="1em" xmlns="http://www.w3.org/2000/svg"><rect x="3" y="3" width="18" height="18" rx="2" ry="2"></rect></svg>

After

Width:  |  Height:  |  Size: 265 B

@ -0,0 +1,132 @@
#include "llm.h"
#include <QCoreApplication>
#include <QDir>
#include <QFile>
#include <QResource>
class MyLLM: public LLM { };
Q_GLOBAL_STATIC(MyLLM, llmInstance)
LLM *LLM::globalInstance()
{
return llmInstance();
}
GPTJObject::GPTJObject()
: QObject{nullptr}
, m_gptj(new GPTJ)
{
moveToThread(&m_llmThread);
connect(&m_llmThread, &QThread::started, this, &GPTJObject::loadModel);
m_llmThread.setObjectName("llm thread");
m_llmThread.start();
}
bool GPTJObject::loadModel()
{
if (isModelLoaded())
return true;
QString modelName("ggml-model-q4_0.bin");
QFile file(QCoreApplication::applicationDirPath() + QDir::separator() + modelName);
if (file.open(QIODevice::ReadOnly)) {
QByteArray data = file.readAll();
std::istringstream iss(data.toStdString());
m_gptj->loadModel(modelName.toStdString(), iss);
emit isModelLoadedChanged();
}
return m_gptj;
}
bool GPTJObject::isModelLoaded() const
{
return m_gptj->isModelLoaded();
}
void GPTJObject::resetResponse()
{
m_response = std::string();
}
QString GPTJObject::response() const
{
return QString::fromStdString(m_response);
}
bool GPTJObject::handleResponse(const std::string &response)
{
#if 0
printf("%s", response.c_str());
fflush(stdout);
#endif
m_response.append(response);
emit responseChanged();
return !m_stopGenerating;
}
bool GPTJObject::prompt(const QString &prompt)
{
if (!isModelLoaded())
return false;
m_stopGenerating = false;
auto func = std::bind(&GPTJObject::handleResponse, this, std::placeholders::_1);
emit responseStarted();
m_gptj->prompt(prompt.toStdString(), func);
emit responseStopped();
return true;
}
LLM::LLM()
: QObject{nullptr}
, m_gptj(new GPTJObject)
, m_responseInProgress(false)
{
connect(m_gptj, &GPTJObject::isModelLoadedChanged, this, &LLM::isModelLoadedChanged, Qt::QueuedConnection);
connect(m_gptj, &GPTJObject::responseChanged, this, &LLM::responseChanged, Qt::QueuedConnection);
connect(m_gptj, &GPTJObject::responseStarted, this, &LLM::responseStarted, Qt::QueuedConnection);
connect(m_gptj, &GPTJObject::responseStopped, this, &LLM::responseStopped, Qt::QueuedConnection);
connect(this, &LLM::promptRequested, m_gptj, &GPTJObject::prompt, Qt::QueuedConnection);
connect(this, &LLM::resetResponseRequested, m_gptj, &GPTJObject::resetResponse, Qt::BlockingQueuedConnection);
}
bool LLM::isModelLoaded() const
{
return m_gptj->isModelLoaded();
}
void LLM::prompt(const QString &prompt)
{
emit promptRequested(prompt);
}
void LLM::resetResponse()
{
emit resetResponseRequested(); // blocking queued connection
}
void LLM::stopGenerating()
{
m_gptj->stopGenerating();
}
QString LLM::response() const
{
return m_gptj->response();
}
void LLM::responseStarted()
{
m_responseInProgress = true;
emit responseInProgressChanged();
}
void LLM::responseStopped()
{
m_responseInProgress = false;
emit responseInProgressChanged();
}

84
llm.h

@ -0,0 +1,84 @@
#ifndef LLM_H
#define LLM_H
#include <QObject>
#include <QThread>
#include "gptj.h"
class GPTJObject : public QObject
{
Q_OBJECT
Q_PROPERTY(bool isModelLoaded READ isModelLoaded NOTIFY isModelLoadedChanged)
Q_PROPERTY(QString response READ response NOTIFY responseChanged)
public:
GPTJObject();
bool loadModel();
bool isModelLoaded() const;
void resetResponse();
void stopGenerating() { m_stopGenerating = true; }
QString response() const;
public Q_SLOTS:
bool prompt(const QString &prompt);
Q_SIGNALS:
void isModelLoadedChanged();
void responseChanged();
void responseStarted();
void responseStopped();
private:
bool handleResponse(const std::string &response);
private:
GPTJ *m_gptj;
std::stringstream m_debug;
std::string m_response;
QThread m_llmThread;
std::atomic<bool> m_stopGenerating;
};
class LLM : public QObject
{
Q_OBJECT
Q_PROPERTY(bool isModelLoaded READ isModelLoaded NOTIFY isModelLoadedChanged)
Q_PROPERTY(QString response READ response NOTIFY responseChanged)
Q_PROPERTY(bool responseInProgress READ responseInProgress NOTIFY responseInProgressChanged)
public:
static LLM *globalInstance();
Q_INVOKABLE bool isModelLoaded() const;
Q_INVOKABLE void prompt(const QString &prompt);
Q_INVOKABLE void resetResponse();
Q_INVOKABLE void stopGenerating();
QString response() const;
bool responseInProgress() const { return m_responseInProgress; }
Q_SIGNALS:
void isModelLoadedChanged();
void responseChanged();
void responseInProgressChanged();
void promptRequested(const QString &prompt);
void resetResponseRequested();
private Q_SLOTS:
void responseStarted();
void responseStopped();
private:
GPTJObject *m_gptj;
bool m_responseInProgress;
private:
explicit LLM();
~LLM() {}
friend class MyLLM;
};
#endif // LLM_H

@ -0,0 +1,31 @@
#include <QGuiApplication>
#include <QQmlApplicationEngine>
#include <QQmlContext>
#include <QDirIterator>
#include "llm.h"
int main(int argc, char *argv[])
{
QGuiApplication app(argc, argv);
QQmlApplicationEngine engine;
qmlRegisterSingletonInstance("llm", 1, 0, "LLM", LLM::globalInstance());
const QUrl url(u"qrc:/gpt4all-chat/main.qml"_qs);
QObject::connect(&engine, &QQmlApplicationEngine::objectCreated,
&app, [url](QObject *obj, const QUrl &objUrl) {
if (!obj && url == objUrl)
QCoreApplication::exit(-1);
}, Qt::QueuedConnection);
engine.load(url);
#if 1
QDirIterator it("qrc:", QDirIterator::Subdirectories);
while (it.hasNext()) {
qDebug() << it.next();
}
#endif
return app.exec();
}

@ -0,0 +1,233 @@
import QtQuick
import QtQuick.Controls
import llm
Window {
id: window
width: 1280
height: 720
visible: true
title: qsTr("GPT4All Chat")
Rectangle {
id: conversationList
width: 300
anchors.left: parent.left
anchors.top: parent.top
anchors.bottom: parent.bottom
color: "#202123"
Button {
id: newChat
text: qsTr("New chat")
anchors.top: parent.top
anchors.left: parent.left
anchors.right: parent.right
anchors.margins: 15
padding: 15
background: Rectangle {
opacity: .5
border.color: "#7d7d8e"
border.width: 1
radius: 10
color: "#343541"
}
}
}
Rectangle {
id: conversation
color: "#343541"
anchors.left: conversationList.right
anchors.right: parent.right
anchors.bottom: parent.bottom
anchors.top: parent.top
ScrollView {
id: scrollView
anchors.left: parent.left
anchors.right: parent.right
anchors.top: parent.top
anchors.bottom: textInput.top
anchors.bottomMargin: 30
ScrollBar.vertical.policy: ScrollBar.AlwaysOn
ListModel {
id: chatModel
}
Rectangle {
anchors.fill: parent
color: "#444654"
ListView {
id: listView
anchors.fill: parent
header: TextField {
id: modelName
width: parent.width
color: "#d1d5db"
padding: 20
font.pixelSize: 24
text: "Model: GPT-J-6B-4bit"
background: Rectangle {
color: "#444654"
}
focus: false
horizontalAlignment: TextInput.AlignHCenter
}
model: chatModel
delegate: TextArea {
text: currentResponse ? LLM.response : value
width: parent.width
color: "#d1d5db"
wrapMode: Text.WordWrap
focus: false
padding: 20
font.pixelSize: 24
cursorVisible: currentResponse ? LLM.responseInProgress : false
cursorPosition: text.length
background: Rectangle {
color: name === qsTr("Response: ") ? "#444654" : "#343541"
}
leftPadding: 100
Rectangle {
anchors.left: parent.left
anchors.top: parent.top
anchors.leftMargin: 20
anchors.topMargin: 20
width: 30
height: 30
radius: 5
color: name === qsTr("Response: ") ? "#10a37f" : "#ec86bf"
Text {
anchors.centerIn: parent
text: name === qsTr("Response: ") ? "R" : "P"
color: "white"
}
}
}
property bool shouldAutoScroll: true
property bool isAutoScrolling: false
Connections {
target: LLM
function onResponseChanged() {
if (listView.shouldAutoScroll) {
listView.isAutoScrolling = true
listView.positionViewAtEnd()
listView.isAutoScrolling = false
}
}
}
onContentYChanged: {
if (!isAutoScrolling)
shouldAutoScroll = atYEnd
}
Component.onCompleted: {
shouldAutoScroll = true
positionViewAtEnd()
}
footer: Item {
id: bottomPadding
width: parent.width
height: 60
}
}
}
}
Button {
Image {
anchors.verticalCenter: parent.verticalCenter
anchors.left: parent.left
anchors.leftMargin: 15
source: LLM.responseInProgress ? "qrc:/gpt4all-chat/icons/stop_generating.svg" : "qrc:/gpt4all-chat/icons/regenerate.svg"
}
text: LLM.responseInProgress ? qsTr(" Stop generating") : qsTr(" Regenerate response")
onClicked: {
if (LLM.responseInProgress)
LLM.stopGenerating()
else {
LLM.resetResponse()
if (chatModel.count) {
var listElement = chatModel.get(chatModel.count - 1)
if (listElement.name === qsTr("Response: ")) {
listElement.currentResponse = true
listElement.value = LLM.response
LLM.prompt(listElement.prompt)
}
}
}
}
anchors.bottom: textInput.top
anchors.horizontalCenter: textInput.horizontalCenter
anchors.bottomMargin: 40
padding: 15
background: Rectangle {
opacity: .5
border.color: "#7d7d8e"
border.width: 1
radius: 10
color: "#343541"
}
}
TextField {
id: textInput
anchors.left: parent.left
anchors.right: parent.right
anchors.bottom: parent.bottom
anchors.margins: 30
color: "#dadadc"
padding: 20
font.pixelSize: 24
placeholderText: qsTr("Send a message...")
placeholderTextColor: "#7d7d8e"
background: Rectangle {
color: "#40414f"
radius: 10
}
onAccepted: {
LLM.stopGenerating()
if (chatModel.count) {
var listElement = chatModel.get(chatModel.count - 1)
listElement.currentResponse = false
listElement.value = LLM.response
}
chatModel.append({"name": qsTr("Prompt: "), "currentResponse": false, "value": textInput.text})
chatModel.append({"name": qsTr("Response: "), "currentResponse": true, "value": "", "prompt": textInput.text})
LLM.resetResponse()
LLM.prompt(textInput.text)
textInput.text = ""
}
Button {
anchors.right: textInput.right
anchors.verticalCenter: textInput.verticalCenter
anchors.rightMargin: 15
width: 30
height: 30
background: Image {
anchors.centerIn: parent
source: "qrc:/gpt4all-chat/icons/send_message.svg"
}
onClicked: {
textInput.onAccepted()
}
}
}
}
}
Loading…
Cancel
Save