diff --git a/gpt4all-backend/CMakeLists.txt b/gpt4all-backend/CMakeLists.txt index f20404e3..27691d25 100644 --- a/gpt4all-backend/CMakeLists.txt +++ b/gpt4all-backend/CMakeLists.txt @@ -97,11 +97,6 @@ foreach(BUILD_VARIANT IN LISTS BUILD_VARIANTS) add_library(gptj-${BUILD_VARIANT} SHARED gptj.cpp utils.h utils.cpp llmodel_shared.cpp llmodel_shared.h) prepare_target(gptj llama-mainline) - - add_library(bert-${BUILD_VARIANT} SHARED - bert.cpp utils.h utils.cpp llmodel_shared.cpp llmodel_shared.h) - target_compile_definitions(bert-${BUILD_VARIANT} PRIVATE LLAMA_VERSIONS=>=3 LLAMA_DATE=999999) - prepare_target(bert llama-mainline) endif() endforeach() diff --git a/gpt4all-backend/bert.cpp b/gpt4all-backend/bert.cpp deleted file mode 100644 index 955faae1..00000000 --- a/gpt4all-backend/bert.cpp +++ /dev/null @@ -1,910 +0,0 @@ -#define BERT_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE -#include "bert_impl.h" -#include "llmodel_shared.h" -#include "ggml.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -//#define DEBUG_BERT - -namespace { -const char *modelType_ = "Bert"; -} - -typedef int32_t bert_vocab_id; - -// default hparams (all-MiniLM-L6-v2) -struct bert_hparams -{ - int32_t n_vocab = 30522; - int32_t n_max_tokens = 512; - int32_t n_embd = 256; - int32_t n_intermediate = 1536; - int32_t n_head = 12; - int32_t n_layer = 6; -}; - -struct bert_layer -{ - // normalization - struct ggml_tensor *ln_att_w; - struct ggml_tensor *ln_att_b; - - struct ggml_tensor *ln_out_w; - struct ggml_tensor *ln_out_b; - - // attention - struct ggml_tensor *q_w; - struct ggml_tensor *q_b; - struct ggml_tensor *k_w; - struct ggml_tensor *k_b; - struct ggml_tensor *v_w; - struct ggml_tensor *v_b; - - struct ggml_tensor *o_w; - struct ggml_tensor *o_b; - - // ff - struct ggml_tensor *ff_i_w; - struct ggml_tensor *ff_i_b; - - struct ggml_tensor *ff_o_w; - struct ggml_tensor *ff_o_b; -}; - -struct bert_vocab -{ - std::map token_to_id; - std::map subword_token_to_id; - - std::map _id_to_token; - std::map _id_to_subword_token; -}; - -struct bert_model -{ - bert_hparams hparams; - - // embeddings weights - struct ggml_tensor *word_embeddings; - struct ggml_tensor *token_type_embeddings; - struct ggml_tensor *position_embeddings; - struct ggml_tensor *ln_e_w; - struct ggml_tensor *ln_e_b; - - std::vector layers; - - struct ggml_context *ctx; -}; - -// Replacement for std::vector that doesn't require zero-initialization. -struct bert_ctx -{ - bert_model model; - bert_vocab vocab; - - size_t mem_per_token; - int64_t mem_per_input; - int32_t max_batch_n; - llm_buffer buf_compute; - llm_buffer work_buf; -}; - -int32_t bert_n_embd(bert_ctx * ctx) -{ - return ctx->model.hparams.n_embd; -} - -int32_t bert_n_max_tokens(bert_ctx * ctx) -{ - return ctx->model.hparams.n_max_tokens; -} - -const char* bert_vocab_id_to_token(bert_ctx * ctx, bert_vocab_id id) { - bert_vocab & vocab = ctx->vocab; - auto it = vocab._id_to_token.find(id); - if (it != vocab._id_to_token.end()) - { - return it->second.c_str(); - } - it = vocab._id_to_subword_token.find(id); - if (it != vocab._id_to_subword_token.end()) - { - return it->second.c_str(); - } - return "[UNK TOKEN from bert_vocab]"; -} - -// -// Tokenizing -// - -static size_t utf8_len(char src) -{ - const size_t lookup[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4}; - uint8_t highbits = static_cast(src) >> 4; - return lookup[highbits]; -} - -std::string stripAccents(const std::string &inputString) -{ - std::string resultString; - std::map accentMap = {{"À", 'A'},{"Á", 'A'}, - {"Â", 'A'},{"Ã", 'A'},{"Ä", 'A'},{"Å", 'A'},{"à", 'a'},{"á", 'a'}, - {"â", 'a'},{"ã", 'a'},{"ä", 'a'},{"å", 'a'},{"È", 'E'},{"É", 'E'}, - {"Ê", 'E'},{"Ë", 'E'},{"è", 'e'},{"é", 'e'},{"ê", 'e'},{"ë", 'e'}, - {"Ì", 'I'},{"Í", 'I'},{"Î", 'I'},{"Ï", 'I'},{"ì", 'i'},{"í", 'i'}, - {"î", 'i'},{"ï", 'i'},{"Ò", 'O'},{"Ó", 'O'},{"Ô", 'O'},{"Õ", 'O'}, - {"Ö", 'O'},{"ò", 'o'},{"ó", 'o'},{"ô", 'o'},{"õ", 'o'},{"ö", 'o'}, - {"Ù", 'U'},{"Ú", 'U'},{"Û", 'U'},{"Ü", 'U'},{"ù", 'u'},{"ú", 'u'}, - {"û", 'u'},{"ü", 'u'},{"Ý", 'Y'},{"ý", 'y'},{"Ç", 'C'},{"ç", 'c'}, - {"Ñ", 'N'},{"ñ", 'n'}, - }; - - for (size_t i = 0; i < inputString.length();) - { - int len = utf8_len(inputString[i]); - std::string curChar = inputString.substr(i, len); - auto iter = accentMap.find(curChar); - if (iter != accentMap.end()) - { - resultString += iter->second; - } - else - { - resultString += curChar; - } - i += len; - } - - return resultString; -} - -std::string bert_normalize_prompt(const std::string &text) -{ - // TODO: handle chinese characters? https://github.com/huggingface/tokenizers/blob/ef5f50605ddf9f8caef1598c0e4853862b9707a7/tokenizers/src/normalizers/bert.rs#L98 - std::string text2 = stripAccents(text); - for (size_t i = 0; i < text2.size(); i += utf8_len(text2[i])) - { - char c = text2[i]; - if (c >= 'A' && c <= 'Z') - text2[i] = c - 'A' + 'a'; - } - return text2; -} - -std::vector bert_tokenize( - struct bert_ctx * ctx, - const char * text) -{ - const bert_vocab &vocab = ctx->vocab; - - std::string str = text; - - std::vector words; - // first split the text into words - { - str = bert_normalize_prompt(str); - - std::string pat = R"([[:punct:]]|[[:alpha:]]+|[[:digit:]]+)"; - - std::regex re(pat); - std::smatch m; - - while (std::regex_search(str, m, re)) - { - for (std::string x : m) - { - words.push_back(x); - } - str = m.suffix(); - } - } - - // find the longest tokens that form the words: - std::vector tokens; - int cls_tok_id = 101; - tokens.push_back(cls_tok_id); - for (const auto &word : words) - { - if (word.size() == 0) - continue; - - int i = 0; - int n = word.size(); - auto *token_map = &vocab.token_to_id; - while (i < n) - { - int j = n; - while (j > i) - { - auto it = token_map->find(word.substr(i, j - i)); - if (it != token_map->end()) - { - tokens.push_back(it->second); - i = j; - token_map = &vocab.subword_token_to_id; - } - --j; - } - if (j == i) - { - fprintf(stderr, "%s: unknown token '%s'\n", __func__, word.substr(i, 1).data()); - token_map = &vocab.subword_token_to_id; - ++i; - } - } - } - - return tokens; -} - -void bert_resize_ctx(bert_ctx * ctx, int32_t new_size) { - int64_t buf_size_new = ctx->mem_per_input * new_size; - - // TODO: Max memory should be a param? Now just 1 GB - int64_t GB = 1 << 30; -#if defined(DEBUG_BERT) - printf("%s: requested_buf_size %lldMB\n", __func__, buf_size_new / (1 << 20)); -#endif - if (buf_size_new > GB) { - int32_t adjusted_new_size = GB / ctx->mem_per_input; - if (adjusted_new_size < 1) adjusted_new_size = 1; -#if defined(DEBUG_BERT) - printf("%s: requested batch size %d, actual new batch size %d\n", __func__, new_size, adjusted_new_size); -#endif - new_size = adjusted_new_size; - buf_size_new = ctx->mem_per_input * new_size; - } - if (new_size > ctx->max_batch_n) { - ctx->buf_compute.resize(buf_size_new); - ctx->max_batch_n = new_size; - } -} - -void bert_eval( - struct bert_ctx *ctx, - int32_t n_threads, - const bert_vocab_id *raw_tokens, - int32_t n_tokens, - float *embeddings) -{ - const bert_model& model = ctx->model; - bool mem_req_mode = !embeddings; - - // batch_embeddings is nullptr for the initial memory requirements run - if (!mem_req_mode && 1 > ctx->max_batch_n) - bert_resize_ctx(ctx, 1); - - const int N = n_tokens; - const auto &tokens = raw_tokens; - - const auto &hparams = model.hparams; - - const int n_embd = hparams.n_embd; - const int n_layer = hparams.n_layer; - const int n_max_tokens = hparams.n_max_tokens; - const int n_head = hparams.n_head; - - const int d_head = n_embd / n_head; - - std::vector result; - if (N > n_max_tokens) - { - fprintf(stderr, "Too many tokens, maximum is %d\n", n_max_tokens); - return; - } - - auto & mem_per_token = ctx->mem_per_token; - auto & buf_compute = ctx->buf_compute; - - struct ggml_init_params params = { - .mem_size = buf_compute.size, - .mem_buffer = buf_compute.addr, - .no_alloc = false, - }; - - struct ggml_context *ctx0 = ggml_init(params); - struct ggml_cgraph *gf = ggml_new_graph(ctx0); - - // Embeddings. word_embeddings + token_type_embeddings + position_embeddings - struct ggml_tensor *token_layer = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); - memcpy(token_layer->data, tokens, N * ggml_element_size(token_layer)); - - struct ggml_tensor *token_types = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); - ggml_set_zero(token_types); - - struct ggml_tensor *positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); - for (int i = 0; i < N; i++) - { - ggml_set_i32_1d(positions, i, i); - } - - struct ggml_tensor *inpL = ggml_get_rows(ctx0, model.word_embeddings, token_layer); - - inpL = ggml_add(ctx0, - ggml_get_rows(ctx0, model.token_type_embeddings, token_types), - inpL); - inpL = ggml_add(ctx0, - ggml_get_rows(ctx0, model.position_embeddings, positions), - inpL); - - // embd norm - { - inpL = ggml_norm(ctx0, inpL, 1e-12f); - - inpL = ggml_add(ctx0, - ggml_mul(ctx0, - ggml_repeat(ctx0, model.ln_e_w, inpL), - inpL), - ggml_repeat(ctx0, model.ln_e_b, inpL)); - } - // layers - for (int il = 0; il < n_layer; il++) - { - struct ggml_tensor *cur = inpL; - - // self-attention - { - struct ggml_tensor *Qcur = cur; - Qcur = ggml_reshape_3d(ctx0, - ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].q_b, Qcur), - ggml_mul_mat(ctx0, model.layers[il].q_w, Qcur)), - d_head, n_head, N); - struct ggml_tensor *Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); - - struct ggml_tensor *Kcur = cur; - Kcur = ggml_reshape_3d(ctx0, - ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].k_b, Kcur), - ggml_mul_mat(ctx0, model.layers[il].k_w, Kcur)), - d_head, n_head, N); - struct ggml_tensor *K = ggml_permute(ctx0, Kcur, 0, 2, 1, 3); - - struct ggml_tensor *Vcur = cur; - Vcur = ggml_reshape_3d(ctx0, - ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].v_b, Vcur), - ggml_mul_mat(ctx0, model.layers[il].v_w, Vcur)), - d_head, n_head, N); - struct ggml_tensor *V = ggml_permute(ctx0, Vcur, 0, 2, 1, 3); - - struct ggml_tensor *KQ = ggml_mul_mat(ctx0, K, Q); - // KQ = soft_max(KQ / sqrt(head width)) - KQ = ggml_soft_max( - ctx0, ggml_scale(ctx0, KQ, 1.0f / sqrt((float)d_head)) - ); - - V = ggml_cont(ctx0, ggml_transpose(ctx0, V)); - struct ggml_tensor *KQV = ggml_mul_mat(ctx0, V, KQ); - KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - - cur = ggml_cpy(ctx0, - KQV, - ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); - } - // attention output - cur = ggml_add(ctx0, - ggml_repeat(ctx0, model.layers[il].o_b, cur), - ggml_mul_mat(ctx0, model.layers[il].o_w, cur)); - - // re-add the layer input - cur = ggml_add(ctx0, cur, inpL); - - // attention norm - { - cur = ggml_norm(ctx0, cur, 1e-12f); - - cur = ggml_add(ctx0, - ggml_mul(ctx0, - ggml_repeat(ctx0, model.layers[il].ln_att_w, cur), - cur), - ggml_repeat(ctx0, model.layers[il].ln_att_b, cur)); - } - struct ggml_tensor *att_output = cur; - // intermediate_output = self.intermediate(attention_output) - cur = ggml_mul_mat(ctx0, model.layers[il].ff_i_w, cur); - cur = ggml_add(ctx0, - ggml_repeat(ctx0, model.layers[il].ff_i_b, cur), - cur); - cur = ggml_gelu(ctx0, cur); - - // layer_output = self.output(intermediate_output, attention_output) - cur = ggml_mul_mat(ctx0, model.layers[il].ff_o_w, cur); - cur = ggml_add(ctx0, - ggml_repeat(ctx0, model.layers[il].ff_o_b, cur), - cur); - // attentions bypass the intermediate layer - cur = ggml_add(ctx0, att_output, cur); - - // output norm - { - cur = ggml_norm(ctx0, cur, 1e-12f); - - cur = ggml_add(ctx0, - ggml_mul(ctx0, - ggml_repeat(ctx0, model.layers[il].ln_out_w, cur), - cur), - ggml_repeat(ctx0, model.layers[il].ln_out_b, cur)); - } - inpL = cur; - } - inpL = ggml_cont(ctx0, ggml_transpose(ctx0, inpL)); - // pooler - struct ggml_tensor *sum = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, N, 1); - ggml_set_f32(sum, 1.0f / N); - inpL = ggml_mul_mat(ctx0, inpL, sum); - - ggml_tensor *output = inpL; - // run the computation - ggml_build_forward_expand(gf, output); - //ggml_graph_compute_g4a() - ggml_graph_compute_g4a(ctx->work_buf, gf, n_threads); - //ggml_graph_compute(ctx0, gf); - - - // float *dat = ggml_get_data_f32(output); - // pretty_print_tensor(dat, output->ne, output->nb, output->n_dims - 1, ""); - - #ifdef GGML_PERF - // print timing information per ggml operation (for debugging purposes) - // requires GGML_PERF to be defined - ggml_graph_print(gf); - #endif - - if (!mem_req_mode) { - memcpy(embeddings, (float *)ggml_get_data(output), sizeof(float) * n_embd); - } else { - mem_per_token = ggml_used_mem(ctx0) / N; - } - - // printf("used_mem = %zu KB \n", ggml_used_mem(ctx0) / 1024); - // printf("mem_per_token = %zu KB \n", mem_per_token / 1024); - - ggml_free(ctx0); -} - -// -// Loading and setup -// - -void bert_free(bert_ctx * ctx) { - delete ctx; -} - -struct bert_ctx * bert_load_from_file(const char *fname) -{ -#if defined(DEBUG_BERT) - printf("%s: loading model from '%s' - please wait ...\n", __func__, fname); -#endif - - bert_ctx * new_bert = new bert_ctx; - - bert_model & model = new_bert->model; - bert_vocab & vocab = new_bert->vocab; - - struct gguf_init_params params = { - /*.no_alloc = */ false, - /*.ctx = */ &model.ctx, - }; - gguf_context *ggufctx = gguf_init_from_file(fname, params); - if (!ggufctx) { - fprintf(stderr, "%s: gguf_init_from_file() failed\n", __func__); - return nullptr; - } - - printf("%s: gguf version = %d\n", __func__, gguf_get_version(ggufctx)); - printf("%s: gguf alignment = %zu\n", __func__, gguf_get_alignment(ggufctx)); - printf("%s: gguf data offset = %zu\n", __func__, gguf_get_data_offset(ggufctx)); - - // print some standard metadata - { - int keyidx; - - keyidx = gguf_find_key(ggufctx, "general.name"); - if (keyidx != -1) { printf("%s: model name = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } - keyidx = gguf_find_key(ggufctx, "general.description"); - if (keyidx != -1) { printf("%s: model description = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } - keyidx = gguf_find_key(ggufctx, "general.author"); - if (keyidx != -1) { printf("%s: model author = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } - keyidx = gguf_find_key(ggufctx, "general.license"); - if (keyidx != -1) { printf("%s: model license = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } - keyidx = gguf_find_key(ggufctx, "general.architecture"); - if (keyidx != -1) { printf("%s: model architecture = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } - keyidx = gguf_find_key(ggufctx, "general.file_type"); - if (keyidx != -1) { printf("%s: model file type = %" PRIu32 "\n", __func__, gguf_get_val_u32(ggufctx, keyidx)); } - keyidx = gguf_find_key(ggufctx, "gptneox.tensor_data_layout"); - if (keyidx != -1) { printf("%s: model data layout = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } - keyidx = gguf_find_key(ggufctx, "general.source.huggingface.repository"); - if (keyidx != -1) { printf("%s: model source HF repo = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } - } - - // check required metadata - { - // check model architecture kv - int keyidx = gguf_find_key(ggufctx, "general.architecture"); - if (keyidx == -1) { - fprintf(stderr, "%s: gguf model architecture not found!\n", __func__); - return nullptr; - } - if (strcmp(gguf_get_val_str(ggufctx, keyidx), "bert") != 0) { - fprintf(stderr, "%s: model architecture not supported!\n", __func__); - return nullptr; - } - } - - // load hparams - { - auto &hparams = model.hparams; - - bool ok = false; - int keyidx; - - do { - keyidx = gguf_find_key(ggufctx, "bert.context_length"); - if (keyidx == -1) { break; } - hparams.n_max_tokens = gguf_get_val_u32(ggufctx, keyidx); - - keyidx = gguf_find_key(ggufctx, "bert.embedding_length"); - if (keyidx == -1) { break; } - hparams.n_embd = gguf_get_val_u32(ggufctx, keyidx); - - keyidx = gguf_find_key(ggufctx, "bert.feed_forward_length"); - if (keyidx == -1) { break; } - hparams.n_intermediate = gguf_get_val_u32(ggufctx, keyidx); - - keyidx = gguf_find_key(ggufctx, "bert.attention.head_count"); - if (keyidx == -1) { break; } - hparams.n_head = gguf_get_val_u32(ggufctx, keyidx); - - keyidx = gguf_find_key(ggufctx, "bert.block_count"); - if (keyidx == -1) { break; } - hparams.n_layer = gguf_get_val_u32(ggufctx, keyidx); - - ok = true; - } while (false); - - if (!ok) { - fprintf(stderr, "%s: required hparam missing!\n", __func__); - return nullptr; - } - -#if defined(DEBUG_BERT) - printf("%s: n_max_tokens = %d\n", __func__, hparams.n_max_tokens); - printf("%s: n_embd = %d\n", __func__, hparams.n_embd); - printf("%s: n_intermediate = %d\n", __func__, hparams.n_intermediate); - printf("%s: n_head = %d\n", __func__, hparams.n_head); - printf("%s: n_layer = %d\n", __func__, hparams.n_layer); -#endif - } - - // load vocab - { - auto & hparams = model.hparams; - - int keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.model"); - if (keyidx == -1) { - fprintf(stderr, "%s: tokenizer model not found!\n", __func__); - return nullptr; - } - if (strcmp(gguf_get_val_str(ggufctx, keyidx), "bert") != 0) { - fprintf(stderr, "%s: tokenizer model not supported!\n", __func__); - return nullptr; - } - - int tokens_keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.tokens"); - if (tokens_keyidx == -1) { - fprintf(stderr, "%s: bert tokenizer vocab not found!\n", __func__); - return nullptr; - } - - hparams.n_vocab = gguf_get_arr_n(ggufctx, tokens_keyidx); - printf("%s: bert tokenizer vocab = %d\n", __func__, int(hparams.n_vocab)); - - for (int i = 0; i < hparams.n_vocab; i++) { - std::string word = gguf_get_arr_str(ggufctx, tokens_keyidx, i); - - if (word[0] == '#' && word[1] == '#') - { - vocab.subword_token_to_id[word.substr(2)] = i; - vocab._id_to_subword_token[i] = word; - } - - if (vocab.token_to_id.count(word) == 0) - { - vocab.token_to_id[word] = i; - vocab._id_to_token[i] = word; - } - } - } - - auto &ctx = model.ctx; - -#if defined(DEBUG_BERT) - printf("%s: ggml ctx size = %6.2f MB\n", __func__, ggml_get_mem_size(ctx) / (1024.0 * 1024.0)); -#endif - - // prepare memory for the weights - { - const int n_layer = model.hparams.n_layer; - model.layers.resize(n_layer); - - model.word_embeddings = ggml_get_tensor(ctx, "token_embd.weight"); - model.token_type_embeddings = ggml_get_tensor(ctx, "token_types.weight"); - model.position_embeddings = ggml_get_tensor(ctx, "position_embd.weight"); - model.ln_e_w = ggml_get_tensor(ctx, "output_norm.weight"); - model.ln_e_b = ggml_get_tensor(ctx, "output_norm.bias"); - - auto name = [](int i, std::string n) { - static std::string key; - key = "blk." + std::to_string(i) + "." + n; - return key.c_str(); - }; - - for (int i = 0; i < n_layer; ++i) - { - auto &layer = model.layers[i]; - - layer.ln_att_w = ggml_get_tensor(ctx, name(i, "attn_norm.weight")); - layer.ln_att_b = ggml_get_tensor(ctx, name(i, "attn_norm.bias")); - layer.ln_out_w = ggml_get_tensor(ctx, name(i, "ffn_norm.weight")); - layer.ln_out_b = ggml_get_tensor(ctx, name(i, "ffn_norm.bias")); - layer.q_w = ggml_get_tensor(ctx, name(i, "attn_q.weight")); - layer.q_b = ggml_get_tensor(ctx, name(i, "attn_q.bias")); - layer.k_w = ggml_get_tensor(ctx, name(i, "attn_k.weight")); - layer.k_b = ggml_get_tensor(ctx, name(i, "attn_k.bias")); - layer.v_w = ggml_get_tensor(ctx, name(i, "attn_v.weight")); - layer.v_b = ggml_get_tensor(ctx, name(i, "attn_v.bias")); - layer.o_w = ggml_get_tensor(ctx, name(i, "attn_output.weight")); - layer.o_b = ggml_get_tensor(ctx, name(i, "attn_output.bias")); - layer.ff_i_w = ggml_get_tensor(ctx, name(i, "ffn_up.weight")); - layer.ff_i_b = ggml_get_tensor(ctx, name(i, "ffn_up.bias")); - layer.ff_o_w = ggml_get_tensor(ctx, name(i, "ffn_down.weight")); - layer.ff_o_b = ggml_get_tensor(ctx, name(i, "ffn_down.bias")); - } - } - - // Calculate space requirements for setting up context buffers later - { - bert_vocab_id tokens[] = {0, 1, 2, 3}; - // TODO: We set the initial buffer size to 16MB and hope it's enough. Maybe there is a better way to do this? - new_bert->buf_compute.resize(16 * 1024 * 1024); - bert_eval(new_bert, 1, tokens, 4, nullptr); - new_bert->max_batch_n = 0; - - // TODO: Max tokens should be a param? - int32_t N = new_bert->model.hparams.n_max_tokens; - new_bert->mem_per_input = 2.2 * (new_bert->mem_per_token * N); // add 10% to account for ggml object overhead - - } -#if defined(DEBUG_BERT) - printf("%s: mem_per_token %ld KB, mem_per_input %ld MB\n", __func__, new_bert->mem_per_token / (1 << 10), new_bert->mem_per_input / (1 << 20)); -#endif - - return new_bert; -} - -struct BertPrivate { - const std::string modelPath; - bool modelLoaded; - bert_ctx *ctx = nullptr; - int64_t n_threads = 0; -}; - -Bert::Bert() : d_ptr(new BertPrivate) { - d_ptr->modelLoaded = false; -} - -Bert::~Bert() { - bert_free(d_ptr->ctx); -} - -bool Bert::loadModel(const std::string &modelPath, int n_ctx, int ngl) -{ - (void)n_ctx; - (void)ngl; - d_ptr->modelLoaded = false; - - auto * ctx = bert_load_from_file(modelPath.c_str()); - fflush(stdout); - if (!ctx) - return false; - - d_ptr->ctx = ctx; - d_ptr->n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency()); - d_ptr->modelLoaded = true; - return true; -} - -bool Bert::isModelLoaded() const -{ - return d_ptr->modelLoaded; -} - -size_t Bert::requiredMem(const std::string &modelPath, int n_ctx, int ngl) -{ - (void)modelPath; - (void)n_ctx; - (void)ngl; - return 0; -} - -size_t Bert::stateSize() const -{ - return 0; -} - -size_t Bert::saveState(uint8_t */*dest*/) const -{ - return 0; -} - -size_t Bert::restoreState(const uint8_t */*src*/) -{ - return 0; -} - -void Bert::setThreadCount(int32_t n_threads) -{ - d_ptr->n_threads = n_threads; -} - -int32_t Bert::threadCount() const -{ - return d_ptr->n_threads; -} - -std::vector Bert::embedding(const std::string &text) -{ - const int overlap = 32; - const LLModel::Token clsToken = 101; - const size_t contextLength = bert_n_max_tokens(d_ptr->ctx); - typedef std::vector TokenString; - TokenString tokens = ::bert_tokenize(d_ptr->ctx, text.c_str()); -#if defined(DEBUG_BERT) - std::cerr << "embedding: " << tokens.size() - << " contextLength " << contextLength - << "\n"; -#endif - std::vector embeddingsSum(bert_n_embd(d_ptr->ctx), 0); - int embeddingsSumTotal = 0; - size_t start_pos = 0; - bool isFirstChunk = true; - while (start_pos < tokens.size()) { - TokenString chunk; - if (!isFirstChunk) - chunk.push_back(clsToken); - const size_t l = isFirstChunk ? contextLength : contextLength - 1; - if (tokens.size() - start_pos > l) { - chunk.insert(chunk.end(), tokens.begin() + start_pos, tokens.begin() + start_pos + l); - start_pos = start_pos + contextLength - overlap; - } else { - chunk.insert(chunk.end(), tokens.begin() + start_pos, tokens.end()); - start_pos = tokens.size(); - } -#if defined(DEBUG_BERT) - std::cerr << "chunk length: " << chunk.size() - << " embeddingsSumTotal " << embeddingsSumTotal - << " contextLength " << contextLength - << " start_pos " << start_pos - << "\n"; -#endif - embeddingsSumTotal++; - std::vector embeddings(bert_n_embd(d_ptr->ctx)); - bert_eval(d_ptr->ctx, d_ptr->n_threads, chunk.data(), chunk.size(), embeddings.data()); - std::transform(embeddingsSum.begin(), embeddingsSum.end(), embeddings.begin(), embeddingsSum.begin(), std::plus()); - isFirstChunk = false; - } - - std::transform(embeddingsSum.begin(), embeddingsSum.end(), embeddingsSum.begin(), [embeddingsSumTotal](float num){ return num / embeddingsSumTotal; }); - double magnitude = std::sqrt(std::inner_product(embeddingsSum.begin(), embeddingsSum.end(), embeddingsSum.begin(), 0.0)); - for (auto &value : embeddingsSum) - value /= magnitude; - std::vector finalEmbeddings(embeddingsSum.begin(), embeddingsSum.end()); - return finalEmbeddings; -} - -std::vector Bert::tokenize(PromptContext &ctx, const std::string &str, bool special) const -{ - (void)ctx; - (void)special; - return ::bert_tokenize(d_ptr->ctx, str.c_str()); -} - -LLModel::Token Bert::sampleToken(PromptContext &/*promptCtx*/) const -{ - return 999 /*!*/; -} - -std::string Bert::tokenToString(Token id) const -{ - return bert_vocab_id_to_token(d_ptr->ctx, id); -} - -bool Bert::evalTokens(PromptContext &ctx, const std::vector &tokens) const -{ - std::vector embeddings(bert_n_embd(d_ptr->ctx)); - int32_t cls = 101; - const bool useCLS = tokens.front() != cls; - if (useCLS) { - std::vector myTokens; - myTokens.push_back(cls); - myTokens.insert(myTokens.end(), tokens.begin(), tokens.end()); - bert_eval(d_ptr->ctx, d_ptr->n_threads, myTokens.data(), myTokens.size(), embeddings.data()); - } else - bert_eval(d_ptr->ctx, d_ptr->n_threads, tokens.data(), tokens.size(), embeddings.data()); - ctx.n_past = 0; // bert does not store any context - return true; -} - -int32_t Bert::contextLength() const -{ - return bert_n_max_tokens(d_ptr->ctx); -} - -const std::vector &Bert::endTokens() const -{ - static const std::vector out = { 102 /*sep*/}; - return out; -} - -std::string get_arch_name(gguf_context *ctx_gguf) { - std::string arch_name; - const int kid = gguf_find_key(ctx_gguf, "general.architecture"); - enum gguf_type ktype = gguf_get_kv_type(ctx_gguf, kid); - if (ktype != GGUF_TYPE_STRING) { - throw std::runtime_error("ERROR: Can't get general architecture from gguf file."); - } - return gguf_get_val_str(ctx_gguf, kid); -} - -#if defined(_WIN32) -#define DLL_EXPORT __declspec(dllexport) -#else -#define DLL_EXPORT __attribute__ ((visibility ("default"))) -#endif - -extern "C" { -DLL_EXPORT bool is_g4a_backend_model_implementation() { - return true; -} - -DLL_EXPORT const char *get_model_type() { - return modelType_; -} - -DLL_EXPORT const char *get_build_variant() { - return GGML_BUILD_VARIANT; -} - -DLL_EXPORT bool magic_match(const char * fname) { - struct ggml_context * ctx_meta = NULL; - struct gguf_init_params params = { - /*.no_alloc = */ true, - /*.ctx = */ &ctx_meta, - }; - gguf_context *ctx_gguf = gguf_init_from_file(fname, params); - if (!ctx_gguf) - return false; - - bool isValid = gguf_get_version(ctx_gguf) <= 3; - isValid = isValid && get_arch_name(ctx_gguf) == "bert"; - - gguf_free(ctx_gguf); - return isValid; -} - -DLL_EXPORT LLModel *construct() { - return new Bert; -} -} diff --git a/gpt4all-backend/bert_impl.h b/gpt4all-backend/bert_impl.h deleted file mode 100644 index 610cc2c9..00000000 --- a/gpt4all-backend/bert_impl.h +++ /dev/null @@ -1,45 +0,0 @@ -#ifndef BERT_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE -#error This file is NOT meant to be included outside of bert.cpp. Doing so is DANGEROUS. Be sure to know what you are doing before proceeding to #define BERT_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE -#endif -#ifndef BERT_H -#define BERT_H - -#include -#include -#include -#include -#include "llmodel.h" - -struct BertPrivate; -class Bert : public LLModel { -public: - Bert(); - ~Bert(); - - bool supportsEmbedding() const override { return true; } - bool supportsCompletion() const override { return true; } - bool loadModel(const std::string &modelPath, int n_ctx, int ngl) override; - bool isModelLoaded() const override; - size_t requiredMem(const std::string &modelPath, int n_ctx, int ngl) override; - size_t stateSize() const override; - size_t saveState(uint8_t *dest) const override; - size_t restoreState(const uint8_t *src) override; - void setThreadCount(int32_t n_threads) override; - int32_t threadCount() const override; - - std::vector embedding(const std::string &text) override; - -private: - std::unique_ptr d_ptr; - -protected: - std::vector tokenize(PromptContext &ctx, const std::string &str, bool special) const override; - Token sampleToken(PromptContext &ctx) const override; - std::string tokenToString(Token id) const override; - bool evalTokens(PromptContext &ctx, const std::vector &tokens) const override; - int32_t contextLength() const override; - const std::vector &endTokens() const override; - bool shouldAddBOS() const override { return true; } -}; - -#endif // BERT_H diff --git a/gpt4all-backend/llama.cpp-mainline b/gpt4all-backend/llama.cpp-mainline index 2a086f71..43c20ce8 160000 --- a/gpt4all-backend/llama.cpp-mainline +++ b/gpt4all-backend/llama.cpp-mainline @@ -1 +1 @@ -Subproject commit 2a086f71f5b570a0f047f88d88cf5704aae7ec7c +Subproject commit 43c20ce8004a4eac25ffe89e52bdf94bc7c47c02 diff --git a/gpt4all-backend/llamamodel.cpp b/gpt4all-backend/llamamodel.cpp index d2b25b8a..966bf90f 100644 --- a/gpt4all-backend/llamamodel.cpp +++ b/gpt4all-backend/llamamodel.cpp @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -30,6 +31,19 @@ static constexpr int GGUF_VER_MAX = 3; static const char * const modelType_ = "LLaMA"; +static const std::vector KNOWN_ARCHES { + "baichuan", "bert", "bloom", "codeshell", "falcon", "gemma", "gpt2", "llama", "mpt", "nomic-bert", "orion", + "persimmon", "phi2", "plamo", "qwen", "qwen2", "refact", "stablelm", "starcoder" +}; + +static const std::vector EMBEDDING_ARCHES { + "bert", "nomic-bert" +}; + +static bool is_embedding_arch(const std::string &arch) { + return std::find(EMBEDDING_ARCHES.begin(), EMBEDDING_ARCHES.end(), arch) < EMBEDDING_ARCHES.end(); +} + static bool llama_verbose() { const char* var = getenv("GPT4ALL_VERBOSE_LLAMACPP"); return var && *var; @@ -124,7 +138,7 @@ static int32_t get_arch_key_u32(std::string const &modelPath, std::string const auto * ctx = load_gguf(modelPath.c_str()); if (!ctx) return -1; - auto arch = get_arch_name(ctx); + std::string arch = get_arch_name(ctx); int32_t value = -1; if (ctx) { @@ -193,7 +207,7 @@ size_t LLamaModel::requiredMem(const std::string &modelPath, int n_ctx, int ngl) return filesize + est_kvcache_size; } -bool LLamaModel::isModelBlacklisted(const std::string &modelPath) { +bool LLamaModel::isModelBlacklisted(const std::string &modelPath) const { auto * ctx = load_gguf(modelPath.c_str()); if (!ctx) { std::cerr << __func__ << ": failed to load " << modelPath << "\n"; @@ -229,6 +243,18 @@ bool LLamaModel::isModelBlacklisted(const std::string &modelPath) { return res; } +bool LLamaModel::isEmbeddingModel(const std::string &modelPath) const { + auto *ctx_gguf = load_gguf(modelPath.c_str()); + if (!ctx_gguf) { + std::cerr << __func__ << ": failed to load GGUF from " << modelPath << "\n"; + return false; + } + + std::string arch = get_arch_name(ctx_gguf); + gguf_free(ctx_gguf); + return is_embedding_arch(arch); +} + bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl) { d_ptr->modelLoaded = false; @@ -287,20 +313,25 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl) if (!d_ptr->model) { fflush(stdout); d_ptr->device = -1; - std::cerr << "LLAMA ERROR: failed to load model from " << modelPath << std::endl; + std::cerr << "LLAMA ERROR: failed to load model from " << modelPath << std::endl; return false; } - const int n_ctx_train = llama_n_ctx_train(d_ptr->model); - if (n_ctx > n_ctx_train) { - std::cerr << "warning: model was trained on only " << n_ctx_train << " context tokens (" - << n_ctx << " specified)\n"; - } - // -- initialize the context -- d_ptr->ctx_params = llama_context_default_params(); + bool isEmbedding = is_embedding_arch(llama_model_arch(d_ptr->model)); + const int n_ctx_train = llama_n_ctx_train(d_ptr->model); + if (isEmbedding) { + d_ptr->ctx_params.n_batch = n_ctx_train; + } else { + if (n_ctx > n_ctx_train) { + std::cerr << "warning: model was trained on only " << n_ctx_train << " context tokens (" + << n_ctx << " specified)\n"; + } + } + d_ptr->ctx_params.n_ctx = n_ctx; d_ptr->ctx_params.seed = params.seed; d_ptr->ctx_params.type_k = params.kv_type; @@ -314,6 +345,9 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl) d_ptr->ctx_params.n_threads = d_ptr->n_threads; d_ptr->ctx_params.n_threads_batch = d_ptr->n_threads; + if (m_supportsEmbedding) + d_ptr->ctx_params.embeddings = true; + d_ptr->ctx = llama_new_context_with_model(d_ptr->model, d_ptr->ctx_params); if (!d_ptr->ctx) { fflush(stdout); @@ -332,6 +366,9 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl) } #endif + m_supportsEmbedding = isEmbedding; + m_supportsCompletion = !isEmbedding; + fflush(stdout); d_ptr->modelLoaded = true; return true; @@ -535,6 +572,320 @@ bool LLamaModel::usingGPUDevice() #endif } +void llama_batch_add( + struct llama_batch & batch, + llama_token id, + llama_pos pos, + const std::vector & seq_ids, + bool logits) { + batch.token [batch.n_tokens] = id; + batch.pos [batch.n_tokens] = pos; + batch.n_seq_id[batch.n_tokens] = seq_ids.size(); + for (size_t i = 0; i < seq_ids.size(); ++i) { + batch.seq_id[batch.n_tokens][i] = seq_ids[i]; + } + batch.logits [batch.n_tokens] = logits; + + batch.n_tokens++; +} + +static void batch_add_seq(llama_batch &batch, const std::vector &tokens, int seq_id) { + for (unsigned i = 0; i < tokens.size(); i++) { + llama_batch_add(batch, tokens[i], i, { seq_id }, i == tokens.size() - 1); + } +} + +size_t LLamaModel::embeddingSize() const { + return llama_n_embd(d_ptr->model); +} + +struct EmbModelSpec { + const char *docPrefix; + const char *queryPrefix; + std::vector otherPrefixes = {}; + bool matryoshkaCapable = false; + const char *recommendedDims = nullptr; +}; + +struct EmbModelGroup { + EmbModelSpec spec; + std::vector names; +}; + +static const EmbModelSpec NOPREFIX_SPEC {nullptr, nullptr}; +static const EmbModelSpec NOMIC_SPEC {"search_document", "search_query", {"clustering", "classification"}}; +static const EmbModelSpec E5_SPEC {"passage", "query"}; + +static const EmbModelSpec NOMIC_1_5_SPEC { + "search_document", "search_query", {"clustering", "classification"}, true, "[768, 512, 384, 256, 128]" +}; +static const EmbModelSpec LLM_EMBEDDER_SPEC { + "Represent this document for retrieval", + "Represent this query for retrieving relevant documents", +}; +static const EmbModelSpec BGE_SPEC { + nullptr, "Represent this sentence for searching relevant passages", +}; +static const EmbModelSpec E5_MISTRAL_SPEC { + nullptr, "Instruct: Given a query, retrieve relevant passages that answer the query\nQuery", +}; + +static const EmbModelGroup EMBEDDING_MODEL_SPECS[] { + {NOPREFIX_SPEC, {"all-MiniLM-L6-v1", "all-MiniLM-L12-v1", "all-MiniLM-L6-v2", "all-MiniLM-L12-v2"}}, + {NOMIC_SPEC, {"nomic-embed-text-v1", "nomic-embed-text-v1-ablated", "nomic-embed-text-v1-unsupervised"}}, + {NOMIC_1_5_SPEC, {"nomic-embed-text-v1.5"}}, + {LLM_EMBEDDER_SPEC, {"llm-embedder"}}, + {BGE_SPEC, {"bge-small-en", "bge-base-en", "bge-large-en", + "bge-small-en-v1.5", "bge-base-en-v1.5", "bge-large-en-v1.5"}}, + {E5_SPEC, {"e5-small", "e5-base", "e5-large", + "e5-small-unsupervised", "e5-base-unsupervised", "e5-large-unsupervised", + "e5-small-v2", "e5-base-v2", "e5-large-v2"}}, + {E5_MISTRAL_SPEC, {"e5-mistral-7b-instruct", + "multilingual-e5-small", "multilingual-e5-base", "multilingual-e5-large", + "multilingual-e5-large-instruct"}}, +}; + +static const EmbModelSpec *getEmbedSpec(const std::string &modelName) { + static const auto &specs = EMBEDDING_MODEL_SPECS; + auto it = std::find_if(specs, std::end(specs), + [&modelName](auto &spec) { + auto &names = spec.names; + return std::find(names.begin(), names.end(), modelName) < names.end(); + } + ); + return it < std::end(specs) ? &it->spec : nullptr; +} + +void LLamaModel::embed( + const std::vector &texts, float *embeddings, bool isRetrieval, int dimensionality, bool doMean, + bool atlas +) { + const EmbModelSpec *spec; + std::optional prefix; + if (d_ptr->model && (spec = getEmbedSpec(llama_model_name(d_ptr->model)))) + prefix = isRetrieval ? spec->queryPrefix : spec->docPrefix; + + embed(texts, embeddings, prefix, dimensionality, doMean, atlas); +} + +void LLamaModel::embed( + const std::vector &texts, float *embeddings, std::optional prefix, int dimensionality, + bool doMean, bool atlas +) { + if (!d_ptr->model) + throw std::logic_error("no model is loaded"); + + const char *modelName = llama_model_name(d_ptr->model); + if (!m_supportsEmbedding) + throw std::logic_error("not an embedding model: "s + modelName); + + auto *spec = getEmbedSpec(modelName); + if (!spec) + std::cerr << __func__ << ": warning: unknown model " << modelName << "\n"; + + const int32_t n_embd = llama_n_embd(d_ptr->model); + if (dimensionality < 0) { + dimensionality = n_embd; + } else if (spec && dimensionality != n_embd) { + auto msg = [dimensionality, modelName]() { + return "unsupported dimensionality " + std::to_string(dimensionality) + " for model " + modelName; + }; + if (!spec->matryoshkaCapable) + throw std::logic_error(msg() + " (supported: " + std::to_string(n_embd) + ")"); + if (dimensionality == 0 || dimensionality > n_embd) + throw std::logic_error(msg() + " (recommended: " + spec->recommendedDims + ")"); + } + + if (!prefix) { + if (spec) { + prefix = spec->docPrefix; + } else { + std::cerr << __func__ << ": warning: assuming no prefix\n"; + prefix = ""; + } + } else if (spec && prefix != spec->docPrefix && prefix != spec->queryPrefix && + std::find(spec->otherPrefixes.begin(), spec->otherPrefixes.end(), *prefix) == spec->otherPrefixes.end()) + { + std::stringstream ss; + ss << std::quoted(*prefix) << " is not a valid task type for model " << modelName; + throw std::logic_error(ss.str()); + } + + embedInternal(texts, embeddings, *prefix, dimensionality, doMean, atlas, spec); +} + +// MD5 hash of "nomic empty" +static const char EMPTY_PLACEHOLDER[] = "24df574ea1c998de59d5be15e769658e"; + +auto product(double a) -> std::function { + return [a](double b) { return a * b; }; +} + +template +double getL2NormScale(T *start, T *end) { + double magnitude = std::sqrt(std::inner_product(start, end, start, 0.0)); + return 1.0 / std::max(magnitude, 1e-12); +} + +void LLamaModel::embedInternal( + const std::vector &texts, float *embeddings, std::string prefix, int dimensionality, + bool doMean, bool atlas, const EmbModelSpec *spec +) { + typedef std::vector TokenString; + static constexpr int32_t atlasMaxLength = 8192; + static constexpr int chunkOverlap = 8; // Atlas overlaps n_batch-sized chunks of input by 8 tokens + + const llama_token bos_token = llama_token_bos(d_ptr->model); + const llama_token eos_token = llama_token_eos(d_ptr->model); + + assert(shouldAddBOS()); + bool addEOS = llama_vocab_type(d_ptr->model) == LLAMA_VOCAB_TYPE_WPM; + + // no EOS, optional BOS + auto tokenize = [this, addEOS](std::string text, TokenString &tokens, bool addBOS) { + if (!text.empty() && text[0] != ' ') + text = ' ' + text; // normalize for SPM - our fork of llama.cpp doesn't add a space prefix + + tokens.resize(text.length()+4); + int32_t n_tokens = llama_tokenize(d_ptr->model, text.c_str(), text.length(), tokens.data(), tokens.size(), addBOS, false); + assert(addEOS == (eos_token != -1 && tokens[n_tokens - 1] == eos_token)); + tokens.resize(n_tokens - addEOS); // erase EOS/SEP + }; + + // tokenize the texts + std::vector inputs; + for (unsigned i = 0; i < texts.size(); i++) { + auto &text = texts[i]; + auto &inp = inputs.emplace_back(); + tokenize(text, inp, false); + if (atlas && inp.size() > atlasMaxLength) { + if (doMean) { + throw std::logic_error( + "length of text at index " + std::to_string(i) + " is " + std::to_string(inp.size()) + + " tokens which exceeds limit of " + std::to_string(atlasMaxLength) + ); + } + inp.resize(atlasMaxLength); + } else if (inp.empty()) { + if (!atlas || !text.empty()) { + std::cerr << __func__ << ": warning: chunking tokenized text at index " << std::to_string(i) + << " into zero tokens\n"; + } + tokenize(EMPTY_PLACEHOLDER, inp, false); + } + } + + // tokenize the prefix + TokenString prefixTokens; + if (prefix.empty()) { + prefixTokens.push_back(bos_token); + } else { + tokenize(prefix + ':', prefixTokens, true); + } + + const uint32_t n_batch = llama_n_batch(d_ptr->ctx); + const uint32_t max_len = n_batch - (prefixTokens.size() + addEOS); // minus BOS/CLS and EOS/SEP + if (chunkOverlap >= max_len) { + throw std::logic_error("max chunk length of " + std::to_string(max_len) + " is smaller than overlap of " + + std::to_string(chunkOverlap) + " tokens"); + } + + // split into max_len-sized chunks + struct split_batch { int idx; TokenString batch; }; + std::vector batches; + for (unsigned i = 0; i < inputs.size(); i++) { + auto &input = inputs[i]; + for (auto it = input.begin(); it < input.end(); it += max_len) { + if (it > input.begin()) { it -= chunkOverlap; } + auto end = std::min(it + max_len, input.end()); + auto &batch = batches.emplace_back(i, prefixTokens).batch; + batch.insert(batch.end(), it, end); + batch.push_back(eos_token); + if (!doMean) { break; /* limit text to one chunk */ } + } + } + inputs.clear(); + + // initialize batch + struct llama_batch batch = llama_batch_init(n_batch, 0, 1); + + // n_texts x n_embd matrix + const int32_t n_embd = llama_n_embd(d_ptr->model); + std::vector embeddingsSum(texts.size() * n_embd); + std::vector embeddingsSumTotal(texts.size()); + std::vector queued_indices; // text indices of batches to be processed + + auto decode = [this, &queued_indices, n_embd, &batch, &embeddingsSum, &embeddingsSumTotal, spec, dimensionality]() { + if (llama_decode(d_ptr->ctx, batch) < 0) + throw std::runtime_error("llama_decode failed"); + + for (int i = 0; i < batch.n_tokens; ++i) { + if (!batch.logits[i]) { continue; } + int i_prompt = queued_indices[batch.seq_id[i][0]]; + auto *out = &embeddingsSum[i_prompt * n_embd]; + + // sequence embeddings aren't available when pooling_type is NONE + auto *embd = llama_get_embeddings_seq(d_ptr->ctx, batch.seq_id[i][0]); + if (!embd) { embd = llama_get_embeddings_ith(d_ptr->ctx, i); } + assert(embd); + + auto *embd_end = embd + n_embd; + + // layer normalization for nomic-embed-text-v1.5 + if (spec && spec->matryoshkaCapable) { + // normalize mean + double mean = std::accumulate(embd, embd_end, 0.0) / n_embd; + std::transform(embd, embd_end, embd, [mean](double f){ return f - mean; }); + + // unbiased sample variance, with Bessel's correction + double variance = std::inner_product(embd, embd_end, embd, 0.0) / (n_embd - 1); + + // trim to matryoshka dim + embd_end = embd + dimensionality; + + // normalize variance + std::transform(embd, embd_end, embd, product(1.0 / std::sqrt(variance + 1e-5))); + } + + // L2 norm + auto scale = getL2NormScale(embd, embd_end); + std::transform(embd, embd_end, out, out, [scale](double e, double o){ return o + scale * e; }); + embeddingsSumTotal[i_prompt]++; + } + }; + + // break into batches + for (auto &inp: batches) { + // encode if at capacity + if (batch.n_tokens + inp.batch.size() > n_batch) { + decode(); + batch.n_tokens = 0; + queued_indices.clear(); + } + + // add to batch + batch_add_seq(batch, inp.batch, queued_indices.size()); + queued_indices.push_back(inp.idx); + } + + // final batch + decode(); + + for (unsigned i = 0; i < texts.size(); i++) { + auto *embd = &embeddingsSum[i * n_embd]; + auto *embd_end = embd + dimensionality; + int total = embeddingsSumTotal[i]; + + // average over chunks + std::transform(embd, embd_end, embd, product(1.0 / total)); + + // L2 norm and copy + auto scale = getL2NormScale(embd, embd_end); + std::transform(embd, embd_end, embeddings, product(scale)); + embeddings += dimensionality; + } +} + #if defined(_WIN32) #define DLL_EXPORT __declspec(dllexport) #else @@ -556,23 +907,21 @@ DLL_EXPORT const char *get_build_variant() { DLL_EXPORT bool magic_match(const char *fname) { auto * ctx = load_gguf(fname); - auto arch = get_arch_name(ctx); + std::string arch = get_arch_name(ctx); bool valid = true; - static const std::vector known_arches { - "baichuan", "bloom", "codeshell", "falcon", "gemma", "gpt2", "llama", "mpt", "orion", "persimmon", "phi2", - "plamo", "qwen", "qwen2", "refact", "stablelm", "starcoder" - }; - - if (std::find(known_arches.begin(), known_arches.end(), arch) == known_arches.end()) { + if (std::find(KNOWN_ARCHES.begin(), KNOWN_ARCHES.end(), arch) == KNOWN_ARCHES.end()) { // not supported by this version of llama.cpp - if (!(arch == "gptj" || arch == "bert")) { // we support these via other modules + if (arch != "gptj") { // we support this via another module std::cerr << __func__ << ": unsupported model architecture: " << arch << "\n"; } valid = false; } + if (valid && is_embedding_arch(arch) && gguf_find_key(ctx, (arch + ".pooling_type").c_str()) < 0) + valid = false; // old pre-llama.cpp embedding model, e.g. all-MiniLM-L6-v2-f16.gguf + gguf_free(ctx); return valid; } diff --git a/gpt4all-backend/llamamodel_impl.h b/gpt4all-backend/llamamodel_impl.h index 15cbe1cd..cd9dbd57 100644 --- a/gpt4all-backend/llamamodel_impl.h +++ b/gpt4all-backend/llamamodel_impl.h @@ -11,15 +11,18 @@ #include "llmodel.h" struct LLamaPrivate; +struct EmbModelSpec; + class LLamaModel : public LLModel { public: LLamaModel(); ~LLamaModel(); - bool supportsEmbedding() const override { return false; } - bool supportsCompletion() const override { return true; } + bool supportsEmbedding() const override { return m_supportsEmbedding; } + bool supportsCompletion() const override { return m_supportsCompletion; } bool loadModel(const std::string &modelPath, int n_ctx, int ngl) override; - bool isModelBlacklisted(const std::string &modelPath) override; + bool isModelBlacklisted(const std::string &modelPath) const override; + bool isEmbeddingModel(const std::string &modelPath) const override; bool isModelLoaded() const override; size_t requiredMem(const std::string &modelPath, int n_ctx, int ngl) override; size_t stateSize() const override; @@ -29,12 +32,22 @@ public: int32_t threadCount() const override; std::vector availableGPUDevices(size_t memoryRequired) const override; bool initializeGPUDevice(size_t memoryRequired, const std::string &name) const override; - bool initializeGPUDevice(int device, std::string *unavail_reason) const override; + bool initializeGPUDevice(int device, std::string *unavail_reason = nullptr) const override; bool hasGPUDevice() override; bool usingGPUDevice() override; + size_t embeddingSize() const override; + // user-specified prefix + void embed(const std::vector &texts, float *embeddings, std::optional prefix, + int dimensionality = -1, bool doMean = true, bool atlas = false) override; + // automatic prefix + void embed(const std::vector &texts, float *embeddings, bool isRetrieval, int dimensionality = -1, + bool doMean = true, bool atlas = false) override; + private: std::unique_ptr d_ptr; + bool m_supportsEmbedding = false; + bool m_supportsCompletion = false; protected: std::vector tokenize(PromptContext &ctx, const std::string &str, bool special) const override; @@ -44,9 +57,11 @@ protected: int32_t contextLength() const override; const std::vector &endTokens() const override; bool shouldAddBOS() const override; - int32_t maxContextLength(std::string const &modelPath) const override; int32_t layerCount(std::string const &modelPath) const override; + + void embedInternal(const std::vector &texts, float *embeddings, std::string prefix, int dimensionality, + bool doMean, bool atlas, const EmbModelSpec *spec); }; #endif // LLAMAMODEL_H diff --git a/gpt4all-backend/llmodel.cpp b/gpt4all-backend/llmodel.cpp index 506b2c06..94be234d 100644 --- a/gpt4all-backend/llmodel.cpp +++ b/gpt4all-backend/llmodel.cpp @@ -213,21 +213,26 @@ LLModel *LLModel::Implementation::constructDefaultLlama() { } std::vector LLModel::Implementation::availableGPUDevices() { - auto * llama = constructDefaultLlama(); + auto *llama = constructDefaultLlama(); if (llama) { return llama->availableGPUDevices(0); } return {}; } int32_t LLModel::Implementation::maxContextLength(const std::string &modelPath) { - auto * llama = constructDefaultLlama(); + auto *llama = constructDefaultLlama(); return llama ? llama->maxContextLength(modelPath) : -1; } int32_t LLModel::Implementation::layerCount(const std::string &modelPath) { - auto * llama = constructDefaultLlama(); + auto *llama = constructDefaultLlama(); return llama ? llama->layerCount(modelPath) : -1; } +bool LLModel::Implementation::isEmbeddingModel(const std::string &modelPath) { + auto *llama = constructDefaultLlama(); + return llama && llama->isEmbeddingModel(modelPath); +} + void LLModel::Implementation::setImplementationsSearchPath(const std::string& path) { s_implementations_search_path = path; } diff --git a/gpt4all-backend/llmodel.h b/gpt4all-backend/llmodel.h index 5139110d..0c76efd7 100644 --- a/gpt4all-backend/llmodel.h +++ b/gpt4all-backend/llmodel.h @@ -1,13 +1,14 @@ #ifndef LLMODEL_H #define LLMODEL_H -#include -#include -#include -#include -#include #include +#include +#include #include +#include +#include +#include +#include #define LLMODEL_MAX_PROMPT_BATCH 128 @@ -44,6 +45,7 @@ public: static std::vector availableGPUDevices(); static int32_t maxContextLength(const std::string &modelPath); static int32_t layerCount(const std::string &modelPath); + static bool isEmbeddingModel(const std::string &modelPath); static void setImplementationsSearchPath(const std::string &path); static const std::string &implementationsSearchPath(); @@ -83,7 +85,8 @@ public: virtual bool supportsEmbedding() const = 0; virtual bool supportsCompletion() const = 0; virtual bool loadModel(const std::string &modelPath, int n_ctx, int ngl) = 0; - virtual bool isModelBlacklisted(const std::string &modelPath) { (void)modelPath; return false; }; + virtual bool isModelBlacklisted(const std::string &modelPath) const { (void)modelPath; return false; }; + virtual bool isEmbeddingModel(const std::string &modelPath) const { (void)modelPath; return false; } virtual bool isModelLoaded() const = 0; virtual size_t requiredMem(const std::string &modelPath, int n_ctx, int ngl) = 0; virtual size_t stateSize() const { return 0; } @@ -101,7 +104,15 @@ public: bool special = false, std::string *fakeReply = nullptr); - virtual std::vector embedding(const std::string &text); + virtual size_t embeddingSize() const { + throw std::logic_error(std::string(implementation().modelType()) + " does not support embeddings"); + } + // user-specified prefix + virtual void embed(const std::vector &texts, float *embeddings, std::optional prefix, + int dimensionality = -1, bool doMean = true, bool atlas = false); + // automatic prefix + virtual void embed(const std::vector &texts, float *embeddings, bool isRetrieval, + int dimensionality = -1, bool doMean = true, bool atlas = false); virtual void setThreadCount(int32_t n_threads) { (void)n_threads; } virtual int32_t threadCount() const { return 1; } diff --git a/gpt4all-backend/llmodel_c.cpp b/gpt4all-backend/llmodel_c.cpp index 6cc630a4..40e41e82 100644 --- a/gpt4all-backend/llmodel_c.cpp +++ b/gpt4all-backend/llmodel_c.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include struct LLModelWrapper { @@ -41,22 +42,22 @@ llmodel_model llmodel_model_create2(const char *model_path, const char *build_va *error = last_error_message.c_str(); } } - return reinterpret_cast(wrapper); + return wrapper; } void llmodel_model_destroy(llmodel_model model) { - delete reinterpret_cast(model); + delete static_cast(model); } size_t llmodel_required_mem(llmodel_model model, const char *model_path, int n_ctx, int ngl) { - LLModelWrapper *wrapper = reinterpret_cast(model); + auto *wrapper = static_cast(model); return wrapper->llModel->requiredMem(model_path, n_ctx, ngl); } bool llmodel_loadModel(llmodel_model model, const char *model_path, int n_ctx, int ngl) { - LLModelWrapper *wrapper = reinterpret_cast(model); + auto *wrapper = static_cast(model); std::string modelPath(model_path); if (wrapper->llModel->isModelBlacklisted(modelPath)) { @@ -69,44 +70,28 @@ bool llmodel_loadModel(llmodel_model model, const char *model_path, int n_ctx, i bool llmodel_isModelLoaded(llmodel_model model) { - LLModelWrapper *wrapper = reinterpret_cast(model); + auto *wrapper = static_cast(model); return wrapper->llModel->isModelLoaded(); } uint64_t llmodel_get_state_size(llmodel_model model) { - LLModelWrapper *wrapper = reinterpret_cast(model); + auto *wrapper = static_cast(model); return wrapper->llModel->stateSize(); } uint64_t llmodel_save_state_data(llmodel_model model, uint8_t *dest) { - LLModelWrapper *wrapper = reinterpret_cast(model); + auto *wrapper = static_cast(model); return wrapper->llModel->saveState(dest); } uint64_t llmodel_restore_state_data(llmodel_model model, const uint8_t *src) { - LLModelWrapper *wrapper = reinterpret_cast(model); + auto *wrapper = static_cast(model); return wrapper->llModel->restoreState(src); } -// Wrapper functions for the C callbacks -bool prompt_wrapper(int32_t token_id, void *user_data) { - llmodel_prompt_callback callback = reinterpret_cast(user_data); - return callback(token_id); -} - -bool response_wrapper(int32_t token_id, const std::string &response, void *user_data) { - llmodel_response_callback callback = reinterpret_cast(user_data); - return callback(token_id, response.c_str()); -} - -bool recalculate_wrapper(bool is_recalculating, void *user_data) { - llmodel_recalculate_callback callback = reinterpret_cast(user_data); - return callback(is_recalculating); -} - void llmodel_prompt(llmodel_model model, const char *prompt, const char *prompt_template, llmodel_prompt_callback prompt_callback, @@ -116,15 +101,11 @@ void llmodel_prompt(llmodel_model model, const char *prompt, bool special, const char *fake_reply) { - LLModelWrapper *wrapper = reinterpret_cast(model); + auto *wrapper = static_cast(model); - // Create std::function wrappers that call the C function pointers - std::function prompt_func = - std::bind(&prompt_wrapper, std::placeholders::_1, reinterpret_cast(prompt_callback)); - std::function response_func = - std::bind(&response_wrapper, std::placeholders::_1, std::placeholders::_2, reinterpret_cast(response_callback)); - std::function recalc_func = - std::bind(&recalculate_wrapper, std::placeholders::_1, reinterpret_cast(recalculate_callback)); + auto response_func = [response_callback](int32_t token_id, const std::string &response) { + return response_callback(token_id, response.c_str()); + }; if (size_t(ctx->n_past) < wrapper->promptContext.tokens.size()) wrapper->promptContext.tokens.resize(ctx->n_past); @@ -147,8 +128,8 @@ void llmodel_prompt(llmodel_model model, const char *prompt, auto *fake_reply_p = fake_reply ? &fake_reply_str : nullptr; // Call the C++ prompt method - wrapper->llModel->prompt(prompt, prompt_template, prompt_func, response_func, recalc_func, wrapper->promptContext, - special, fake_reply_p); + wrapper->llModel->prompt(prompt, prompt_template, prompt_callback, response_func, recalculate_callback, + wrapper->promptContext, special, fake_reply_p); // Update the C context by giving access to the wrappers raw pointers to std::vector data // which involves no copies @@ -171,38 +152,60 @@ void llmodel_prompt(llmodel_model model, const char *prompt, ctx->context_erase = wrapper->promptContext.contextErase; } -float *llmodel_embedding(llmodel_model model, const char *text, size_t *embedding_size) -{ - if (model == nullptr || text == nullptr || !strlen(text)) { - *embedding_size = 0; +float *llmodel_embed( + llmodel_model model, const char **texts, size_t *embedding_size, const char *prefix, int dimensionality, + bool do_mean, bool atlas, const char **error +) { + auto *wrapper = static_cast(model); + + if (!texts || !*texts) { + if (error) + *error = strdup("'texts' is NULL or empty"); return nullptr; } - LLModelWrapper *wrapper = reinterpret_cast(model); - std::vector embeddingVector = wrapper->llModel->embedding(text); - float *embedding = (float *)malloc(embeddingVector.size() * sizeof(float)); - if (embedding == nullptr) { - *embedding_size = 0; + + std::vector textsVec; + while (*texts) { textsVec.emplace_back(*texts++); } + + size_t embd_size; + float *embedding; + + try { + embd_size = wrapper->llModel->embeddingSize(); + if (dimensionality > 0 && dimensionality < int(embd_size)) + embd_size = dimensionality; + + embd_size *= textsVec.size(); + + std::optional prefixStr; + if (prefix) { prefixStr = prefix; } + + embedding = new float[embd_size]; + wrapper->llModel->embed(textsVec, embedding, prefixStr, dimensionality, do_mean, atlas); + } catch (std::exception const &e) { + if (error) + *error = strdup(e.what()); return nullptr; } - std::copy(embeddingVector.begin(), embeddingVector.end(), embedding); - *embedding_size = embeddingVector.size(); + + *embedding_size = embd_size; return embedding; } void llmodel_free_embedding(float *ptr) { - free(ptr); + delete[] ptr; } void llmodel_setThreadCount(llmodel_model model, int32_t n_threads) { - LLModelWrapper *wrapper = reinterpret_cast(model); + auto *wrapper = static_cast(model); wrapper->llModel->setThreadCount(n_threads); } int32_t llmodel_threadCount(llmodel_model model) { - LLModelWrapper *wrapper = reinterpret_cast(model); + auto *wrapper = static_cast(model); return wrapper->llModel->threadCount(); } @@ -218,7 +221,7 @@ const char *llmodel_get_implementation_search_path() struct llmodel_gpu_device* llmodel_available_gpu_devices(llmodel_model model, size_t memoryRequired, int* num_devices) { - LLModelWrapper *wrapper = reinterpret_cast(model); + auto *wrapper = static_cast(model); std::vector devices = wrapper->llModel->availableGPUDevices(memoryRequired); // Set the num_devices @@ -242,24 +245,24 @@ struct llmodel_gpu_device* llmodel_available_gpu_devices(llmodel_model model, si bool llmodel_gpu_init_gpu_device_by_string(llmodel_model model, size_t memoryRequired, const char *device) { - LLModelWrapper *wrapper = reinterpret_cast(model); + auto *wrapper = static_cast(model); return wrapper->llModel->initializeGPUDevice(memoryRequired, std::string(device)); } bool llmodel_gpu_init_gpu_device_by_struct(llmodel_model model, const llmodel_gpu_device *device) { - LLModelWrapper *wrapper = reinterpret_cast(model); + auto *wrapper = static_cast(model); return wrapper->llModel->initializeGPUDevice(device->index); } bool llmodel_gpu_init_gpu_device_by_int(llmodel_model model, int device) { - LLModelWrapper *wrapper = reinterpret_cast(model); + auto *wrapper = static_cast(model); return wrapper->llModel->initializeGPUDevice(device); } bool llmodel_has_gpu_device(llmodel_model model) { - LLModelWrapper *wrapper = reinterpret_cast(model); + auto *wrapper = static_cast(model); return wrapper->llModel->hasGPUDevice(); } diff --git a/gpt4all-backend/llmodel_c.h b/gpt4all-backend/llmodel_c.h index 3ae0db22..913834ec 100644 --- a/gpt4all-backend/llmodel_c.h +++ b/gpt4all-backend/llmodel_c.h @@ -186,13 +186,23 @@ void llmodel_prompt(llmodel_model model, const char *prompt, * NOTE: If given NULL pointers for the model or text, or an empty text, a NULL pointer will be * returned. Bindings should signal an error when NULL is the return value. * @param model A pointer to the llmodel_model instance. - * @param text A string representing the text to generate an embedding for. + * @param texts A pointer to a NULL-terminated array of strings representing the texts to generate an + * embedding for. * @param embedding_size A pointer to a size_t type that will be set by the call indicating the length * of the returned floating point array. + * @param prefix The model-specific prefix representing the embedding task, without the trailing colon. NULL for no + * prefix. + * @param dimensionality The embedding dimension, for use with Matryoshka-capable models. Set to -1 to for full-size. + * @param do_mean True to average multiple embeddings if the text is longer than the model can accept, False to + * truncate. + * @param atlas Try to be fully compatible with the Atlas API. Currently, this means texts longer than 8192 tokens with + * long_text_mode="mean" will raise an error. Disabled by default. + * @param error Return location for a malloc()ed string that will be set on error, or NULL. * @return A pointer to an array of floating point values passed to the calling method which then will - * be responsible for lifetime of this memory. + * be responsible for lifetime of this memory. NULL if an error occurred. */ -float *llmodel_embedding(llmodel_model model, const char *text, size_t *embedding_size); +float *llmodel_embed(llmodel_model model, const char **texts, size_t *embedding_size, const char *prefix, + int dimensionality, bool do_mean, bool atlas, const char **error); /** * Frees the memory allocated by the llmodel_embedding function. diff --git a/gpt4all-backend/llmodel_shared.cpp b/gpt4all-backend/llmodel_shared.cpp index 15696033..6cc7e905 100644 --- a/gpt4all-backend/llmodel_shared.cpp +++ b/gpt4all-backend/llmodel_shared.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include // TODO(cebtenzzre): replace this with llama_kv_cache_seq_shift for llamamodel (GPT-J needs this as-is) @@ -267,12 +268,28 @@ void LLModel::generateResponse(std::function } } -std::vector LLModel::embedding(const std::string &text) -{ - (void)text; - if (!supportsCompletion()) { - std::string errorMessage = "ERROR: this model does not support generating embeddings!\n"; - std::cerr << implementation().modelType() << errorMessage; - } - return std::vector(); +void LLModel::embed( + const std::vector &texts, float *embeddings, std::optional prefix, int dimensionality, + bool doMean, bool atlas +) { + (void)texts; + (void)embeddings; + (void)prefix; + (void)dimensionality; + (void)doMean; + (void)atlas; + throw std::logic_error(std::string(implementation().modelType()) + " does not support embeddings"); +} + +void LLModel::embed( + const std::vector &texts, float *embeddings, bool isRetrieval, int dimensionality, bool doMean, + bool atlas +) { + (void)texts; + (void)embeddings; + (void)isRetrieval; + (void)dimensionality; + (void)doMean; + (void)atlas; + throw std::logic_error(std::string(implementation().modelType()) + " does not support embeddings"); } diff --git a/gpt4all-bindings/python/gpt4all/_pyllmodel.py b/gpt4all-bindings/python/gpt4all/_pyllmodel.py index c623beda..d3633ef6 100644 --- a/gpt4all-bindings/python/gpt4all/_pyllmodel.py +++ b/gpt4all-bindings/python/gpt4all/_pyllmodel.py @@ -10,7 +10,7 @@ import sys import threading from enum import Enum from queue import Queue -from typing import Callable, Iterable, List +from typing import Callable, Iterable, overload if sys.version_info >= (3, 9): import importlib.resources as importlib_resources @@ -105,13 +105,18 @@ llmodel.llmodel_prompt.argtypes = [ llmodel.llmodel_prompt.restype = None -llmodel.llmodel_embedding.argtypes = [ +llmodel.llmodel_embed.argtypes = [ ctypes.c_void_p, - ctypes.c_char_p, + ctypes.POINTER(ctypes.c_char_p), ctypes.POINTER(ctypes.c_size_t), + ctypes.c_char_p, + ctypes.c_int, + ctypes.c_bool, + ctypes.c_bool, + ctypes.POINTER(ctypes.c_char_p), ] -llmodel.llmodel_embedding.restype = ctypes.POINTER(ctypes.c_float) +llmodel.llmodel_embed.restype = ctypes.POINTER(ctypes.c_float) llmodel.llmodel_free_embedding.argtypes = [ctypes.POINTER(ctypes.c_float)] llmodel.llmodel_free_embedding.restype = None @@ -287,16 +292,50 @@ class LLModel: self.context.repeat_last_n = repeat_last_n self.context.context_erase = context_erase - def generate_embedding(self, text: str) -> List[float]: + @overload + def generate_embeddings( + self, text: str, prefix: str, dimensionality: int, do_mean: bool, atlas: bool, + ) -> list[float]: ... + @overload + def generate_embeddings( + self, text: list[str], prefix: str, dimensionality: int, do_mean: bool, atlas: bool, + ) -> list[list[float]]: ... + + def generate_embeddings(self, text, prefix, dimensionality, do_mean, atlas): if not text: - raise ValueError("Text must not be None or empty") + raise ValueError("text must not be None or empty") + single_text = isinstance(text, str) + if single_text: + text = [text] + + # prepare input embedding_size = ctypes.c_size_t() - c_text = ctypes.c_char_p(text.encode()) - embedding_ptr = llmodel.llmodel_embedding(self.model, c_text, ctypes.byref(embedding_size)) - embedding_array = [embedding_ptr[i] for i in range(embedding_size.value)] + error = ctypes.c_char_p() + c_prefix = ctypes.c_char_p() if prefix is None else prefix.encode() + c_texts = (ctypes.c_char_p * (len(text) + 1))() + for i, t in enumerate(text): + c_texts[i] = t.encode() + + # generate the embeddings + embedding_ptr = llmodel.llmodel_embed( + self.model, c_texts, ctypes.byref(embedding_size), c_prefix, dimensionality, do_mean, atlas, + ctypes.byref(error), + ) + + if embedding_ptr.value is None: + msg = "(unknown error)" if error.value is None else error.value.decode() + raise RuntimeError(f'Failed to generate embeddings: {msg}') + + # extract output + n_embd = embedding_size.value // len(text) + embedding_array = [ + embedding_ptr[i:i + n_embd] + for i in range(0, embedding_size.value, n_embd) + ] llmodel.llmodel_free_embedding(embedding_ptr) - return list(embedding_array) + + return embedding_array[0] if single_text else embedding_array def prompt_model( self, diff --git a/gpt4all-bindings/python/gpt4all/gpt4all.py b/gpt4all-bindings/python/gpt4all/gpt4all.py index ee8ff3fd..b8ea05f6 100644 --- a/gpt4all-bindings/python/gpt4all/gpt4all.py +++ b/gpt4all-bindings/python/gpt4all/gpt4all.py @@ -10,7 +10,7 @@ import time import warnings from contextlib import contextmanager from pathlib import Path -from typing import Any, Dict, Iterable, List, Optional, Union +from typing import Any, Dict, Iterable, List, Optional, Union, overload import requests from requests.exceptions import ChunkedEncodingError @@ -36,6 +36,8 @@ class Embed4All: Python class that handles embeddings for GPT4All. """ + MIN_DIMENSIONALITY = 64 + def __init__(self, model_name: Optional[str] = None, n_threads: Optional[int] = None, **kwargs): """ Constructor @@ -45,17 +47,48 @@ class Embed4All: """ self.gpt4all = GPT4All(model_name or 'all-MiniLM-L6-v2-f16.gguf', n_threads=n_threads, **kwargs) - def embed(self, text: str) -> List[float]: + @overload + def embed( + self, text: str, prefix: str | None = ..., dimensionality: int | None = ..., long_text_mode: str = ..., + atlas: bool = ..., + ) -> list[float]: ... + @overload + def embed( + self, text: list[str], prefix: str | None = ..., dimensionality: int | None = ..., long_text_mode: str = ..., + atlas: bool = ..., + ) -> list[list[float]]: ... + + def embed(self, text, prefix=None, dimensionality=None, long_text_mode="truncate", atlas=False): """ - Generate an embedding. + Generate one or more embeddings. Args: - text: The text document to generate an embedding for. + text: A text or list of texts to generate embeddings for. + prefix: The model-specific prefix representing the embedding task, without the trailing colon. For Nomic + Embed this can be `search_query`, `search_document`, `classification`, or `clustering`. + dimensionality: The embedding dimension, for use with Matryoshka-capable models. Defaults to full-size. + long_text_mode: How to handle texts longer than the model can accept. One of `mean` or `truncate`. + atlas: Try to be fully compatible with the Atlas API. Currently, this means texts longer than 8192 tokens + with long_text_mode="mean" will raise an error. Disabled by default. Returns: - An embedding of your document of text. + An embedding or list of embeddings of your text(s). """ - return self.gpt4all.model.generate_embedding(text) + if dimensionality is None: + dimensionality = -1 + else: + if dimensionality <= 0: + raise ValueError(f'Dimensionality must be None or a positive integer, got {dimensionality}') + if dimensionality < self.MIN_DIMENSIONALITY: + warnings.warn( + f'Dimensionality {dimensionality} is less than the suggested minimum of {self.MIN_DIMENSIONALITY}.' + ' Performance may be degraded.' + ) + try: + do_mean = {"mean": True, "truncate": False}[long_text_mode] + except KeyError: + raise ValueError(f"Long text mode must be one of 'mean' or 'truncate', got {long_text_mode!r}") + return self.gpt4all.model.generate_embeddings(text, prefix, dimensionality, do_mean, atlas) class GPT4All: diff --git a/gpt4all-chat/CMakeLists.txt b/gpt4all-chat/CMakeLists.txt index 2ead372f..bc510a0b 100644 --- a/gpt4all-chat/CMakeLists.txt +++ b/gpt4all-chat/CMakeLists.txt @@ -202,8 +202,6 @@ install(TARGETS llamamodel-mainline-default DESTINATION lib COMPONENT ${COMPONEN if(APPLE) install(TARGETS llamamodel-mainline-metal DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN}) endif() -install(TARGETS bert-avxonly DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN}) -install(TARGETS bert-default DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN}) set(CPACK_GENERATOR "IFW") set(CPACK_VERBATIM_VARIABLES YES) diff --git a/gpt4all-chat/chatllm.cpp b/gpt4all-chat/chatllm.cpp index c41431ff..eba9d2f6 100644 --- a/gpt4all-chat/chatllm.cpp +++ b/gpt4all-chat/chatllm.cpp @@ -12,7 +12,6 @@ #define GPTJ_INTERNAL_STATE_VERSION 0 #define LLAMA_INTERNAL_STATE_VERSION 0 -#define BERT_INTERNAL_STATE_VERSION 0 class LLModelStore { public: @@ -386,7 +385,6 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo) switch (m_llModelInfo.model->implementation().modelType()[0]) { case 'L': m_llModelType = LLModelType::LLAMA_; break; case 'G': m_llModelType = LLModelType::GPTJ_; break; - case 'B': m_llModelType = LLModelType::BERT_; break; default: { delete m_llModelInfo.model; @@ -840,7 +838,6 @@ bool ChatLLM::serialize(QDataStream &stream, int version, bool serializeKV) switch (m_llModelType) { case GPTJ_: stream << GPTJ_INTERNAL_STATE_VERSION; break; case LLAMA_: stream << LLAMA_INTERNAL_STATE_VERSION; break; - case BERT_: stream << BERT_INTERNAL_STATE_VERSION; break; default: Q_UNREACHABLE(); } } diff --git a/gpt4all-chat/chatllm.h b/gpt4all-chat/chatllm.h index 3c22f998..d123aafb 100644 --- a/gpt4all-chat/chatllm.h +++ b/gpt4all-chat/chatllm.h @@ -13,7 +13,6 @@ enum LLModelType { GPTJ_, LLAMA_, CHATGPT_, - BERT_, }; struct LLModelInfo { diff --git a/gpt4all-chat/embllm.cpp b/gpt4all-chat/embllm.cpp index 2bfb226d..95c61834 100644 --- a/gpt4all-chat/embllm.cpp +++ b/gpt4all-chat/embllm.cpp @@ -27,7 +27,7 @@ void EmbeddingLLMWorker::wait() bool EmbeddingLLMWorker::loadModel() { - const EmbeddingModels *embeddingModels = ModelList::globalInstance()->embeddingModels(); + const EmbeddingModels *embeddingModels = ModelList::globalInstance()->installedEmbeddingModels(); if (!embeddingModels->count()) return false; @@ -41,7 +41,8 @@ bool EmbeddingLLMWorker::loadModel() return false; } - bool isNomic = fileInfo.fileName().startsWith("nomic"); + auto filename = fileInfo.fileName(); + bool isNomic = filename.startsWith("nomic-") && filename.endsWith(".txt"); if (isNomic) { QFile file(filePath); file.open(QIODeviceBase::ReadOnly | QIODeviceBase::Text); @@ -52,16 +53,18 @@ bool EmbeddingLLMWorker::loadModel() } m_model = LLModel::Implementation::construct(filePath.toStdString()); + // NOTE: explicitly loads model on CPU to avoid GPU OOM + // TODO(cebtenzzre): support GPU-accelerated embeddings bool success = m_model->loadModel(filePath.toStdString(), 2048, 0); if (!success) { - qWarning() << "WARNING: Could not load sbert"; + qWarning() << "WARNING: Could not load embedding model"; delete m_model; m_model = nullptr; return false; } - if (m_model->implementation().modelType() != "Bert") { - qWarning() << "WARNING: Model type is not sbert"; + if (!m_model->supportsEmbedding()) { + qWarning() << "WARNING: Model type does not support embeddings"; delete m_model; m_model = nullptr; return false; @@ -79,21 +82,49 @@ bool EmbeddingLLMWorker::isNomic() const return !m_nomicAPIKey.isEmpty(); } +// this function is always called for retrieval tasks std::vector EmbeddingLLMWorker::generateSyncEmbedding(const QString &text) { if (!hasModel() && !loadModel()) { qWarning() << "WARNING: Could not load model for embeddings"; - return std::vector(); + return {}; } if (isNomic()) { qWarning() << "WARNING: Request to generate sync embeddings for non-local model invalid"; - return std::vector(); + return {}; } - return m_model->embedding(text.toStdString()); + std::vector embedding(m_model->embeddingSize()); + try { + m_model->embed({text.toStdString()}, embedding.data(), true); + } catch (const std::exception &e) { + qWarning() << "WARNING: LLModel::embed failed: " << e.what(); + return {}; + } + return embedding; } +void EmbeddingLLMWorker::sendAtlasRequest(const QStringList &texts, const QString &taskType, QVariant userData) { + QJsonObject root; + root.insert("model", "nomic-embed-text-v1"); + root.insert("texts", QJsonArray::fromStringList(texts)); + root.insert("task_type", taskType); + + QJsonDocument doc(root); + + QUrl nomicUrl("https://api-atlas.nomic.ai/v1/embedding/text"); + const QString authorization = QString("Bearer %1").arg(m_nomicAPIKey).trimmed(); + QNetworkRequest request(nomicUrl); + request.setHeader(QNetworkRequest::ContentTypeHeader, "application/json"); + request.setRawHeader("Authorization", authorization.toUtf8()); + request.setAttribute(QNetworkRequest::User, userData); + QNetworkReply *reply = m_networkManager->post(request, doc.toJson(QJsonDocument::Compact)); + connect(qApp, &QCoreApplication::aboutToQuit, reply, &QNetworkReply::abort); + connect(reply, &QNetworkReply::finished, this, &EmbeddingLLMWorker::handleFinished); +} + +// this function is always called for retrieval tasks void EmbeddingLLMWorker::requestSyncEmbedding(const QString &text) { if (!hasModel() && !loadModel()) { @@ -108,25 +139,10 @@ void EmbeddingLLMWorker::requestSyncEmbedding(const QString &text) Q_ASSERT(hasModel()); - QJsonObject root; - root.insert("model", "nomic-embed-text-v1"); - QJsonArray texts; - texts.append(text); - root.insert("texts", texts); - root.insert("task_type", "search_query"); - - QJsonDocument doc(root); - - QUrl nomicUrl("https://api-atlas.nomic.ai/v1/embedding/text"); - const QString authorization = QString("Bearer %1").arg(m_nomicAPIKey).trimmed(); - QNetworkRequest request(nomicUrl); - request.setHeader(QNetworkRequest::ContentTypeHeader, "application/json"); - request.setRawHeader("Authorization", authorization.toUtf8()); - QNetworkReply *reply = m_networkManager->post(request, doc.toJson(QJsonDocument::Compact)); - connect(qApp, &QCoreApplication::aboutToQuit, reply, &QNetworkReply::abort); - connect(reply, &QNetworkReply::finished, this, &EmbeddingLLMWorker::handleFinished); + sendAtlasRequest({text}, "search_query"); } +// this function is always called for storage into the database void EmbeddingLLMWorker::requestAsyncEmbedding(const QVector &chunks) { if (!hasModel() && !loadModel()) { @@ -141,33 +157,24 @@ void EmbeddingLLMWorker::requestAsyncEmbedding(const QVector &ch EmbeddingResult result; result.folder_id = c.folder_id; result.chunk_id = c.chunk_id; - result.embedding = m_model->embedding(c.chunk.toStdString()); + // TODO(cebtenzzre): take advantage of batched embeddings + result.embedding.resize(m_model->embeddingSize()); + try { + m_model->embed({c.chunk.toStdString()}, result.embedding.data(), false); + } catch (const std::exception &e) { + qWarning() << "WARNING: LLModel::embed failed:" << e.what(); + return; + } results << result; } emit embeddingsGenerated(results); return; }; - QJsonObject root; - root.insert("model", "nomic-embed-text-v1"); - QJsonArray texts; - - for (auto c : chunks) + QStringList texts; + for (auto &c: chunks) texts.append(c.chunk); - root.insert("texts", texts); - - QJsonDocument doc(root); - - QUrl nomicUrl("https://api-atlas.nomic.ai/v1/embedding/text"); - const QString authorization = QString("Bearer %1").arg(m_nomicAPIKey).trimmed(); - QNetworkRequest request(nomicUrl); - request.setHeader(QNetworkRequest::ContentTypeHeader, "application/json"); - request.setRawHeader("Authorization", authorization.toUtf8()); - request.setAttribute(QNetworkRequest::User, QVariant::fromValue(chunks)); - - QNetworkReply *reply = m_networkManager->post(request, doc.toJson(QJsonDocument::Compact)); - connect(qApp, &QCoreApplication::aboutToQuit, reply, &QNetworkReply::abort); - connect(reply, &QNetworkReply::finished, this, &EmbeddingLLMWorker::handleFinished); + sendAtlasRequest(texts, "search_document", QVariant::fromValue(chunks)); } std::vector jsonArrayToVector(const QJsonArray &jsonArray) { diff --git a/gpt4all-chat/embllm.h b/gpt4all-chat/embllm.h index cde30c60..44ec4ff6 100644 --- a/gpt4all-chat/embllm.h +++ b/gpt4all-chat/embllm.h @@ -1,10 +1,11 @@ #ifndef EMBLLM_H #define EMBLLM_H +#include +#include #include +#include #include -#include -#include #include "../gpt4all-backend/llmodel.h" @@ -51,6 +52,8 @@ private Q_SLOTS: void handleFinished(); private: + void sendAtlasRequest(const QStringList &texts, const QString &taskType, QVariant userData = {}); + QString m_nomicAPIKey; QNetworkAccessManager *m_networkManager; std::vector m_lastResponse; diff --git a/gpt4all-chat/metadata/models3.json b/gpt4all-chat/metadata/models3.json index e6da73f0..e52d3cab 100644 --- a/gpt4all-chat/metadata/models3.json +++ b/gpt4all-chat/metadata/models3.json @@ -247,14 +247,31 @@ "filename": "all-MiniLM-L6-v2-f16.gguf", "filesize": "45887744", "requires": "2.5.0", + "removedIn": "2.7.4", "ramrequired": "1", "parameters": "40 million", "quant": "f16", "type": "Bert", + "embeddingModel": true, "systemPrompt": " ", "description": "LocalDocs text embeddings model
  • For use with LocalDocs feature
  • Used for retrieval augmented generation (RAG)", "url": "https://gpt4all.io/models/gguf/all-MiniLM-L6-v2-f16.gguf" }, + { + "order": "o", + "md5sum": "dd90e2cb7f8e9316ac3796cece9883b5", + "name": "SBert", + "filename": "all-MiniLM-L6-v2.gguf2.f16.gguf", + "filesize": "45949216", + "requires": "2.7.4", + "ramrequired": "1", + "parameters": "40 million", + "quant": "f16", + "type": "Bert", + "embeddingModel": true, + "description": "LocalDocs text embeddings model
    • For use with LocalDocs feature
    • Used for retrieval augmented generation (RAG)", + "url": "https://gpt4all.io/models/gguf/all-MiniLM-L6-v2.gguf2.f16.gguf" + }, { "order": "p", "md5sum": "919de4dd6f25351bcb0223790db1932d", @@ -270,5 +287,39 @@ "url": "https://huggingface.co/TheBloke/em_german_mistral_v01-GGUF/resolve/main/em_german_mistral_v01.Q4_0.gguf", "promptTemplate": "USER: %1 ASSISTANT: ", "systemPrompt": "Du bist ein hilfreicher Assistent. " + }, + { + "order": "q", + "md5sum": "60ea031126f82db8ddbbfecc668315d2", + "disableGUI": "true", + "name": "Nomic Embed Text v1", + "filename": "nomic-embed-text-v1.f16.gguf", + "filesize": "274290560", + "requires": "2.7.4", + "ramrequired": "1", + "parameters": "137 million", + "quant": "f16", + "type": "Bert", + "embeddingModel": true, + "systemPrompt": "", + "description": "nomic-embed-text-v1", + "url": "https://gpt4all.io/models/gguf/nomic-embed-text-v1.f16.gguf" + }, + { + "order": "r", + "md5sum": "a5401e7f7e46ed9fcaed5b60a281d547", + "disableGUI": "true", + "name": "Nomic Embed Text v1.5", + "filename": "nomic-embed-text-v1.5.f16.gguf", + "filesize": "274290560", + "requires": "2.7.4", + "ramrequired": "1", + "parameters": "137 million", + "quant": "f16", + "type": "Bert", + "embeddingModel": true, + "systemPrompt": "", + "description": "nomic-embed-text-v1.5", + "url": "https://gpt4all.io/models/gguf/nomic-embed-text-v1.5.f16.gguf" } ] diff --git a/gpt4all-chat/modellist.cpp b/gpt4all-chat/modellist.cpp index f31bdbc5..db21a4a5 100644 --- a/gpt4all-chat/modellist.cpp +++ b/gpt4all-chat/modellist.cpp @@ -10,8 +10,10 @@ //#define USE_LOCAL_MODELSJSON -#define DEFAULT_EMBEDDING_MODEL "all-MiniLM-L6-v2-f16.gguf" -#define NOMIC_EMBEDDING_MODEL "nomic-embed-text-v1.txt" +const char * const KNOWN_EMBEDDING_MODELS[] { + "all-MiniLM-L6-v2.gguf2.f16.gguf", + "nomic-embed-text-v1.txt", +}; QString ModelInfo::id() const { @@ -223,6 +225,7 @@ void ModelInfo::setContextLength(int l) int ModelInfo::maxContextLength() const { + if (!installed || isOnline) return -1; if (m_maxContextLength != -1) return m_maxContextLength; auto path = (dirpath + filename()).toStdString(); int layers = LLModel::Implementation::maxContextLength(path); @@ -306,9 +309,11 @@ bool ModelInfo::shouldSaveMetadata() const return installed && (isClone() || isDiscovered() || description() == "" /*indicates sideloaded*/); } -EmbeddingModels::EmbeddingModels(QObject *parent) +EmbeddingModels::EmbeddingModels(QObject *parent, bool requireInstalled) : QSortFilterProxyModel(parent) { + m_requireInstalled = requireInstalled; + connect(this, &EmbeddingModels::rowsInserted, this, &EmbeddingModels::countChanged); connect(this, &EmbeddingModels::rowsRemoved, this, &EmbeddingModels::countChanged); connect(this, &EmbeddingModels::modelReset, this, &EmbeddingModels::countChanged); @@ -319,36 +324,41 @@ bool EmbeddingModels::filterAcceptsRow(int sourceRow, const QModelIndex &sourceParent) const { QModelIndex index = sourceModel()->index(sourceRow, 0, sourceParent); - bool isInstalled = sourceModel()->data(index, ModelList::InstalledRole).toBool(); - bool isEmbedding = sourceModel()->data(index, ModelList::FilenameRole).toString() == DEFAULT_EMBEDDING_MODEL || - sourceModel()->data(index, ModelList::FilenameRole).toString() == NOMIC_EMBEDDING_MODEL; - return isInstalled && isEmbedding; + bool isEmbeddingModel = sourceModel()->data(index, ModelList::IsEmbeddingModelRole).toBool(); + bool installed = sourceModel()->data(index, ModelList::InstalledRole).toBool(); + QString filename = sourceModel()->data(index, ModelList::FilenameRole).toString(); + auto &known = KNOWN_EMBEDDING_MODELS; + if (std::find(known, std::end(known), filename.toStdString()) == std::end(known)) + return false; // we are currently not prepared to support other embedding models + + return isEmbeddingModel && (!m_requireInstalled || installed); } -int EmbeddingModels::count() const +int EmbeddingModels::defaultModelIndex() const { - return rowCount(); + auto *sourceListModel = qobject_cast(sourceModel()); + if (!sourceListModel) return -1; + + int rows = sourceListModel->rowCount(); + for (int i = 0; i < rows; ++i) { + if (filterAcceptsRow(i, sourceListModel->index(i, 0).parent())) + return i; + } + + return -1; } ModelInfo EmbeddingModels::defaultModelInfo() const { - if (!sourceModel()) - return ModelInfo(); + auto *sourceListModel = qobject_cast(sourceModel()); + if (!sourceListModel) return ModelInfo(); - const ModelList *sourceListModel = qobject_cast(sourceModel()); - if (!sourceListModel) - return ModelInfo(); - - const int rows = sourceListModel->rowCount(); - for (int i = 0; i < rows; ++i) { - QModelIndex sourceIndex = sourceListModel->index(i, 0); - if (filterAcceptsRow(i, sourceIndex.parent())) { - const QString id = sourceListModel->data(sourceIndex, ModelList::IdRole).toString(); - return sourceListModel->modelInfo(id); - } - } + int i = defaultModelIndex(); + if (i < 0) return ModelInfo(); - return ModelInfo(); + QModelIndex sourceIndex = sourceListModel->index(i, 0); + auto id = sourceListModel->data(sourceIndex, ModelList::IdRole).toString(); + return sourceListModel->modelInfo(id); } InstalledModels::InstalledModels(QObject *parent) @@ -365,13 +375,9 @@ bool InstalledModels::filterAcceptsRow(int sourceRow, { QModelIndex index = sourceModel()->index(sourceRow, 0, sourceParent); bool isInstalled = sourceModel()->data(index, ModelList::InstalledRole).toBool(); - bool showInGUI = !sourceModel()->data(index, ModelList::DisableGUIRole).toBool(); - return isInstalled && showInGUI; -} - -int InstalledModels::count() const -{ - return rowCount(); + bool isEmbeddingModel = sourceModel()->data(index, ModelList::IsEmbeddingModelRole).toBool(); + // list installed chat models + return isInstalled && !isEmbeddingModel; } DownloadableModels::DownloadableModels(QObject *parent) @@ -432,8 +438,9 @@ ModelList *ModelList::globalInstance() ModelList::ModelList() : QAbstractListModel(nullptr) - , m_embeddingModels(new EmbeddingModels(this)) + , m_embeddingModels(new EmbeddingModels(this, false /* all models */)) , m_installedModels(new InstalledModels(this)) + , m_installedEmbeddingModels(new EmbeddingModels(this, true /* installed models */)) , m_downloadableModels(new DownloadableModels(this)) , m_asyncModelRequestOngoing(false) , m_discoverLimit(20) @@ -445,6 +452,7 @@ ModelList::ModelList() { m_embeddingModels->setSourceModel(this); m_installedModels->setSourceModel(this); + m_installedEmbeddingModels->setSourceModel(this); m_downloadableModels->setSourceModel(this); connect(MySettings::globalInstance(), &MySettings::modelPathChanged, this, &ModelList::updateModelsFromDirectory); @@ -494,8 +502,8 @@ const QList ModelList::userDefaultModelList() const bool foundUserDefault = false; for (ModelInfo *info : m_models) { - // Only installed models that are meant for GUI are suitable as a default - if (!info->installed || info->disableGUI) + // Only installed chat models are suitable as a default + if (!info->installed || info->isEmbeddingModel) continue; if (info->id() == userDefaultModelName) { @@ -516,13 +524,7 @@ const QList ModelList::userDefaultModelList() const int ModelList::defaultEmbeddingModelIndex() const { - QMutexLocker locker(&m_mutex); - for (int i = 0; i < m_models.size(); ++i) { - const ModelInfo *info = m_models.at(i); - const bool isEmbedding = info->filename() == DEFAULT_EMBEDDING_MODEL; - if (isEmbedding) return i; - } - return -1; + return embeddingModels()->defaultModelIndex(); } ModelInfo ModelList::defaultModelInfo() const @@ -692,8 +694,6 @@ QVariant ModelList::dataInternal(const ModelInfo *info, int role) const return info->isDefault; case OnlineRole: return info->isOnline; - case DisableGUIRole: - return info->disableGUI; case DescriptionRole: return info->description(); case RequiresVersionRole: @@ -730,6 +730,8 @@ QVariant ModelList::dataInternal(const ModelInfo *info, int role) const return info->isClone(); case IsDiscoveredRole: return info->isDiscovered(); + case IsEmbeddingModelRole: + return info->isEmbeddingModel; case TemperatureRole: return info->temperature(); case TopPRole: @@ -844,8 +846,6 @@ void ModelList::updateData(const QString &id, const QVector info->isDefault = value.toBool(); break; case OnlineRole: info->isOnline = value.toBool(); break; - case DisableGUIRole: - info->disableGUI = value.toBool(); break; case DescriptionRole: info->setDescription(value.toString()); break; case RequiresVersionRole: @@ -900,6 +900,8 @@ void ModelList::updateData(const QString &id, const QVector } break; } + case IsEmbeddingModelRole: + info->isEmbeddingModel = value.toBool(); break; case TemperatureRole: info->setTemperature(value.toDouble()); break; case TopPRole: @@ -952,11 +954,21 @@ void ModelList::updateData(const QString &id, const QVector } // Extra guarantee that these always remains in sync with filesystem - const QFileInfo fileInfo(info->dirpath + info->filename()); + QString modelPath = info->dirpath + info->filename(); + const QFileInfo fileInfo(modelPath); info->installed = fileInfo.exists(); const QFileInfo incompleteInfo(incompleteDownloadPath(info->filename())); info->isIncomplete = incompleteInfo.exists(); + // check installed, discovered/sideloaded models only (including clones) + if (!info->checkedEmbeddingModel && !info->isEmbeddingModel && info->installed + && (info->isDiscovered() || info->description().isEmpty())) + { + // read GGUF and decide based on model architecture + info->isEmbeddingModel = LLModel::Implementation::isEmbeddingModel(modelPath.toStdString()); + info->checkedEmbeddingModel = true; + } + if (shouldSort) { auto s = m_discoverSort; auto d = m_discoverSortDirection; @@ -983,8 +995,11 @@ void ModelList::resortModel() emit layoutChanged(); } -void ModelList::updateDataByFilename(const QString &filename, const QVector> &data) +void ModelList::updateDataByFilename(const QString &filename, QVector> data) { + if (data.isEmpty()) + return; // no-op + QVector modelsById; { QMutexLocker locker(&m_mutex); @@ -1041,6 +1056,7 @@ QString ModelList::clone(const ModelInfo &model) { ModelList::FilenameRole, model.filename() }, { ModelList::DirpathRole, model.dirpath }, { ModelList::OnlineRole, model.isOnline }, + { ModelList::IsEmbeddingModelRole, model.isEmbeddingModel }, { ModelList::TemperatureRole, model.temperature() }, { ModelList::TopPRole, model.topP() }, { ModelList::MinPRole, model.minP() }, @@ -1164,8 +1180,7 @@ void ModelList::updateModelsFromDirectory() if (!it.fileInfo().isDir()) { QString filename = it.fileName(); - // All files that end with .bin and have 'ggml' somewhere in the name - if (((filename.endsWith(".bin") || filename.endsWith(".gguf")) && (/*filename.contains("ggml") ||*/ filename.contains("gguf")) && !filename.startsWith("incomplete")) + if ((filename.endsWith(".gguf") && !filename.startsWith("incomplete")) || (filename.endsWith(".txt") && (filename.startsWith("chatgpt-") || filename.startsWith("nomic-")))) { QString filePath = it.filePath(); @@ -1373,16 +1388,19 @@ void ModelList::parseModelsJsonFile(const QByteArray &jsonData, bool save) QString parameters = obj["parameters"].toString(); QString quant = obj["quant"].toString(); QString type = obj["type"].toString(); + bool isEmbeddingModel = obj["embeddingModel"].toBool(); + + // Some models aren't supported in the GUI at all + if (disableGUI) + continue; // If the current version is strictly less than required version, then skip - if (!requiresVersion.isEmpty() && compareVersions(currentVersion, requiresVersion) < 0) { + if (!requiresVersion.isEmpty() && compareVersions(currentVersion, requiresVersion) < 0) continue; - } // If the version removed is less than or equal to the current version, then skip - if (!versionRemoved.isEmpty() && compareVersions(versionRemoved, currentVersion) <= 0) { + if (!versionRemoved.isEmpty() && compareVersions(versionRemoved, currentVersion) <= 0) continue; - } modelFilesize = ModelList::toFileSize(modelFilesize.toULongLong()); @@ -1406,12 +1424,12 @@ void ModelList::parseModelsJsonFile(const QByteArray &jsonData, bool save) { ModelList::RequiresVersionRole, requiresVersion }, { ModelList::VersionRemovedRole, versionRemoved }, { ModelList::UrlRole, url }, - { ModelList::DisableGUIRole, disableGUI }, { ModelList::OrderRole, order }, { ModelList::RamrequiredRole, ramrequired }, { ModelList::ParametersRole, parameters }, { ModelList::QuantRole, quant }, { ModelList::TypeRole, type }, + { ModelList::IsEmbeddingModelRole, isEmbeddingModel }, }; if (obj.contains("temperature")) data.append({ ModelList::TemperatureRole, obj["temperature"].toDouble() }); @@ -1515,7 +1533,7 @@ void ModelList::parseModelsJsonFile(const QByteArray &jsonData, bool save) { ModelList::FilenameRole, modelFilename }, { ModelList::FilesizeRole, "minimal" }, { ModelList::OnlineRole, true }, - { ModelList::DisableGUIRole, true }, + { ModelList::IsEmbeddingModelRole, true }, { ModelList::DescriptionRole, tr("LocalDocs Nomic Atlas Embed
      ") + nomicEmbedDesc }, { ModelList::RequiresVersionRole, "2.6.3" }, diff --git a/gpt4all-chat/modellist.h b/gpt4all-chat/modellist.h index 0529afbc..6d2034dd 100644 --- a/gpt4all-chat/modellist.h +++ b/gpt4all-chat/modellist.h @@ -16,7 +16,6 @@ struct ModelInfo { Q_PROPERTY(bool calcHash MEMBER calcHash) Q_PROPERTY(bool installed MEMBER installed) Q_PROPERTY(bool isDefault MEMBER isDefault) - Q_PROPERTY(bool disableGUI MEMBER disableGUI) Q_PROPERTY(bool isOnline MEMBER isOnline) Q_PROPERTY(QString description READ description WRITE setDescription) Q_PROPERTY(QString requiresVersion MEMBER requiresVersion) @@ -36,6 +35,7 @@ struct ModelInfo { Q_PROPERTY(QString type READ type WRITE setType) Q_PROPERTY(bool isClone READ isClone WRITE setIsClone) Q_PROPERTY(bool isDiscovered READ isDiscovered WRITE setIsDiscovered) + Q_PROPERTY(bool isEmbeddingModel MEMBER isEmbeddingModel) Q_PROPERTY(double temperature READ temperature WRITE setTemperature) Q_PROPERTY(double topP READ topP WRITE setTopP) Q_PROPERTY(double minP READ minP WRITE setMinP) @@ -104,7 +104,6 @@ public: bool installed = false; bool isDefault = false; bool isOnline = false; - bool disableGUI = false; QString requiresVersion; QString versionRemoved; qint64 bytesReceived = 0; @@ -117,6 +116,8 @@ public: QString order; int ramrequired = -1; QString parameters; + bool isEmbeddingModel = false; + bool checkedEmbeddingModel = false; bool operator==(const ModelInfo &other) const { return m_id == other.m_id; @@ -187,9 +188,10 @@ class EmbeddingModels : public QSortFilterProxyModel Q_OBJECT Q_PROPERTY(int count READ count NOTIFY countChanged) public: - explicit EmbeddingModels(QObject *parent); - int count() const; + EmbeddingModels(QObject *parent, bool requireInstalled); + int count() const { return rowCount(); } + int defaultModelIndex() const; ModelInfo defaultModelInfo() const; Q_SIGNALS: @@ -198,6 +200,9 @@ Q_SIGNALS: protected: bool filterAcceptsRow(int sourceRow, const QModelIndex &sourceParent) const override; + +private: + bool m_requireInstalled; }; class InstalledModels : public QSortFilterProxyModel @@ -206,7 +211,7 @@ class InstalledModels : public QSortFilterProxyModel Q_PROPERTY(int count READ count NOTIFY countChanged) public: explicit InstalledModels(QObject *parent); - int count() const; + int count() const { return rowCount(); } Q_SIGNALS: void countChanged(); @@ -248,8 +253,8 @@ class ModelList : public QAbstractListModel { Q_OBJECT Q_PROPERTY(int count READ count NOTIFY countChanged) - Q_PROPERTY(int defaultEmbeddingModelIndex READ defaultEmbeddingModelIndex NOTIFY defaultEmbeddingModelIndexChanged) - Q_PROPERTY(EmbeddingModels* embeddingModels READ embeddingModels NOTIFY embeddingModelsChanged) + Q_PROPERTY(int defaultEmbeddingModelIndex READ defaultEmbeddingModelIndex) + Q_PROPERTY(EmbeddingModels* installedEmbeddingModels READ installedEmbeddingModels NOTIFY installedEmbeddingModelsChanged) Q_PROPERTY(InstalledModels* installedModels READ installedModels NOTIFY installedModelsChanged) Q_PROPERTY(DownloadableModels* downloadableModels READ downloadableModels NOTIFY downloadableModelsChanged) Q_PROPERTY(QList userDefaultModelList READ userDefaultModelList NOTIFY userDefaultModelListChanged) @@ -282,7 +287,6 @@ public: InstalledRole, DefaultRole, OnlineRole, - DisableGUIRole, DescriptionRole, RequiresVersionRole, VersionRemovedRole, @@ -301,6 +305,7 @@ public: TypeRole, IsCloneRole, IsDiscoveredRole, + IsEmbeddingModelRole, TemperatureRole, TopPRole, TopKRole, @@ -332,7 +337,6 @@ public: roles[InstalledRole] = "installed"; roles[DefaultRole] = "isDefault"; roles[OnlineRole] = "isOnline"; - roles[DisableGUIRole] = "disableGUI"; roles[DescriptionRole] = "description"; roles[RequiresVersionRole] = "requiresVersion"; roles[VersionRemovedRole] = "versionRemoved"; @@ -351,6 +355,7 @@ public: roles[TypeRole] = "type"; roles[IsCloneRole] = "isClone"; roles[IsDiscoveredRole] = "isDiscovered"; + roles[IsEmbeddingModelRole] = "isEmbeddingModel"; roles[TemperatureRole] = "temperature"; roles[TopPRole] = "topP"; roles[MinPRole] = "minP"; @@ -373,7 +378,7 @@ public: QVariant data(const QModelIndex &index, int role = Qt::DisplayRole) const override; QVariant data(const QString &id, int role) const; QVariant dataByFilename(const QString &filename, int role) const; - void updateDataByFilename(const QString &filename, const QVector> &data); + void updateDataByFilename(const QString &filename, QVector> data); void updateData(const QString &id, const QVector> &data); int count() const { return m_models.size(); } @@ -396,6 +401,7 @@ public: const QList userDefaultModelList() const; EmbeddingModels *embeddingModels() const { return m_embeddingModels; } + EmbeddingModels *installedEmbeddingModels() const { return m_installedEmbeddingModels; } InstalledModels *installedModels() const { return m_installedModels; } DownloadableModels *downloadableModels() const { return m_downloadableModels; } @@ -433,12 +439,11 @@ public: Q_SIGNALS: void countChanged(); - void embeddingModelsChanged(); + void installedEmbeddingModelsChanged(); void installedModelsChanged(); void downloadableModelsChanged(); void userDefaultModelListChanged(); void asyncModelRequestOngoingChanged(); - void defaultEmbeddingModelIndexChanged(); void discoverLimitChanged(); void discoverSortDirectionChanged(); void discoverSortChanged(); @@ -474,6 +479,7 @@ private: mutable QMutex m_mutex; QNetworkAccessManager m_networkManager; EmbeddingModels *m_embeddingModels; + EmbeddingModels *m_installedEmbeddingModels; InstalledModels *m_installedModels; DownloadableModels *m_downloadableModels; QList m_models; @@ -488,7 +494,7 @@ private: protected: explicit ModelList(); - ~ModelList() {} + ~ModelList() { for (auto *model: m_models) { delete model; } } friend class MyModelList; }; diff --git a/gpt4all-chat/qml/LocalDocsSettings.qml b/gpt4all-chat/qml/LocalDocsSettings.qml index c9ac4abd..e14e0a8c 100644 --- a/gpt4all-chat/qml/LocalDocsSettings.qml +++ b/gpt4all-chat/qml/LocalDocsSettings.qml @@ -14,7 +14,7 @@ MySettingsTab { MySettings.restoreLocalDocsDefaults(); } - property bool hasEmbeddingModel: ModelList.embeddingModels.count !== 0 + property bool hasEmbeddingModel: ModelList.installedEmbeddingModels.count !== 0 showAdvancedSettingsButton: hasEmbeddingModel showRestoreDefaultsButton: hasEmbeddingModel diff --git a/gpt4all-chat/qml/ModelDownloaderDialog.qml b/gpt4all-chat/qml/ModelDownloaderDialog.qml index 71e7bcdd..acbb6290 100644 --- a/gpt4all-chat/qml/ModelDownloaderDialog.qml +++ b/gpt4all-chat/qml/ModelDownloaderDialog.qml @@ -24,7 +24,7 @@ MyDialog { if (showEmbeddingModels) { ModelList.downloadableModels.expanded = true var targetModelIndex = ModelList.defaultEmbeddingModelIndex - modelListView.positionViewAtIndex(targetModelIndex, ListView.Contain) + modelListView.positionViewAtIndex(targetModelIndex, ListView.Beginning) } }