From 4a8ebd841fd06fb277c1e3e68959d5805f075ec4 Mon Sep 17 00:00:00 2001 From: Maxime Labonne <81252890+mlabonne@users.noreply.github.com> Date: Wed, 29 Nov 2023 20:41:55 +0000 Subject: [PATCH] Created using Colaboratory --- ...ma_2_models_using_GGUF_and_llama_cpp.ipynb | 2218 +++++++++++++++++ 1 file changed, 2218 insertions(+) create mode 100644 Quantize_Llama_2_models_using_GGUF_and_llama_cpp.ipynb diff --git a/Quantize_Llama_2_models_using_GGUF_and_llama_cpp.ipynb b/Quantize_Llama_2_models_using_GGUF_and_llama_cpp.ipynb new file mode 100644 index 0000000..cb3fc28 --- /dev/null +++ b/Quantize_Llama_2_models_using_GGUF_and_llama_cpp.ipynb @@ -0,0 +1,2218 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "gpuType": "T4", + "authorship_tag": "ABX9TyMohoDhmmKsuh9OLDHor3GB", + "include_colab_link": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + }, + "accelerator": "GPU", + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "c281b60e104f4c5da547bbdd7208d4bc": { + "model_module": "@jupyter-widgets/controls", + "model_name": "VBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "VBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "VBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_2e2fabac70484c1c8b16fa6ca8fd8537", + "IPY_MODEL_bf53c635fa374420ad850eea22cd1e31", + "IPY_MODEL_065d59126a734c1aa096ba40cd4a129f", + "IPY_MODEL_e8855d5678a342f5a33171aa74d3b7bc" + ], + "layout": "IPY_MODEL_1c8a6b959f9c4443a92f58eff1b03077" + } + }, + "74b084c97f6f46d293a197bf9804460c": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_9fb5726f91734b1da149784680dc9624", + "placeholder": "​", + "style": "IPY_MODEL_202a8eb11eda4e58942113fbeacfdc3d", + "value": "

Copy a token from your Hugging Face\ntokens page and paste it below.
Immediately click login after copying\nyour token or it might be stored in plain text in this notebook file.
" + } + }, + "1409574c4f9742e7a711965dd2c8ad87": { + "model_module": "@jupyter-widgets/controls", + "model_name": "PasswordModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "PasswordModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "PasswordView", + "continuous_update": true, + "description": "Token:", + "description_tooltip": null, + "disabled": false, + "layout": "IPY_MODEL_970d4d3daf854f92bd650dc4da99e1bc", + "placeholder": "​", + "style": "IPY_MODEL_24b1e007921046b1adc61db0f2bf9fc7", + "value": "" + } + }, + "704ecf9409244e0b93612d6a11476346": { + "model_module": "@jupyter-widgets/controls", + "model_name": "CheckboxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "CheckboxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "CheckboxView", + "description": "Add token as git credential?", + "description_tooltip": null, + "disabled": false, + "indent": true, + "layout": "IPY_MODEL_24d3d72f5de54de8a1ded4e528dde332", + "style": "IPY_MODEL_e90cb0ce526a4556bc643ba6c5485661", + "value": true + } + }, + "b1a8d3a9a379415393d9e7d995a40788": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ButtonModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ButtonModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ButtonView", + "button_style": "", + "description": "Login", + "disabled": false, + "icon": "", + "layout": "IPY_MODEL_76e7372656b745c889b9283b76c04148", + "style": "IPY_MODEL_ce0204c7e1ff4a51b2648284a2492262", + "tooltip": "" + } + }, + "f928772f92724579b068e984d9eef387": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_6dbb8e8a5ebb40a4ba910b09dde27e1a", + "placeholder": "​", + "style": "IPY_MODEL_7944af54f2564920822d5d4b348896c4", + "value": "\nPro Tip: If you don't already have one, you can create a dedicated\n'notebooks' token with 'write' access, that you can then easily reuse for all\nnotebooks. " + } + }, + "1c8a6b959f9c4443a92f58eff1b03077": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": "center", + "align_self": null, + "border": null, + "bottom": null, + "display": "flex", + "flex": null, + "flex_flow": "column", + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": "50%" + } + }, + "9fb5726f91734b1da149784680dc9624": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "202a8eb11eda4e58942113fbeacfdc3d": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "970d4d3daf854f92bd650dc4da99e1bc": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "24b1e007921046b1adc61db0f2bf9fc7": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "24d3d72f5de54de8a1ded4e528dde332": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e90cb0ce526a4556bc643ba6c5485661": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "76e7372656b745c889b9283b76c04148": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "ce0204c7e1ff4a51b2648284a2492262": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ButtonStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ButtonStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "button_color": null, + "font_weight": "" + } + }, + "6dbb8e8a5ebb40a4ba910b09dde27e1a": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "7944af54f2564920822d5d4b348896c4": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "1b55372f62494ca0baabf87f7e7f4ba8": { + "model_module": "@jupyter-widgets/controls", + "model_name": "LabelModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "LabelModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "LabelView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_bf612001ad354ea19de6ee45a166a43c", + "placeholder": "​", + "style": "IPY_MODEL_a8e4691970b14955bfb4865bcef5e912", + "value": "Connecting..." + } + }, + "bf612001ad354ea19de6ee45a166a43c": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a8e4691970b14955bfb4865bcef5e912": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "2e2fabac70484c1c8b16fa6ca8fd8537": { + "model_module": "@jupyter-widgets/controls", + "model_name": "LabelModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "LabelModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "LabelView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_7eb6de1a979b46f7b234724073f8bc3a", + "placeholder": "​", + "style": "IPY_MODEL_6ae4640196da492fadafeb63f4bc89d2", + "value": "Token is valid (permission: write)." + } + }, + "bf53c635fa374420ad850eea22cd1e31": { + "model_module": "@jupyter-widgets/controls", + "model_name": "LabelModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "LabelModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "LabelView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_cef83433dbea4f529f43722fe78a8baf", + "placeholder": "​", + "style": "IPY_MODEL_845ba8115d5140ac9ee22af4a9e6a03b", + "value": "Your token has been saved in your configured git credential helpers (store)." + } + }, + "065d59126a734c1aa096ba40cd4a129f": { + "model_module": "@jupyter-widgets/controls", + "model_name": "LabelModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "LabelModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "LabelView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_cdd888041aca4dcf8adc785309071fc6", + "placeholder": "​", + "style": "IPY_MODEL_cf63214cb4f8442999fa5b971035fe4f", + "value": "Your token has been saved to /root/.cache/huggingface/token" + } + }, + "e8855d5678a342f5a33171aa74d3b7bc": { + "model_module": "@jupyter-widgets/controls", + "model_name": "LabelModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "LabelModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "LabelView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_7d9b22f2b7fe4a749f989e247bce446a", + "placeholder": "​", + "style": "IPY_MODEL_7f8e268db8144adfb09d089784d8411a", + "value": "Login successful" + } + }, + "7eb6de1a979b46f7b234724073f8bc3a": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "6ae4640196da492fadafeb63f4bc89d2": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "cef83433dbea4f529f43722fe78a8baf": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "845ba8115d5140ac9ee22af4a9e6a03b": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "cdd888041aca4dcf8adc785309071fc6": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "cf63214cb4f8442999fa5b971035fe4f": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "7d9b22f2b7fe4a749f989e247bce446a": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "7f8e268db8144adfb09d089784d8411a": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + } + } + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Quantize Llama 2 models using GGUF and llama.cpp\n", + "> 🗣️ [Large Language Model Course](https://github.com/mlabonne/llm-course)\n", + "\n", + "❤️ Created by [@maximelabonne](https://twitter.com/maximelabonne).\n", + "\n", + "## Usage\n", + "\n", + "* `MODEL_ID`: The ID of the model to quantize (e.g., `mlabonne/EvolCodeLlama-7b`).\n", + "* `QUANTIZATION_METHOD`: The quantization method to use.\n", + "\n", + "## Quantization methods\n", + "\n", + "The names of the quantization methods follow the naming convention: \"q\" + the number of bits + the variant used (detailed below). Here is a list of all the possible quant methods and their corresponding use cases, based on model cards made by [TheBloke](https://huggingface.co/TheBloke/):\n", + "\n", + "* `q2_k`: Uses Q4_K for the attention.vw and feed_forward.w2 tensors, Q2_K for the other tensors.\n", + "* `q3_k_l`: Uses Q5_K for the attention.wv, attention.wo, and feed_forward.w2 tensors, else Q3_K\n", + "* `q3_k_m`: Uses Q4_K for the attention.wv, attention.wo, and feed_forward.w2 tensors, else Q3_K\n", + "* `q3_k_s`: Uses Q3_K for all tensors\n", + "* `q4_0`: Original quant method, 4-bit.\n", + "* `q4_1`: Higher accuracy than q4_0 but not as high as q5_0. However has quicker inference than q5 models.\n", + "* `q4_k_m`: Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q4_K\n", + "* `q4_k_s`: Uses Q4_K for all tensors\n", + "* `q5_0`: Higher accuracy, higher resource usage and slower inference.\n", + "* `q5_1`: Even higher accuracy, resource usage and slower inference.\n", + "* `q5_k_m`: Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q5_K\n", + "* `q5_k_s`: Uses Q5_K for all tensors\n", + "* `q6_k`: Uses Q8_K for all tensors\n", + "* `q8_0`: Almost indistinguishable from float16. High resource use and slow. Not recommended for most users.\n", + "\n", + "As a rule of thumb, **I recommend using Q5_K_M** as it preserves most of the model's performance. Alternatively, you can use Q4_K_M if you want to save some memory. In general, K_M versions are better than K_S versions. I cannot recommend Q2_K or Q3_* versions, as they drastically decrease model performance." + ], + "metadata": { + "id": "8y_Rk94LzG7I" + } + }, + { + "cell_type": "code", + "source": [ + "# Variables\n", + "MODEL_ID = \"mlabonne/EvolCodeLlama-7b\"\n", + "QUANTIZATION_METHODS = [\"q4_k_m\", \"q5_k_m\"]\n", + "\n", + "# Constants\n", + "MODEL_NAME = MODEL_ID.split('/')[-1]\n", + "\n", + "# Install llama.cpp\n", + "!git clone https://github.com/ggerganov/llama.cpp\n", + "!cd llama.cpp && git pull && make clean && LLAMA_CUBLAS=1 make\n", + "!pip install -r llama.cpp/requirements.txt\n", + "\n", + "# Download model\n", + "!git lfs install\n", + "!git clone https://huggingface.co/{MODEL_ID}\n", + "\n", + "# Convert to fp16\n", + "fp16 = f\"{MODEL_NAME}/{MODEL_NAME.lower()}.fp16.bin\"\n", + "!python llama.cpp/convert.py {MODEL_NAME} --outtype f16 --outfile {fp16}\n", + "\n", + "# Quantize the model for each method in the QUANTIZATION_METHODS list\n", + "for method in QUANTIZATION_METHODS:\n", + " qtype = f\"{MODEL_NAME}/{MODEL_NAME.lower()}.{method.upper()}.gguf\"\n", + " !./llama.cpp/quantize {fp16} {qtype} {method}" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "fD24jJxq7t3k", + "outputId": "94954934-0829-44e9-a5e5-262c17e162d0" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "ggml_init_cublas: found 1 CUDA devices:\n", + " Device 0: Tesla T4, compute capability 7.5\n", + "main: build = 1100 (dd0dc36)\n", + "main: quantizing 'EvolCodeLlama-7b/evolcodellama-7b.gguf.fp16.bin' to 'EvolCodeLlama-7b/evolcodellama-7b.gguf.q4_k_s.bin' as Q4_K_S\n", + "llama_model_loader: loaded meta data with 16 key-value pairs and 291 tensors from EvolCodeLlama-7b/evolcodellama-7b.gguf.fp16.bin (version GGUF V1 (support until nov 2023))\n", + "llama_model_loader: - tensor 0: token_embd.weight f16 [ 4096, 32016, 1, 1 ]\n", + "llama_model_loader: - tensor 1: blk.0.attn_q.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 2: blk.0.attn_k.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 3: blk.0.attn_v.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 4: blk.0.attn_output.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 5: blk.0.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 6: blk.0.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 7: blk.0.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 8: blk.0.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 9: blk.0.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 10: blk.1.attn_q.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 11: blk.1.attn_k.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 12: blk.1.attn_v.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 13: blk.1.attn_output.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 14: blk.1.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 15: blk.1.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 16: blk.1.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 17: blk.1.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 18: blk.1.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 19: blk.2.attn_q.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 20: blk.2.attn_k.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 21: blk.2.attn_v.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 22: blk.2.attn_output.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 23: blk.2.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 24: blk.2.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 25: blk.2.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 26: blk.2.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 27: blk.2.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 28: blk.3.attn_q.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 29: blk.3.attn_k.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 30: blk.3.attn_v.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 31: blk.3.attn_output.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 32: blk.3.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 33: blk.3.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 34: blk.3.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 35: blk.3.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 36: blk.3.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 37: blk.4.attn_q.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 38: blk.4.attn_k.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 39: blk.4.attn_v.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 40: blk.4.attn_output.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 41: blk.4.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 42: blk.4.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 43: blk.4.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 44: blk.4.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 45: blk.4.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 46: blk.5.attn_q.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 47: blk.5.attn_k.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 48: blk.5.attn_v.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 49: blk.5.attn_output.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 50: blk.5.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 51: blk.5.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 52: blk.5.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 53: blk.5.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 54: blk.5.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 55: blk.6.attn_q.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 56: blk.6.attn_k.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 57: blk.6.attn_v.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 58: blk.6.attn_output.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 59: blk.6.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 60: blk.6.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 61: blk.6.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 62: blk.6.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 63: blk.6.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 64: blk.7.attn_q.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 65: blk.7.attn_k.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 66: blk.7.attn_v.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 67: blk.7.attn_output.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 68: blk.7.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 69: blk.7.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 70: blk.7.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 71: blk.7.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 72: blk.7.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 73: blk.8.attn_q.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 74: blk.8.attn_k.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 75: blk.8.attn_v.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 76: blk.8.attn_output.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 77: blk.8.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 78: blk.8.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 79: blk.8.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 80: blk.8.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 81: blk.8.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 82: blk.9.attn_q.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 83: blk.9.attn_k.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 84: blk.9.attn_v.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 85: blk.9.attn_output.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 86: blk.9.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 87: blk.9.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 88: blk.9.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 89: blk.9.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 90: blk.9.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 91: blk.10.attn_q.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 92: blk.10.attn_k.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 93: blk.10.attn_v.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 94: blk.10.attn_output.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 95: blk.10.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 96: blk.10.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 97: blk.10.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 98: blk.10.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 99: blk.10.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 100: blk.11.attn_q.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 101: blk.11.attn_k.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 102: blk.11.attn_v.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 103: blk.11.attn_output.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 104: blk.11.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 105: blk.11.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 106: blk.11.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 107: blk.11.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 108: blk.11.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 109: blk.12.attn_q.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 110: blk.12.attn_k.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 111: blk.12.attn_v.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 112: blk.12.attn_output.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 113: blk.12.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 114: blk.12.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 115: blk.12.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 116: blk.12.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 117: blk.12.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 118: blk.13.attn_q.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 119: blk.13.attn_k.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 120: blk.13.attn_v.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 121: blk.13.attn_output.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 122: blk.13.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 123: blk.13.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 124: blk.13.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 125: blk.13.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 126: blk.13.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 127: blk.14.attn_q.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 128: blk.14.attn_k.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 129: blk.14.attn_v.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 130: blk.14.attn_output.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 131: blk.14.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 132: blk.14.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 133: blk.14.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 134: blk.14.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 135: blk.14.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 136: blk.15.attn_q.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 137: blk.15.attn_k.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 138: blk.15.attn_v.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 139: blk.15.attn_output.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 140: blk.15.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 141: blk.15.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 142: blk.15.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 143: blk.15.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 144: blk.15.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 145: blk.16.attn_q.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 146: blk.16.attn_k.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 147: blk.16.attn_v.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 148: blk.16.attn_output.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 149: blk.16.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 150: blk.16.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 151: blk.16.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 152: blk.16.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 153: blk.16.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 154: blk.17.attn_q.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 155: blk.17.attn_k.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 156: blk.17.attn_v.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 157: blk.17.attn_output.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 158: blk.17.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 159: blk.17.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 160: blk.17.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 161: blk.17.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 162: blk.17.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 163: blk.18.attn_q.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 164: blk.18.attn_k.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 165: blk.18.attn_v.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 166: blk.18.attn_output.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 167: blk.18.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 168: blk.18.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 169: blk.18.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 170: blk.18.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 171: blk.18.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 172: blk.19.attn_q.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 173: blk.19.attn_k.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 174: blk.19.attn_v.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 175: blk.19.attn_output.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 176: blk.19.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 177: blk.19.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 178: blk.19.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 179: blk.19.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 180: blk.19.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 181: blk.20.attn_q.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 182: blk.20.attn_k.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 183: blk.20.attn_v.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 184: blk.20.attn_output.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 185: blk.20.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 186: blk.20.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 187: blk.20.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 188: blk.20.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 189: blk.20.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 190: blk.21.attn_q.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 191: blk.21.attn_k.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 192: blk.21.attn_v.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 193: blk.21.attn_output.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 194: blk.21.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 195: blk.21.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 196: blk.21.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 197: blk.21.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 198: blk.21.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 199: blk.22.attn_q.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 200: blk.22.attn_k.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 201: blk.22.attn_v.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 202: blk.22.attn_output.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 203: blk.22.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 204: blk.22.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 205: blk.22.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 206: blk.22.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 207: blk.22.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 208: blk.23.attn_q.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 209: blk.23.attn_k.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 210: blk.23.attn_v.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 211: blk.23.attn_output.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 212: blk.23.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 213: blk.23.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 214: blk.23.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 215: blk.23.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 216: blk.23.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 217: blk.24.attn_q.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 218: blk.24.attn_k.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 219: blk.24.attn_v.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 220: blk.24.attn_output.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 221: blk.24.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 222: blk.24.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 223: blk.24.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 224: blk.24.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 225: blk.24.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 226: blk.25.attn_q.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 227: blk.25.attn_k.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 228: blk.25.attn_v.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 229: blk.25.attn_output.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 230: blk.25.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 231: blk.25.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 232: blk.25.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 233: blk.25.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 234: blk.25.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 235: blk.26.attn_q.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 236: blk.26.attn_k.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 237: blk.26.attn_v.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 238: blk.26.attn_output.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 239: blk.26.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 240: blk.26.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 241: blk.26.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 242: blk.26.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 243: blk.26.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 244: blk.27.attn_q.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 245: blk.27.attn_k.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 246: blk.27.attn_v.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 247: blk.27.attn_output.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 248: blk.27.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 249: blk.27.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 250: blk.27.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 251: blk.27.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 252: blk.27.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 253: blk.28.attn_q.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 254: blk.28.attn_k.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 255: blk.28.attn_v.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 256: blk.28.attn_output.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 257: blk.28.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 258: blk.28.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 259: blk.28.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 260: blk.28.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 261: blk.28.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 262: blk.29.attn_q.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 263: blk.29.attn_k.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 264: blk.29.attn_v.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 265: blk.29.attn_output.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 266: blk.29.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 267: blk.29.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 268: blk.29.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 269: blk.29.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 270: blk.29.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 271: blk.30.attn_q.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 272: blk.30.attn_k.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 273: blk.30.attn_v.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 274: blk.30.attn_output.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 275: blk.30.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 276: blk.30.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 277: blk.30.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 278: blk.30.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 279: blk.30.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 280: blk.31.attn_q.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 281: blk.31.attn_k.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 282: blk.31.attn_v.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 283: blk.31.attn_output.weight f16 [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 284: blk.31.ffn_gate.weight f16 [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 285: blk.31.ffn_up.weight f16 [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 286: blk.31.ffn_down.weight f16 [ 11008, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 287: blk.31.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 288: blk.31.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 289: output_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 290: output.weight f16 [ 4096, 32016, 1, 1 ]\n", + "llama_model_loader: - kv 0: general.architecture str \n", + "llama_model_loader: - kv 1: general.name str \n", + "llama_model_loader: - kv 2: llama.context_length u32 \n", + "llama_model_loader: - kv 3: llama.embedding_length u32 \n", + "llama_model_loader: - kv 4: llama.block_count u32 \n", + "llama_model_loader: - kv 5: llama.feed_forward_length u32 \n", + "llama_model_loader: - kv 6: llama.rope.dimension_count u32 \n", + "llama_model_loader: - kv 7: llama.attention.head_count u32 \n", + "llama_model_loader: - kv 8: llama.attention.head_count_kv u32 \n", + "llama_model_loader: - kv 9: llama.attention.layer_norm_rms_epsilon f32 \n", + "llama_model_loader: - kv 10: llama.rope.freq_base f32 \n", + "llama_model_loader: - kv 11: general.file_type u32 \n", + "llama_model_loader: - kv 12: tokenizer.ggml.model str \n", + "llama_model_loader: - kv 13: tokenizer.ggml.tokens arr \n", + "llama_model_loader: - kv 14: tokenizer.ggml.scores arr \n", + "llama_model_loader: - kv 15: tokenizer.ggml.token_type arr \n", + "llama_model_loader: - type f32: 65 tensors\n", + "llama_model_loader: - type f16: 226 tensors\n", + "llama_model_quantize_internal: meta size = 741408 bytes\n", + "[ 1/ 291] token_embd.weight - [ 4096, 32016, 1, 1], type = f16, quantizing to q4_K .. size = 250.12 MB -> 70.35 MB | hist: \n", + "[ 2/ 291] blk.0.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 3/ 291] blk.0.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 4/ 291] blk.0.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MB -> 11.00 MB | hist: \n", + "[ 5/ 291] blk.0.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 6/ 291] blk.0.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 7/ 291] blk.0.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 8/ 291] blk.0.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 86.00 MB -> 29.56 MB | hist: \n", + "[ 9/ 291] blk.0.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", + "[ 10/ 291] blk.0.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", + "[ 11/ 291] blk.1.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 12/ 291] blk.1.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 13/ 291] blk.1.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MB -> 11.00 MB | hist: \n", + "[ 14/ 291] blk.1.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 15/ 291] blk.1.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 16/ 291] blk.1.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 17/ 291] blk.1.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 86.00 MB -> 29.56 MB | hist: \n", + "[ 18/ 291] blk.1.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", + "[ 19/ 291] blk.1.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", + "[ 20/ 291] blk.2.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 21/ 291] blk.2.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 22/ 291] blk.2.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MB -> 11.00 MB | hist: \n", + "[ 23/ 291] blk.2.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 24/ 291] blk.2.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 25/ 291] blk.2.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 26/ 291] blk.2.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 86.00 MB -> 29.56 MB | hist: \n", + "[ 27/ 291] blk.2.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", + "[ 28/ 291] blk.2.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", + "[ 29/ 291] blk.3.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 30/ 291] blk.3.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 31/ 291] blk.3.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MB -> 11.00 MB | hist: \n", + "[ 32/ 291] blk.3.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 33/ 291] blk.3.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 34/ 291] blk.3.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 35/ 291] blk.3.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 86.00 MB -> 29.56 MB | hist: \n", + "[ 36/ 291] blk.3.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", + "[ 37/ 291] blk.3.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", + "[ 38/ 291] blk.4.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 39/ 291] blk.4.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 40/ 291] blk.4.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 41/ 291] blk.4.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 42/ 291] blk.4.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 43/ 291] blk.4.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 44/ 291] blk.4.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 45/ 291] blk.4.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", + "[ 46/ 291] blk.4.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", + "[ 47/ 291] blk.5.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 48/ 291] blk.5.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 49/ 291] blk.5.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 50/ 291] blk.5.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 51/ 291] blk.5.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 52/ 291] blk.5.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 53/ 291] blk.5.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 54/ 291] blk.5.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", + "[ 55/ 291] blk.5.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", + "[ 56/ 291] blk.6.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 57/ 291] blk.6.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 58/ 291] blk.6.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 59/ 291] blk.6.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 60/ 291] blk.6.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 61/ 291] blk.6.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 62/ 291] blk.6.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 63/ 291] blk.6.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", + "[ 64/ 291] blk.6.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", + "[ 65/ 291] blk.7.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 66/ 291] blk.7.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 67/ 291] blk.7.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 68/ 291] blk.7.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 69/ 291] blk.7.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 70/ 291] blk.7.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 71/ 291] blk.7.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 72/ 291] blk.7.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", + "[ 73/ 291] blk.7.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", + "[ 74/ 291] blk.8.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 75/ 291] blk.8.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 76/ 291] blk.8.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 77/ 291] blk.8.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 78/ 291] blk.8.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 79/ 291] blk.8.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 80/ 291] blk.8.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 81/ 291] blk.8.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", + "[ 82/ 291] blk.8.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", + "[ 83/ 291] blk.9.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 84/ 291] blk.9.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 85/ 291] blk.9.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 86/ 291] blk.9.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 87/ 291] blk.9.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 88/ 291] blk.9.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 89/ 291] blk.9.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 90/ 291] blk.9.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", + "[ 91/ 291] blk.9.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", + "[ 92/ 291] blk.10.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 93/ 291] blk.10.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 94/ 291] blk.10.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 95/ 291] blk.10.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 96/ 291] blk.10.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 97/ 291] blk.10.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 98/ 291] blk.10.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 99/ 291] blk.10.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", + "[ 100/ 291] blk.10.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", + "[ 101/ 291] blk.11.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 102/ 291] blk.11.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 103/ 291] blk.11.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 104/ 291] blk.11.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 105/ 291] blk.11.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 106/ 291] blk.11.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 107/ 291] blk.11.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 108/ 291] blk.11.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", + "[ 109/ 291] blk.11.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", + "[ 110/ 291] blk.12.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 111/ 291] blk.12.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 112/ 291] blk.12.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 113/ 291] blk.12.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 114/ 291] blk.12.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 115/ 291] blk.12.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 116/ 291] blk.12.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 117/ 291] blk.12.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", + "[ 118/ 291] blk.12.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", + "[ 119/ 291] blk.13.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 120/ 291] blk.13.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 121/ 291] blk.13.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 122/ 291] blk.13.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 123/ 291] blk.13.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 124/ 291] blk.13.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 125/ 291] blk.13.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 126/ 291] blk.13.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", + "[ 127/ 291] blk.13.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", + "[ 128/ 291] blk.14.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 129/ 291] blk.14.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 130/ 291] blk.14.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 131/ 291] blk.14.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 132/ 291] blk.14.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 133/ 291] blk.14.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 134/ 291] blk.14.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 135/ 291] blk.14.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", + "[ 136/ 291] blk.14.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", + "[ 137/ 291] blk.15.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 138/ 291] blk.15.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 139/ 291] blk.15.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 140/ 291] blk.15.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 141/ 291] blk.15.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 142/ 291] blk.15.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 143/ 291] blk.15.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 144/ 291] blk.15.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", + "[ 145/ 291] blk.15.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", + "[ 146/ 291] blk.16.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 147/ 291] blk.16.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 148/ 291] blk.16.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 149/ 291] blk.16.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 150/ 291] blk.16.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 151/ 291] blk.16.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 152/ 291] blk.16.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 153/ 291] blk.16.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", + "[ 154/ 291] blk.16.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", + "[ 155/ 291] blk.17.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 156/ 291] blk.17.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 157/ 291] blk.17.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 158/ 291] blk.17.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 159/ 291] blk.17.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 160/ 291] blk.17.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 161/ 291] blk.17.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 162/ 291] blk.17.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", + "[ 163/ 291] blk.17.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", + "[ 164/ 291] blk.18.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 165/ 291] blk.18.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 166/ 291] blk.18.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 167/ 291] blk.18.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 168/ 291] blk.18.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 169/ 291] blk.18.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 170/ 291] blk.18.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 171/ 291] blk.18.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", + "[ 172/ 291] blk.18.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", + "[ 173/ 291] blk.19.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 174/ 291] blk.19.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 175/ 291] blk.19.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 176/ 291] blk.19.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 177/ 291] blk.19.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 178/ 291] blk.19.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 179/ 291] blk.19.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 180/ 291] blk.19.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", + "[ 181/ 291] blk.19.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", + "[ 182/ 291] blk.20.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 183/ 291] blk.20.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 184/ 291] blk.20.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 185/ 291] blk.20.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 186/ 291] blk.20.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 187/ 291] blk.20.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 188/ 291] blk.20.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 189/ 291] blk.20.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", + "[ 190/ 291] blk.20.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", + "[ 191/ 291] blk.21.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 192/ 291] blk.21.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 193/ 291] blk.21.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 194/ 291] blk.21.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 195/ 291] blk.21.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 196/ 291] blk.21.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 197/ 291] blk.21.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 198/ 291] blk.21.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", + "[ 199/ 291] blk.21.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", + "[ 200/ 291] blk.22.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 201/ 291] blk.22.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 202/ 291] blk.22.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 203/ 291] blk.22.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 204/ 291] blk.22.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 205/ 291] blk.22.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 206/ 291] blk.22.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 207/ 291] blk.22.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", + "[ 208/ 291] blk.22.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", + "[ 209/ 291] blk.23.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 210/ 291] blk.23.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 211/ 291] blk.23.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 212/ 291] blk.23.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 213/ 291] blk.23.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 214/ 291] blk.23.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 215/ 291] blk.23.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 216/ 291] blk.23.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", + "[ 217/ 291] blk.23.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", + "[ 218/ 291] blk.24.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 219/ 291] blk.24.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 220/ 291] blk.24.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 221/ 291] blk.24.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 222/ 291] blk.24.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 223/ 291] blk.24.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 224/ 291] blk.24.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 225/ 291] blk.24.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", + "[ 226/ 291] blk.24.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", + "[ 227/ 291] blk.25.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 228/ 291] blk.25.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 229/ 291] blk.25.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 230/ 291] blk.25.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 231/ 291] blk.25.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 232/ 291] blk.25.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 233/ 291] blk.25.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 234/ 291] blk.25.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", + "[ 235/ 291] blk.25.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", + "[ 236/ 291] blk.26.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 237/ 291] blk.26.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 238/ 291] blk.26.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 239/ 291] blk.26.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 240/ 291] blk.26.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 241/ 291] blk.26.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 242/ 291] blk.26.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 243/ 291] blk.26.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", + "[ 244/ 291] blk.26.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", + "[ 245/ 291] blk.27.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 246/ 291] blk.27.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 247/ 291] blk.27.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 248/ 291] blk.27.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 249/ 291] blk.27.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 250/ 291] blk.27.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 251/ 291] blk.27.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 252/ 291] blk.27.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", + "[ 253/ 291] blk.27.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", + "[ 254/ 291] blk.28.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 255/ 291] blk.28.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 256/ 291] blk.28.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 257/ 291] blk.28.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 258/ 291] blk.28.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 259/ 291] blk.28.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 260/ 291] blk.28.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 261/ 291] blk.28.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", + "[ 262/ 291] blk.28.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", + "[ 263/ 291] blk.29.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 264/ 291] blk.29.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 265/ 291] blk.29.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 266/ 291] blk.29.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 267/ 291] blk.29.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 268/ 291] blk.29.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 269/ 291] blk.29.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 270/ 291] blk.29.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", + "[ 271/ 291] blk.29.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", + "[ 272/ 291] blk.30.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 273/ 291] blk.30.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 274/ 291] blk.30.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 275/ 291] blk.30.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 276/ 291] blk.30.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 277/ 291] blk.30.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 278/ 291] blk.30.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 279/ 291] blk.30.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", + "[ 280/ 291] blk.30.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", + "[ 281/ 291] blk.31.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 282/ 291] blk.31.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 283/ 291] blk.31.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 284/ 291] blk.31.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 32.00 MB -> 9.00 MB | hist: \n", + "[ 285/ 291] blk.31.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 286/ 291] blk.31.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 287/ 291] blk.31.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q4_K .. size = 86.00 MB -> 24.19 MB | hist: \n", + "[ 288/ 291] blk.31.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", + "[ 289/ 291] blk.31.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", + "[ 290/ 291] output_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n", + "[ 291/ 291] output.weight - [ 4096, 32016, 1, 1], type = f16, quantizing to q6_K .. size = 250.12 MB -> 102.59 MB | hist: \n", + "llama_model_quantize_internal: model size = 12853.27 MB\n", + "llama_model_quantize_internal: quant size = 3677.45 MB\n", + "\n", + "main: quantize time = 1089230.46 ms\n", + "main: total time = 1089230.46 ms\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Run inference\n", + "\n", + "Here is a simple script to run your quantized models. I'm offloading every layer to the GPU (35 for a 7b parameter model) to speed up inference." + ], + "metadata": { + "id": "WqI1CPiXI4dP" + } + }, + { + "cell_type": "code", + "source": [ + "import os\n", + "\n", + "model_list = [file for file in os.listdir(MODEL_NAME) if \"gguf\" in file]\n", + "\n", + "prompt = input(\"Enter your prompt: \")\n", + "chosen_method = input(\"Name of the model (options: \" + \", \".join(model_list) + \"): \")\n", + "\n", + "# Verify the chosen method is in the list\n", + "if chosen_method not in model_list:\n", + " print(\"Invalid name\")\n", + "else:\n", + " qtype = f\"{MODEL_NAME}/{MODEL_NAME.lower()}.{method.upper()}.gguf\"\n", + " !./llama.cpp/main -m {qtype} -n 128 --color -ngl 35 -p \"{prompt}\"" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "vNPL9WYg78l-", + "outputId": "3c3e7d2f-f0de-429d-fd97-dab480bc514a" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Enter your prompt: prompt\n", + "Please specify the quantization method to run the model (options: q4_k_s): q4_k_s\n", + "main: build = 1100 (dd0dc36)\n", + "main: seed = 1693227123\n", + "ggml_init_cublas: found 1 CUDA devices:\n", + " Device 0: Tesla T4, compute capability 7.5\n", + "llama_model_loader: loaded meta data with 17 key-value pairs and 291 tensors from EvolCodeLlama-7b/evolcodellama-7b.gguf.q4_k_s.bin (version GGUF V2 (latest))\n", + "llama_model_loader: - tensor 0: token_embd.weight q4_K [ 4096, 32016, 1, 1 ]\n", + "llama_model_loader: - tensor 1: blk.0.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 2: blk.0.attn_k.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 3: blk.0.attn_v.weight q5_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 4: blk.0.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 5: blk.0.ffn_gate.weight q4_K [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 6: blk.0.ffn_up.weight q4_K [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 7: blk.0.ffn_down.weight q5_K [ 11008, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 8: blk.0.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 9: blk.0.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 10: blk.1.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 11: blk.1.attn_k.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 12: blk.1.attn_v.weight q5_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 13: blk.1.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 14: blk.1.ffn_gate.weight q4_K [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 15: blk.1.ffn_up.weight q4_K [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 16: blk.1.ffn_down.weight q5_K [ 11008, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 17: blk.1.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 18: blk.1.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 19: blk.2.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 20: blk.2.attn_k.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 21: blk.2.attn_v.weight q5_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 22: blk.2.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 23: blk.2.ffn_gate.weight q4_K [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 24: blk.2.ffn_up.weight q4_K [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 25: blk.2.ffn_down.weight q5_K [ 11008, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 26: blk.2.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 27: blk.2.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 28: blk.3.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 29: blk.3.attn_k.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 30: blk.3.attn_v.weight q5_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 31: blk.3.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 32: blk.3.ffn_gate.weight q4_K [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 33: blk.3.ffn_up.weight q4_K [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 34: blk.3.ffn_down.weight q5_K [ 11008, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 35: blk.3.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 36: blk.3.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 37: blk.4.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 38: blk.4.attn_k.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 39: blk.4.attn_v.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 40: blk.4.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 41: blk.4.ffn_gate.weight q4_K [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 42: blk.4.ffn_up.weight q4_K [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 43: blk.4.ffn_down.weight q4_K [ 11008, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 44: blk.4.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 45: blk.4.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 46: blk.5.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 47: blk.5.attn_k.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 48: blk.5.attn_v.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 49: blk.5.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 50: blk.5.ffn_gate.weight q4_K [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 51: blk.5.ffn_up.weight q4_K [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 52: blk.5.ffn_down.weight q4_K [ 11008, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 53: blk.5.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 54: blk.5.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 55: blk.6.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 56: blk.6.attn_k.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 57: blk.6.attn_v.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 58: blk.6.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 59: blk.6.ffn_gate.weight q4_K [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 60: blk.6.ffn_up.weight q4_K [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 61: blk.6.ffn_down.weight q4_K [ 11008, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 62: blk.6.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 63: blk.6.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 64: blk.7.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 65: blk.7.attn_k.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 66: blk.7.attn_v.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 67: blk.7.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 68: blk.7.ffn_gate.weight q4_K [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 69: blk.7.ffn_up.weight q4_K [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 70: blk.7.ffn_down.weight q4_K [ 11008, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 71: blk.7.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 72: blk.7.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 73: blk.8.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 74: blk.8.attn_k.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 75: blk.8.attn_v.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 76: blk.8.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 77: blk.8.ffn_gate.weight q4_K [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 78: blk.8.ffn_up.weight q4_K [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 79: blk.8.ffn_down.weight q4_K [ 11008, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 80: blk.8.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 81: blk.8.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 82: blk.9.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 83: blk.9.attn_k.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 84: blk.9.attn_v.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 85: blk.9.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 86: blk.9.ffn_gate.weight q4_K [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 87: blk.9.ffn_up.weight q4_K [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 88: blk.9.ffn_down.weight q4_K [ 11008, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 89: blk.9.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 90: blk.9.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 91: blk.10.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 92: blk.10.attn_k.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 93: blk.10.attn_v.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 94: blk.10.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 95: blk.10.ffn_gate.weight q4_K [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 96: blk.10.ffn_up.weight q4_K [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 97: blk.10.ffn_down.weight q4_K [ 11008, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 98: blk.10.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 99: blk.10.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 100: blk.11.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 101: blk.11.attn_k.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 102: blk.11.attn_v.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 103: blk.11.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 104: blk.11.ffn_gate.weight q4_K [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 105: blk.11.ffn_up.weight q4_K [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 106: blk.11.ffn_down.weight q4_K [ 11008, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 107: blk.11.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 108: blk.11.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 109: blk.12.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 110: blk.12.attn_k.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 111: blk.12.attn_v.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 112: blk.12.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 113: blk.12.ffn_gate.weight q4_K [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 114: blk.12.ffn_up.weight q4_K [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 115: blk.12.ffn_down.weight q4_K [ 11008, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 116: blk.12.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 117: blk.12.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 118: blk.13.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 119: blk.13.attn_k.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 120: blk.13.attn_v.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 121: blk.13.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 122: blk.13.ffn_gate.weight q4_K [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 123: blk.13.ffn_up.weight q4_K [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 124: blk.13.ffn_down.weight q4_K [ 11008, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 125: blk.13.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 126: blk.13.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 127: blk.14.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 128: blk.14.attn_k.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 129: blk.14.attn_v.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 130: blk.14.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 131: blk.14.ffn_gate.weight q4_K [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 132: blk.14.ffn_up.weight q4_K [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 133: blk.14.ffn_down.weight q4_K [ 11008, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 134: blk.14.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 135: blk.14.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 136: blk.15.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 137: blk.15.attn_k.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 138: blk.15.attn_v.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 139: blk.15.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 140: blk.15.ffn_gate.weight q4_K [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 141: blk.15.ffn_up.weight q4_K [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 142: blk.15.ffn_down.weight q4_K [ 11008, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 143: blk.15.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 144: blk.15.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 145: blk.16.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 146: blk.16.attn_k.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 147: blk.16.attn_v.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 148: blk.16.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 149: blk.16.ffn_gate.weight q4_K [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 150: blk.16.ffn_up.weight q4_K [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 151: blk.16.ffn_down.weight q4_K [ 11008, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 152: blk.16.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 153: blk.16.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 154: blk.17.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 155: blk.17.attn_k.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 156: blk.17.attn_v.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 157: blk.17.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 158: blk.17.ffn_gate.weight q4_K [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 159: blk.17.ffn_up.weight q4_K [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 160: blk.17.ffn_down.weight q4_K [ 11008, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 161: blk.17.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 162: blk.17.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 163: blk.18.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 164: blk.18.attn_k.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 165: blk.18.attn_v.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 166: blk.18.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 167: blk.18.ffn_gate.weight q4_K [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 168: blk.18.ffn_up.weight q4_K [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 169: blk.18.ffn_down.weight q4_K [ 11008, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 170: blk.18.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 171: blk.18.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 172: blk.19.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 173: blk.19.attn_k.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 174: blk.19.attn_v.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 175: blk.19.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 176: blk.19.ffn_gate.weight q4_K [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 177: blk.19.ffn_up.weight q4_K [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 178: blk.19.ffn_down.weight q4_K [ 11008, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 179: blk.19.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 180: blk.19.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 181: blk.20.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 182: blk.20.attn_k.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 183: blk.20.attn_v.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 184: blk.20.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 185: blk.20.ffn_gate.weight q4_K [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 186: blk.20.ffn_up.weight q4_K [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 187: blk.20.ffn_down.weight q4_K [ 11008, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 188: blk.20.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 189: blk.20.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 190: blk.21.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 191: blk.21.attn_k.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 192: blk.21.attn_v.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 193: blk.21.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 194: blk.21.ffn_gate.weight q4_K [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 195: blk.21.ffn_up.weight q4_K [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 196: blk.21.ffn_down.weight q4_K [ 11008, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 197: blk.21.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 198: blk.21.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 199: blk.22.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 200: blk.22.attn_k.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 201: blk.22.attn_v.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 202: blk.22.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 203: blk.22.ffn_gate.weight q4_K [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 204: blk.22.ffn_up.weight q4_K [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 205: blk.22.ffn_down.weight q4_K [ 11008, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 206: blk.22.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 207: blk.22.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 208: blk.23.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 209: blk.23.attn_k.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 210: blk.23.attn_v.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 211: blk.23.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 212: blk.23.ffn_gate.weight q4_K [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 213: blk.23.ffn_up.weight q4_K [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 214: blk.23.ffn_down.weight q4_K [ 11008, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 215: blk.23.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 216: blk.23.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 217: blk.24.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 218: blk.24.attn_k.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 219: blk.24.attn_v.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 220: blk.24.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 221: blk.24.ffn_gate.weight q4_K [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 222: blk.24.ffn_up.weight q4_K [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 223: blk.24.ffn_down.weight q4_K [ 11008, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 224: blk.24.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 225: blk.24.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 226: blk.25.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 227: blk.25.attn_k.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 228: blk.25.attn_v.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 229: blk.25.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 230: blk.25.ffn_gate.weight q4_K [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 231: blk.25.ffn_up.weight q4_K [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 232: blk.25.ffn_down.weight q4_K [ 11008, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 233: blk.25.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 234: blk.25.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 235: blk.26.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 236: blk.26.attn_k.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 237: blk.26.attn_v.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 238: blk.26.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 239: blk.26.ffn_gate.weight q4_K [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 240: blk.26.ffn_up.weight q4_K [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 241: blk.26.ffn_down.weight q4_K [ 11008, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 242: blk.26.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 243: blk.26.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 244: blk.27.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 245: blk.27.attn_k.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 246: blk.27.attn_v.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 247: blk.27.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 248: blk.27.ffn_gate.weight q4_K [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 249: blk.27.ffn_up.weight q4_K [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 250: blk.27.ffn_down.weight q4_K [ 11008, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 251: blk.27.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 252: blk.27.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 253: blk.28.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 254: blk.28.attn_k.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 255: blk.28.attn_v.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 256: blk.28.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 257: blk.28.ffn_gate.weight q4_K [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 258: blk.28.ffn_up.weight q4_K [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 259: blk.28.ffn_down.weight q4_K [ 11008, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 260: blk.28.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 261: blk.28.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 262: blk.29.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 263: blk.29.attn_k.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 264: blk.29.attn_v.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 265: blk.29.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 266: blk.29.ffn_gate.weight q4_K [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 267: blk.29.ffn_up.weight q4_K [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 268: blk.29.ffn_down.weight q4_K [ 11008, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 269: blk.29.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 270: blk.29.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 271: blk.30.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 272: blk.30.attn_k.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 273: blk.30.attn_v.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 274: blk.30.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 275: blk.30.ffn_gate.weight q4_K [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 276: blk.30.ffn_up.weight q4_K [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 277: blk.30.ffn_down.weight q4_K [ 11008, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 278: blk.30.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 279: blk.30.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 280: blk.31.attn_q.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 281: blk.31.attn_k.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 282: blk.31.attn_v.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 283: blk.31.attn_output.weight q4_K [ 4096, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 284: blk.31.ffn_gate.weight q4_K [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 285: blk.31.ffn_up.weight q4_K [ 4096, 11008, 1, 1 ]\n", + "llama_model_loader: - tensor 286: blk.31.ffn_down.weight q4_K [ 11008, 4096, 1, 1 ]\n", + "llama_model_loader: - tensor 287: blk.31.attn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 288: blk.31.ffn_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 289: output_norm.weight f32 [ 4096, 1, 1, 1 ]\n", + "llama_model_loader: - tensor 290: output.weight q6_K [ 4096, 32016, 1, 1 ]\n", + "llama_model_loader: - kv 0: general.architecture str \n", + "llama_model_loader: - kv 1: general.name str \n", + "llama_model_loader: - kv 2: llama.context_length u32 \n", + "llama_model_loader: - kv 3: llama.embedding_length u32 \n", + "llama_model_loader: - kv 4: llama.block_count u32 \n", + "llama_model_loader: - kv 5: llama.feed_forward_length u32 \n", + "llama_model_loader: - kv 6: llama.rope.dimension_count u32 \n", + "llama_model_loader: - kv 7: llama.attention.head_count u32 \n", + "llama_model_loader: - kv 8: llama.attention.head_count_kv u32 \n", + "llama_model_loader: - kv 9: llama.attention.layer_norm_rms_epsilon f32 \n", + "llama_model_loader: - kv 10: llama.rope.freq_base f32 \n", + "llama_model_loader: - kv 11: general.file_type u32 \n", + "llama_model_loader: - kv 12: tokenizer.ggml.model str \n", + "llama_model_loader: - kv 13: tokenizer.ggml.tokens arr \n", + "llama_model_loader: - kv 14: tokenizer.ggml.scores arr \n", + "llama_model_loader: - kv 15: tokenizer.ggml.token_type arr \n", + "llama_model_loader: - kv 16: general.quantization_version u32 \n", + "llama_model_loader: - type f32: 65 tensors\n", + "llama_model_loader: - type q4_K: 217 tensors\n", + "llama_model_loader: - type q5_K: 8 tensors\n", + "llama_model_loader: - type q6_K: 1 tensors\n", + "llm_load_print_meta: format = GGUF V2 (latest)\n", + "llm_load_print_meta: arch = llama\n", + "llm_load_print_meta: vocab type = SPM\n", + "llm_load_print_meta: n_vocab = 32016\n", + "llm_load_print_meta: n_merges = 0\n", + "llm_load_print_meta: n_ctx_train = 16384\n", + "llm_load_print_meta: n_ctx = 512\n", + "llm_load_print_meta: n_embd = 4096\n", + "llm_load_print_meta: n_head = 32\n", + "llm_load_print_meta: n_head_kv = 32\n", + "llm_load_print_meta: n_layer = 32\n", + "llm_load_print_meta: n_rot = 128\n", + "llm_load_print_meta: n_gqa = 1\n", + "llm_load_print_meta: f_norm_eps = 1.0e-05\n", + "llm_load_print_meta: f_norm_rms_eps = 1.0e-05\n", + "llm_load_print_meta: n_ff = 11008\n", + "llm_load_print_meta: freq_base = 1000000.0\n", + "llm_load_print_meta: freq_scale = 1\n", + "llm_load_print_meta: model type = 7B\n", + "llm_load_print_meta: model ftype = mostly Q4_K - Small\n", + "llm_load_print_meta: model size = 6.74 B\n", + "llm_load_print_meta: general.name = LLaMA\n", + "llm_load_print_meta: BOS token = 1 ''\n", + "llm_load_print_meta: EOS token = 2 ''\n", + "llm_load_print_meta: UNK token = 0 ''\n", + "llm_load_print_meta: LF token = 13 '<0x0A>'\n", + "llm_load_tensors: ggml ctx size = 0.09 MB\n", + "llm_load_tensors: using CUDA for GPU acceleration\n", + "llm_load_tensors: mem required = 70.44 MB (+ 256.00 MB per state)\n", + "llm_load_tensors: offloading 32 repeating layers to GPU\n", + "llm_load_tensors: offloading non-repeating layers to GPU\n", + "llm_load_tensors: offloading v cache to GPU\n", + "llm_load_tensors: offloading k cache to GPU\n", + "llm_load_tensors: offloaded 35/35 layers to GPU\n", + "llm_load_tensors: VRAM used: 3864 MB\n", + "..................................................................................................\n", + "llama_new_context_with_model: kv self size = 256.00 MB\n", + "llama_new_context_with_model: compute buffer total size = 71.94 MB\n", + "llama_new_context_with_model: VRAM scratch buffer: 70.53 MB\n", + "\n", + "system_info: n_threads = 2 / 2 | AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | \n", + "sampling: repeat_last_n = 64, repeat_penalty = 1.100000, presence_penalty = 0.000000, frequency_penalty = 0.000000, top_k = 40, tfs_z = 1.000000, top_p = 0.950000, typical_p = 1.000000, temp = 0.800000, mirostat = 0, mirostat_lr = 0.100000, mirostat_ent = 5.000000\n", + "generate: n_ctx = 512, n_batch = 512, n_predict = 128, n_keep = 0\n", + "\n", + "\n", + "\u001b[33m prompt\u001b[0m.\t\t\t\t\n", + "\t\t\t\t\tif( !this->m_pMiscSettings ) { return; }\t// If no misc settings, do nothing\n", + "\t\t\t\t\t\n", + "\t\t\t\t\t// Get the value of the checkbox for \"Always on top\"\n", + "\t\t\t\t\tbool alwaysOnTop = this->m_pMiscSettings->GetBool(L\"AlwaysOnTop\", false);\n", + "\t\t\t\t\tthis->SetWindowPos((alwaysOnTop ? HWND_TOPMOST : HWND_NOTOPMOST\n", + "llama_print_timings: load time = 1392.10 ms\n", + "llama_print_timings: sample time = 147.99 ms / 128 runs ( 1.16 ms per token, 864.92 tokens per second)\n", + "llama_print_timings: prompt eval time = 261.80 ms / 2 tokens ( 130.90 ms per token, 7.64 tokens per second)\n", + "llama_print_timings: eval time = 5923.18 ms / 127 runs ( 46.64 ms per token, 21.44 tokens per second)\n", + "llama_print_timings: total time = 6370.96 ms\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Push to hub\n", + "\n", + "To push your model to the hub, you'll need to input your Hugging Face token (https://huggingface.co/settings/tokens) in Google Colab's \"Secrets\" tab. The following code creates a new repo with the \"-GGUF\" suffix. Don't forget to change the `username` variable." + ], + "metadata": { + "id": "Ar8pO7bb80US" + } + }, + { + "cell_type": "code", + "source": [ + "!pip install -q huggingface_hub\n", + "from huggingface_hub import create_repo, HfApi\n", + "from google.colab import userdata\n", + "\n", + "# Defined in the secrets tab in Google Colab\n", + "hf_token = userdata.get('huggingface')\n", + "\n", + "api = HfApi()\n", + "username = \"mlabonne\"\n", + "\n", + "# Create empty repo\n", + "create_repo(\n", + " repo_id = f\"{username}/{MODEL_NAME}-GGUF\",\n", + " repo_type=\"model\",\n", + " exist_ok=True,\n", + " token=hf_token\n", + ")\n", + "\n", + "# Upload gguf files\n", + "api.upload_folder(\n", + " folder_path=MODEL_NAME,\n", + " repo_id=f\"{username}/{MODEL_NAME}-GGUF\",\n", + " allow_patterns=f\"*.gguf\",\n", + " token=hf_token\n", + ")" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 163, + "referenced_widgets": [ + "c281b60e104f4c5da547bbdd7208d4bc", + "74b084c97f6f46d293a197bf9804460c", + "1409574c4f9742e7a711965dd2c8ad87", + "704ecf9409244e0b93612d6a11476346", + "b1a8d3a9a379415393d9e7d995a40788", + "f928772f92724579b068e984d9eef387", + "1c8a6b959f9c4443a92f58eff1b03077", + "9fb5726f91734b1da149784680dc9624", + "202a8eb11eda4e58942113fbeacfdc3d", + "970d4d3daf854f92bd650dc4da99e1bc", + "24b1e007921046b1adc61db0f2bf9fc7", + "24d3d72f5de54de8a1ded4e528dde332", + "e90cb0ce526a4556bc643ba6c5485661", + "76e7372656b745c889b9283b76c04148", + "ce0204c7e1ff4a51b2648284a2492262", + "6dbb8e8a5ebb40a4ba910b09dde27e1a", + "7944af54f2564920822d5d4b348896c4", + "1b55372f62494ca0baabf87f7e7f4ba8", + "bf612001ad354ea19de6ee45a166a43c", + "a8e4691970b14955bfb4865bcef5e912", + "2e2fabac70484c1c8b16fa6ca8fd8537", + "bf53c635fa374420ad850eea22cd1e31", + "065d59126a734c1aa096ba40cd4a129f", + "e8855d5678a342f5a33171aa74d3b7bc", + "7eb6de1a979b46f7b234724073f8bc3a", + "6ae4640196da492fadafeb63f4bc89d2", + "cef83433dbea4f529f43722fe78a8baf", + "845ba8115d5140ac9ee22af4a9e6a03b", + "cdd888041aca4dcf8adc785309071fc6", + "cf63214cb4f8442999fa5b971035fe4f", + "7d9b22f2b7fe4a749f989e247bce446a", + "7f8e268db8144adfb09d089784d8411a" + ] + }, + "id": "UOyKfUD-8jmh", + "outputId": "3c8df47b-f350-4251-a19f-4b9fb1116381" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/268.8 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[91m━━━━━━━━━━\u001b[0m\u001b[91m╸\u001b[0m\u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m71.7/268.8 kB\u001b[0m \u001b[31m2.0 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m268.8/268.8 kB\u001b[0m \u001b[31m3.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "VBox(children=(HTML(value='