From 4a8ebd841fd06fb277c1e3e68959d5805f075ec4 Mon Sep 17 00:00:00 2001
From: Maxime Labonne <81252890+mlabonne@users.noreply.github.com>
Date: Wed, 29 Nov 2023 20:41:55 +0000
Subject: [PATCH] Created using Colaboratory

---
 ...ma_2_models_using_GGUF_and_llama_cpp.ipynb | 2218 +++++++++++++++++
 1 file changed, 2218 insertions(+)
 create mode 100644 Quantize_Llama_2_models_using_GGUF_and_llama_cpp.ipynb
diff --git a/Quantize_Llama_2_models_using_GGUF_and_llama_cpp.ipynb b/Quantize_Llama_2_models_using_GGUF_and_llama_cpp.ipynb
new file mode 100644
index 0000000..cb3fc28
--- /dev/null
+++ b/Quantize_Llama_2_models_using_GGUF_and_llama_cpp.ipynb
@@ -0,0 +1,2218 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": [],
+      "gpuType": "T4",
+      "authorship_tag": "ABX9TyMohoDhmmKsuh9OLDHor3GB",
+      "include_colab_link": true
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    },
+    "accelerator": "GPU",
+    "widgets": {
+      "application/vnd.jupyter.widget-state+json": {
+        "c281b60e104f4c5da547bbdd7208d4bc": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "VBoxModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "VBoxModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "VBoxView",
+            "box_style": "",
+            "children": [
+              "IPY_MODEL_2e2fabac70484c1c8b16fa6ca8fd8537",
+              "IPY_MODEL_bf53c635fa374420ad850eea22cd1e31",
+              "IPY_MODEL_065d59126a734c1aa096ba40cd4a129f",
+              "IPY_MODEL_e8855d5678a342f5a33171aa74d3b7bc"
+            ],
+            "layout": "IPY_MODEL_1c8a6b959f9c4443a92f58eff1b03077"
+          }
+        },
+        "74b084c97f6f46d293a197bf9804460c": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "HTMLModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_9fb5726f91734b1da149784680dc9624",
+            "placeholder": "​",
+            "style": "IPY_MODEL_202a8eb11eda4e58942113fbeacfdc3d",
+            "value": "<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.svg\nalt='Hugging Face'> <br> Copy a token from <a\nhref=\"https://huggingface.co/settings/tokens\" target=\"_blank\">your Hugging Face\ntokens page</a> and paste it below. <br> Immediately click login after copying\nyour token or it might be stored in plain text in this notebook file. </center>"
+          }
+        },
+        "1409574c4f9742e7a711965dd2c8ad87": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "PasswordModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "PasswordModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "PasswordView",
+            "continuous_update": true,
+            "description": "Token:",
+            "description_tooltip": null,
+            "disabled": false,
+            "layout": "IPY_MODEL_970d4d3daf854f92bd650dc4da99e1bc",
+            "placeholder": "​",
+            "style": "IPY_MODEL_24b1e007921046b1adc61db0f2bf9fc7",
+            "value": ""
+          }
+        },
+        "704ecf9409244e0b93612d6a11476346": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "CheckboxModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "CheckboxModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "CheckboxView",
+            "description": "Add token as git credential?",
+            "description_tooltip": null,
+            "disabled": false,
+            "indent": true,
+            "layout": "IPY_MODEL_24d3d72f5de54de8a1ded4e528dde332",
+            "style": "IPY_MODEL_e90cb0ce526a4556bc643ba6c5485661",
+            "value": true
+          }
+        },
+        "b1a8d3a9a379415393d9e7d995a40788": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "ButtonModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "ButtonModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "ButtonView",
+            "button_style": "",
+            "description": "Login",
+            "disabled": false,
+            "icon": "",
+            "layout": "IPY_MODEL_76e7372656b745c889b9283b76c04148",
+            "style": "IPY_MODEL_ce0204c7e1ff4a51b2648284a2492262",
+            "tooltip": ""
+          }
+        },
+        "f928772f92724579b068e984d9eef387": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "HTMLModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_6dbb8e8a5ebb40a4ba910b09dde27e1a",
+            "placeholder": "​",
+            "style": "IPY_MODEL_7944af54f2564920822d5d4b348896c4",
+            "value": "\n<b>Pro Tip:</b> If you don't already have one, you can create a dedicated\n'notebooks' token with 'write' access, that you can then easily reuse for all\nnotebooks. </center>"
+          }
+        },
+        "1c8a6b959f9c4443a92f58eff1b03077": {
+          "model_module": "@jupyter-widgets/base",
+          "model_name": "LayoutModel",
+          "model_module_version": "1.2.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": "center",
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": "flex",
+            "flex": null,
+            "flex_flow": "column",
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": "50%"
+          }
+        },
+        "9fb5726f91734b1da149784680dc9624": {
+          "model_module": "@jupyter-widgets/base",
+          "model_name": "LayoutModel",
+          "model_module_version": "1.2.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "202a8eb11eda4e58942113fbeacfdc3d": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "DescriptionStyleModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "970d4d3daf854f92bd650dc4da99e1bc": {
+          "model_module": "@jupyter-widgets/base",
+          "model_name": "LayoutModel",
+          "model_module_version": "1.2.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "24b1e007921046b1adc61db0f2bf9fc7": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "DescriptionStyleModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "24d3d72f5de54de8a1ded4e528dde332": {
+          "model_module": "@jupyter-widgets/base",
+          "model_name": "LayoutModel",
+          "model_module_version": "1.2.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "e90cb0ce526a4556bc643ba6c5485661": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "DescriptionStyleModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "76e7372656b745c889b9283b76c04148": {
+          "model_module": "@jupyter-widgets/base",
+          "model_name": "LayoutModel",
+          "model_module_version": "1.2.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "ce0204c7e1ff4a51b2648284a2492262": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "ButtonStyleModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "ButtonStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "button_color": null,
+            "font_weight": ""
+          }
+        },
+        "6dbb8e8a5ebb40a4ba910b09dde27e1a": {
+          "model_module": "@jupyter-widgets/base",
+          "model_name": "LayoutModel",
+          "model_module_version": "1.2.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "7944af54f2564920822d5d4b348896c4": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "DescriptionStyleModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "1b55372f62494ca0baabf87f7e7f4ba8": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "LabelModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "LabelModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "LabelView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_bf612001ad354ea19de6ee45a166a43c",
+            "placeholder": "​",
+            "style": "IPY_MODEL_a8e4691970b14955bfb4865bcef5e912",
+            "value": "Connecting..."
+          }
+        },
+        "bf612001ad354ea19de6ee45a166a43c": {
+          "model_module": "@jupyter-widgets/base",
+          "model_name": "LayoutModel",
+          "model_module_version": "1.2.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "a8e4691970b14955bfb4865bcef5e912": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "DescriptionStyleModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "2e2fabac70484c1c8b16fa6ca8fd8537": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "LabelModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "LabelModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "LabelView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_7eb6de1a979b46f7b234724073f8bc3a",
+            "placeholder": "​",
+            "style": "IPY_MODEL_6ae4640196da492fadafeb63f4bc89d2",
+            "value": "Token is valid (permission: write)."
+          }
+        },
+        "bf53c635fa374420ad850eea22cd1e31": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "LabelModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "LabelModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "LabelView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_cef83433dbea4f529f43722fe78a8baf",
+            "placeholder": "​",
+            "style": "IPY_MODEL_845ba8115d5140ac9ee22af4a9e6a03b",
+            "value": "Your token has been saved in your configured git credential helpers (store)."
+          }
+        },
+        "065d59126a734c1aa096ba40cd4a129f": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "LabelModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "LabelModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "LabelView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_cdd888041aca4dcf8adc785309071fc6",
+            "placeholder": "​",
+            "style": "IPY_MODEL_cf63214cb4f8442999fa5b971035fe4f",
+            "value": "Your token has been saved to /root/.cache/huggingface/token"
+          }
+        },
+        "e8855d5678a342f5a33171aa74d3b7bc": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "LabelModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "LabelModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "LabelView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_7d9b22f2b7fe4a749f989e247bce446a",
+            "placeholder": "​",
+            "style": "IPY_MODEL_7f8e268db8144adfb09d089784d8411a",
+            "value": "Login successful"
+          }
+        },
+        "7eb6de1a979b46f7b234724073f8bc3a": {
+          "model_module": "@jupyter-widgets/base",
+          "model_name": "LayoutModel",
+          "model_module_version": "1.2.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "6ae4640196da492fadafeb63f4bc89d2": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "DescriptionStyleModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "cef83433dbea4f529f43722fe78a8baf": {
+          "model_module": "@jupyter-widgets/base",
+          "model_name": "LayoutModel",
+          "model_module_version": "1.2.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "845ba8115d5140ac9ee22af4a9e6a03b": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "DescriptionStyleModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "cdd888041aca4dcf8adc785309071fc6": {
+          "model_module": "@jupyter-widgets/base",
+          "model_name": "LayoutModel",
+          "model_module_version": "1.2.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "cf63214cb4f8442999fa5b971035fe4f": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "DescriptionStyleModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "7d9b22f2b7fe4a749f989e247bce446a": {
+          "model_module": "@jupyter-widgets/base",
+          "model_name": "LayoutModel",
+          "model_module_version": "1.2.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "7f8e268db8144adfb09d089784d8411a": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "DescriptionStyleModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        }
+      }
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "view-in-github",
+        "colab_type": "text"
+      },
+      "source": [
+        "<a href=\"https://colab.research.google.com/github/mlabonne/llm-course/blob/main/Quantize_Llama_2_models_using_GGUF_and_llama_cpp.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# Quantize Llama 2 models using GGUF and llama.cpp\n",
+        "> 🗣️ [Large Language Model Course](https://github.com/mlabonne/llm-course)\n",
+        "\n",
+        "❤️ Created by [@maximelabonne](https://twitter.com/maximelabonne).\n",
+        "\n",
+        "## Usage\n",
+        "\n",
+        "* `MODEL_ID`: The ID of the model to quantize (e.g., `mlabonne/EvolCodeLlama-7b`).\n",
+        "* `QUANTIZATION_METHOD`: The quantization method to use.\n",
+        "\n",
+        "## Quantization methods\n",
+        "\n",
+        "The names of the quantization methods follow the naming convention: \"q\" + the number of bits + the variant used (detailed below). Here is a list of all the possible quant methods and their corresponding use cases, based on model cards made by [TheBloke](https://huggingface.co/TheBloke/):\n",
+        "\n",
+        "* `q2_k`: Uses Q4_K for the attention.vw and feed_forward.w2 tensors, Q2_K for the other tensors.\n",
+        "* `q3_k_l`: Uses Q5_K for the attention.wv, attention.wo, and feed_forward.w2 tensors, else Q3_K\n",
+        "* `q3_k_m`: Uses Q4_K for the attention.wv, attention.wo, and feed_forward.w2 tensors, else Q3_K\n",
+        "* `q3_k_s`: Uses Q3_K for all tensors\n",
+        "* `q4_0`: Original quant method, 4-bit.\n",
+        "* `q4_1`: Higher accuracy than q4_0 but not as high as q5_0. However has quicker inference than q5 models.\n",
+        "* `q4_k_m`: Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q4_K\n",
+        "* `q4_k_s`: Uses Q4_K for all tensors\n",
+        "* `q5_0`: Higher accuracy, higher resource usage and slower inference.\n",
+        "* `q5_1`: Even higher accuracy, resource usage and slower inference.\n",
+        "* `q5_k_m`: Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q5_K\n",
+        "* `q5_k_s`:  Uses Q5_K for all tensors\n",
+        "* `q6_k`: Uses Q8_K for all tensors\n",
+        "* `q8_0`: Almost indistinguishable from float16. High resource use and slow. Not recommended for most users.\n",
+        "\n",
+        "As a rule of thumb, **I recommend using Q5_K_M** as it preserves most of the model's performance. Alternatively, you can use Q4_K_M if you want to save some memory. In general, K_M versions are better than K_S versions. I cannot recommend Q2_K or Q3_* versions, as they drastically decrease model performance."
+      ],
+      "metadata": {
+        "id": "8y_Rk94LzG7I"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Variables\n",
+        "MODEL_ID = \"mlabonne/EvolCodeLlama-7b\"\n",
+        "QUANTIZATION_METHODS = [\"q4_k_m\", \"q5_k_m\"]\n",
+        "\n",
+        "# Constants\n",
+        "MODEL_NAME = MODEL_ID.split('/')[-1]\n",
+        "\n",
+        "# Install llama.cpp\n",
+        "!git clone https://github.com/ggerganov/llama.cpp\n",
+        "!cd llama.cpp && git pull && make clean && LLAMA_CUBLAS=1 make\n",
+        "!pip install -r llama.cpp/requirements.txt\n",
+        "\n",
+        "# Download model\n",
+        "!git lfs install\n",
+        "!git clone https://huggingface.co/{MODEL_ID}\n",
+        "\n",
+        "# Convert to fp16\n",
+        "fp16 = f\"{MODEL_NAME}/{MODEL_NAME.lower()}.fp16.bin\"\n",
+        "!python llama.cpp/convert.py {MODEL_NAME} --outtype f16 --outfile {fp16}\n",
+        "\n",
+        "# Quantize the model for each method in the QUANTIZATION_METHODS list\n",
+        "for method in QUANTIZATION_METHODS:\n",
+        "    qtype = f\"{MODEL_NAME}/{MODEL_NAME.lower()}.{method.upper()}.gguf\"\n",
+        "    !./llama.cpp/quantize {fp16} {qtype} {method}"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "fD24jJxq7t3k",
+        "outputId": "94954934-0829-44e9-a5e5-262c17e162d0"
+      },
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "ggml_init_cublas: found 1 CUDA devices:\n",
+            "  Device 0: Tesla T4, compute capability 7.5\n",
+            "main: build = 1100 (dd0dc36)\n",
+            "main: quantizing 'EvolCodeLlama-7b/evolcodellama-7b.gguf.fp16.bin' to 'EvolCodeLlama-7b/evolcodellama-7b.gguf.q4_k_s.bin' as Q4_K_S\n",
+            "llama_model_loader: loaded meta data with 16 key-value pairs and 291 tensors from EvolCodeLlama-7b/evolcodellama-7b.gguf.fp16.bin (version GGUF V1 (support until nov 2023))\n",
+            "llama_model_loader: - tensor    0:                token_embd.weight f16      [  4096, 32016,     1,     1 ]\n",
+            "llama_model_loader: - tensor    1:              blk.0.attn_q.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor    2:              blk.0.attn_k.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor    3:              blk.0.attn_v.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor    4:         blk.0.attn_output.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor    5:            blk.0.ffn_gate.weight f16      [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor    6:              blk.0.ffn_up.weight f16      [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor    7:            blk.0.ffn_down.weight f16      [ 11008,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor    8:           blk.0.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor    9:            blk.0.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor   10:              blk.1.attn_q.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   11:              blk.1.attn_k.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   12:              blk.1.attn_v.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   13:         blk.1.attn_output.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   14:            blk.1.ffn_gate.weight f16      [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor   15:              blk.1.ffn_up.weight f16      [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor   16:            blk.1.ffn_down.weight f16      [ 11008,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   17:           blk.1.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor   18:            blk.1.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor   19:              blk.2.attn_q.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   20:              blk.2.attn_k.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   21:              blk.2.attn_v.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   22:         blk.2.attn_output.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   23:            blk.2.ffn_gate.weight f16      [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor   24:              blk.2.ffn_up.weight f16      [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor   25:            blk.2.ffn_down.weight f16      [ 11008,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   26:           blk.2.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor   27:            blk.2.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor   28:              blk.3.attn_q.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   29:              blk.3.attn_k.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   30:              blk.3.attn_v.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   31:         blk.3.attn_output.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   32:            blk.3.ffn_gate.weight f16      [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor   33:              blk.3.ffn_up.weight f16      [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor   34:            blk.3.ffn_down.weight f16      [ 11008,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   35:           blk.3.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor   36:            blk.3.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor   37:              blk.4.attn_q.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   38:              blk.4.attn_k.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   39:              blk.4.attn_v.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   40:         blk.4.attn_output.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   41:            blk.4.ffn_gate.weight f16      [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor   42:              blk.4.ffn_up.weight f16      [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor   43:            blk.4.ffn_down.weight f16      [ 11008,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   44:           blk.4.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor   45:            blk.4.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor   46:              blk.5.attn_q.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   47:              blk.5.attn_k.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   48:              blk.5.attn_v.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   49:         blk.5.attn_output.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   50:            blk.5.ffn_gate.weight f16      [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor   51:              blk.5.ffn_up.weight f16      [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor   52:            blk.5.ffn_down.weight f16      [ 11008,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   53:           blk.5.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor   54:            blk.5.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor   55:              blk.6.attn_q.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   56:              blk.6.attn_k.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   57:              blk.6.attn_v.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   58:         blk.6.attn_output.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   59:            blk.6.ffn_gate.weight f16      [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor   60:              blk.6.ffn_up.weight f16      [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor   61:            blk.6.ffn_down.weight f16      [ 11008,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   62:           blk.6.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor   63:            blk.6.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor   64:              blk.7.attn_q.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   65:              blk.7.attn_k.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   66:              blk.7.attn_v.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   67:         blk.7.attn_output.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   68:            blk.7.ffn_gate.weight f16      [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor   69:              blk.7.ffn_up.weight f16      [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor   70:            blk.7.ffn_down.weight f16      [ 11008,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   71:           blk.7.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor   72:            blk.7.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor   73:              blk.8.attn_q.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   74:              blk.8.attn_k.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   75:              blk.8.attn_v.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   76:         blk.8.attn_output.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   77:            blk.8.ffn_gate.weight f16      [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor   78:              blk.8.ffn_up.weight f16      [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor   79:            blk.8.ffn_down.weight f16      [ 11008,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   80:           blk.8.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor   81:            blk.8.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor   82:              blk.9.attn_q.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   83:              blk.9.attn_k.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   84:              blk.9.attn_v.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   85:         blk.9.attn_output.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   86:            blk.9.ffn_gate.weight f16      [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor   87:              blk.9.ffn_up.weight f16      [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor   88:            blk.9.ffn_down.weight f16      [ 11008,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   89:           blk.9.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor   90:            blk.9.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor   91:             blk.10.attn_q.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   92:             blk.10.attn_k.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   93:             blk.10.attn_v.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   94:        blk.10.attn_output.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   95:           blk.10.ffn_gate.weight f16      [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor   96:             blk.10.ffn_up.weight f16      [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor   97:           blk.10.ffn_down.weight f16      [ 11008,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   98:          blk.10.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor   99:           blk.10.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor  100:             blk.11.attn_q.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  101:             blk.11.attn_k.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  102:             blk.11.attn_v.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  103:        blk.11.attn_output.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  104:           blk.11.ffn_gate.weight f16      [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor  105:             blk.11.ffn_up.weight f16      [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor  106:           blk.11.ffn_down.weight f16      [ 11008,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  107:          blk.11.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor  108:           blk.11.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor  109:             blk.12.attn_q.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  110:             blk.12.attn_k.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  111:             blk.12.attn_v.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  112:        blk.12.attn_output.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  113:           blk.12.ffn_gate.weight f16      [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor  114:             blk.12.ffn_up.weight f16      [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor  115:           blk.12.ffn_down.weight f16      [ 11008,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  116:          blk.12.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor  117:           blk.12.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor  118:             blk.13.attn_q.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  119:             blk.13.attn_k.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  120:             blk.13.attn_v.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  121:        blk.13.attn_output.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  122:           blk.13.ffn_gate.weight f16      [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor  123:             blk.13.ffn_up.weight f16      [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor  124:           blk.13.ffn_down.weight f16      [ 11008,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  125:          blk.13.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor  126:           blk.13.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor  127:             blk.14.attn_q.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  128:             blk.14.attn_k.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  129:             blk.14.attn_v.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  130:        blk.14.attn_output.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  131:           blk.14.ffn_gate.weight f16      [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor  132:             blk.14.ffn_up.weight f16      [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor  133:           blk.14.ffn_down.weight f16      [ 11008,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  134:          blk.14.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor  135:           blk.14.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor  136:             blk.15.attn_q.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  137:             blk.15.attn_k.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  138:             blk.15.attn_v.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  139:        blk.15.attn_output.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  140:           blk.15.ffn_gate.weight f16      [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor  141:             blk.15.ffn_up.weight f16      [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor  142:           blk.15.ffn_down.weight f16      [ 11008,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  143:          blk.15.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor  144:           blk.15.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor  145:             blk.16.attn_q.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  146:             blk.16.attn_k.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  147:             blk.16.attn_v.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  148:        blk.16.attn_output.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  149:           blk.16.ffn_gate.weight f16      [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor  150:             blk.16.ffn_up.weight f16      [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor  151:           blk.16.ffn_down.weight f16      [ 11008,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  152:          blk.16.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor  153:           blk.16.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor  154:             blk.17.attn_q.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  155:             blk.17.attn_k.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  156:             blk.17.attn_v.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  157:        blk.17.attn_output.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  158:           blk.17.ffn_gate.weight f16      [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor  159:             blk.17.ffn_up.weight f16      [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor  160:           blk.17.ffn_down.weight f16      [ 11008,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  161:          blk.17.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor  162:           blk.17.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor  163:             blk.18.attn_q.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  164:             blk.18.attn_k.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  165:             blk.18.attn_v.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  166:        blk.18.attn_output.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  167:           blk.18.ffn_gate.weight f16      [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor  168:             blk.18.ffn_up.weight f16      [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor  169:           blk.18.ffn_down.weight f16      [ 11008,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  170:          blk.18.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor  171:           blk.18.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor  172:             blk.19.attn_q.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  173:             blk.19.attn_k.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  174:             blk.19.attn_v.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  175:        blk.19.attn_output.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  176:           blk.19.ffn_gate.weight f16      [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor  177:             blk.19.ffn_up.weight f16      [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor  178:           blk.19.ffn_down.weight f16      [ 11008,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  179:          blk.19.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor  180:           blk.19.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor  181:             blk.20.attn_q.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  182:             blk.20.attn_k.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  183:             blk.20.attn_v.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  184:        blk.20.attn_output.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  185:           blk.20.ffn_gate.weight f16      [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor  186:             blk.20.ffn_up.weight f16      [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor  187:           blk.20.ffn_down.weight f16      [ 11008,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  188:          blk.20.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor  189:           blk.20.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor  190:             blk.21.attn_q.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  191:             blk.21.attn_k.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  192:             blk.21.attn_v.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  193:        blk.21.attn_output.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  194:           blk.21.ffn_gate.weight f16      [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor  195:             blk.21.ffn_up.weight f16      [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor  196:           blk.21.ffn_down.weight f16      [ 11008,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  197:          blk.21.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor  198:           blk.21.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor  199:             blk.22.attn_q.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  200:             blk.22.attn_k.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  201:             blk.22.attn_v.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  202:        blk.22.attn_output.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  203:           blk.22.ffn_gate.weight f16      [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor  204:             blk.22.ffn_up.weight f16      [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor  205:           blk.22.ffn_down.weight f16      [ 11008,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  206:          blk.22.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor  207:           blk.22.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor  208:             blk.23.attn_q.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  209:             blk.23.attn_k.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  210:             blk.23.attn_v.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  211:        blk.23.attn_output.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  212:           blk.23.ffn_gate.weight f16      [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor  213:             blk.23.ffn_up.weight f16      [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor  214:           blk.23.ffn_down.weight f16      [ 11008,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  215:          blk.23.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor  216:           blk.23.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor  217:             blk.24.attn_q.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  218:             blk.24.attn_k.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  219:             blk.24.attn_v.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  220:        blk.24.attn_output.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  221:           blk.24.ffn_gate.weight f16      [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor  222:             blk.24.ffn_up.weight f16      [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor  223:           blk.24.ffn_down.weight f16      [ 11008,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  224:          blk.24.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor  225:           blk.24.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor  226:             blk.25.attn_q.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  227:             blk.25.attn_k.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  228:             blk.25.attn_v.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  229:        blk.25.attn_output.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  230:           blk.25.ffn_gate.weight f16      [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor  231:             blk.25.ffn_up.weight f16      [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor  232:           blk.25.ffn_down.weight f16      [ 11008,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  233:          blk.25.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor  234:           blk.25.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor  235:             blk.26.attn_q.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  236:             blk.26.attn_k.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  237:             blk.26.attn_v.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  238:        blk.26.attn_output.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  239:           blk.26.ffn_gate.weight f16      [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor  240:             blk.26.ffn_up.weight f16      [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor  241:           blk.26.ffn_down.weight f16      [ 11008,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  242:          blk.26.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor  243:           blk.26.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor  244:             blk.27.attn_q.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  245:             blk.27.attn_k.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  246:             blk.27.attn_v.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  247:        blk.27.attn_output.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  248:           blk.27.ffn_gate.weight f16      [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor  249:             blk.27.ffn_up.weight f16      [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor  250:           blk.27.ffn_down.weight f16      [ 11008,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  251:          blk.27.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor  252:           blk.27.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor  253:             blk.28.attn_q.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  254:             blk.28.attn_k.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  255:             blk.28.attn_v.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  256:        blk.28.attn_output.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  257:           blk.28.ffn_gate.weight f16      [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor  258:             blk.28.ffn_up.weight f16      [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor  259:           blk.28.ffn_down.weight f16      [ 11008,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  260:          blk.28.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor  261:           blk.28.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor  262:             blk.29.attn_q.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  263:             blk.29.attn_k.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  264:             blk.29.attn_v.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  265:        blk.29.attn_output.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  266:           blk.29.ffn_gate.weight f16      [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor  267:             blk.29.ffn_up.weight f16      [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor  268:           blk.29.ffn_down.weight f16      [ 11008,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  269:          blk.29.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor  270:           blk.29.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor  271:             blk.30.attn_q.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  272:             blk.30.attn_k.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  273:             blk.30.attn_v.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  274:        blk.30.attn_output.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  275:           blk.30.ffn_gate.weight f16      [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor  276:             blk.30.ffn_up.weight f16      [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor  277:           blk.30.ffn_down.weight f16      [ 11008,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  278:          blk.30.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor  279:           blk.30.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor  280:             blk.31.attn_q.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  281:             blk.31.attn_k.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  282:             blk.31.attn_v.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  283:        blk.31.attn_output.weight f16      [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  284:           blk.31.ffn_gate.weight f16      [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor  285:             blk.31.ffn_up.weight f16      [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor  286:           blk.31.ffn_down.weight f16      [ 11008,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  287:          blk.31.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor  288:           blk.31.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor  289:               output_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor  290:                    output.weight f16      [  4096, 32016,     1,     1 ]\n",
+            "llama_model_loader: - kv   0:                       general.architecture str     \n",
+            "llama_model_loader: - kv   1:                               general.name str     \n",
+            "llama_model_loader: - kv   2:                       llama.context_length u32     \n",
+            "llama_model_loader: - kv   3:                     llama.embedding_length u32     \n",
+            "llama_model_loader: - kv   4:                          llama.block_count u32     \n",
+            "llama_model_loader: - kv   5:                  llama.feed_forward_length u32     \n",
+            "llama_model_loader: - kv   6:                 llama.rope.dimension_count u32     \n",
+            "llama_model_loader: - kv   7:                 llama.attention.head_count u32     \n",
+            "llama_model_loader: - kv   8:              llama.attention.head_count_kv u32     \n",
+            "llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32     \n",
+            "llama_model_loader: - kv  10:                       llama.rope.freq_base f32     \n",
+            "llama_model_loader: - kv  11:                          general.file_type u32     \n",
+            "llama_model_loader: - kv  12:                       tokenizer.ggml.model str     \n",
+            "llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr     \n",
+            "llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr     \n",
+            "llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr     \n",
+            "llama_model_loader: - type  f32:   65 tensors\n",
+            "llama_model_loader: - type  f16:  226 tensors\n",
+            "llama_model_quantize_internal: meta size = 741408 bytes\n",
+            "[   1/ 291]                    token_embd.weight - [ 4096, 32016,     1,     1], type =    f16, quantizing to q4_K .. size =   250.12 MB ->    70.35 MB | hist: \n",
+            "[   2/ 291]                  blk.0.attn_q.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[   3/ 291]                  blk.0.attn_k.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[   4/ 291]                  blk.0.attn_v.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q5_K .. size =    32.00 MB ->    11.00 MB | hist: \n",
+            "[   5/ 291]             blk.0.attn_output.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[   6/ 291]                blk.0.ffn_gate.weight - [ 4096, 11008,     1,     1], type =    f16, quantizing to q4_K .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[   7/ 291]                  blk.0.ffn_up.weight - [ 4096, 11008,     1,     1], type =    f16, quantizing to q4_K .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[   8/ 291]                blk.0.ffn_down.weight - [11008,  4096,     1,     1], type =    f16, quantizing to q5_K .. size =    86.00 MB ->    29.56 MB | hist: \n",
+            "[   9/ 291]               blk.0.attn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
+            "[  10/ 291]                blk.0.ffn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
+            "[  11/ 291]                  blk.1.attn_q.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[  12/ 291]                  blk.1.attn_k.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[  13/ 291]                  blk.1.attn_v.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q5_K .. size =    32.00 MB ->    11.00 MB | hist: \n",
+            "[  14/ 291]             blk.1.attn_output.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[  15/ 291]                blk.1.ffn_gate.weight - [ 4096, 11008,     1,     1], type =    f16, quantizing to q4_K .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[  16/ 291]                  blk.1.ffn_up.weight - [ 4096, 11008,     1,     1], type =    f16, quantizing to q4_K .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[  17/ 291]                blk.1.ffn_down.weight - [11008,  4096,     1,     1], type =    f16, quantizing to q5_K .. size =    86.00 MB ->    29.56 MB | hist: \n",
+            "[  18/ 291]               blk.1.attn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
+            "[  19/ 291]                blk.1.ffn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
+            "[  20/ 291]                  blk.2.attn_q.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[  21/ 291]                  blk.2.attn_k.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[  22/ 291]                  blk.2.attn_v.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q5_K .. size =    32.00 MB ->    11.00 MB | hist: \n",
+            "[  23/ 291]             blk.2.attn_output.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[  24/ 291]                blk.2.ffn_gate.weight - [ 4096, 11008,     1,     1], type =    f16, quantizing to q4_K .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[  25/ 291]                  blk.2.ffn_up.weight - [ 4096, 11008,     1,     1], type =    f16, quantizing to q4_K .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[  26/ 291]                blk.2.ffn_down.weight - [11008,  4096,     1,     1], type =    f16, quantizing to q5_K .. size =    86.00 MB ->    29.56 MB | hist: \n",
+            "[  27/ 291]               blk.2.attn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
+            "[  28/ 291]                blk.2.ffn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
+            "[  29/ 291]                  blk.3.attn_q.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[  30/ 291]                  blk.3.attn_k.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[  31/ 291]                  blk.3.attn_v.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q5_K .. size =    32.00 MB ->    11.00 MB | hist: \n",
+            "[  32/ 291]             blk.3.attn_output.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[  33/ 291]                blk.3.ffn_gate.weight - [ 4096, 11008,     1,     1], type =    f16, quantizing to q4_K .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[  34/ 291]                  blk.3.ffn_up.weight - [ 4096, 11008,     1,     1], type =    f16, quantizing to q4_K .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[  35/ 291]                blk.3.ffn_down.weight - [11008,  4096,     1,     1], type =    f16, quantizing to q5_K .. size =    86.00 MB ->    29.56 MB | hist: \n",
+            "[  36/ 291]               blk.3.attn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
+            "[  37/ 291]                blk.3.ffn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
+            "[  38/ 291]                  blk.4.attn_q.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[  39/ 291]                  blk.4.attn_k.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[  40/ 291]                  blk.4.attn_v.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[  41/ 291]             blk.4.attn_output.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[  42/ 291]                blk.4.ffn_gate.weight - [ 4096, 11008,     1,     1], type =    f16, quantizing to q4_K .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[  43/ 291]                  blk.4.ffn_up.weight - [ 4096, 11008,     1,     1], type =    f16, quantizing to q4_K .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[  44/ 291]                blk.4.ffn_down.weight - [11008,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[  45/ 291]               blk.4.attn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
+            "[  46/ 291]                blk.4.ffn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
+            "[  47/ 291]                  blk.5.attn_q.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[  48/ 291]                  blk.5.attn_k.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[  49/ 291]                  blk.5.attn_v.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[  50/ 291]             blk.5.attn_output.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[  51/ 291]                blk.5.ffn_gate.weight - [ 4096, 11008,     1,     1], type =    f16, quantizing to q4_K .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[  52/ 291]                  blk.5.ffn_up.weight - [ 4096, 11008,     1,     1], type =    f16, quantizing to q4_K .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[  53/ 291]                blk.5.ffn_down.weight - [11008,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[  54/ 291]               blk.5.attn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
+            "[  55/ 291]                blk.5.ffn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
+            "[  56/ 291]                  blk.6.attn_q.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[  57/ 291]                  blk.6.attn_k.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[  58/ 291]                  blk.6.attn_v.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[  59/ 291]             blk.6.attn_output.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[  60/ 291]                blk.6.ffn_gate.weight - [ 4096, 11008,     1,     1], type =    f16, quantizing to q4_K .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[  61/ 291]                  blk.6.ffn_up.weight - [ 4096, 11008,     1,     1], type =    f16, quantizing to q4_K .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[  62/ 291]                blk.6.ffn_down.weight - [11008,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[  63/ 291]               blk.6.attn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
+            "[  64/ 291]                blk.6.ffn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
+            "[  65/ 291]                  blk.7.attn_q.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[  66/ 291]                  blk.7.attn_k.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[  67/ 291]                  blk.7.attn_v.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[  68/ 291]             blk.7.attn_output.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[  69/ 291]                blk.7.ffn_gate.weight - [ 4096, 11008,     1,     1], type =    f16, quantizing to q4_K .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[  70/ 291]                  blk.7.ffn_up.weight - [ 4096, 11008,     1,     1], type =    f16, quantizing to q4_K .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[  71/ 291]                blk.7.ffn_down.weight - [11008,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[  72/ 291]               blk.7.attn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
+            "[  73/ 291]                blk.7.ffn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
+            "[  74/ 291]                  blk.8.attn_q.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[  75/ 291]                  blk.8.attn_k.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[  76/ 291]                  blk.8.attn_v.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[  77/ 291]             blk.8.attn_output.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[  78/ 291]                blk.8.ffn_gate.weight - [ 4096, 11008,     1,     1], type =    f16, quantizing to q4_K .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[  79/ 291]                  blk.8.ffn_up.weight - [ 4096, 11008,     1,     1], type =    f16, quantizing to q4_K .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[  80/ 291]                blk.8.ffn_down.weight - [11008,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[  81/ 291]               blk.8.attn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
+            "[  82/ 291]                blk.8.ffn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
+            "[  83/ 291]                  blk.9.attn_q.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[  84/ 291]                  blk.9.attn_k.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[  85/ 291]                  blk.9.attn_v.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[  86/ 291]             blk.9.attn_output.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[  87/ 291]                blk.9.ffn_gate.weight - [ 4096, 11008,     1,     1], type =    f16, quantizing to q4_K .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[  88/ 291]                  blk.9.ffn_up.weight - [ 4096, 11008,     1,     1], type =    f16, quantizing to q4_K .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[  89/ 291]                blk.9.ffn_down.weight - [11008,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[  90/ 291]               blk.9.attn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
+            "[  91/ 291]                blk.9.ffn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
+            "[  92/ 291]                 blk.10.attn_q.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[  93/ 291]                 blk.10.attn_k.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[  94/ 291]                 blk.10.attn_v.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[  95/ 291]            blk.10.attn_output.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[  96/ 291]               blk.10.ffn_gate.weight - [ 4096, 11008,     1,     1], type =    f16, quantizing to q4_K .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[  97/ 291]                 blk.10.ffn_up.weight - [ 4096, 11008,     1,     1], type =    f16, quantizing to q4_K .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[  98/ 291]               blk.10.ffn_down.weight - [11008,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[  99/ 291]              blk.10.attn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
+            "[ 100/ 291]               blk.10.ffn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
+            "[ 101/ 291]                 blk.11.attn_q.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 102/ 291]                 blk.11.attn_k.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 103/ 291]                 blk.11.attn_v.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 104/ 291]            blk.11.attn_output.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 105/ 291]               blk.11.ffn_gate.weight - [ 4096, 11008,     1,     1], type =    f16, quantizing to q4_K .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 106/ 291]                 blk.11.ffn_up.weight - [ 4096, 11008,     1,     1], type =    f16, quantizing to q4_K .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 107/ 291]               blk.11.ffn_down.weight - [11008,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 108/ 291]              blk.11.attn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
+            "[ 109/ 291]               blk.11.ffn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
+            "[ 110/ 291]                 blk.12.attn_q.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 111/ 291]                 blk.12.attn_k.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 112/ 291]                 blk.12.attn_v.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 113/ 291]            blk.12.attn_output.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 114/ 291]               blk.12.ffn_gate.weight - [ 4096, 11008,     1,     1], type =    f16, quantizing to q4_K .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 115/ 291]                 blk.12.ffn_up.weight - [ 4096, 11008,     1,     1], type =    f16, quantizing to q4_K .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 116/ 291]               blk.12.ffn_down.weight - [11008,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 117/ 291]              blk.12.attn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
+            "[ 118/ 291]               blk.12.ffn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
+            "[ 119/ 291]                 blk.13.attn_q.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 120/ 291]                 blk.13.attn_k.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 121/ 291]                 blk.13.attn_v.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 122/ 291]            blk.13.attn_output.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 123/ 291]               blk.13.ffn_gate.weight - [ 4096, 11008,     1,     1], type =    f16, quantizing to q4_K .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 124/ 291]                 blk.13.ffn_up.weight - [ 4096, 11008,     1,     1], type =    f16, quantizing to q4_K .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 125/ 291]               blk.13.ffn_down.weight - [11008,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 126/ 291]              blk.13.attn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
+            "[ 127/ 291]               blk.13.ffn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
+            "[ 128/ 291]                 blk.14.attn_q.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 129/ 291]                 blk.14.attn_k.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 130/ 291]                 blk.14.attn_v.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 131/ 291]            blk.14.attn_output.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 132/ 291]               blk.14.ffn_gate.weight - [ 4096, 11008,     1,     1], type =    f16, quantizing to q4_K .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 133/ 291]                 blk.14.ffn_up.weight - [ 4096, 11008,     1,     1], type =    f16, quantizing to q4_K .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 134/ 291]               blk.14.ffn_down.weight - [11008,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 135/ 291]              blk.14.attn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
+            "[ 136/ 291]               blk.14.ffn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
+            "[ 137/ 291]                 blk.15.attn_q.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 138/ 291]                 blk.15.attn_k.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 139/ 291]                 blk.15.attn_v.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 140/ 291]            blk.15.attn_output.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 141/ 291]               blk.15.ffn_gate.weight - [ 4096, 11008,     1,     1], type =    f16, quantizing to q4_K .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 142/ 291]                 blk.15.ffn_up.weight - [ 4096, 11008,     1,     1], type =    f16, quantizing to q4_K .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 143/ 291]               blk.15.ffn_down.weight - [11008,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 144/ 291]              blk.15.attn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
+            "[ 145/ 291]               blk.15.ffn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
+            "[ 146/ 291]                 blk.16.attn_q.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 147/ 291]                 blk.16.attn_k.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 148/ 291]                 blk.16.attn_v.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 149/ 291]            blk.16.attn_output.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 150/ 291]               blk.16.ffn_gate.weight - [ 4096, 11008,     1,     1], type =    f16, quantizing to q4_K .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 151/ 291]                 blk.16.ffn_up.weight - [ 4096, 11008,     1,     1], type =    f16, quantizing to q4_K .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 152/ 291]               blk.16.ffn_down.weight - [11008,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 153/ 291]              blk.16.attn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
+            "[ 154/ 291]               blk.16.ffn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
+            "[ 155/ 291]                 blk.17.attn_q.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 156/ 291]                 blk.17.attn_k.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 157/ 291]                 blk.17.attn_v.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 158/ 291]            blk.17.attn_output.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 159/ 291]               blk.17.ffn_gate.weight - [ 4096, 11008,     1,     1], type =    f16, quantizing to q4_K .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 160/ 291]                 blk.17.ffn_up.weight - [ 4096, 11008,     1,     1], type =    f16, quantizing to q4_K .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 161/ 291]               blk.17.ffn_down.weight - [11008,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 162/ 291]              blk.17.attn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
+            "[ 163/ 291]               blk.17.ffn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
+            "[ 164/ 291]                 blk.18.attn_q.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 165/ 291]                 blk.18.attn_k.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 166/ 291]                 blk.18.attn_v.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 167/ 291]            blk.18.attn_output.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 168/ 291]               blk.18.ffn_gate.weight - [ 4096, 11008,     1,     1], type =    f16, quantizing to q4_K .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 169/ 291]                 blk.18.ffn_up.weight - [ 4096, 11008,     1,     1], type =    f16, quantizing to q4_K .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 170/ 291]               blk.18.ffn_down.weight - [11008,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 171/ 291]              blk.18.attn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
+            "[ 172/ 291]               blk.18.ffn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
+            "[ 173/ 291]                 blk.19.attn_q.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 174/ 291]                 blk.19.attn_k.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 175/ 291]                 blk.19.attn_v.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 176/ 291]            blk.19.attn_output.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 177/ 291]               blk.19.ffn_gate.weight - [ 4096, 11008,     1,     1], type =    f16, quantizing to q4_K .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 178/ 291]                 blk.19.ffn_up.weight - [ 4096, 11008,     1,     1], type =    f16, quantizing to q4_K .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 179/ 291]               blk.19.ffn_down.weight - [11008,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 180/ 291]              blk.19.attn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
+            "[ 181/ 291]               blk.19.ffn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
+            "[ 182/ 291]                 blk.20.attn_q.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 183/ 291]                 blk.20.attn_k.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 184/ 291]                 blk.20.attn_v.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 185/ 291]            blk.20.attn_output.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 186/ 291]               blk.20.ffn_gate.weight - [ 4096, 11008,     1,     1], type =    f16, quantizing to q4_K .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 187/ 291]                 blk.20.ffn_up.weight - [ 4096, 11008,     1,     1], type =    f16, quantizing to q4_K .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 188/ 291]               blk.20.ffn_down.weight - [11008,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 189/ 291]              blk.20.attn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
+            "[ 190/ 291]               blk.20.ffn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
+            "[ 191/ 291]                 blk.21.attn_q.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 192/ 291]                 blk.21.attn_k.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 193/ 291]                 blk.21.attn_v.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 194/ 291]            blk.21.attn_output.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 195/ 291]               blk.21.ffn_gate.weight - [ 4096, 11008,     1,     1], type =    f16, quantizing to q4_K .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 196/ 291]                 blk.21.ffn_up.weight - [ 4096, 11008,     1,     1], type =    f16, quantizing to q4_K .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 197/ 291]               blk.21.ffn_down.weight - [11008,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 198/ 291]              blk.21.attn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
+            "[ 199/ 291]               blk.21.ffn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
+            "[ 200/ 291]                 blk.22.attn_q.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 201/ 291]                 blk.22.attn_k.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 202/ 291]                 blk.22.attn_v.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 203/ 291]            blk.22.attn_output.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 204/ 291]               blk.22.ffn_gate.weight - [ 4096, 11008,     1,     1], type =    f16, quantizing to q4_K .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 205/ 291]                 blk.22.ffn_up.weight - [ 4096, 11008,     1,     1], type =    f16, quantizing to q4_K .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 206/ 291]               blk.22.ffn_down.weight - [11008,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 207/ 291]              blk.22.attn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
+            "[ 208/ 291]               blk.22.ffn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
+            "[ 209/ 291]                 blk.23.attn_q.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 210/ 291]                 blk.23.attn_k.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 211/ 291]                 blk.23.attn_v.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 212/ 291]            blk.23.attn_output.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 213/ 291]               blk.23.ffn_gate.weight - [ 4096, 11008,     1,     1], type =    f16, quantizing to q4_K .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 214/ 291]                 blk.23.ffn_up.weight - [ 4096, 11008,     1,     1], type =    f16, quantizing to q4_K .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 215/ 291]               blk.23.ffn_down.weight - [11008,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 216/ 291]              blk.23.attn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
+            "[ 217/ 291]               blk.23.ffn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
+            "[ 218/ 291]                 blk.24.attn_q.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 219/ 291]                 blk.24.attn_k.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 220/ 291]                 blk.24.attn_v.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 221/ 291]            blk.24.attn_output.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 222/ 291]               blk.24.ffn_gate.weight - [ 4096, 11008,     1,     1], type =    f16, quantizing to q4_K .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 223/ 291]                 blk.24.ffn_up.weight - [ 4096, 11008,     1,     1], type =    f16, quantizing to q4_K .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 224/ 291]               blk.24.ffn_down.weight - [11008,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 225/ 291]              blk.24.attn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
+            "[ 226/ 291]               blk.24.ffn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
+            "[ 227/ 291]                 blk.25.attn_q.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 228/ 291]                 blk.25.attn_k.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 229/ 291]                 blk.25.attn_v.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 230/ 291]            blk.25.attn_output.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 231/ 291]               blk.25.ffn_gate.weight - [ 4096, 11008,     1,     1], type =    f16, quantizing to q4_K .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 232/ 291]                 blk.25.ffn_up.weight - [ 4096, 11008,     1,     1], type =    f16, quantizing to q4_K .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 233/ 291]               blk.25.ffn_down.weight - [11008,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 234/ 291]              blk.25.attn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
+            "[ 235/ 291]               blk.25.ffn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
+            "[ 236/ 291]                 blk.26.attn_q.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 237/ 291]                 blk.26.attn_k.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 238/ 291]                 blk.26.attn_v.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 239/ 291]            blk.26.attn_output.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 240/ 291]               blk.26.ffn_gate.weight - [ 4096, 11008,     1,     1], type =    f16, quantizing to q4_K .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 241/ 291]                 blk.26.ffn_up.weight - [ 4096, 11008,     1,     1], type =    f16, quantizing to q4_K .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 242/ 291]               blk.26.ffn_down.weight - [11008,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 243/ 291]              blk.26.attn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
+            "[ 244/ 291]               blk.26.ffn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
+            "[ 245/ 291]                 blk.27.attn_q.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 246/ 291]                 blk.27.attn_k.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 247/ 291]                 blk.27.attn_v.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 248/ 291]            blk.27.attn_output.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 249/ 291]               blk.27.ffn_gate.weight - [ 4096, 11008,     1,     1], type =    f16, quantizing to q4_K .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 250/ 291]                 blk.27.ffn_up.weight - [ 4096, 11008,     1,     1], type =    f16, quantizing to q4_K .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 251/ 291]               blk.27.ffn_down.weight - [11008,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 252/ 291]              blk.27.attn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
+            "[ 253/ 291]               blk.27.ffn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
+            "[ 254/ 291]                 blk.28.attn_q.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 255/ 291]                 blk.28.attn_k.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 256/ 291]                 blk.28.attn_v.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 257/ 291]            blk.28.attn_output.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 258/ 291]               blk.28.ffn_gate.weight - [ 4096, 11008,     1,     1], type =    f16, quantizing to q4_K .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 259/ 291]                 blk.28.ffn_up.weight - [ 4096, 11008,     1,     1], type =    f16, quantizing to q4_K .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 260/ 291]               blk.28.ffn_down.weight - [11008,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 261/ 291]              blk.28.attn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
+            "[ 262/ 291]               blk.28.ffn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
+            "[ 263/ 291]                 blk.29.attn_q.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 264/ 291]                 blk.29.attn_k.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 265/ 291]                 blk.29.attn_v.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 266/ 291]            blk.29.attn_output.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 267/ 291]               blk.29.ffn_gate.weight - [ 4096, 11008,     1,     1], type =    f16, quantizing to q4_K .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 268/ 291]                 blk.29.ffn_up.weight - [ 4096, 11008,     1,     1], type =    f16, quantizing to q4_K .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 269/ 291]               blk.29.ffn_down.weight - [11008,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 270/ 291]              blk.29.attn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
+            "[ 271/ 291]               blk.29.ffn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
+            "[ 272/ 291]                 blk.30.attn_q.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 273/ 291]                 blk.30.attn_k.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 274/ 291]                 blk.30.attn_v.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 275/ 291]            blk.30.attn_output.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 276/ 291]               blk.30.ffn_gate.weight - [ 4096, 11008,     1,     1], type =    f16, quantizing to q4_K .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 277/ 291]                 blk.30.ffn_up.weight - [ 4096, 11008,     1,     1], type =    f16, quantizing to q4_K .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 278/ 291]               blk.30.ffn_down.weight - [11008,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 279/ 291]              blk.30.attn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
+            "[ 280/ 291]               blk.30.ffn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
+            "[ 281/ 291]                 blk.31.attn_q.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 282/ 291]                 blk.31.attn_k.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 283/ 291]                 blk.31.attn_v.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 284/ 291]            blk.31.attn_output.weight - [ 4096,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    32.00 MB ->     9.00 MB | hist: \n",
+            "[ 285/ 291]               blk.31.ffn_gate.weight - [ 4096, 11008,     1,     1], type =    f16, quantizing to q4_K .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 286/ 291]                 blk.31.ffn_up.weight - [ 4096, 11008,     1,     1], type =    f16, quantizing to q4_K .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 287/ 291]               blk.31.ffn_down.weight - [11008,  4096,     1,     1], type =    f16, quantizing to q4_K .. size =    86.00 MB ->    24.19 MB | hist: \n",
+            "[ 288/ 291]              blk.31.attn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
+            "[ 289/ 291]               blk.31.ffn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
+            "[ 290/ 291]                   output_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
+            "[ 291/ 291]                        output.weight - [ 4096, 32016,     1,     1], type =    f16, quantizing to q6_K .. size =   250.12 MB ->   102.59 MB | hist: \n",
+            "llama_model_quantize_internal: model size  = 12853.27 MB\n",
+            "llama_model_quantize_internal: quant size  =  3677.45 MB\n",
+            "\n",
+            "main: quantize time = 1089230.46 ms\n",
+            "main:    total time = 1089230.46 ms\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Run inference\n",
+        "\n",
+        "Here is a simple script to run your quantized models. I'm offloading every layer to the GPU (35 for a 7b parameter model) to speed up inference."
+      ],
+      "metadata": {
+        "id": "WqI1CPiXI4dP"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import os\n",
+        "\n",
+        "model_list = [file for file in os.listdir(MODEL_NAME) if \"gguf\" in file]\n",
+        "\n",
+        "prompt = input(\"Enter your prompt: \")\n",
+        "chosen_method = input(\"Name of the model (options: \" + \", \".join(model_list) + \"): \")\n",
+        "\n",
+        "# Verify the chosen method is in the list\n",
+        "if chosen_method not in model_list:\n",
+        "    print(\"Invalid name\")\n",
+        "else:\n",
+        "    qtype = f\"{MODEL_NAME}/{MODEL_NAME.lower()}.{method.upper()}.gguf\"\n",
+        "    !./llama.cpp/main -m {qtype} -n 128 --color -ngl 35 -p \"{prompt}\""
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "vNPL9WYg78l-",
+        "outputId": "3c3e7d2f-f0de-429d-fd97-dab480bc514a"
+      },
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Enter your prompt: prompt\n",
+            "Please specify the quantization method to run the model (options: q4_k_s): q4_k_s\n",
+            "main: build = 1100 (dd0dc36)\n",
+            "main: seed  = 1693227123\n",
+            "ggml_init_cublas: found 1 CUDA devices:\n",
+            "  Device 0: Tesla T4, compute capability 7.5\n",
+            "llama_model_loader: loaded meta data with 17 key-value pairs and 291 tensors from EvolCodeLlama-7b/evolcodellama-7b.gguf.q4_k_s.bin (version GGUF V2 (latest))\n",
+            "llama_model_loader: - tensor    0:                token_embd.weight q4_K     [  4096, 32016,     1,     1 ]\n",
+            "llama_model_loader: - tensor    1:              blk.0.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor    2:              blk.0.attn_k.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor    3:              blk.0.attn_v.weight q5_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor    4:         blk.0.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor    5:            blk.0.ffn_gate.weight q4_K     [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor    6:              blk.0.ffn_up.weight q4_K     [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor    7:            blk.0.ffn_down.weight q5_K     [ 11008,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor    8:           blk.0.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor    9:            blk.0.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor   10:              blk.1.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   11:              blk.1.attn_k.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   12:              blk.1.attn_v.weight q5_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   13:         blk.1.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   14:            blk.1.ffn_gate.weight q4_K     [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor   15:              blk.1.ffn_up.weight q4_K     [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor   16:            blk.1.ffn_down.weight q5_K     [ 11008,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   17:           blk.1.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor   18:            blk.1.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor   19:              blk.2.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   20:              blk.2.attn_k.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   21:              blk.2.attn_v.weight q5_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   22:         blk.2.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   23:            blk.2.ffn_gate.weight q4_K     [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor   24:              blk.2.ffn_up.weight q4_K     [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor   25:            blk.2.ffn_down.weight q5_K     [ 11008,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   26:           blk.2.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor   27:            blk.2.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor   28:              blk.3.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   29:              blk.3.attn_k.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   30:              blk.3.attn_v.weight q5_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   31:         blk.3.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   32:            blk.3.ffn_gate.weight q4_K     [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor   33:              blk.3.ffn_up.weight q4_K     [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor   34:            blk.3.ffn_down.weight q5_K     [ 11008,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   35:           blk.3.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor   36:            blk.3.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor   37:              blk.4.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   38:              blk.4.attn_k.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   39:              blk.4.attn_v.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   40:         blk.4.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   41:            blk.4.ffn_gate.weight q4_K     [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor   42:              blk.4.ffn_up.weight q4_K     [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor   43:            blk.4.ffn_down.weight q4_K     [ 11008,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   44:           blk.4.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor   45:            blk.4.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor   46:              blk.5.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   47:              blk.5.attn_k.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   48:              blk.5.attn_v.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   49:         blk.5.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   50:            blk.5.ffn_gate.weight q4_K     [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor   51:              blk.5.ffn_up.weight q4_K     [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor   52:            blk.5.ffn_down.weight q4_K     [ 11008,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   53:           blk.5.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor   54:            blk.5.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor   55:              blk.6.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   56:              blk.6.attn_k.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   57:              blk.6.attn_v.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   58:         blk.6.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   59:            blk.6.ffn_gate.weight q4_K     [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor   60:              blk.6.ffn_up.weight q4_K     [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor   61:            blk.6.ffn_down.weight q4_K     [ 11008,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   62:           blk.6.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor   63:            blk.6.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor   64:              blk.7.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   65:              blk.7.attn_k.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   66:              blk.7.attn_v.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   67:         blk.7.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   68:            blk.7.ffn_gate.weight q4_K     [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor   69:              blk.7.ffn_up.weight q4_K     [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor   70:            blk.7.ffn_down.weight q4_K     [ 11008,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   71:           blk.7.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor   72:            blk.7.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor   73:              blk.8.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   74:              blk.8.attn_k.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   75:              blk.8.attn_v.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   76:         blk.8.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   77:            blk.8.ffn_gate.weight q4_K     [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor   78:              blk.8.ffn_up.weight q4_K     [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor   79:            blk.8.ffn_down.weight q4_K     [ 11008,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   80:           blk.8.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor   81:            blk.8.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor   82:              blk.9.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   83:              blk.9.attn_k.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   84:              blk.9.attn_v.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   85:         blk.9.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   86:            blk.9.ffn_gate.weight q4_K     [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor   87:              blk.9.ffn_up.weight q4_K     [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor   88:            blk.9.ffn_down.weight q4_K     [ 11008,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   89:           blk.9.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor   90:            blk.9.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor   91:             blk.10.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   92:             blk.10.attn_k.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   93:             blk.10.attn_v.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   94:        blk.10.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   95:           blk.10.ffn_gate.weight q4_K     [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor   96:             blk.10.ffn_up.weight q4_K     [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor   97:           blk.10.ffn_down.weight q4_K     [ 11008,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor   98:          blk.10.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor   99:           blk.10.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor  100:             blk.11.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  101:             blk.11.attn_k.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  102:             blk.11.attn_v.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  103:        blk.11.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  104:           blk.11.ffn_gate.weight q4_K     [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor  105:             blk.11.ffn_up.weight q4_K     [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor  106:           blk.11.ffn_down.weight q4_K     [ 11008,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  107:          blk.11.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor  108:           blk.11.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor  109:             blk.12.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  110:             blk.12.attn_k.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  111:             blk.12.attn_v.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  112:        blk.12.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  113:           blk.12.ffn_gate.weight q4_K     [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor  114:             blk.12.ffn_up.weight q4_K     [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor  115:           blk.12.ffn_down.weight q4_K     [ 11008,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  116:          blk.12.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor  117:           blk.12.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor  118:             blk.13.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  119:             blk.13.attn_k.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  120:             blk.13.attn_v.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  121:        blk.13.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  122:           blk.13.ffn_gate.weight q4_K     [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor  123:             blk.13.ffn_up.weight q4_K     [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor  124:           blk.13.ffn_down.weight q4_K     [ 11008,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  125:          blk.13.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor  126:           blk.13.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor  127:             blk.14.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  128:             blk.14.attn_k.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  129:             blk.14.attn_v.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  130:        blk.14.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  131:           blk.14.ffn_gate.weight q4_K     [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor  132:             blk.14.ffn_up.weight q4_K     [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor  133:           blk.14.ffn_down.weight q4_K     [ 11008,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  134:          blk.14.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor  135:           blk.14.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor  136:             blk.15.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  137:             blk.15.attn_k.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  138:             blk.15.attn_v.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  139:        blk.15.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  140:           blk.15.ffn_gate.weight q4_K     [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor  141:             blk.15.ffn_up.weight q4_K     [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor  142:           blk.15.ffn_down.weight q4_K     [ 11008,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  143:          blk.15.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor  144:           blk.15.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor  145:             blk.16.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  146:             blk.16.attn_k.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  147:             blk.16.attn_v.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  148:        blk.16.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  149:           blk.16.ffn_gate.weight q4_K     [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor  150:             blk.16.ffn_up.weight q4_K     [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor  151:           blk.16.ffn_down.weight q4_K     [ 11008,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  152:          blk.16.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor  153:           blk.16.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor  154:             blk.17.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  155:             blk.17.attn_k.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  156:             blk.17.attn_v.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  157:        blk.17.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  158:           blk.17.ffn_gate.weight q4_K     [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor  159:             blk.17.ffn_up.weight q4_K     [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor  160:           blk.17.ffn_down.weight q4_K     [ 11008,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  161:          blk.17.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor  162:           blk.17.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor  163:             blk.18.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  164:             blk.18.attn_k.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  165:             blk.18.attn_v.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  166:        blk.18.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  167:           blk.18.ffn_gate.weight q4_K     [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor  168:             blk.18.ffn_up.weight q4_K     [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor  169:           blk.18.ffn_down.weight q4_K     [ 11008,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  170:          blk.18.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor  171:           blk.18.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor  172:             blk.19.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  173:             blk.19.attn_k.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  174:             blk.19.attn_v.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  175:        blk.19.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  176:           blk.19.ffn_gate.weight q4_K     [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor  177:             blk.19.ffn_up.weight q4_K     [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor  178:           blk.19.ffn_down.weight q4_K     [ 11008,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  179:          blk.19.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor  180:           blk.19.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor  181:             blk.20.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  182:             blk.20.attn_k.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  183:             blk.20.attn_v.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  184:        blk.20.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  185:           blk.20.ffn_gate.weight q4_K     [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor  186:             blk.20.ffn_up.weight q4_K     [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor  187:           blk.20.ffn_down.weight q4_K     [ 11008,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  188:          blk.20.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor  189:           blk.20.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor  190:             blk.21.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  191:             blk.21.attn_k.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  192:             blk.21.attn_v.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  193:        blk.21.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  194:           blk.21.ffn_gate.weight q4_K     [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor  195:             blk.21.ffn_up.weight q4_K     [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor  196:           blk.21.ffn_down.weight q4_K     [ 11008,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  197:          blk.21.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor  198:           blk.21.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor  199:             blk.22.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  200:             blk.22.attn_k.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  201:             blk.22.attn_v.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  202:        blk.22.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  203:           blk.22.ffn_gate.weight q4_K     [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor  204:             blk.22.ffn_up.weight q4_K     [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor  205:           blk.22.ffn_down.weight q4_K     [ 11008,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  206:          blk.22.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor  207:           blk.22.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor  208:             blk.23.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  209:             blk.23.attn_k.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  210:             blk.23.attn_v.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  211:        blk.23.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  212:           blk.23.ffn_gate.weight q4_K     [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor  213:             blk.23.ffn_up.weight q4_K     [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor  214:           blk.23.ffn_down.weight q4_K     [ 11008,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  215:          blk.23.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor  216:           blk.23.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor  217:             blk.24.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  218:             blk.24.attn_k.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  219:             blk.24.attn_v.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  220:        blk.24.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  221:           blk.24.ffn_gate.weight q4_K     [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor  222:             blk.24.ffn_up.weight q4_K     [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor  223:           blk.24.ffn_down.weight q4_K     [ 11008,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  224:          blk.24.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor  225:           blk.24.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor  226:             blk.25.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  227:             blk.25.attn_k.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  228:             blk.25.attn_v.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  229:        blk.25.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  230:           blk.25.ffn_gate.weight q4_K     [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor  231:             blk.25.ffn_up.weight q4_K     [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor  232:           blk.25.ffn_down.weight q4_K     [ 11008,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  233:          blk.25.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor  234:           blk.25.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor  235:             blk.26.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  236:             blk.26.attn_k.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  237:             blk.26.attn_v.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  238:        blk.26.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  239:           blk.26.ffn_gate.weight q4_K     [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor  240:             blk.26.ffn_up.weight q4_K     [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor  241:           blk.26.ffn_down.weight q4_K     [ 11008,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  242:          blk.26.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor  243:           blk.26.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor  244:             blk.27.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  245:             blk.27.attn_k.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  246:             blk.27.attn_v.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  247:        blk.27.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  248:           blk.27.ffn_gate.weight q4_K     [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor  249:             blk.27.ffn_up.weight q4_K     [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor  250:           blk.27.ffn_down.weight q4_K     [ 11008,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  251:          blk.27.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor  252:           blk.27.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor  253:             blk.28.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  254:             blk.28.attn_k.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  255:             blk.28.attn_v.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  256:        blk.28.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  257:           blk.28.ffn_gate.weight q4_K     [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor  258:             blk.28.ffn_up.weight q4_K     [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor  259:           blk.28.ffn_down.weight q4_K     [ 11008,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  260:          blk.28.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor  261:           blk.28.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor  262:             blk.29.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  263:             blk.29.attn_k.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  264:             blk.29.attn_v.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  265:        blk.29.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  266:           blk.29.ffn_gate.weight q4_K     [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor  267:             blk.29.ffn_up.weight q4_K     [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor  268:           blk.29.ffn_down.weight q4_K     [ 11008,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  269:          blk.29.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor  270:           blk.29.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor  271:             blk.30.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  272:             blk.30.attn_k.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  273:             blk.30.attn_v.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  274:        blk.30.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  275:           blk.30.ffn_gate.weight q4_K     [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor  276:             blk.30.ffn_up.weight q4_K     [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor  277:           blk.30.ffn_down.weight q4_K     [ 11008,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  278:          blk.30.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor  279:           blk.30.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor  280:             blk.31.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  281:             blk.31.attn_k.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  282:             blk.31.attn_v.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  283:        blk.31.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  284:           blk.31.ffn_gate.weight q4_K     [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor  285:             blk.31.ffn_up.weight q4_K     [  4096, 11008,     1,     1 ]\n",
+            "llama_model_loader: - tensor  286:           blk.31.ffn_down.weight q4_K     [ 11008,  4096,     1,     1 ]\n",
+            "llama_model_loader: - tensor  287:          blk.31.attn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor  288:           blk.31.ffn_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor  289:               output_norm.weight f32      [  4096,     1,     1,     1 ]\n",
+            "llama_model_loader: - tensor  290:                    output.weight q6_K     [  4096, 32016,     1,     1 ]\n",
+            "llama_model_loader: - kv   0:                       general.architecture str     \n",
+            "llama_model_loader: - kv   1:                               general.name str     \n",
+            "llama_model_loader: - kv   2:                       llama.context_length u32     \n",
+            "llama_model_loader: - kv   3:                     llama.embedding_length u32     \n",
+            "llama_model_loader: - kv   4:                          llama.block_count u32     \n",
+            "llama_model_loader: - kv   5:                  llama.feed_forward_length u32     \n",
+            "llama_model_loader: - kv   6:                 llama.rope.dimension_count u32     \n",
+            "llama_model_loader: - kv   7:                 llama.attention.head_count u32     \n",
+            "llama_model_loader: - kv   8:              llama.attention.head_count_kv u32     \n",
+            "llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32     \n",
+            "llama_model_loader: - kv  10:                       llama.rope.freq_base f32     \n",
+            "llama_model_loader: - kv  11:                          general.file_type u32     \n",
+            "llama_model_loader: - kv  12:                       tokenizer.ggml.model str     \n",
+            "llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr     \n",
+            "llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr     \n",
+            "llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr     \n",
+            "llama_model_loader: - kv  16:               general.quantization_version u32     \n",
+            "llama_model_loader: - type  f32:   65 tensors\n",
+            "llama_model_loader: - type q4_K:  217 tensors\n",
+            "llama_model_loader: - type q5_K:    8 tensors\n",
+            "llama_model_loader: - type q6_K:    1 tensors\n",
+            "llm_load_print_meta: format         = GGUF V2 (latest)\n",
+            "llm_load_print_meta: arch           = llama\n",
+            "llm_load_print_meta: vocab type     = SPM\n",
+            "llm_load_print_meta: n_vocab        = 32016\n",
+            "llm_load_print_meta: n_merges       = 0\n",
+            "llm_load_print_meta: n_ctx_train    = 16384\n",
+            "llm_load_print_meta: n_ctx          = 512\n",
+            "llm_load_print_meta: n_embd         = 4096\n",
+            "llm_load_print_meta: n_head         = 32\n",
+            "llm_load_print_meta: n_head_kv      = 32\n",
+            "llm_load_print_meta: n_layer        = 32\n",
+            "llm_load_print_meta: n_rot          = 128\n",
+            "llm_load_print_meta: n_gqa          = 1\n",
+            "llm_load_print_meta: f_norm_eps     = 1.0e-05\n",
+            "llm_load_print_meta: f_norm_rms_eps = 1.0e-05\n",
+            "llm_load_print_meta: n_ff           = 11008\n",
+            "llm_load_print_meta: freq_base      = 1000000.0\n",
+            "llm_load_print_meta: freq_scale     = 1\n",
+            "llm_load_print_meta: model type     = 7B\n",
+            "llm_load_print_meta: model ftype    = mostly Q4_K - Small\n",
+            "llm_load_print_meta: model size     = 6.74 B\n",
+            "llm_load_print_meta: general.name   = LLaMA\n",
+            "llm_load_print_meta: BOS token = 1 '<s>'\n",
+            "llm_load_print_meta: EOS token = 2 '</s>'\n",
+            "llm_load_print_meta: UNK token = 0 '<unk>'\n",
+            "llm_load_print_meta: LF token  = 13 '<0x0A>'\n",
+            "llm_load_tensors: ggml ctx size =    0.09 MB\n",
+            "llm_load_tensors: using CUDA for GPU acceleration\n",
+            "llm_load_tensors: mem required  =   70.44 MB (+  256.00 MB per state)\n",
+            "llm_load_tensors: offloading 32 repeating layers to GPU\n",
+            "llm_load_tensors: offloading non-repeating layers to GPU\n",
+            "llm_load_tensors: offloading v cache to GPU\n",
+            "llm_load_tensors: offloading k cache to GPU\n",
+            "llm_load_tensors: offloaded 35/35 layers to GPU\n",
+            "llm_load_tensors: VRAM used: 3864 MB\n",
+            "..................................................................................................\n",
+            "llama_new_context_with_model: kv self size  =  256.00 MB\n",
+            "llama_new_context_with_model: compute buffer total size =   71.94 MB\n",
+            "llama_new_context_with_model: VRAM scratch buffer: 70.53 MB\n",
+            "\n",
+            "system_info: n_threads = 2 / 2 | AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | \n",
+            "sampling: repeat_last_n = 64, repeat_penalty = 1.100000, presence_penalty = 0.000000, frequency_penalty = 0.000000, top_k = 40, tfs_z = 1.000000, top_p = 0.950000, typical_p = 1.000000, temp = 0.800000, mirostat = 0, mirostat_lr = 0.100000, mirostat_ent = 5.000000\n",
+            "generate: n_ctx = 512, n_batch = 512, n_predict = 128, n_keep = 0\n",
+            "\n",
+            "\n",
+            "\u001b[33m prompt\u001b[0m.\t\t\t\t\n",
+            "\t\t\t\t\tif( !this->m_pMiscSettings ) { return; }\t// If no misc settings, do nothing\n",
+            "\t\t\t\t\t\n",
+            "\t\t\t\t\t// Get the value of the checkbox for \"Always on top\"\n",
+            "\t\t\t\t\tbool alwaysOnTop = this->m_pMiscSettings->GetBool(L\"AlwaysOnTop\", false);\n",
+            "\t\t\t\t\tthis->SetWindowPos((alwaysOnTop ? HWND_TOPMOST : HWND_NOTOPMOST\n",
+            "llama_print_timings:        load time =  1392.10 ms\n",
+            "llama_print_timings:      sample time =   147.99 ms /   128 runs   (    1.16 ms per token,   864.92 tokens per second)\n",
+            "llama_print_timings: prompt eval time =   261.80 ms /     2 tokens (  130.90 ms per token,     7.64 tokens per second)\n",
+            "llama_print_timings:        eval time =  5923.18 ms /   127 runs   (   46.64 ms per token,    21.44 tokens per second)\n",
+            "llama_print_timings:       total time =  6370.96 ms\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Push to hub\n",
+        "\n",
+        "To push your model to the hub, you'll need to input your Hugging Face token (https://huggingface.co/settings/tokens) in Google Colab's \"Secrets\" tab. The following code creates a new repo with the \"-GGUF\" suffix. Don't forget to change the `username` variable."
+      ],
+      "metadata": {
+        "id": "Ar8pO7bb80US"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!pip install -q huggingface_hub\n",
+        "from huggingface_hub import create_repo, HfApi\n",
+        "from google.colab import userdata\n",
+        "\n",
+        "# Defined in the secrets tab in Google Colab\n",
+        "hf_token = userdata.get('huggingface')\n",
+        "\n",
+        "api = HfApi()\n",
+        "username = \"mlabonne\"\n",
+        "\n",
+        "# Create empty repo\n",
+        "create_repo(\n",
+        "    repo_id = f\"{username}/{MODEL_NAME}-GGUF\",\n",
+        "    repo_type=\"model\",\n",
+        "    exist_ok=True,\n",
+        "    token=hf_token\n",
+        ")\n",
+        "\n",
+        "# Upload gguf files\n",
+        "api.upload_folder(\n",
+        "    folder_path=MODEL_NAME,\n",
+        "    repo_id=f\"{username}/{MODEL_NAME}-GGUF\",\n",
+        "    allow_patterns=f\"*.gguf\",\n",
+        "    token=hf_token\n",
+        ")"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 163,
+          "referenced_widgets": [
+            "c281b60e104f4c5da547bbdd7208d4bc",
+            "74b084c97f6f46d293a197bf9804460c",
+            "1409574c4f9742e7a711965dd2c8ad87",
+            "704ecf9409244e0b93612d6a11476346",
+            "b1a8d3a9a379415393d9e7d995a40788",
+            "f928772f92724579b068e984d9eef387",
+            "1c8a6b959f9c4443a92f58eff1b03077",
+            "9fb5726f91734b1da149784680dc9624",
+            "202a8eb11eda4e58942113fbeacfdc3d",
+            "970d4d3daf854f92bd650dc4da99e1bc",
+            "24b1e007921046b1adc61db0f2bf9fc7",
+            "24d3d72f5de54de8a1ded4e528dde332",
+            "e90cb0ce526a4556bc643ba6c5485661",
+            "76e7372656b745c889b9283b76c04148",
+            "ce0204c7e1ff4a51b2648284a2492262",
+            "6dbb8e8a5ebb40a4ba910b09dde27e1a",
+            "7944af54f2564920822d5d4b348896c4",
+            "1b55372f62494ca0baabf87f7e7f4ba8",
+            "bf612001ad354ea19de6ee45a166a43c",
+            "a8e4691970b14955bfb4865bcef5e912",
+            "2e2fabac70484c1c8b16fa6ca8fd8537",
+            "bf53c635fa374420ad850eea22cd1e31",
+            "065d59126a734c1aa096ba40cd4a129f",
+            "e8855d5678a342f5a33171aa74d3b7bc",
+            "7eb6de1a979b46f7b234724073f8bc3a",
+            "6ae4640196da492fadafeb63f4bc89d2",
+            "cef83433dbea4f529f43722fe78a8baf",
+            "845ba8115d5140ac9ee22af4a9e6a03b",
+            "cdd888041aca4dcf8adc785309071fc6",
+            "cf63214cb4f8442999fa5b971035fe4f",
+            "7d9b22f2b7fe4a749f989e247bce446a",
+            "7f8e268db8144adfb09d089784d8411a"
+          ]
+        },
+        "id": "UOyKfUD-8jmh",
+        "outputId": "3c8df47b-f350-4251-a19f-4b9fb1116381"
+      },
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "\u001b[?25l     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/268.8 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K     \u001b[91m━━━━━━━━━━\u001b[0m\u001b[91m╸\u001b[0m\u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m71.7/268.8 kB\u001b[0m \u001b[31m2.0 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m268.8/268.8 kB\u001b[0m \u001b[31m3.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25h"
+          ]
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": [
+              "VBox(children=(HTML(value='<center> <img\\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…"
+            ],
+            "application/vnd.jupyter.widget-view+json": {
+              "version_major": 2,
+              "version_minor": 0,
+              "model_id": "c281b60e104f4c5da547bbdd7208d4bc"
+            }
+          },
+          "metadata": {}
+        }
+      ]
+    }
+  ]
+}
\ No newline at end of file