From 1bad14298082df4bbe180934a9b3296b7d503a55 Mon Sep 17 00:00:00 2001 From: Maxime Labonne <81252890+mlabonne@users.noreply.github.com> Date: Sat, 22 Jul 2023 22:16:56 +0100 Subject: [PATCH] Created using Colaboratory --- 4_bit_LLM_Quantization_with_GPTQ.ipynb | 252 +++++++++++++++++++++++++ 1 file changed, 252 insertions(+) create mode 100644 4_bit_LLM_Quantization_with_GPTQ.ipynb diff --git a/4_bit_LLM_Quantization_with_GPTQ.ipynb b/4_bit_LLM_Quantization_with_GPTQ.ipynb new file mode 100644 index 0000000..f2dec8d --- /dev/null +++ b/4_bit_LLM_Quantization_with_GPTQ.ipynb @@ -0,0 +1,252 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "gpuType": "T4", + "authorship_tag": "ABX9TyOS2QEuJ1BDI/3IFsLsFIZo", + "include_colab_link": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + }, + "accelerator": "GPU" + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "markdown", + "source": [ + "# 4-bit LLM Quantization with GPTQ\n", + "> 🗣️ [Large Language Model Course](https://github.com/mlabonne/llm-course)\n", + "\n", + "❤️ Created by [@maximelabonne](https://twitter.com/maximelabonne).\n", + "\n", + "Companion notebook to execute the code from the following article: https://mlabonne.github.io/blog/4bit_quantization/" + ], + "metadata": { + "id": "yezrHxYvg_wR" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "BhufqqQAaz6e" + }, + "outputs": [], + "source": [ + "!BUILD_CUDA_EXT=0 pip install -q auto-gptq transformers" + ] + }, + { + "cell_type": "code", + "source": [ + "import random\n", + "\n", + "from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig\n", + "from datasets import load_dataset\n", + "import torch\n", + "from transformers import AutoTokenizer\n", + "\n", + "\n", + "# Define base model and output directory\n", + "model_id = \"gpt2\"\n", + "out_dir = model_id + \"-GPTQ\"" + ], + "metadata": { + "id": "dg8NyBL0ZNyw" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Load quantize config, model and tokenizer\n", + "quantize_config = BaseQuantizeConfig(\n", + " bits=4,\n", + " group_size=128,\n", + " damp_percent=0.01,\n", + " desc_act=False,\n", + ")\n", + "model = AutoGPTQForCausalLM.from_pretrained(model_id, quantize_config)\n", + "tokenizer = AutoTokenizer.from_pretrained(model_id)" + ], + "metadata": { + "id": "C9352jN0ZP6I" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Load data and tokenize examples\n", + "n_samples = 1024\n", + "data = load_dataset(\"allenai/c4\", data_files=\"en/c4-train.00001-of-01024.json.gz\", split=f\"train[:{n_samples*5}]\")\n", + "tokenized_data = tokenizer(\"\\n\\n\".join(data['text']), return_tensors='pt')\n", + "\n", + "# Format tokenized examples\n", + "examples_ids = []\n", + "for _ in range(n_samples):\n", + " i = random.randint(0, tokenized_data.input_ids.shape[1] - tokenizer.model_max_length - 1)\n", + " j = i + tokenizer.model_max_length\n", + " input_ids = tokenized_data.input_ids[:, i:j]\n", + " attention_mask = torch.ones_like(input_ids)\n", + " examples_ids.append({'input_ids': input_ids, 'attention_mask': attention_mask})" + ], + "metadata": { + "id": "6wuBLe6aZSe-", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "e4ebd71a-2854-4347-cebe-08cf040d1eb6" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "WARNING:datasets.builder:Found cached dataset json (/root/.cache/huggingface/datasets/allenai___json/allenai--c4-6e494e9c0ee1404e/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96)\n", + "Token indices sequence length is longer than the specified maximum sequence length for this model (2441065 > 1024). Running this sequence through the model will result in indexing errors\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "%%time\n", + "\n", + "# Quantize with GPTQ\n", + "model.quantize(\n", + " examples_ids,\n", + " batch_size=1,\n", + " use_triton=True,\n", + ")\n", + "\n", + "# Save model and tokenizer\n", + "model.save_quantized(out_dir, use_safetensors=True)\n", + "tokenizer.save_pretrained(out_dir)" + ], + "metadata": { + "id": "ETsG2iYrXaUg", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "e48b825e-0ebc-4a73-dbfd-b5571cafd24e" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "CPU times: user 4min 35s, sys: 3.49 s, total: 4min 39s\n", + "Wall time: 5min 8s\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "('gpt2-GPTQ/tokenizer_config.json',\n", + " 'gpt2-GPTQ/special_tokens_map.json',\n", + " 'gpt2-GPTQ/vocab.json',\n", + " 'gpt2-GPTQ/merges.txt',\n", + " 'gpt2-GPTQ/added_tokens.json',\n", + " 'gpt2-GPTQ/tokenizer.json')" + ] + }, + "metadata": {}, + "execution_count": 5 + } + ] + }, + { + "cell_type": "code", + "source": [ + "device = \"cuda:0\" if torch.cuda.is_available() else \"cpu\"\n", + "\n", + "# Reload model and tokenizer\n", + "model = AutoGPTQForCausalLM.from_quantized(\n", + " out_dir,\n", + " device=device,\n", + " use_triton=True,\n", + " use_safetensors=True,\n", + ")\n", + "tokenizer = AutoTokenizer.from_pretrained(out_dir)" + ], + "metadata": { + "id": "nktu1FsdZ9sd", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "9943c829-1b58-474a-f245-6aefa09d81dc" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "WARNING:accelerate.utils.modeling:The safetensors archive passed at gpt2-GPTQ/gptq_model-4bit-128g.safetensors does not contain metadata. Make sure to save your model with the `save_pretrained` method. Defaulting to 'pt' metadata.\n", + "WARNING:auto_gptq.modeling._base:GPT2GPTQForCausalLM hasn't fused attention module yet, will skip inject fused attention.\n", + "WARNING:auto_gptq.modeling._base:GPT2GPTQForCausalLM hasn't fused mlp module yet, will skip inject fused mlp.\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "from transformers import pipeline\n", + "\n", + "generator = pipeline('text-generation', model=model, tokenizer=tokenizer)\n", + "result = generator(\"I have a dream\", do_sample=True, max_length=50)[0]['generated_text']\n", + "print(result)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "cRhIGrXdiFdt", + "outputId": "6dca2078-6f01-44da-9895-3a03bdfb4b5b" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "The model 'GPT2GPTQForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MusicgenForCausalLM', 'MvpForCausalLM', 'OpenLlamaForCausalLM', 'OpenAIGPTLMHeadModel', 'OPTForCausalLM', 'PegasusForCausalLM', 'PLBartForCausalLM', 'ProphetNetForCausalLM', 'QDQBertLMHeadModel', 'ReformerModelWithLMHead', 'RemBertForCausalLM', 'RobertaForCausalLM', 'RobertaPreLayerNormForCausalLM', 'RoCBertForCausalLM', 'RoFormerForCausalLM', 'RwkvForCausalLM', 'Speech2Text2ForCausalLM', 'TransfoXLLMHeadModel', 'TrOCRForCausalLM', 'XGLMForCausalLM', 'XLMWithLMHeadModel', 'XLMProphetNetForCausalLM', 'XLMRobertaForCausalLM', 'XLMRobertaXLForCausalLM', 'XLNetLMHeadModel', 'XmodForCausalLM'].\n", + "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "I have a dream,\" she told CNN last week. \"I have this dream of helping my mother find her own. But, to tell that for the first time, now that I'm seeing my mother now, just knowing how wonderful it is that\n" + ] + } + ] + } + ] +} \ No newline at end of file