Laurel/multigpuhf (#15)

* feat: added user session logging * fix: removed raw queries in tests * build: move torch to dev dependency * fix: adding accelerate to api Addresses https://github.com/HazyResearch/manifest/issues/14 * build: fix torch github
2 years ago · 99bcc76fde
parent d29d10de28
commit 99bcc76fde
6 changed files with 173 additions and 25 deletions
--- a/README.md
+++ b/README.md
@ -157,7 +157,7 @@ Before submitting a PR, run
 ```bash
 export REDIS_PORT="6380"  # or whatever PORT local redis is running for those tests
 cd <REDIS_PATH>
-docker run -d -p 127.0.0.1:${REDIS_PORT}:6380 -v `pwd`:`pwd` -w `pwd` --name manifest_redis_test redis
+docker run -d -p 127.0.0.1:${REDIS_PORT}:6379 -v `pwd`:`pwd` -w `pwd` --name manifest_redis_test redis
 make test
 ```

--- a/manifest/api/app.py
+++ b/manifest/api/app.py
@ -46,6 +46,17 @@ def parse_args() -> argparse.Namespace:
    parser.add_argument(
        "--device", type=int, default=-1, help="Model device. -1 for CPU."
    )
+    parser.add_argument(
+        "--fp32", action="store_true", help="Use fp32 for model params."
+    )
+    parser.add_argument(
+        "--use_accelerate_multigpu",
+        action="store_true",
+        help=(
+            "Use accelerate for multi gpu inference. "
+            "This will override --device parameter."
+        ),
+    )
    args = parser.parse_args()
    return args

@ -55,11 +66,18 @@ def main() -> None:
    kwargs = parse_args()
    model_type = kwargs.model_type
    model_name = kwargs.model_name
+    use_accelerate = kwargs.use_accelerate_multigpu
+    if use_accelerate:
+        logger.info("Using accelerate. Overridding --device argument.")

    # Global model
    global model
    model = MODEL_CONSTRUCTORS[model_type](
-        model_name, cache_dir=kwargs.cache_dir, device=kwargs.device
+        model_name,
+        cache_dir=kwargs.cache_dir,
+        device=kwargs.device,
+        use_accelerate=use_accelerate,
+        use_fp32=kwargs.fp32,
    )
    app.run(host="0.0.0.0", port=PORT)

--- a/manifest/api/models/huggingface.py
+++ b/manifest/api/models/huggingface.py
@ -11,6 +11,8 @@ from transformers import (
    GPT2LMHeadModel,
    GPTJForCausalLM,
    GPTNeoForCausalLM,
+    GPTNeoXForCausalLM,
+    OPTForCausalLM,
    PreTrainedModel,
    PreTrainedTokenizer,
    pipeline,
@ -19,7 +21,7 @@ from transformers import (
 from manifest.api.models.model import Model


-class GPT2Pipeline:
+class GPTPipeline:
    """Custom GPT3 Pipeline."""

    def __init__(
@ -33,7 +35,6 @@ class GPT2Pipeline:
            if (device == -1 or not torch.cuda.is_available())
            else torch.device(f"cuda:{device}")
        )
-        self.model = self.model.to(self.device)  # type: ignore

    def __call__(self, text: str, **kwargs: Any) -> List[Dict[str, str]]:
        """Generate from text.
@ -59,28 +60,42 @@ class GPT2Pipeline:
            num_return_sequences=kwargs.get("num_return_sequences"),
        )
        generated_sequences = [
-            {"generated_text": self.tokenizer.decode(output_seq)}
+            {
+                "generated_text": self.tokenizer.decode(
+                    output_seq, skip_special_tokens=True
+                )
+            }
            for output_seq in output_sequences
        ]
        return generated_sequences


 MODEL_REGISTRY = {
-    "EleutherAI/gpt-j-6B": GPTJForCausalLM,
-    "EleutherAI/gpt-neo-125M": GPTNeoForCausalLM,
    "EleutherAI/gpt-neo-1.3B": GPTNeoForCausalLM,
    "EleutherAI/gpt-neo-2.7B": GPTNeoForCausalLM,
+    "EleutherAI/gpt-j-6B": GPTJForCausalLM,
+    "EleutherAI/gpt-neox-20b": GPTNeoXForCausalLM,
+    "facebook/opt-1.3b": OPTForCausalLM,
+    "facebook/opt-2.7b": OPTForCausalLM,
+    "facebook/opt-6.7b": OPTForCausalLM,
+    "facebook/opt-13b": OPTForCausalLM,
+    "facebook/opt-30b": OPTForCausalLM,
    "gpt2": GPT2LMHeadModel,
    "bigscience/T0pp": AutoModelForSeq2SeqLM,
    "bigscience/T0_3B": AutoModelForSeq2SeqLM,
 }

 MODEL_PIPELINE = {
-    "EleutherAI/gpt-j-6B": GPT2Pipeline,
-    "EleutherAI/gpt-neo-125M": GPT2Pipeline,
-    "EleutherAI/gpt-neo-1.3B": GPT2Pipeline,
-    "EleutherAI/gpt-neo-2.7B": GPT2Pipeline,
-    "gpt2": GPT2Pipeline,
+    "EleutherAI/gpt-neo-1.3B": GPTPipeline,
+    "EleutherAI/gpt-neo-2.7B": GPTPipeline,
+    "EleutherAI/gpt-j-6B": GPTPipeline,
+    "EleutherAI/gpt-neox-20b": GPTPipeline,
+    "facebook/opt-1.3b": GPTPipeline,
+    "facebook/opt-2.7b": GPTPipeline,
+    "facebook/opt-6.7b": GPTPipeline,
+    "facebook/opt-13b": GPTPipeline,
+    "facebook/opt-30b": GPTPipeline,
+    "gpt2": GPTPipeline,
    "bigscience/T0pp": partial(pipeline, "text2text-generation"),
    "bigscience/T0_3B": partial(pipeline, "text2text-generation"),
 }
@ -89,7 +104,14 @@ MODEL_PIPELINE = {
 class HuggingFaceModel(Model):
    """Huggingface model."""

-    def __init__(self, model_name: str, cache_dir: str, device: int):
+    def __init__(
+        self,
+        model_name: str,
+        cache_dir: str,
+        device: int,
+        use_accelerate: bool,
+        use_fp32: bool,
+    ):
        """
        Initialize model.

@ -97,6 +119,10 @@ class HuggingFaceModel(Model):

        Args:
            model_name: model name string.
+            cache_dir: cache directory for model.
+            device: device to use for model.
+            use_accelerate: whether to use accelerate for multi-gpu inference.
+            use_fp32: use fp32 for model weights.
        """
        # Check if providing path
        self.model_path = model_name
@ -107,19 +133,80 @@ class HuggingFaceModel(Model):
                model_name = config["_name_or_path"]
        self.model_name = model_name
        print("Model Name:", self.model_name, "Model Path:", self.model_path)
+        try:
+            tokenizer = AutoTokenizer.from_pretrained(model_name)
+        except ValueError:
+            tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
+        dtype = torch.float32 if use_fp32 else torch.float16
        model = MODEL_REGISTRY[model_name].from_pretrained(  # type: ignore
-            self.model_path, cache_dir=cache_dir
+            self.model_path, cache_dir=cache_dir, torch_dtype=dtype
        )
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        if use_accelerate:
+            self._dispatch_accelerate_model(model)
+            device = 0
+        else:
+            if device > -1:
+                model = model.to(device)  # type: ignore
        self.pipeline = MODEL_PIPELINE[model_name](  # type: ignore
            model=model, tokenizer=tokenizer, device=device
        )
-        self.returns_input = "gpt" in model_name
+        self.returns_input = "T0" not in model_name

    def get_init_params(self) -> Dict:
        """Return init params to determine what model is being used."""
        return {"model_name": self.model_name, "model_path": self.model_path}

+    def _dispatch_accelerate_model(self, model: PreTrainedModel) -> None:
+        """
+        Load model with accelerate.
+
+        Adapted from https://colab.research.google.com/drive/14wnxMvD9zsiBQo2FtT
+                     pxn6w2cpXCcb-7#scrollTo=y8Ne7jJdaF9F&uniqifier=1
+
+        Args:
+            model_name: name of model.
+            model_path: path to model.
+            cache_dir: cache directory for model.
+        """
+        from accelerate import dispatch_model, infer_auto_device_map
+
+        model.tie_weights()  # type: ignore
+        # Get the model where we can infer devices from
+        if hasattr(model, "model"):
+            # OPT
+            main_model = model.model  # type: ignore
+            model_getter = "model."
+        else:
+            # Eleuther Neo and J
+            main_model = model
+            model_getter = ""
+        raw_device_map = infer_auto_device_map(
+            main_model,
+            no_split_module_classes=[
+                "OPTDecoderLayer",
+                "GPTNeoBlock",
+                "GPTJBlock",
+                "GPTNeoXLayer",
+            ],
+            dtype=model.dtype,  # type: ignore
+        )
+        # Hacky fix for Eleuther getting the "weight" of embeddings
+        device_map = {}
+        for k, v in raw_device_map.items():
+            if k in {"wte", "wpe"}:
+                device_map[f"{model_getter}{k}.weight"] = v
+            else:
+                device_map[f"{model_getter}{k}"] = v
+        # For OPT models
+        if "lm_head" not in device_map:
+            if "disk" in device_map.values():
+                device_map["lm_head"] = "disk"
+            else:
+                device_map["lm_head"] = max(device_map.values())
+        print("Device Map", device_map)
+        dispatch_model(model, device_map=device_map)
+        return
+
    def generate(self, prompt: str, **kwargs: Any) -> List[str]:
        """
        Generate the prompt from model.
@ -134,12 +221,15 @@ class HuggingFaceModel(Model):
        """
        num_return = kwargs.get("n")
        final_results = []
-        encoded_prompt = self.pipeline.tokenizer.encode(
+        # Add tokens for length
+        encoded_prompt_with_special = self.pipeline.tokenizer.encode(prompt)
+        # Remove tokens as the pipeline removes special tokens upon return
+        encoded_prompt_without_special = self.pipeline.tokenizer.encode(
            prompt, add_special_tokens=False
        )
        result = self.pipeline(
            prompt,
-            max_length=kwargs.get("max_tokens") + len(encoded_prompt),
+            max_length=kwargs.get("max_tokens") + len(encoded_prompt_with_special),
            temperature=kwargs.get("temperature"),
            repetition_penalty=kwargs.get("repetition_penalty"),
            top_k=kwargs.get("top_k"),
@ -148,7 +238,7 @@ class HuggingFaceModel(Model):
            num_return_sequences=num_return,
        )
        # Correctly returns prompt without extra spaces
-        decoded_prompt = self.pipeline.tokenizer.decode(encoded_prompt)
+        decoded_prompt = self.pipeline.tokenizer.decode(encoded_prompt_without_special)
        if self.returns_input:
            start_idx = len(decoded_prompt)
        else:
--- a/manifest/session.py
+++ b/manifest/session.py
@ -31,9 +31,9 @@ class Session:

        """
        manifest_home = Path(os.environ.get("MANIFEST_SESSION_HOME", Path.home()))
-        db_file = manifest_home / ".manifest" / "session.db"
-        db_file.parent.mkdir(parents=True, exist_ok=True)
-        self.conn = sqlite3.connect(str(db_file))
+        self.db_file = manifest_home / ".manifest" / "session.db"
+        self.db_file.parent.mkdir(parents=True, exist_ok=True)
+        self.conn = sqlite3.connect(str(self.db_file))
        self._create_table()
        if not session_id:
            self.session_id = str(uuid.uuid4())
@ -49,6 +49,22 @@ class Session:
        """Close the client."""
        self.conn.close()

+    @classmethod
+    def get_session_keys(cls, db_file: Path) -> List[str]:
+        """Get available session keys from cached file."""
+        try:
+            conn = sqlite3.connect(str(db_file))
+            query = """SELECT DISTINCT session_id FROM queries"""
+            cur = conn.cursor()
+            res = cur.execute(query)
+            return [x[0] for x in res.fetchall()]
+        except sqlite3.OperationalError:
+            logger.info(
+                "There is no database with the 'queries' table. "
+                "Are you sure you are using the right session file"
+            )
+            return []
+
    def _execute_query(self, query: str, *args: Any) -> Any:
        """
        Execute query with optional args.
--- a/pyproject.toml
+++ b/pyproject.toml
@ -16,19 +16,21 @@ version = "0.0.1"
 "Bug Tracker" = "https://github.com/HazyResearch/manifest/issues"

 [tool.poetry.dependencies]
-python = "^3.8"
+python = "^3.10"
 sqlitedict = "^2.0.0"
 openai = "^0.18.1"
 redis = "^4.3.1"
 dill = "^0.3.5"
 Flask = "^2.1.2"
-transformers = "^4.19.2"
+transformers = "^4.20.0"
 requests = "^2.27.1"
 tqdm = "^4.64.0"
 uuid = "^1.30"
+accelerate = "^0.10.0"

 [tool.poetry.dev-dependencies]
-torch = "^1.0"
+# torch = { url = "https://download.pytorch.org/whl/cu113/torch-1.11.0%2Bcu113-cp310-cp310-linux_x86_64.whl" }
+torch = "1.11.0"
 black = "^22.3.0"
 flake8 = "^4.0.0"
 flake8-docstrings = "^1.6.0"
@ -64,6 +66,7 @@ module = [
  "sqlitedict",
  "dill",
  "tqdm.auto",
+  "accelerate",
 ]

 [tool.isort]
--- a/tests/test_session.py
+++ b/tests/test_session.py
@ -11,6 +11,7 @@ def test_init(session_cache):
    """Test session initialization."""
    session = Session()
    assert isinstance(session.conn, sqlite3.Connection)
+    assert session.db_file == session_cache / ".manifest" / "session.db"
    assert session.query_id == 0
    assert (session_cache / ".manifest" / "session.db").exists()
    # Remove session cache file.
@ -18,9 +19,11 @@ def test_init(session_cache):

    session = Session("dog_days")
    assert isinstance(session.conn, sqlite3.Connection)
+    assert session.db_file == session_cache / ".manifest" / "session.db"
    assert session.query_id == 0
    assert session.session_id == "dog_days"
    assert (session_cache / ".manifest" / "session.db").exists()
+    session.close()


@pytest.mark.usefixtures("session_cache")
@ -44,6 +47,7 @@ def test_log_query(session_cache):
        (query_key, response_key),
        (query_key2, response_key2),
    ]
+    session.close()


@pytest.mark.usefixtures("session_cache")
@ -57,3 +61,20 @@ def test_resume_query(session_cache):

    session = Session(session_id="dog_days")
    assert session.query_id == 1
+
+
+@pytest.mark.usefixtures("session_cache")
+def test_session_keys(session_cache):
+    """Test get session keys."""
+    # Assert empty before queries
+    assert Session.get_session_keys(session_cache / ".manifest" / "session.db") == []
+    # Add queries and make sure session is logged
+    session = Session(session_id="dog_days")
+    query_key = {"query": "What is your name?", "time": "now"}
+    response_key = {"response": "I don't have a name", "engine": "nodel"}
+    session.log_query(query_key, response_key)
+    session.close()
+
+    assert Session.get_session_keys(session_cache / ".manifest" / "session.db") == [
+        "dog_days"
+    ]