feat: add local huggingface embedding models (#76)

1 year ago · 0fb192a0a2
parent 40de0e7f59
commit 0fb192a0a2
18 changed files with 455 additions and 41 deletions
--- a/examples/manifest_embedding.ipynb
+++ b/examples/manifest_embedding.ipynb
@ -69,19 +69,24 @@
    "```\n",
    "python3 manifest/api/app.py --model_type huggingface --model_name_or_path EleutherAI/gpt-neo-125M --device 0\n",
    "```\n",
-    "in a separate `screen` or `tmux`."
+    "or\n",
+    "```\n",
+    "python3 manifest/api/app.py --model_type sentence_transformers --model_name_or_path all-mpnet-base-v2 --device 0\n",
+    "```\n",
+    "\n",
+    "in a separate `screen` or `tmux`. Make sure to note the port. You can change this with `export FLASK_PORT=<port>`."
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "{'model_name': 'EleutherAI/gpt-neo-125M', 'model_path': 'EleutherAI/gpt-neo-125M'}\n"
+      "{'model_name': 'all-mpnet-base-v2', 'model_path': 'all-mpnet-base-v2', 'client_name': 'huggingfaceembedding'}\n"
     ]
    }
   ],
@ -91,7 +96,7 @@
    "# Local hosted GPT Neo 125M\n",
    "manifest = Manifest(\n",
    "    client_name=\"huggingfaceembedding\",\n",
-    "    client_connection=\"http://127.0.0.1:6001\",\n",
+    "    client_connection=\"http://127.0.0.1:6000\",\n",
    "    cache_name=\"sqlite\",\n",
    "    cache_connection=\"my_sqlite_manifest.sqlite\"\n",
    ")\n",
@ -100,12 +105,24 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(768,)\n",
+      "(768,) (768,)\n"
+     ]
+    }
+   ],
   "source": [
    "emb = manifest.run(\"Is this an embedding?\")\n",
-    "emb2 = manifest.run(\"Is this an embedding?\", aggregation=\"mean\")"
+    "print(emb.shape)\n",
+    "\n",
+    "emb = manifest.run([\"Is this an embedding?\", \"Bananas!!!\"])\n",
+    "print(emb[0].shape, emb[1].shape)"
   ]
  }
 ],
--- a/manifest/api/app.py
+++ b/manifest/api/app.py
@ -16,6 +16,7 @@ from manifest.api.models.huggingface import (
    CrossModalEncoderModel,
    TextGenerationModel,
 )
+from manifest.api.models.sentence_transformer import SentenceTransformerModel
 from manifest.api.response import ModelResponse

 os.environ["TOKENIZERS_PARALLELISM"] = "false"
@ -28,6 +29,7 @@ model_type = None
 PORT = int(os.environ.get("FLASK_PORT", 5000))
 MODEL_CONSTRUCTORS = {
    "huggingface": TextGenerationModel,
+    "sentence_transformers": SentenceTransformerModel,
    "huggingface_crossmodal": CrossModalEncoderModel,
    "diffuser": DiffuserModel,
 }
@ -198,11 +200,14 @@ def completions() -> Response:


@app.route("/embed", methods=["POST"])
-def embed() -> Dict:
+def embed() -> Response:
    """Get embed for generation."""
-    modality = request.json["modality"]
+    if "modality" in request.json:
+        modality = request.json["modality"]
+    else:
+        modality = "text"
    if modality == "text":
-        prompts = request.json["prompts"]
+        prompts = request.json["prompt"]
    elif modality == "image":
        import base64

@ -210,19 +215,36 @@ def embed() -> Dict:

        prompts = [
            Image.open(io.BytesIO(base64.b64decode(data)))
-            for data in request.json["prompts"]
+            for data in request.json["prompt"]
        ]
    else:
        raise ValueError("modality must be text or image")

-    results = []
-    embeddings = model.embed(prompts)
-    for embedding in embeddings:
-        results.append(embedding.tolist())
+    try:
+        results = []
+        embeddings = model.embed(prompts)
+        for embedding in embeddings:
+            results.append(
+                {
+                    "array": embedding,
+                    "logprob": None,
+                    "tokens": None,
+                    "token_logprobs": None,
+                }
+            )

-    # transform the result into the openai format
-    # return Response(results, response_type="text_completion").__dict__()
-    return {"result": results}
+        return Response(
+            json.dumps(
+                ModelResponse(results, response_type="embedding_generation").__dict__()
+            ),
+            status=200,
+        )
+    except Exception as e:
+        logger.error(e)
+        return Response(
+            json.dumps({"message": str(e)}),
+            status=400,
+        )


@app.route("/score_sequence", methods=["POST"])
--- a/manifest/api/models/diffuser.py
+++ b/manifest/api/models/diffuser.py
@ -1,7 +1,8 @@
-"""Huggingface model."""
+"""Diffuser model."""
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple, Union

+import numpy as np
 import torch
 from diffusers import StableDiffusionPipeline

@ -93,6 +94,19 @@ class DiffuserModel(Model):
        # Return None for logprobs and token logprobs
        return [(im, None, None, None) for im in result["images"]]

+    @torch.no_grad()
+    def embed(self, prompt: Union[str, List[str]], **kwargs: Any) -> np.ndarray:
+        """
+        Embed the prompt from model.
+
+        Args:
+            prompt: promt to embed from.
+
+        Returns:
+            list of embeddings (list of length 1 for 1 embedding).
+        """
+        raise NotImplementedError("Embed not supported for diffusers")
+
    @torch.no_grad()
    def score_sequence(
        self, prompt: Union[str, List[str]], **kwargs: Any
--- a/manifest/api/models/huggingface.py
+++ b/manifest/api/models/huggingface.py
@ -535,15 +535,32 @@ class TextGenerationModel(HuggingFaceModel):
    @torch.no_grad()
    def embed(self, prompt: Union[str, List[str]], **kwargs: Any) -> np.ndarray:
        """
-        Compute embedding for prompts.
+        Embed the prompt from model.

        Args:
-            prompt: promt to generate from.
+            prompt: promt to embed from.

        Returns:
-            embedding
+            list of embeddings (list of length 1 for 1 embedding).
        """
-        pass
+        if isinstance(prompt, str):
+            prompt = [prompt]
+        encoded_prompt = self.pipeline.tokenizer(
+            prompt,
+            max_length=self.pipeline.max_length,
+            truncation=True,
+            padding=True,
+            return_tensors="pt",
+        )
+        encoded_prompt = encoded_prompt.to(self.pipeline.device)
+        # Get last hidden state
+        output = self.pipeline.model(  # type: ignore
+            **encoded_prompt,
+            output_hidden_states=True,
+            return_dict=True,
+        )
+        last_hidden_state = output["hidden_states"][-1][:, -1, :]
+        return last_hidden_state.cpu().numpy()

    @torch.no_grad()
    def generate(
--- a/manifest/api/models/model.py
+++ b/manifest/api/models/model.py
@ -1,14 +1,12 @@
 """Model class."""
-from abc import ABC, abstractmethod
 from typing import Any, Dict, List, Tuple, Union

 import numpy as np


-class Model(ABC):
+class Model:
    """Model class."""

-    @abstractmethod
    def __init__(
        self,
        model_name_or_path: str,
@ -41,7 +39,6 @@ class Model(ABC):
        """
        raise NotImplementedError()

-    @abstractmethod
    def get_init_params(self) -> Dict:
        """Return init params to determine what model is being used."""
        raise NotImplementedError()
@ -66,13 +63,13 @@ class Model(ABC):

    def embed(self, prompt: Union[str, List[str]], **kwargs: Any) -> np.ndarray:
        """
-        Compute embedding for prompts.
+        Embed the prompt from model.

        Args:
-            prompt: promt to generate from.
+            prompt: promt to embed from.

        Returns:
-            embedding
+            list of embeddings (list of length 1 for 1 embedding).
        """
        raise NotImplementedError()

--- a/manifest/api/models/sentence_transformer.py
+++ b/manifest/api/models/sentence_transformer.py
@ -0,0 +1,113 @@
+"""Sentence transformer model."""
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+from sentence_transformers import SentenceTransformer
+
+from manifest.api.models.model import Model
+
+
+class SentenceTransformerModel(Model):
+    """SentenceTransformer model."""
+
+    def __init__(
+        self,
+        model_name_or_path: str,
+        model_type: Optional[str] = None,
+        model_config: Optional[str] = None,
+        cache_dir: Optional[str] = None,
+        device: int = 0,
+        use_accelerate: bool = False,
+        use_parallelize: bool = False,
+        use_bitsandbytes: bool = False,
+        use_deepspeed: bool = False,
+        perc_max_gpu_mem_red: float = 1.0,
+        use_fp16: bool = False,
+    ):
+        """
+        Initialize model.
+
+        All arguments will be passed in the request from Manifest.
+
+        Args:
+            model_name_or_path: model name string.
+            model_config: model config string.
+            cache_dir: cache directory for model.
+            device: device to use for model.
+            use_accelerate: whether to use accelerate for multi-gpu inference.
+            use_parallelize: use HF default parallelize
+            use_bitsandbytes: use HF bits and bytes
+            use_deepspeed: use deepspeed
+            perc_max_gpu_mem_red: percent max memory reduction in accelerate
+            use_fp16: use fp16 for model weights.
+        """
+        if use_accelerate or use_parallelize or use_bitsandbytes or use_deepspeed:
+            raise ValueError(
+                "Cannot use accelerate or parallelize or "
+                "bitsandbytes or deepspeeed with sentence transformers"
+            )
+        # Check if providing path
+        self.model_name = model_name_or_path
+        print("Model Name:", self.model_name)
+        torch_device = (
+            torch.device("cpu")
+            if (device == -1 or not torch.cuda.is_available())
+            else torch.device(f"cuda:{device}")
+        )
+        self.embedding_model = SentenceTransformer(self.model_name, device=torch_device)
+        self.embedding_model.to(torch_device)
+        self.embedding_model.eval()
+
+    def get_init_params(self) -> Dict:
+        """Return init params to determine what model is being used."""
+        return {"model_name": self.model_name, "model_path": self.model_name}
+
+    @torch.no_grad()
+    def generate(
+        self, prompt: Union[str, List[str]], **kwargs: Any
+    ) -> List[Tuple[Any, float, List[int], List[float]]]:
+        """
+        Generate the prompt from model.
+
+        Outputs must be generated text and score, not including prompt.
+
+        Args:
+            prompt: promt to generate from.
+
+        Returns:
+            list of generated text (list of length 1 for 1 generation).
+        """
+        raise NotImplementedError("Generate not supported for sentence transformers")
+
+    @torch.no_grad()
+    def embed(self, prompt: Union[str, List[str]], **kwargs: Any) -> np.ndarray:
+        """
+        Embed the prompt from model.
+
+        Args:
+            prompt: promt to embed from.
+
+        Returns:
+            list of embeddings (list of length 1 for 1 embedding).
+        """
+        if isinstance(prompt, str):
+            prompt = [prompt]
+        return self.embedding_model.encode(prompt)
+
+    @torch.no_grad()
+    def score_sequence(
+        self, prompt: Union[str, List[str]], **kwargs: Any
+    ) -> List[Tuple[float, List[int], List[float]]]:
+        """
+        Score a sequence of choices.
+
+        Args:
+            prompt (:obj:`str` or :obj:`List[str]`):
+                The prompt to score the choices against.
+            **kwargs:
+                Additional keyword arguments passed along to the :obj:`__call__` method.
+        """
+        raise NotImplementedError(
+            "Score sequence not supported for sentence transformers"
+        )
--- a/manifest/api/response.py
+++ b/manifest/api/response.py
@ -16,17 +16,24 @@ class ModelResponse:
            "text_completion",
            "prompt_logit_score",
            "image_generation",
+            "embedding_generation",
        }:
            raise ValueError(
                f"Invalid response type: {self.response_type}. "
-                "Must be one of: text_completion, prompt_logit_score, image_generation."
+                "Must be one of: text_completion, prompt_logit_score, "
+                "image_generation, embedding_generation."
            )
        self.response_id = str(uuid.uuid4())
        self.created = int(time.time())

    def __dict__(self) -> Dict[str, Any]:  # type: ignore
        """Return dictionary representation of response."""
-        key = "text" if self.response_type != "image_generation" else "array"
+        key = (
+            "text"
+            if self.response_type
+            not in {"prompt_logit_score", "image_generation", "embedding_generation"}
+            else "array"
+        )
        return {
            "id": self.response_id,
            "object": self.response_type,
--- a/manifest/caches/cache.py
+++ b/manifest/caches/cache.py
@ -10,6 +10,7 @@ ARRAY_CACHE_TYPES = {
    "diffuser",
    "tomadiffuser",
    "openaiembedding",
+    "huggingfaceembedding",
 }


--- a/manifest/clients/diffuser.py
+++ b/manifest/clients/diffuser.py
@ -82,8 +82,9 @@ class DiffuserClient(Client):
        Returns:
            model params.
        """
-        res = requests.post(self.host + "/params")
-        return res.json()
+        res = requests.post(self.host + "/params").json()
+        res["client_name"] = self.NAME
+        return res

    def format_response(self, response: Dict, request: Dict) -> Dict[str, Any]:
        """
--- a/manifest/clients/huggingface.py
+++ b/manifest/clients/huggingface.py
@ -76,8 +76,9 @@ class HuggingFaceClient(Client):
        Returns:
            model params.
        """
-        res = requests.post(self.host + "/params")
-        return res.json()
+        res = requests.post(self.host + "/params").json()
+        res["client_name"] = self.NAME
+        return res

    def get_score_prompt_request(
        self,
--- a/manifest/clients/huggingface_embedding.py
+++ b/manifest/clients/huggingface_embedding.py
@ -0,0 +1,89 @@
+"""Hugging Face client."""
+import logging
+from typing import Any, Dict, Optional, Tuple
+
+import numpy as np
+import requests
+
+from manifest.clients.client import Client
+from manifest.request import EmbeddingRequest
+
+logger = logging.getLogger(__name__)
+
+
+class HuggingFaceEmbeddingClient(Client):
+    """HuggingFaceEmbedding client."""
+
+    # User param -> (client param, default value)
+    PARAMS: Dict[str, Tuple[str, Any]] = {}
+    REQUEST_CLS = EmbeddingRequest
+    NAME = "huggingfaceembedding"
+
+    def connect(
+        self,
+        connection_str: Optional[str] = None,
+        client_args: Dict[str, Any] = {},
+    ) -> None:
+        """
+        Connect to the HuggingFace url.
+
+        Arsg:
+            connection_str: connection string.
+            client_args: client arguments.
+        """
+        if not connection_str:
+            raise ValueError("Must provide connection string")
+        self.host = connection_str.rstrip("/")
+        for key in self.PARAMS:
+            setattr(self, key, client_args.pop(key, self.PARAMS[key][1]))
+
+    def close(self) -> None:
+        """Close the client."""
+        pass
+
+    def get_generation_url(self) -> str:
+        """Get generation URL."""
+        return self.host + "/embed"
+
+    def get_generation_header(self) -> Dict[str, str]:
+        """
+        Get generation header.
+
+        Returns:
+            header.
+        """
+        return {}
+
+    def supports_batch_inference(self) -> bool:
+        """Return whether the client supports batch inference."""
+        return True
+
+    def get_model_params(self) -> Dict:
+        """
+        Get model params.
+
+        By getting model params from the server, we can add to request
+        and make sure cache keys are unique to model.
+
+        Returns:
+            model params.
+        """
+        res = requests.post(self.host + "/params").json()
+        res["client_name"] = self.NAME
+        return res
+
+    def format_response(self, response: Dict, request: Dict) -> Dict[str, Any]:
+        """
+        Format response to dict.
+
+        Args:
+            response: response
+            request: request
+
+        Return:
+            response as dict
+        """
+        # Convert array to np.array
+        for choice in response["choices"]:
+            choice["array"] = np.array(choice["array"])
+        return response
--- a/manifest/manifest.py
+++ b/manifest/manifest.py
@ -13,6 +13,7 @@ from manifest.clients.ai21 import AI21Client
 from manifest.clients.cohere import CohereClient
 from manifest.clients.dummy import DummyClient
 from manifest.clients.huggingface import HuggingFaceClient
+from manifest.clients.huggingface_embedding import HuggingFaceEmbeddingClient
 from manifest.clients.openai import OpenAIClient
 from manifest.clients.openai_chat import OpenAIChatClient
 from manifest.clients.openai_embedding import OpenAIEmbeddingClient
@ -30,6 +31,7 @@ CLIENT_CONSTRUCTORS = {
    CohereClient.NAME: CohereClient,
    AI21Client.NAME: AI21Client,
    HuggingFaceClient.NAME: HuggingFaceClient,
+    HuggingFaceEmbeddingClient.NAME: HuggingFaceEmbeddingClient,
    DummyClient.NAME: DummyClient,
    TOMAClient.NAME: TOMAClient,
 }
--- a/manifest/request.py
+++ b/manifest/request.py
@ -1,5 +1,5 @@
 """Request object."""
-from typing import Any, Dict, List, Literal, Optional, Tuple, Union
+from typing import Any, Dict, List, Optional, Tuple, Union

 from pydantic import BaseModel

@ -99,8 +99,7 @@ class LMRequest(Request):
 class EmbeddingRequest(Request):
    """Embedding Request object."""

-    # Aggregate method (if applicable)
-    aggregation_method: Optional[Literal["last_token", "mean"]] = None
+    pass


 class DiffusionRequest(Request):
--- a/manifest/response.py
+++ b/manifest/response.py
@ -17,6 +17,10 @@ RESPONSE_CONSTRUCTORS = {
        "logits_key": "token_logprobs",
        "item_key": "array",
    },
+    "huggingfaceembedding": {
+        "logits_key": "token_logprobs",
+        "item_key": "array",
+    },
 }


--- a/pyproject.toml
+++ b/pyproject.toml
@ -9,6 +9,7 @@ module = [
  "deepspeed",
  "numpy",
  "diffusers",
+  "sentence_transformers",
  "sqlitedict",
  "dill",
  "accelerate",
--- a/setup.py
+++ b/setup.py
@ -42,12 +42,13 @@ REQUIRED = [
 # What packages are optional?
 EXTRAS = {
    "api": [
+        "accelerate>=0.10.0",
        "deepspeed>=0.7.0",
        "diffusers>=0.6.0",
        "Flask>=2.1.2",
-        "accelerate>=0.10.0",
-        "transformers>=4.20.0,<4.26.0",
+        "sentence_transformers>=2.2.0",
        "torch>=1.8.0",
+        "transformers>=4.20.0,<4.26.0",
    ],
    "app": [
        "fastapi>=0.70.0",
--- a/tests/test_huggingface_api.py
+++ b/tests/test_huggingface_api.py
@ -4,9 +4,11 @@ import math
 import os
 from subprocess import PIPE, Popen

+import numpy as np
 import pytest

 from manifest.api.models.huggingface import MODEL_REGISTRY, TextGenerationModel
+from manifest.api.models.sentence_transformer import SentenceTransformerModel

 NOCUDA = 0
 try:
@ -147,6 +149,37 @@ def test_gpt_score() -> None:
    assert isinstance(result[1][1], list)


+def test_embed() -> None:
+    """Test embedding pipeline."""
+    model = TextGenerationModel(
+        model_name_or_path="gpt2",
+        use_accelerate=False,
+        use_parallelize=False,
+        use_bitsandbytes=False,
+        use_deepspeed=False,
+        use_fp16=False,
+        device=-1,
+    )
+    inputs = ["Why is the sky green?", "Cats are butterflies"]
+    embeddings = model.embed(inputs)
+    assert isinstance(embeddings, np.ndarray)
+    assert embeddings.shape == (2, 768)
+
+    model2 = SentenceTransformerModel(
+        model_name_or_path="all-mpnet-base-v2",
+        use_accelerate=False,
+        use_parallelize=False,
+        use_bitsandbytes=False,
+        use_deepspeed=False,
+        use_fp16=False,
+        device=-1,
+    )
+    inputs = ["Why is the sky green?", "Cats are butterflies"]
+    embeddings = model2.embed(inputs)
+    assert isinstance(embeddings, np.ndarray)
+    assert embeddings.shape == (2, 768)
+
+
 def test_batch_gpt_generate() -> None:
    """Test pipeline generation from a gpt model."""
    model = TextGenerationModel(
--- a/tests/test_manifest.py
+++ b/tests/test_manifest.py
@ -551,6 +551,101 @@ def test_local_huggingface(sqlite_cache: str) -> None:
    )


+@pytest.mark.skipif(not MODEL_ALIVE, reason=f"No model at {URL}")
+@pytest.mark.usefixtures("sqlite_cache")
+def test_local_huggingfaceembedding(sqlite_cache: str) -> None:
+    """Test openaichat client."""
+    client = Manifest(
+        client_name="huggingfaceembedding",
+        client_connection=URL,
+        cache_name="sqlite",
+        cache_connection=sqlite_cache,
+    )
+
+    res = client.run("Why are there carrots?")
+    assert isinstance(res, np.ndarray)
+
+    response = cast(
+        Response, client.run("Why are there carrots?", return_response=True)
+    )
+    assert isinstance(response.get_response(), np.ndarray)
+    assert np.allclose(response.get_response(), res)
+
+    client = Manifest(
+        client_name="huggingfaceembedding",
+        client_connection=URL,
+        cache_name="sqlite",
+        cache_connection=sqlite_cache,
+    )
+
+    res = client.run("Why are there apples?")
+    assert isinstance(res, np.ndarray)
+
+    response = cast(Response, client.run("Why are there apples?", return_response=True))
+    assert isinstance(response.get_response(), np.ndarray)
+    assert np.allclose(response.get_response(), res)
+    assert response.is_cached() is True
+
+    response = cast(Response, client.run("Why are there apples?", return_response=True))
+    assert response.is_cached() is True
+
+    res_list = client.run(["Why are there apples?", "Why are there bananas?"])
+    assert (
+        isinstance(res_list, list)
+        and len(res_list) == 2
+        and isinstance(res_list[0], np.ndarray)
+    )
+
+    response = cast(
+        Response,
+        client.run(
+            ["Why are there apples?", "Why are there mangos?"], return_response=True
+        ),
+    )
+    assert (
+        isinstance(response.get_response(), list) and len(response.get_response()) == 2
+    )
+
+    response = cast(
+        Response, client.run("Why are there bananas?", return_response=True)
+    )
+    assert response.is_cached() is True
+
+    response = cast(
+        Response, client.run("Why are there oranges?", return_response=True)
+    )
+    assert response.is_cached() is False
+
+    res_list = asyncio.run(
+        client.arun_batch(["Why are there pears?", "Why are there oranges?"])
+    )
+    assert (
+        isinstance(res_list, list)
+        and len(res_list) == 2
+        and isinstance(res_list[0], np.ndarray)
+    )
+
+    response = cast(
+        Response,
+        asyncio.run(
+            client.arun_batch(
+                ["Why are there pinenuts?", "Why are there cocoa?"],
+                return_response=True,
+            )
+        ),
+    )
+    assert (
+        isinstance(response.get_response(), list)
+        and len(res_list) == 2
+        and isinstance(res_list[0], np.ndarray)
+    )
+
+    response = cast(
+        Response, client.run("Why are there oranges?", return_response=True)
+    )
+    assert response.is_cached() is True
+
+
@pytest.mark.skipif(not OPENAI_ALIVE, reason="No openai key set")
@pytest.mark.usefixtures("sqlite_cache")
 def test_openai(sqlite_cache: str) -> None: