feat: add local huggingface embedding models (#76)

pull/82/head
Laurel Orr 1 year ago committed by GitHub
parent 40de0e7f59
commit 0fb192a0a2

@ -69,19 +69,24 @@
"```\n",
"python3 manifest/api/app.py --model_type huggingface --model_name_or_path EleutherAI/gpt-neo-125M --device 0\n",
"```\n",
"in a separate `screen` or `tmux`."
"or\n",
"```\n",
"python3 manifest/api/app.py --model_type sentence_transformers --model_name_or_path all-mpnet-base-v2 --device 0\n",
"```\n",
"\n",
"in a separate `screen` or `tmux`. Make sure to note the port. You can change this with `export FLASK_PORT=<port>`."
]
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'model_name': 'EleutherAI/gpt-neo-125M', 'model_path': 'EleutherAI/gpt-neo-125M'}\n"
"{'model_name': 'all-mpnet-base-v2', 'model_path': 'all-mpnet-base-v2', 'client_name': 'huggingfaceembedding'}\n"
]
}
],
@ -91,7 +96,7 @@
"# Local hosted GPT Neo 125M\n",
"manifest = Manifest(\n",
" client_name=\"huggingfaceembedding\",\n",
" client_connection=\"http://127.0.0.1:6001\",\n",
" client_connection=\"http://127.0.0.1:6000\",\n",
" cache_name=\"sqlite\",\n",
" cache_connection=\"my_sqlite_manifest.sqlite\"\n",
")\n",
@ -100,12 +105,24 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 4,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(768,)\n",
"(768,) (768,)\n"
]
}
],
"source": [
"emb = manifest.run(\"Is this an embedding?\")\n",
"emb2 = manifest.run(\"Is this an embedding?\", aggregation=\"mean\")"
"print(emb.shape)\n",
"\n",
"emb = manifest.run([\"Is this an embedding?\", \"Bananas!!!\"])\n",
"print(emb[0].shape, emb[1].shape)"
]
}
],

@ -16,6 +16,7 @@ from manifest.api.models.huggingface import (
CrossModalEncoderModel,
TextGenerationModel,
)
from manifest.api.models.sentence_transformer import SentenceTransformerModel
from manifest.api.response import ModelResponse
os.environ["TOKENIZERS_PARALLELISM"] = "false"
@ -28,6 +29,7 @@ model_type = None
PORT = int(os.environ.get("FLASK_PORT", 5000))
MODEL_CONSTRUCTORS = {
"huggingface": TextGenerationModel,
"sentence_transformers": SentenceTransformerModel,
"huggingface_crossmodal": CrossModalEncoderModel,
"diffuser": DiffuserModel,
}
@ -198,11 +200,14 @@ def completions() -> Response:
@app.route("/embed", methods=["POST"])
def embed() -> Dict:
def embed() -> Response:
"""Get embed for generation."""
modality = request.json["modality"]
if "modality" in request.json:
modality = request.json["modality"]
else:
modality = "text"
if modality == "text":
prompts = request.json["prompts"]
prompts = request.json["prompt"]
elif modality == "image":
import base64
@ -210,19 +215,36 @@ def embed() -> Dict:
prompts = [
Image.open(io.BytesIO(base64.b64decode(data)))
for data in request.json["prompts"]
for data in request.json["prompt"]
]
else:
raise ValueError("modality must be text or image")
results = []
embeddings = model.embed(prompts)
for embedding in embeddings:
results.append(embedding.tolist())
try:
results = []
embeddings = model.embed(prompts)
for embedding in embeddings:
results.append(
{
"array": embedding,
"logprob": None,
"tokens": None,
"token_logprobs": None,
}
)
# transform the result into the openai format
# return Response(results, response_type="text_completion").__dict__()
return {"result": results}
return Response(
json.dumps(
ModelResponse(results, response_type="embedding_generation").__dict__()
),
status=200,
)
except Exception as e:
logger.error(e)
return Response(
json.dumps({"message": str(e)}),
status=400,
)
@app.route("/score_sequence", methods=["POST"])

@ -1,7 +1,8 @@
"""Huggingface model."""
"""Diffuser model."""
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, Union
import numpy as np
import torch
from diffusers import StableDiffusionPipeline
@ -93,6 +94,19 @@ class DiffuserModel(Model):
# Return None for logprobs and token logprobs
return [(im, None, None, None) for im in result["images"]]
@torch.no_grad()
def embed(self, prompt: Union[str, List[str]], **kwargs: Any) -> np.ndarray:
"""
Embed the prompt from model.
Args:
prompt: promt to embed from.
Returns:
list of embeddings (list of length 1 for 1 embedding).
"""
raise NotImplementedError("Embed not supported for diffusers")
@torch.no_grad()
def score_sequence(
self, prompt: Union[str, List[str]], **kwargs: Any

@ -535,15 +535,32 @@ class TextGenerationModel(HuggingFaceModel):
@torch.no_grad()
def embed(self, prompt: Union[str, List[str]], **kwargs: Any) -> np.ndarray:
"""
Compute embedding for prompts.
Embed the prompt from model.
Args:
prompt: promt to generate from.
prompt: promt to embed from.
Returns:
embedding
list of embeddings (list of length 1 for 1 embedding).
"""
pass
if isinstance(prompt, str):
prompt = [prompt]
encoded_prompt = self.pipeline.tokenizer(
prompt,
max_length=self.pipeline.max_length,
truncation=True,
padding=True,
return_tensors="pt",
)
encoded_prompt = encoded_prompt.to(self.pipeline.device)
# Get last hidden state
output = self.pipeline.model( # type: ignore
**encoded_prompt,
output_hidden_states=True,
return_dict=True,
)
last_hidden_state = output["hidden_states"][-1][:, -1, :]
return last_hidden_state.cpu().numpy()
@torch.no_grad()
def generate(

@ -1,14 +1,12 @@
"""Model class."""
from abc import ABC, abstractmethod
from typing import Any, Dict, List, Tuple, Union
import numpy as np
class Model(ABC):
class Model:
"""Model class."""
@abstractmethod
def __init__(
self,
model_name_or_path: str,
@ -41,7 +39,6 @@ class Model(ABC):
"""
raise NotImplementedError()
@abstractmethod
def get_init_params(self) -> Dict:
"""Return init params to determine what model is being used."""
raise NotImplementedError()
@ -66,13 +63,13 @@ class Model(ABC):
def embed(self, prompt: Union[str, List[str]], **kwargs: Any) -> np.ndarray:
"""
Compute embedding for prompts.
Embed the prompt from model.
Args:
prompt: promt to generate from.
prompt: promt to embed from.
Returns:
embedding
list of embeddings (list of length 1 for 1 embedding).
"""
raise NotImplementedError()

@ -0,0 +1,113 @@
"""Sentence transformer model."""
from typing import Any, Dict, List, Optional, Tuple, Union
import numpy as np
import torch
from sentence_transformers import SentenceTransformer
from manifest.api.models.model import Model
class SentenceTransformerModel(Model):
"""SentenceTransformer model."""
def __init__(
self,
model_name_or_path: str,
model_type: Optional[str] = None,
model_config: Optional[str] = None,
cache_dir: Optional[str] = None,
device: int = 0,
use_accelerate: bool = False,
use_parallelize: bool = False,
use_bitsandbytes: bool = False,
use_deepspeed: bool = False,
perc_max_gpu_mem_red: float = 1.0,
use_fp16: bool = False,
):
"""
Initialize model.
All arguments will be passed in the request from Manifest.
Args:
model_name_or_path: model name string.
model_config: model config string.
cache_dir: cache directory for model.
device: device to use for model.
use_accelerate: whether to use accelerate for multi-gpu inference.
use_parallelize: use HF default parallelize
use_bitsandbytes: use HF bits and bytes
use_deepspeed: use deepspeed
perc_max_gpu_mem_red: percent max memory reduction in accelerate
use_fp16: use fp16 for model weights.
"""
if use_accelerate or use_parallelize or use_bitsandbytes or use_deepspeed:
raise ValueError(
"Cannot use accelerate or parallelize or "
"bitsandbytes or deepspeeed with sentence transformers"
)
# Check if providing path
self.model_name = model_name_or_path
print("Model Name:", self.model_name)
torch_device = (
torch.device("cpu")
if (device == -1 or not torch.cuda.is_available())
else torch.device(f"cuda:{device}")
)
self.embedding_model = SentenceTransformer(self.model_name, device=torch_device)
self.embedding_model.to(torch_device)
self.embedding_model.eval()
def get_init_params(self) -> Dict:
"""Return init params to determine what model is being used."""
return {"model_name": self.model_name, "model_path": self.model_name}
@torch.no_grad()
def generate(
self, prompt: Union[str, List[str]], **kwargs: Any
) -> List[Tuple[Any, float, List[int], List[float]]]:
"""
Generate the prompt from model.
Outputs must be generated text and score, not including prompt.
Args:
prompt: promt to generate from.
Returns:
list of generated text (list of length 1 for 1 generation).
"""
raise NotImplementedError("Generate not supported for sentence transformers")
@torch.no_grad()
def embed(self, prompt: Union[str, List[str]], **kwargs: Any) -> np.ndarray:
"""
Embed the prompt from model.
Args:
prompt: promt to embed from.
Returns:
list of embeddings (list of length 1 for 1 embedding).
"""
if isinstance(prompt, str):
prompt = [prompt]
return self.embedding_model.encode(prompt)
@torch.no_grad()
def score_sequence(
self, prompt: Union[str, List[str]], **kwargs: Any
) -> List[Tuple[float, List[int], List[float]]]:
"""
Score a sequence of choices.
Args:
prompt (:obj:`str` or :obj:`List[str]`):
The prompt to score the choices against.
**kwargs:
Additional keyword arguments passed along to the :obj:`__call__` method.
"""
raise NotImplementedError(
"Score sequence not supported for sentence transformers"
)

@ -16,17 +16,24 @@ class ModelResponse:
"text_completion",
"prompt_logit_score",
"image_generation",
"embedding_generation",
}:
raise ValueError(
f"Invalid response type: {self.response_type}. "
"Must be one of: text_completion, prompt_logit_score, image_generation."
"Must be one of: text_completion, prompt_logit_score, "
"image_generation, embedding_generation."
)
self.response_id = str(uuid.uuid4())
self.created = int(time.time())
def __dict__(self) -> Dict[str, Any]: # type: ignore
"""Return dictionary representation of response."""
key = "text" if self.response_type != "image_generation" else "array"
key = (
"text"
if self.response_type
not in {"prompt_logit_score", "image_generation", "embedding_generation"}
else "array"
)
return {
"id": self.response_id,
"object": self.response_type,

@ -10,6 +10,7 @@ ARRAY_CACHE_TYPES = {
"diffuser",
"tomadiffuser",
"openaiembedding",
"huggingfaceembedding",
}

@ -82,8 +82,9 @@ class DiffuserClient(Client):
Returns:
model params.
"""
res = requests.post(self.host + "/params")
return res.json()
res = requests.post(self.host + "/params").json()
res["client_name"] = self.NAME
return res
def format_response(self, response: Dict, request: Dict) -> Dict[str, Any]:
"""

@ -76,8 +76,9 @@ class HuggingFaceClient(Client):
Returns:
model params.
"""
res = requests.post(self.host + "/params")
return res.json()
res = requests.post(self.host + "/params").json()
res["client_name"] = self.NAME
return res
def get_score_prompt_request(
self,

@ -0,0 +1,89 @@
"""Hugging Face client."""
import logging
from typing import Any, Dict, Optional, Tuple
import numpy as np
import requests
from manifest.clients.client import Client
from manifest.request import EmbeddingRequest
logger = logging.getLogger(__name__)
class HuggingFaceEmbeddingClient(Client):
"""HuggingFaceEmbedding client."""
# User param -> (client param, default value)
PARAMS: Dict[str, Tuple[str, Any]] = {}
REQUEST_CLS = EmbeddingRequest
NAME = "huggingfaceembedding"
def connect(
self,
connection_str: Optional[str] = None,
client_args: Dict[str, Any] = {},
) -> None:
"""
Connect to the HuggingFace url.
Arsg:
connection_str: connection string.
client_args: client arguments.
"""
if not connection_str:
raise ValueError("Must provide connection string")
self.host = connection_str.rstrip("/")
for key in self.PARAMS:
setattr(self, key, client_args.pop(key, self.PARAMS[key][1]))
def close(self) -> None:
"""Close the client."""
pass
def get_generation_url(self) -> str:
"""Get generation URL."""
return self.host + "/embed"
def get_generation_header(self) -> Dict[str, str]:
"""
Get generation header.
Returns:
header.
"""
return {}
def supports_batch_inference(self) -> bool:
"""Return whether the client supports batch inference."""
return True
def get_model_params(self) -> Dict:
"""
Get model params.
By getting model params from the server, we can add to request
and make sure cache keys are unique to model.
Returns:
model params.
"""
res = requests.post(self.host + "/params").json()
res["client_name"] = self.NAME
return res
def format_response(self, response: Dict, request: Dict) -> Dict[str, Any]:
"""
Format response to dict.
Args:
response: response
request: request
Return:
response as dict
"""
# Convert array to np.array
for choice in response["choices"]:
choice["array"] = np.array(choice["array"])
return response

@ -13,6 +13,7 @@ from manifest.clients.ai21 import AI21Client
from manifest.clients.cohere import CohereClient
from manifest.clients.dummy import DummyClient
from manifest.clients.huggingface import HuggingFaceClient
from manifest.clients.huggingface_embedding import HuggingFaceEmbeddingClient
from manifest.clients.openai import OpenAIClient
from manifest.clients.openai_chat import OpenAIChatClient
from manifest.clients.openai_embedding import OpenAIEmbeddingClient
@ -30,6 +31,7 @@ CLIENT_CONSTRUCTORS = {
CohereClient.NAME: CohereClient,
AI21Client.NAME: AI21Client,
HuggingFaceClient.NAME: HuggingFaceClient,
HuggingFaceEmbeddingClient.NAME: HuggingFaceEmbeddingClient,
DummyClient.NAME: DummyClient,
TOMAClient.NAME: TOMAClient,
}

@ -1,5 +1,5 @@
"""Request object."""
from typing import Any, Dict, List, Literal, Optional, Tuple, Union
from typing import Any, Dict, List, Optional, Tuple, Union
from pydantic import BaseModel
@ -99,8 +99,7 @@ class LMRequest(Request):
class EmbeddingRequest(Request):
"""Embedding Request object."""
# Aggregate method (if applicable)
aggregation_method: Optional[Literal["last_token", "mean"]] = None
pass
class DiffusionRequest(Request):

@ -17,6 +17,10 @@ RESPONSE_CONSTRUCTORS = {
"logits_key": "token_logprobs",
"item_key": "array",
},
"huggingfaceembedding": {
"logits_key": "token_logprobs",
"item_key": "array",
},
}

@ -9,6 +9,7 @@ module = [
"deepspeed",
"numpy",
"diffusers",
"sentence_transformers",
"sqlitedict",
"dill",
"accelerate",

@ -42,12 +42,13 @@ REQUIRED = [
# What packages are optional?
EXTRAS = {
"api": [
"accelerate>=0.10.0",
"deepspeed>=0.7.0",
"diffusers>=0.6.0",
"Flask>=2.1.2",
"accelerate>=0.10.0",
"transformers>=4.20.0,<4.26.0",
"sentence_transformers>=2.2.0",
"torch>=1.8.0",
"transformers>=4.20.0,<4.26.0",
],
"app": [
"fastapi>=0.70.0",

@ -4,9 +4,11 @@ import math
import os
from subprocess import PIPE, Popen
import numpy as np
import pytest
from manifest.api.models.huggingface import MODEL_REGISTRY, TextGenerationModel
from manifest.api.models.sentence_transformer import SentenceTransformerModel
NOCUDA = 0
try:
@ -147,6 +149,37 @@ def test_gpt_score() -> None:
assert isinstance(result[1][1], list)
def test_embed() -> None:
"""Test embedding pipeline."""
model = TextGenerationModel(
model_name_or_path="gpt2",
use_accelerate=False,
use_parallelize=False,
use_bitsandbytes=False,
use_deepspeed=False,
use_fp16=False,
device=-1,
)
inputs = ["Why is the sky green?", "Cats are butterflies"]
embeddings = model.embed(inputs)
assert isinstance(embeddings, np.ndarray)
assert embeddings.shape == (2, 768)
model2 = SentenceTransformerModel(
model_name_or_path="all-mpnet-base-v2",
use_accelerate=False,
use_parallelize=False,
use_bitsandbytes=False,
use_deepspeed=False,
use_fp16=False,
device=-1,
)
inputs = ["Why is the sky green?", "Cats are butterflies"]
embeddings = model2.embed(inputs)
assert isinstance(embeddings, np.ndarray)
assert embeddings.shape == (2, 768)
def test_batch_gpt_generate() -> None:
"""Test pipeline generation from a gpt model."""
model = TextGenerationModel(

@ -551,6 +551,101 @@ def test_local_huggingface(sqlite_cache: str) -> None:
)
@pytest.mark.skipif(not MODEL_ALIVE, reason=f"No model at {URL}")
@pytest.mark.usefixtures("sqlite_cache")
def test_local_huggingfaceembedding(sqlite_cache: str) -> None:
"""Test openaichat client."""
client = Manifest(
client_name="huggingfaceembedding",
client_connection=URL,
cache_name="sqlite",
cache_connection=sqlite_cache,
)
res = client.run("Why are there carrots?")
assert isinstance(res, np.ndarray)
response = cast(
Response, client.run("Why are there carrots?", return_response=True)
)
assert isinstance(response.get_response(), np.ndarray)
assert np.allclose(response.get_response(), res)
client = Manifest(
client_name="huggingfaceembedding",
client_connection=URL,
cache_name="sqlite",
cache_connection=sqlite_cache,
)
res = client.run("Why are there apples?")
assert isinstance(res, np.ndarray)
response = cast(Response, client.run("Why are there apples?", return_response=True))
assert isinstance(response.get_response(), np.ndarray)
assert np.allclose(response.get_response(), res)
assert response.is_cached() is True
response = cast(Response, client.run("Why are there apples?", return_response=True))
assert response.is_cached() is True
res_list = client.run(["Why are there apples?", "Why are there bananas?"])
assert (
isinstance(res_list, list)
and len(res_list) == 2
and isinstance(res_list[0], np.ndarray)
)
response = cast(
Response,
client.run(
["Why are there apples?", "Why are there mangos?"], return_response=True
),
)
assert (
isinstance(response.get_response(), list) and len(response.get_response()) == 2
)
response = cast(
Response, client.run("Why are there bananas?", return_response=True)
)
assert response.is_cached() is True
response = cast(
Response, client.run("Why are there oranges?", return_response=True)
)
assert response.is_cached() is False
res_list = asyncio.run(
client.arun_batch(["Why are there pears?", "Why are there oranges?"])
)
assert (
isinstance(res_list, list)
and len(res_list) == 2
and isinstance(res_list[0], np.ndarray)
)
response = cast(
Response,
asyncio.run(
client.arun_batch(
["Why are there pinenuts?", "Why are there cocoa?"],
return_response=True,
)
),
)
assert (
isinstance(response.get_response(), list)
and len(res_list) == 2
and isinstance(res_list[0], np.ndarray)
)
response = cast(
Response, client.run("Why are there oranges?", return_response=True)
)
assert response.is_cached() is True
@pytest.mark.skipif(not OPENAI_ALIVE, reason="No openai key set")
@pytest.mark.usefixtures("sqlite_cache")
def test_openai(sqlite_cache: str) -> None:

Loading…
Cancel
Save