fix: add dtype to cache (#52)

pull/82/head
Laurel Orr 1 year ago
parent e351bd5315
commit 8ced666df8

@ -1,4 +1,8 @@
Unreleased 0.0.2
0.0.3 - Unreleased
---------------------
0.0.2 - 2022-01-31
---------------------
Added
^^^^^
@ -6,9 +10,7 @@ Added
* Standard request base model for all language inputs.
* ChatGPT client. Requires CHATGPT_SESSION_KEY to be passed in.
* Diffusion model support
Fixed
^^^^^^^^
* Together model support
Removed
^^^^^^^

File diff suppressed because one or more lines are too long

@ -0,0 +1,98 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%load_ext autoreload\n",
"%autoreload 2\n",
"\n",
"%env TOMA_URL=https://staging.together.xyz/api"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from manifest import Manifest\n",
"\n",
"# The responses are not fast\n",
"manifest = Manifest(\n",
" client_name=\"toma\",\n",
")\n",
"\n",
"print(manifest.run(\"What is the color of an apple?\"))"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"With a cache"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from manifest import Manifest\n",
"\n",
"# The responses are not fast\n",
"manifest = Manifest(\n",
" client_name=\"toma\",\n",
" cache_name=\"sqlite\",\n",
" cache_connection=\"my_manifest_cache.sqlite\",\n",
")\n",
"\n",
"res = manifest.run(\"What is the color of an apple?\", return_response=True)\n",
"print(res.get_response())\n",
"print(\"Is Cached?\", res.is_cached())\n",
"\n",
"res = manifest.run(\"What is the color of an apple?\", return_response=True)\n",
"print(res.get_response())\n",
"print(\"Is Cached?\", res.is_cached())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "manifest",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.4"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "fddffe4ac3b9f00470127629076101c1b5f38ecb1e7358b567d19305425e9491"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}

@ -11,7 +11,11 @@ import pkg_resources
from flask import Flask, Response, request
from manifest.api.models.diffuser import DiffuserModel
from manifest.api.models.huggingface import CrossModalEncoderModel, TextGenerationModel
from manifest.api.models.huggingface import (
MODEL_GENTYPE_REGISTRY,
CrossModalEncoderModel,
TextGenerationModel,
)
from manifest.api.response import ModelResponse
os.environ["TOKENIZERS_PARALLELISM"] = "false"
@ -40,6 +44,13 @@ def parse_args() -> argparse.Namespace:
help="Model type used for finding constructor.",
choices=MODEL_CONSTRUCTORS.keys(),
)
parser.add_argument(
"--model_generation_type",
default=None,
type=str,
help="Model generation type.",
choices=MODEL_GENTYPE_REGISTRY.keys(),
)
parser.add_argument(
"--model_name_or_path",
default=None,
@ -104,6 +115,7 @@ def main() -> None:
raise ValueError(f"Port {PORT} is already in use.")
global model_type
model_type = kwargs.model_type
model_gen_type = kwargs.model_generation_type
model_name_or_path = kwargs.model_name_or_path
if not model_name_or_path:
raise ValueError("Must provide model_name_or_path.")
@ -133,6 +145,7 @@ def main() -> None:
global model
model = MODEL_CONSTRUCTORS[model_type](
model_name_or_path,
model_type=model_gen_type,
cache_dir=kwargs.cache_dir,
device=kwargs.device,
use_accelerate=kwargs.use_accelerate_multigpu,

@ -14,6 +14,7 @@ class DiffuserModel(Model):
def __init__(
self,
model_name_or_path: str,
model_type: Optional[str] = None,
model_config: Optional[str] = None,
cache_dir: Optional[str] = None,
device: int = 0,
@ -93,17 +94,16 @@ class DiffuserModel(Model):
return [(im, None) for im in result["images"]]
@torch.no_grad()
def logits_scoring(
self, prompt: Union[str, List[str]], gold_choices: List[str], **kwargs: Any
) -> List[Tuple[Any, float]]:
def score_sequence(
self, prompt: Union[str, List[str]], **kwargs: Any
) -> List[float]:
"""
Given the prompt and gold choices, choose the best choice with max logits.
Score a sequence of choices.
Args:
prompt: promt to generate from.
gold_choices: list of choices to choose from.
Returns:
the returned gold choice
prompt (:obj:`str` or :obj:`List[str]`):
The prompt to score the choices against.
**kwargs:
Additional keyword arguments passed along to the :obj:`__call__` method.
"""
raise NotImplementedError("Logits scoring not supported for diffusers")
raise NotImplementedError("Score sequence not supported for diffusers")

@ -61,7 +61,11 @@ MODEL_REGISTRY = {
"google/flan-t5-l": AutoModelForSeq2SeqLM, # 800M
"google/flan-t5-xl": AutoModelForSeq2SeqLM, # 3B
"google/flan-t5-xxl": AutoModelForSeq2SeqLM, # 11B
"allenai/tk-instruct-3b-def": AutoModelForSeq2SeqLM,
}
MODEL_GENTYPE_REGISTRY = {
"text-generation": AutoModelForCausalLM,
"text2text-generation": AutoModelForSeq2SeqLM,
}
@ -187,6 +191,7 @@ class HuggingFaceModel(Model):
def __init__(
self,
model_name_or_path: str,
model_type: Optional[str] = None,
model_config: Optional[str] = None,
cache_dir: Optional[str] = None,
device: int = 0,
@ -227,6 +232,13 @@ class HuggingFaceModel(Model):
config = json.load(open(Path(self.model_path) / "config.json"))
model_name_or_path = config["_name_or_path"]
self.model_name = model_name_or_path
self.model_type = model_type
if self.model_name not in MODEL_REGISTRY and self.model_type is None:
raise ValueError(
f"{self.model_name} is not in our registry. Please specify "
"--model_generation_type as either text-generation (for Causal)"
" or text2text-generation (for Seq2Seq)"
)
print("Model Name:", self.model_name, "Model Path:", self.model_path)
def get_init_params(self) -> Dict:
@ -316,6 +328,7 @@ class CrossModalEncoderModel(HuggingFaceModel):
def __init__(
self,
model_name_or_path: str,
model_type: Optional[str] = None,
model_config: Optional[str] = None,
cache_dir: Optional[str] = None,
device: int = 0,
@ -345,6 +358,7 @@ class CrossModalEncoderModel(HuggingFaceModel):
"""
super().__init__(
model_name_or_path,
model_type,
model_config,
cache_dir,
device,
@ -359,7 +373,9 @@ class CrossModalEncoderModel(HuggingFaceModel):
# TODO: make this generalizable
self.processor = CLIPProcessor.from_pretrained(self.model_path)
model = MODEL_REGISTRY[self.model_name].from_pretrained(
model = MODEL_REGISTRY.get(
self.model_name, MODEL_GENTYPE_REGISTRY.get(self.model_type, None)
).from_pretrained(
self.model_path,
cache_dir=cache_dir,
)
@ -370,7 +386,6 @@ class CrossModalEncoderModel(HuggingFaceModel):
if (device == -1 or not torch.cuda.is_available())
else torch.device(f"cuda:{device}")
)
print("T", torch_device)
self.model = model.to(torch_device) # type: ignore
@torch.no_grad()
@ -401,6 +416,7 @@ class TextGenerationModel(HuggingFaceModel):
def __init__(
self,
model_name_or_path: str,
model_type: Optional[str] = None,
model_config: Optional[str] = None,
cache_dir: Optional[str] = None,
device: int = 0,
@ -430,6 +446,7 @@ class TextGenerationModel(HuggingFaceModel):
"""
super().__init__(
model_name_or_path,
model_type,
model_config,
cache_dir,
device,
@ -455,7 +472,9 @@ class TextGenerationModel(HuggingFaceModel):
if use_bitsandbytes:
print("WARNING!!! Cannot use sampling with bitsandbytes.")
max_memory = get_max_memory(perc_max_gpu_mem_red)
model = MODEL_REGISTRY[self.model_name].from_pretrained( # type: ignore
model = MODEL_REGISTRY.get(
self.model_name, MODEL_GENTYPE_REGISTRY.get(self.model_type, None)
).from_pretrained( # type: ignore
self.model_path,
cache_dir=cache_dir,
load_in_8bit=True,
@ -465,14 +484,18 @@ class TextGenerationModel(HuggingFaceModel):
else:
try:
# Try to explicitely find a fp16 copy (gpt-j-6B for example)
model = MODEL_REGISTRY[self.model_name].from_pretrained( # type: ignore
model = MODEL_REGISTRY.get(
self.model_name, MODEL_GENTYPE_REGISTRY.get(self.model_type, None)
).from_pretrained( # type: ignore
self.model_path,
cache_dir=cache_dir,
revision="float16",
torch_dtype=torch.float16,
)
except Exception:
model = MODEL_REGISTRY[self.model_name].from_pretrained( # type: ignore
model = MODEL_REGISTRY.get(
self.model_name, MODEL_GENTYPE_REGISTRY.get(self.model_type, None)
).from_pretrained( # type: ignore
self.model_path, cache_dir=cache_dir, torch_dtype=dtype
)
model.eval()
@ -555,188 +578,6 @@ class TextGenerationModel(HuggingFaceModel):
]
return final_results
@torch.no_grad()
def logits_scoring(
self, prompt: Union[str, List[str]], gold_choices: List[str], **kwargs: Any
) -> List[Tuple[Any, float]]:
"""
Given the prompt and gold choices, choose the best choice with max logits.
Args:
prompt: promt to generate from.
gold_choices: list of choices to choose from.
Returns:
the returned gold choice
"""
if isinstance(prompt, str):
prompt = [prompt]
max_input_len = self.pipeline.max_length
if self.is_encdec:
# Adapted from https://github.com/bigscience-workshop/t-zero
tokenized_inputs = self.pipeline.tokenizer(
prompt,
padding="longest",
max_length=max_input_len,
truncation=True,
add_special_tokens=False,
)
# Get max target length
max_target_len = max(
[
len(self.pipeline.tokenizer(ans_choi)["input_ids"])
for ans_choi in gold_choices
]
)
tokenized_targets = [
self.pipeline.tokenizer(
ans_choi,
# padding is on the right here.
padding="max_length",
max_length=min(max_target_len, max_input_len),
truncation=True,
)
for ans_choi in gold_choices
]
# Repeat input ids for each choice to form a batch
features = {
k: [tokenized_inputs[k] for _ in range(len(gold_choices))]
for k in tokenized_inputs.keys()
}
# Add choice tokens + mask
features["labels"] = [
[tokenized_targets[k]["input_ids"]] * len(tokenized_inputs["input_ids"])
for k in range(len(gold_choices))
]
features["labels_attention_mask"] = [
[tokenized_targets[k]["attention_mask"]]
* len(tokenized_inputs["input_ids"])
for k in range(len(gold_choices))
]
else:
tokenized_inputs = self.pipeline.tokenizer(
prompt,
max_length=max_input_len,
truncation=True,
padding=False,
add_special_tokens=False,
)
tokenized_targets = [
self.pipeline.tokenizer(
# Add starting whitespace fo gpt
ans_choi,
max_length=max_input_len,
truncation=True,
padding=False,
add_special_tokens=False,
)
for ans_choi in gold_choices
]
features = {
k: [] for k in list(tokenized_inputs.keys()) + ["labels_attention_mask"]
}
max_effective_input_len = 0
for tokenized_targ in tokenized_targets:
for k in tokenized_inputs.keys():
batched_features = []
for prompt_i in range(len(tokenized_inputs[k])):
# Make sure to leave room for the outputs
batched_features.append(
tokenized_inputs[k][prompt_i][
: min(
len(tokenized_inputs[k][prompt_i]),
max_input_len - len(tokenized_targ[k]),
)
]
+ tokenized_targ[k]
)
max_effective_input_len = max(
max_effective_input_len, len(batched_features[-1])
)
features[k].append(batched_features)
# Manuall add labels_attention_mask
batched_features = []
for prompt_i in range(len(tokenized_inputs["input_ids"])):
batched_features.append(
[0]
* min(
len(tokenized_inputs["input_ids"][prompt_i]),
max_input_len - len(tokenized_targ["input_ids"]),
)
+ [1] * len(tokenized_targ["input_ids"])
)
features["labels_attention_mask"].append(batched_features)
# Manually pad to max effective length
for k in features.keys():
for targ_i in range(len(features[k])):
for prompt_i in range(len(features[k][targ_i])):
if k == "input_ids":
features[k][targ_i][prompt_i] += [
self.pipeline.tokenizer.pad_token_id
] * (
max_effective_input_len
- len(features[k][targ_i][prompt_i])
)
elif k in ["attention_mask", "labels_attention_mask"]:
features[k][targ_i][prompt_i] += [0] * (
max_effective_input_len
- len(features[k][targ_i][prompt_i])
)
else:
raise ValueError(f"Unknown key {k} for decoder only models")
features["labels"] = features["input_ids"]
# Convert to tensors
tensor_features = {}
for k in features:
tensor_features[k] = torch.LongTensor(features[k]).to(self.pipeline.device)
if self.is_encdec:
gold_l, bsz, seq_len = tensor_features["labels"].shape
stacked_logits = self.pipeline.model( # type: ignore
input_ids=tensor_features["input_ids"].reshape(gold_l * bsz, -1),
attention_mask=tensor_features["attention_mask"].reshape(
gold_l * bsz, -1
),
labels=tensor_features["labels"].reshape(gold_l * bsz, -1),
).logits
stacked_logits = stacked_logits.reshape(gold_l, bsz, seq_len, -1)
# Adapted from https://github.com/bigscience-workshop/t-zero
masked_log_probs = tensor_features["labels_attention_mask"].unsqueeze(
-1
) * torch.log_softmax(stacked_logits, dim=-1)
seq_token_log_probs = torch.gather(
masked_log_probs, -1, tensor_features["labels"].unsqueeze(-1)
)
else:
stacked_logits = self.pipeline.model( # type: ignore
input_ids=tensor_features["input_ids"],
attention_mask=tensor_features["attention_mask"],
).logits
# For causal decoders, shift logts and labels
labels_attention_mask = tensor_features["labels_attention_mask"].unsqueeze(
-1
)[..., 1:, :]
masked_log_probs = (
labels_attention_mask.float()
* torch.log_softmax(stacked_logits.float(), dim=-1)[..., :-1, :]
)
seq_token_log_probs = torch.gather(
masked_log_probs, -1, tensor_features["labels"][..., 1:].unsqueeze(-1)
)
seq_token_log_probs = seq_token_log_probs.squeeze(dim=-1)
seq_log_prob = seq_token_log_probs.sum(dim=-1)
# Averaging over output sequence length for GPT
if not self.is_encdec:
seq_log_prob = seq_log_prob * (1 / (seq_token_log_probs != 0).sum(dim=-1))
prediction = seq_log_prob.argmax(dim=0)
return [
(gold_choices[int(p)], seq_log_prob[int(p), i].item())
for i, p in enumerate(prediction)
]
@torch.no_grad()
def score_sequence(
self, prompt: Union[str, List[str]], **kwargs: Any

@ -12,6 +12,7 @@ class Model(ABC):
def __init__(
self,
model_name_or_path: str,
model_type: str,
cache_dir: str,
device: int,
use_accelerate: bool,
@ -28,6 +29,7 @@ class Model(ABC):
Args:
model_name_or_path: model name string.
model_type: model type string for when model_name not in registry.
cache_dir: cache directory for model.
device: device to use for model.
use_accelerate: whether to use accelerate for multi-gpu inference.

@ -98,6 +98,7 @@ class ArrayCache:
"offset": self.cur_offset,
"flatten_size": len(arr),
"shape": arr_shape,
"dtype": arr.dtype,
}
self.cur_offset += len(arr)
return
@ -112,4 +113,4 @@ class ArrayCache:
arr = memmap[
file_data["offset"] : file_data["offset"] + file_data["flatten_size"]
]
return arr.reshape(file_data["shape"])
return arr.reshape(file_data["shape"]).astype(file_data["dtype"])

@ -15,7 +15,7 @@ class ChatGPTClient(Client):
"""ChatGPT Client class."""
# No params for ChatGPT
PARAMS = {}
PARAMS: Dict[str, Tuple[str, Any]] = {}
REQUEST_CLS = LMRequest
def connect(

@ -35,7 +35,7 @@ EXTRAS = {
"diffusers>=0.6.0",
"Flask>=2.1.2",
"accelerate>=0.10.0",
"transformers>=4.20.0",
"transformers>=4.20.0,<4.26.0",
"torch>=1.8.0",
"numpy>=1.20.0",
],

@ -28,6 +28,7 @@ def test_put_get(tmpdir: Path) -> None:
cache.max_memmap_size = 120
cache.put("key", arr)
assert np.allclose(cache.get("key"), arr)
assert cache.get("key").dtype == arr.dtype
assert cache.cur_file_idx == 0
assert cache.cur_offset == 100
assert cache.hash2arrloc["key"] == {
@ -35,11 +36,13 @@ def test_put_get(tmpdir: Path) -> None:
"offset": 0,
"flatten_size": 100,
"shape": (10, 10),
"dtype": np.dtype("float64"),
}
arr2 = np.random.rand(10, 10)
arr2 = np.random.randint(0, 3, size=(10, 10))
cache.put("key2", arr2)
assert np.allclose(cache.get("key2"), arr2)
assert cache.get("key2").dtype == arr2.dtype
assert cache.cur_file_idx == 1
assert cache.cur_offset == 100
assert cache.hash2arrloc["key2"] == {
@ -47,6 +50,7 @@ def test_put_get(tmpdir: Path) -> None:
"offset": 0,
"flatten_size": 100,
"shape": (10, 10),
"dtype": np.dtype("int64"),
}
cache = ArrayCache(tmpdir)
@ -55,12 +59,14 @@ def test_put_get(tmpdir: Path) -> None:
"offset": 0,
"flatten_size": 100,
"shape": (10, 10),
"dtype": np.dtype("float64"),
}
assert cache.hash2arrloc["key2"] == {
"file_idx": 1,
"offset": 0,
"flatten_size": 100,
"shape": (10, 10),
"dtype": np.dtype("int64"),
}
assert np.allclose(cache.get("key"), arr)
assert np.allclose(cache.get("key2"), arr2)

@ -6,7 +6,7 @@ from subprocess import PIPE, Popen
import pytest
from manifest.api.models.huggingface import TextGenerationModel
from manifest.api.models.huggingface import MODEL_REGISTRY, TextGenerationModel
NOCUDA = 0
try:
@ -37,6 +37,17 @@ if NOCUDA == 0:
NOCUDA = 1
def test_load_non_registry_model() -> None:
"""Test load model not in registry."""
model_name = "NinedayWang/PolyCoder-160M"
assert model_name not in MODEL_REGISTRY
model = TextGenerationModel(
model_name_or_path=model_name, model_type="text-generation"
)
result = model.generate("Why is the sky green?", max_tokens=10)
assert result is not None
def test_gpt_generate() -> None:
"""Test pipeline generation from a gpt model."""
model = TextGenerationModel(
@ -67,12 +78,6 @@ def test_gpt_generate() -> None:
assert result[0][0] == "\n\nThe sky is"
assert math.isclose(round(result[0][1], 3), -6.046)
result = model.logits_scoring(inputs, gold_choices=[" blue sky", " green sky"])
assert result is not None
assert len(result) == 1
assert result[0][0] == " blue sky"
assert math.isclose(round(result[0][1], 3), -6.999)
# Truncate max length
model.pipeline.max_length = 5
result = model.generate(inputs, max_tokens=2)
@ -112,12 +117,6 @@ def test_encdec_generate() -> None:
assert result[0][0] == "What is the sky green"
assert math.isclose(round(result[0][1], 3), -5.144)
result = model.logits_scoring(inputs, gold_choices=[" blue sky", " green sky"])
assert result is not None
assert len(result) == 1
assert result[0][0] == " green sky"
assert math.isclose(round(result[0][1], 3), -13.538)
# Truncate max length
model.pipeline.max_length = 5
result = model.generate(inputs, max_tokens=2)
@ -174,16 +173,6 @@ def test_batch_gpt_generate() -> None:
assert result[1][0] == " not the only ones who"
assert math.isclose(round(result[1][1], 3), -9.978)
result = model.logits_scoring(
inputs, gold_choices=[" purple sky", " green sky", " blue sky"]
)
assert result is not None
assert len(result) == 2
assert result[0][0] == " blue sky"
assert math.isclose(round(result[0][1], 3), -6.999)
assert result[1][0] == " blue sky"
assert math.isclose(round(result[1][1], 3), -8.212)
# Truncate max length
model.pipeline.max_length = 5
result = model.generate(inputs, max_tokens=2)
@ -223,18 +212,6 @@ def test_batch_encdec_generate() -> None:
assert result[1][0] == "a great way to"
assert math.isclose(round(result[1][1], 3), -6.353)
result = model.logits_scoring(
inputs, gold_choices=[" purple sky", " green sky", " blue sky"]
)
assert result is not None
assert len(result) == 2
assert result[0][0] == " green sky"
assert math.isclose(round(result[0][1], 3), -13.538)
assert result[1][0] == " blue sky"
assert math.isclose(round(result[1][1], 3), -41.503) or math.isclose(
round(result[1][1], 3), -41.504
)
# Truncate max length
model.pipeline.max_length = 5
result = model.generate(inputs, max_tokens=2)

Loading…
Cancel
Save