Laurel/multigpuhf (#15)

* feat: added user session logging

* fix: removed raw queries in tests

* build: move torch to dev dependency

* fix: adding accelerate to api

Addresses https://github.com/HazyResearch/manifest/issues/14

* build: fix torch github
laurel/helm
Laurel Orr 2 years ago committed by GitHub
parent d29d10de28
commit 99bcc76fde

@ -157,7 +157,7 @@ Before submitting a PR, run
```bash
export REDIS_PORT="6380" # or whatever PORT local redis is running for those tests
cd <REDIS_PATH>
docker run -d -p 127.0.0.1:${REDIS_PORT}:6380 -v `pwd`:`pwd` -w `pwd` --name manifest_redis_test redis
docker run -d -p 127.0.0.1:${REDIS_PORT}:6379 -v `pwd`:`pwd` -w `pwd` --name manifest_redis_test redis
make test
```

@ -46,6 +46,17 @@ def parse_args() -> argparse.Namespace:
parser.add_argument(
"--device", type=int, default=-1, help="Model device. -1 for CPU."
)
parser.add_argument(
"--fp32", action="store_true", help="Use fp32 for model params."
)
parser.add_argument(
"--use_accelerate_multigpu",
action="store_true",
help=(
"Use accelerate for multi gpu inference. "
"This will override --device parameter."
),
)
args = parser.parse_args()
return args
@ -55,11 +66,18 @@ def main() -> None:
kwargs = parse_args()
model_type = kwargs.model_type
model_name = kwargs.model_name
use_accelerate = kwargs.use_accelerate_multigpu
if use_accelerate:
logger.info("Using accelerate. Overridding --device argument.")
# Global model
global model
model = MODEL_CONSTRUCTORS[model_type](
model_name, cache_dir=kwargs.cache_dir, device=kwargs.device
model_name,
cache_dir=kwargs.cache_dir,
device=kwargs.device,
use_accelerate=use_accelerate,
use_fp32=kwargs.fp32,
)
app.run(host="0.0.0.0", port=PORT)

@ -11,6 +11,8 @@ from transformers import (
GPT2LMHeadModel,
GPTJForCausalLM,
GPTNeoForCausalLM,
GPTNeoXForCausalLM,
OPTForCausalLM,
PreTrainedModel,
PreTrainedTokenizer,
pipeline,
@ -19,7 +21,7 @@ from transformers import (
from manifest.api.models.model import Model
class GPT2Pipeline:
class GPTPipeline:
"""Custom GPT3 Pipeline."""
def __init__(
@ -33,7 +35,6 @@ class GPT2Pipeline:
if (device == -1 or not torch.cuda.is_available())
else torch.device(f"cuda:{device}")
)
self.model = self.model.to(self.device) # type: ignore
def __call__(self, text: str, **kwargs: Any) -> List[Dict[str, str]]:
"""Generate from text.
@ -59,28 +60,42 @@ class GPT2Pipeline:
num_return_sequences=kwargs.get("num_return_sequences"),
)
generated_sequences = [
{"generated_text": self.tokenizer.decode(output_seq)}
{
"generated_text": self.tokenizer.decode(
output_seq, skip_special_tokens=True
)
}
for output_seq in output_sequences
]
return generated_sequences
MODEL_REGISTRY = {
"EleutherAI/gpt-j-6B": GPTJForCausalLM,
"EleutherAI/gpt-neo-125M": GPTNeoForCausalLM,
"EleutherAI/gpt-neo-1.3B": GPTNeoForCausalLM,
"EleutherAI/gpt-neo-2.7B": GPTNeoForCausalLM,
"EleutherAI/gpt-j-6B": GPTJForCausalLM,
"EleutherAI/gpt-neox-20b": GPTNeoXForCausalLM,
"facebook/opt-1.3b": OPTForCausalLM,
"facebook/opt-2.7b": OPTForCausalLM,
"facebook/opt-6.7b": OPTForCausalLM,
"facebook/opt-13b": OPTForCausalLM,
"facebook/opt-30b": OPTForCausalLM,
"gpt2": GPT2LMHeadModel,
"bigscience/T0pp": AutoModelForSeq2SeqLM,
"bigscience/T0_3B": AutoModelForSeq2SeqLM,
}
MODEL_PIPELINE = {
"EleutherAI/gpt-j-6B": GPT2Pipeline,
"EleutherAI/gpt-neo-125M": GPT2Pipeline,
"EleutherAI/gpt-neo-1.3B": GPT2Pipeline,
"EleutherAI/gpt-neo-2.7B": GPT2Pipeline,
"gpt2": GPT2Pipeline,
"EleutherAI/gpt-neo-1.3B": GPTPipeline,
"EleutherAI/gpt-neo-2.7B": GPTPipeline,
"EleutherAI/gpt-j-6B": GPTPipeline,
"EleutherAI/gpt-neox-20b": GPTPipeline,
"facebook/opt-1.3b": GPTPipeline,
"facebook/opt-2.7b": GPTPipeline,
"facebook/opt-6.7b": GPTPipeline,
"facebook/opt-13b": GPTPipeline,
"facebook/opt-30b": GPTPipeline,
"gpt2": GPTPipeline,
"bigscience/T0pp": partial(pipeline, "text2text-generation"),
"bigscience/T0_3B": partial(pipeline, "text2text-generation"),
}
@ -89,7 +104,14 @@ MODEL_PIPELINE = {
class HuggingFaceModel(Model):
"""Huggingface model."""
def __init__(self, model_name: str, cache_dir: str, device: int):
def __init__(
self,
model_name: str,
cache_dir: str,
device: int,
use_accelerate: bool,
use_fp32: bool,
):
"""
Initialize model.
@ -97,6 +119,10 @@ class HuggingFaceModel(Model):
Args:
model_name: model name string.
cache_dir: cache directory for model.
device: device to use for model.
use_accelerate: whether to use accelerate for multi-gpu inference.
use_fp32: use fp32 for model weights.
"""
# Check if providing path
self.model_path = model_name
@ -107,19 +133,80 @@ class HuggingFaceModel(Model):
model_name = config["_name_or_path"]
self.model_name = model_name
print("Model Name:", self.model_name, "Model Path:", self.model_path)
try:
tokenizer = AutoTokenizer.from_pretrained(model_name)
except ValueError:
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
dtype = torch.float32 if use_fp32 else torch.float16
model = MODEL_REGISTRY[model_name].from_pretrained( # type: ignore
self.model_path, cache_dir=cache_dir
self.model_path, cache_dir=cache_dir, torch_dtype=dtype
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
if use_accelerate:
self._dispatch_accelerate_model(model)
device = 0
else:
if device > -1:
model = model.to(device) # type: ignore
self.pipeline = MODEL_PIPELINE[model_name]( # type: ignore
model=model, tokenizer=tokenizer, device=device
)
self.returns_input = "gpt" in model_name
self.returns_input = "T0" not in model_name
def get_init_params(self) -> Dict:
"""Return init params to determine what model is being used."""
return {"model_name": self.model_name, "model_path": self.model_path}
def _dispatch_accelerate_model(self, model: PreTrainedModel) -> None:
"""
Load model with accelerate.
Adapted from https://colab.research.google.com/drive/14wnxMvD9zsiBQo2FtT
pxn6w2cpXCcb-7#scrollTo=y8Ne7jJdaF9F&uniqifier=1
Args:
model_name: name of model.
model_path: path to model.
cache_dir: cache directory for model.
"""
from accelerate import dispatch_model, infer_auto_device_map
model.tie_weights() # type: ignore
# Get the model where we can infer devices from
if hasattr(model, "model"):
# OPT
main_model = model.model # type: ignore
model_getter = "model."
else:
# Eleuther Neo and J
main_model = model
model_getter = ""
raw_device_map = infer_auto_device_map(
main_model,
no_split_module_classes=[
"OPTDecoderLayer",
"GPTNeoBlock",
"GPTJBlock",
"GPTNeoXLayer",
],
dtype=model.dtype, # type: ignore
)
# Hacky fix for Eleuther getting the "weight" of embeddings
device_map = {}
for k, v in raw_device_map.items():
if k in {"wte", "wpe"}:
device_map[f"{model_getter}{k}.weight"] = v
else:
device_map[f"{model_getter}{k}"] = v
# For OPT models
if "lm_head" not in device_map:
if "disk" in device_map.values():
device_map["lm_head"] = "disk"
else:
device_map["lm_head"] = max(device_map.values())
print("Device Map", device_map)
dispatch_model(model, device_map=device_map)
return
def generate(self, prompt: str, **kwargs: Any) -> List[str]:
"""
Generate the prompt from model.
@ -134,12 +221,15 @@ class HuggingFaceModel(Model):
"""
num_return = kwargs.get("n")
final_results = []
encoded_prompt = self.pipeline.tokenizer.encode(
# Add tokens for length
encoded_prompt_with_special = self.pipeline.tokenizer.encode(prompt)
# Remove tokens as the pipeline removes special tokens upon return
encoded_prompt_without_special = self.pipeline.tokenizer.encode(
prompt, add_special_tokens=False
)
result = self.pipeline(
prompt,
max_length=kwargs.get("max_tokens") + len(encoded_prompt),
max_length=kwargs.get("max_tokens") + len(encoded_prompt_with_special),
temperature=kwargs.get("temperature"),
repetition_penalty=kwargs.get("repetition_penalty"),
top_k=kwargs.get("top_k"),
@ -148,7 +238,7 @@ class HuggingFaceModel(Model):
num_return_sequences=num_return,
)
# Correctly returns prompt without extra spaces
decoded_prompt = self.pipeline.tokenizer.decode(encoded_prompt)
decoded_prompt = self.pipeline.tokenizer.decode(encoded_prompt_without_special)
if self.returns_input:
start_idx = len(decoded_prompt)
else:

@ -31,9 +31,9 @@ class Session:
"""
manifest_home = Path(os.environ.get("MANIFEST_SESSION_HOME", Path.home()))
db_file = manifest_home / ".manifest" / "session.db"
db_file.parent.mkdir(parents=True, exist_ok=True)
self.conn = sqlite3.connect(str(db_file))
self.db_file = manifest_home / ".manifest" / "session.db"
self.db_file.parent.mkdir(parents=True, exist_ok=True)
self.conn = sqlite3.connect(str(self.db_file))
self._create_table()
if not session_id:
self.session_id = str(uuid.uuid4())
@ -49,6 +49,22 @@ class Session:
"""Close the client."""
self.conn.close()
@classmethod
def get_session_keys(cls, db_file: Path) -> List[str]:
"""Get available session keys from cached file."""
try:
conn = sqlite3.connect(str(db_file))
query = """SELECT DISTINCT session_id FROM queries"""
cur = conn.cursor()
res = cur.execute(query)
return [x[0] for x in res.fetchall()]
except sqlite3.OperationalError:
logger.info(
"There is no database with the 'queries' table. "
"Are you sure you are using the right session file"
)
return []
def _execute_query(self, query: str, *args: Any) -> Any:
"""
Execute query with optional args.

@ -16,19 +16,21 @@ version = "0.0.1"
"Bug Tracker" = "https://github.com/HazyResearch/manifest/issues"
[tool.poetry.dependencies]
python = "^3.8"
python = "^3.10"
sqlitedict = "^2.0.0"
openai = "^0.18.1"
redis = "^4.3.1"
dill = "^0.3.5"
Flask = "^2.1.2"
transformers = "^4.19.2"
transformers = "^4.20.0"
requests = "^2.27.1"
tqdm = "^4.64.0"
uuid = "^1.30"
accelerate = "^0.10.0"
[tool.poetry.dev-dependencies]
torch = "^1.0"
# torch = { url = "https://download.pytorch.org/whl/cu113/torch-1.11.0%2Bcu113-cp310-cp310-linux_x86_64.whl" }
torch = "1.11.0"
black = "^22.3.0"
flake8 = "^4.0.0"
flake8-docstrings = "^1.6.0"
@ -64,6 +66,7 @@ module = [
"sqlitedict",
"dill",
"tqdm.auto",
"accelerate",
]
[tool.isort]

@ -11,6 +11,7 @@ def test_init(session_cache):
"""Test session initialization."""
session = Session()
assert isinstance(session.conn, sqlite3.Connection)
assert session.db_file == session_cache / ".manifest" / "session.db"
assert session.query_id == 0
assert (session_cache / ".manifest" / "session.db").exists()
# Remove session cache file.
@ -18,9 +19,11 @@ def test_init(session_cache):
session = Session("dog_days")
assert isinstance(session.conn, sqlite3.Connection)
assert session.db_file == session_cache / ".manifest" / "session.db"
assert session.query_id == 0
assert session.session_id == "dog_days"
assert (session_cache / ".manifest" / "session.db").exists()
session.close()
@pytest.mark.usefixtures("session_cache")
@ -44,6 +47,7 @@ def test_log_query(session_cache):
(query_key, response_key),
(query_key2, response_key2),
]
session.close()
@pytest.mark.usefixtures("session_cache")
@ -57,3 +61,20 @@ def test_resume_query(session_cache):
session = Session(session_id="dog_days")
assert session.query_id == 1
@pytest.mark.usefixtures("session_cache")
def test_session_keys(session_cache):
"""Test get session keys."""
# Assert empty before queries
assert Session.get_session_keys(session_cache / ".manifest" / "session.db") == []
# Add queries and make sure session is logged
session = Session(session_id="dog_days")
query_key = {"query": "What is your name?", "time": "now"}
response_key = {"response": "I don't have a name", "engine": "nodel"}
session.log_query(query_key, response_key)
session.close()
assert Session.get_session_keys(session_cache / ".manifest" / "session.db") == [
"dog_days"
]

Loading…
Cancel
Save