fix: add dtype to cache (#52)

1 year ago · 8ced666df8
parent e351bd5315
commit 8ced666df8
12 changed files with 264 additions and 253 deletions
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@ -1,4 +1,8 @@
-Unreleased 0.0.2
+0.0.3 - Unreleased
+---------------------
+
+
+0.0.2 - 2022-01-31
 ---------------------
 Added
 ^^^^^
@ -6,9 +10,7 @@ Added
 * Standard request base model for all language inputs.
 * ChatGPT client. Requires CHATGPT_SESSION_KEY to be passed in.
 * Diffusion model support
-
-Fixed
-^^^^^^^^
+* Together model support

 Removed
 ^^^^^^^
--- a/examples/manifest_diffusers.ipynb
+++ b/examples/manifest_diffusers.ipynb
--- a/examples/manifest_together.ipynb
+++ b/examples/manifest_together.ipynb
@ -0,0 +1,98 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%load_ext autoreload\n",
+    "%autoreload 2\n",
+    "\n",
+    "%env TOMA_URL=https://staging.together.xyz/api"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from manifest import Manifest\n",
+    "\n",
+    "# The responses are not fast\n",
+    "manifest = Manifest(\n",
+    "    client_name=\"toma\",\n",
+    ")\n",
+    "\n",
+    "print(manifest.run(\"What is the color of an apple?\"))"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "With a cache"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from manifest import Manifest\n",
+    "\n",
+    "# The responses are not fast\n",
+    "manifest = Manifest(\n",
+    "    client_name=\"toma\",\n",
+    "    cache_name=\"sqlite\",\n",
+    "    cache_connection=\"my_manifest_cache.sqlite\",\n",
+    ")\n",
+    "\n",
+    "res = manifest.run(\"What is the color of an apple?\", return_response=True)\n",
+    "print(res.get_response())\n",
+    "print(\"Is Cached?\", res.is_cached())\n",
+    "\n",
+    "res = manifest.run(\"What is the color of an apple?\", return_response=True)\n",
+    "print(res.get_response())\n",
+    "print(\"Is Cached?\", res.is_cached())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "manifest",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.4"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "fddffe4ac3b9f00470127629076101c1b5f38ecb1e7358b567d19305425e9491"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/manifest/api/app.py
+++ b/manifest/api/app.py
@ -11,7 +11,11 @@ import pkg_resources
 from flask import Flask, Response, request

 from manifest.api.models.diffuser import DiffuserModel
-from manifest.api.models.huggingface import CrossModalEncoderModel, TextGenerationModel
+from manifest.api.models.huggingface import (
+    MODEL_GENTYPE_REGISTRY,
+    CrossModalEncoderModel,
+    TextGenerationModel,
+)
 from manifest.api.response import ModelResponse

 os.environ["TOKENIZERS_PARALLELISM"] = "false"
@ -40,6 +44,13 @@ def parse_args() -> argparse.Namespace:
        help="Model type used for finding constructor.",
        choices=MODEL_CONSTRUCTORS.keys(),
    )
+    parser.add_argument(
+        "--model_generation_type",
+        default=None,
+        type=str,
+        help="Model generation type.",
+        choices=MODEL_GENTYPE_REGISTRY.keys(),
+    )
    parser.add_argument(
        "--model_name_or_path",
        default=None,
@ -104,6 +115,7 @@ def main() -> None:
        raise ValueError(f"Port {PORT} is already in use.")
    global model_type
    model_type = kwargs.model_type
+    model_gen_type = kwargs.model_generation_type
    model_name_or_path = kwargs.model_name_or_path
    if not model_name_or_path:
        raise ValueError("Must provide model_name_or_path.")
@ -133,6 +145,7 @@ def main() -> None:
    global model
    model = MODEL_CONSTRUCTORS[model_type](
        model_name_or_path,
+        model_type=model_gen_type,
        cache_dir=kwargs.cache_dir,
        device=kwargs.device,
        use_accelerate=kwargs.use_accelerate_multigpu,
--- a/manifest/api/models/diffuser.py
+++ b/manifest/api/models/diffuser.py
@ -14,6 +14,7 @@ class DiffuserModel(Model):
    def __init__(
        self,
        model_name_or_path: str,
+        model_type: Optional[str] = None,
        model_config: Optional[str] = None,
        cache_dir: Optional[str] = None,
        device: int = 0,
@ -93,17 +94,16 @@ class DiffuserModel(Model):
        return [(im, None) for im in result["images"]]

    @torch.no_grad()
-    def logits_scoring(
-        self, prompt: Union[str, List[str]], gold_choices: List[str], **kwargs: Any
-    ) -> List[Tuple[Any, float]]:
+    def score_sequence(
+        self, prompt: Union[str, List[str]], **kwargs: Any
+    ) -> List[float]:
        """
-        Given the prompt and gold choices, choose the best choice with max logits.
+        Score a sequence of choices.

        Args:
-            prompt: promt to generate from.
-            gold_choices: list of choices to choose from.
-
-        Returns:
-            the returned gold choice
+            prompt (:obj:`str` or :obj:`List[str]`):
+                The prompt to score the choices against.
+            **kwargs:
+                Additional keyword arguments passed along to the :obj:`__call__` method.
        """
-        raise NotImplementedError("Logits scoring not supported for diffusers")
+        raise NotImplementedError("Score sequence not supported for diffusers")
--- a/manifest/api/models/huggingface.py
+++ b/manifest/api/models/huggingface.py
@ -61,7 +61,11 @@ MODEL_REGISTRY = {
    "google/flan-t5-l": AutoModelForSeq2SeqLM,  # 800M
    "google/flan-t5-xl": AutoModelForSeq2SeqLM,  # 3B
    "google/flan-t5-xxl": AutoModelForSeq2SeqLM,  # 11B
-    "allenai/tk-instruct-3b-def": AutoModelForSeq2SeqLM,
+}
+
+MODEL_GENTYPE_REGISTRY = {
+    "text-generation": AutoModelForCausalLM,
+    "text2text-generation": AutoModelForSeq2SeqLM,
 }


@ -187,6 +191,7 @@ class HuggingFaceModel(Model):
    def __init__(
        self,
        model_name_or_path: str,
+        model_type: Optional[str] = None,
        model_config: Optional[str] = None,
        cache_dir: Optional[str] = None,
        device: int = 0,
@ -227,6 +232,13 @@ class HuggingFaceModel(Model):
                config = json.load(open(Path(self.model_path) / "config.json"))
                model_name_or_path = config["_name_or_path"]
        self.model_name = model_name_or_path
+        self.model_type = model_type
+        if self.model_name not in MODEL_REGISTRY and self.model_type is None:
+            raise ValueError(
+                f"{self.model_name} is not in our registry. Please specify "
+                "--model_generation_type as either text-generation (for Causal)"
+                " or text2text-generation (for Seq2Seq)"
+            )
        print("Model Name:", self.model_name, "Model Path:", self.model_path)

    def get_init_params(self) -> Dict:
@ -316,6 +328,7 @@ class CrossModalEncoderModel(HuggingFaceModel):
    def __init__(
        self,
        model_name_or_path: str,
+        model_type: Optional[str] = None,
        model_config: Optional[str] = None,
        cache_dir: Optional[str] = None,
        device: int = 0,
@ -345,6 +358,7 @@ class CrossModalEncoderModel(HuggingFaceModel):
        """
        super().__init__(
            model_name_or_path,
+            model_type,
            model_config,
            cache_dir,
            device,
@ -359,7 +373,9 @@ class CrossModalEncoderModel(HuggingFaceModel):
        # TODO: make this generalizable
        self.processor = CLIPProcessor.from_pretrained(self.model_path)

-        model = MODEL_REGISTRY[self.model_name].from_pretrained(
+        model = MODEL_REGISTRY.get(
+            self.model_name, MODEL_GENTYPE_REGISTRY.get(self.model_type, None)
+        ).from_pretrained(
            self.model_path,
            cache_dir=cache_dir,
        )
@ -370,7 +386,6 @@ class CrossModalEncoderModel(HuggingFaceModel):
            if (device == -1 or not torch.cuda.is_available())
            else torch.device(f"cuda:{device}")
        )
-        print("T", torch_device)
        self.model = model.to(torch_device)  # type: ignore

    @torch.no_grad()
@ -401,6 +416,7 @@ class TextGenerationModel(HuggingFaceModel):
    def __init__(
        self,
        model_name_or_path: str,
+        model_type: Optional[str] = None,
        model_config: Optional[str] = None,
        cache_dir: Optional[str] = None,
        device: int = 0,
@ -430,6 +446,7 @@ class TextGenerationModel(HuggingFaceModel):
        """
        super().__init__(
            model_name_or_path,
+            model_type,
            model_config,
            cache_dir,
            device,
@ -455,7 +472,9 @@ class TextGenerationModel(HuggingFaceModel):
        if use_bitsandbytes:
            print("WARNING!!! Cannot use sampling with bitsandbytes.")
            max_memory = get_max_memory(perc_max_gpu_mem_red)
-            model = MODEL_REGISTRY[self.model_name].from_pretrained(  # type: ignore
+            model = MODEL_REGISTRY.get(
+                self.model_name, MODEL_GENTYPE_REGISTRY.get(self.model_type, None)
+            ).from_pretrained(  # type: ignore
                self.model_path,
                cache_dir=cache_dir,
                load_in_8bit=True,
@ -465,14 +484,18 @@ class TextGenerationModel(HuggingFaceModel):
        else:
            try:
                # Try to explicitely find a fp16 copy (gpt-j-6B for example)
-                model = MODEL_REGISTRY[self.model_name].from_pretrained(  # type: ignore
+                model = MODEL_REGISTRY.get(
+                    self.model_name, MODEL_GENTYPE_REGISTRY.get(self.model_type, None)
+                ).from_pretrained(  # type: ignore
                    self.model_path,
                    cache_dir=cache_dir,
                    revision="float16",
                    torch_dtype=torch.float16,
                )
            except Exception:
-                model = MODEL_REGISTRY[self.model_name].from_pretrained(  # type: ignore
+                model = MODEL_REGISTRY.get(
+                    self.model_name, MODEL_GENTYPE_REGISTRY.get(self.model_type, None)
+                ).from_pretrained(  # type: ignore
                    self.model_path, cache_dir=cache_dir, torch_dtype=dtype
                )
        model.eval()
@ -555,188 +578,6 @@ class TextGenerationModel(HuggingFaceModel):
        ]
        return final_results

-    @torch.no_grad()
-    def logits_scoring(
-        self, prompt: Union[str, List[str]], gold_choices: List[str], **kwargs: Any
-    ) -> List[Tuple[Any, float]]:
-        """
-        Given the prompt and gold choices, choose the best choice with max logits.
-
-        Args:
-            prompt: promt to generate from.
-            gold_choices: list of choices to choose from.
-
-        Returns:
-            the returned gold choice
-        """
-        if isinstance(prompt, str):
-            prompt = [prompt]
-        max_input_len = self.pipeline.max_length
-        if self.is_encdec:
-            # Adapted from https://github.com/bigscience-workshop/t-zero
-            tokenized_inputs = self.pipeline.tokenizer(
-                prompt,
-                padding="longest",
-                max_length=max_input_len,
-                truncation=True,
-                add_special_tokens=False,
-            )
-            # Get max target length
-            max_target_len = max(
-                [
-                    len(self.pipeline.tokenizer(ans_choi)["input_ids"])
-                    for ans_choi in gold_choices
-                ]
-            )
-            tokenized_targets = [
-                self.pipeline.tokenizer(
-                    ans_choi,
-                    # padding is on the right here.
-                    padding="max_length",
-                    max_length=min(max_target_len, max_input_len),
-                    truncation=True,
-                )
-                for ans_choi in gold_choices
-            ]
-
-            # Repeat input ids for each choice to form a batch
-            features = {
-                k: [tokenized_inputs[k] for _ in range(len(gold_choices))]
-                for k in tokenized_inputs.keys()
-            }
-            # Add choice tokens + mask
-            features["labels"] = [
-                [tokenized_targets[k]["input_ids"]] * len(tokenized_inputs["input_ids"])
-                for k in range(len(gold_choices))
-            ]
-            features["labels_attention_mask"] = [
-                [tokenized_targets[k]["attention_mask"]]
-                * len(tokenized_inputs["input_ids"])
-                for k in range(len(gold_choices))
-            ]
-        else:
-            tokenized_inputs = self.pipeline.tokenizer(
-                prompt,
-                max_length=max_input_len,
-                truncation=True,
-                padding=False,
-                add_special_tokens=False,
-            )
-            tokenized_targets = [
-                self.pipeline.tokenizer(
-                    # Add starting whitespace fo gpt
-                    ans_choi,
-                    max_length=max_input_len,
-                    truncation=True,
-                    padding=False,
-                    add_special_tokens=False,
-                )
-                for ans_choi in gold_choices
-            ]
-            features = {
-                k: [] for k in list(tokenized_inputs.keys()) + ["labels_attention_mask"]
-            }
-            max_effective_input_len = 0
-            for tokenized_targ in tokenized_targets:
-                for k in tokenized_inputs.keys():
-                    batched_features = []
-                    for prompt_i in range(len(tokenized_inputs[k])):
-                        # Make sure to leave room for the outputs
-                        batched_features.append(
-                            tokenized_inputs[k][prompt_i][
-                                : min(
-                                    len(tokenized_inputs[k][prompt_i]),
-                                    max_input_len - len(tokenized_targ[k]),
-                                )
-                            ]
-                            + tokenized_targ[k]
-                        )
-                        max_effective_input_len = max(
-                            max_effective_input_len, len(batched_features[-1])
-                        )
-                    features[k].append(batched_features)
-                # Manuall add labels_attention_mask
-                batched_features = []
-                for prompt_i in range(len(tokenized_inputs["input_ids"])):
-                    batched_features.append(
-                        [0]
-                        * min(
-                            len(tokenized_inputs["input_ids"][prompt_i]),
-                            max_input_len - len(tokenized_targ["input_ids"]),
-                        )
-                        + [1] * len(tokenized_targ["input_ids"])
-                    )
-                features["labels_attention_mask"].append(batched_features)
-            # Manually pad to max effective length
-            for k in features.keys():
-                for targ_i in range(len(features[k])):
-                    for prompt_i in range(len(features[k][targ_i])):
-                        if k == "input_ids":
-                            features[k][targ_i][prompt_i] += [
-                                self.pipeline.tokenizer.pad_token_id
-                            ] * (
-                                max_effective_input_len
-                                - len(features[k][targ_i][prompt_i])
-                            )
-                        elif k in ["attention_mask", "labels_attention_mask"]:
-                            features[k][targ_i][prompt_i] += [0] * (
-                                max_effective_input_len
-                                - len(features[k][targ_i][prompt_i])
-                            )
-                        else:
-                            raise ValueError(f"Unknown key {k} for decoder only models")
-
-            features["labels"] = features["input_ids"]
-
-        # Convert to tensors
-        tensor_features = {}
-        for k in features:
-            tensor_features[k] = torch.LongTensor(features[k]).to(self.pipeline.device)
-
-        if self.is_encdec:
-            gold_l, bsz, seq_len = tensor_features["labels"].shape
-            stacked_logits = self.pipeline.model(  # type: ignore
-                input_ids=tensor_features["input_ids"].reshape(gold_l * bsz, -1),
-                attention_mask=tensor_features["attention_mask"].reshape(
-                    gold_l * bsz, -1
-                ),
-                labels=tensor_features["labels"].reshape(gold_l * bsz, -1),
-            ).logits
-            stacked_logits = stacked_logits.reshape(gold_l, bsz, seq_len, -1)
-            # Adapted from https://github.com/bigscience-workshop/t-zero
-            masked_log_probs = tensor_features["labels_attention_mask"].unsqueeze(
-                -1
-            ) * torch.log_softmax(stacked_logits, dim=-1)
-            seq_token_log_probs = torch.gather(
-                masked_log_probs, -1, tensor_features["labels"].unsqueeze(-1)
-            )
-        else:
-            stacked_logits = self.pipeline.model(  # type: ignore
-                input_ids=tensor_features["input_ids"],
-                attention_mask=tensor_features["attention_mask"],
-            ).logits
-            # For causal decoders, shift logts and labels
-            labels_attention_mask = tensor_features["labels_attention_mask"].unsqueeze(
-                -1
-            )[..., 1:, :]
-            masked_log_probs = (
-                labels_attention_mask.float()
-                * torch.log_softmax(stacked_logits.float(), dim=-1)[..., :-1, :]
-            )
-            seq_token_log_probs = torch.gather(
-                masked_log_probs, -1, tensor_features["labels"][..., 1:].unsqueeze(-1)
-            )
-        seq_token_log_probs = seq_token_log_probs.squeeze(dim=-1)
-        seq_log_prob = seq_token_log_probs.sum(dim=-1)
-        # Averaging over output sequence length for GPT
-        if not self.is_encdec:
-            seq_log_prob = seq_log_prob * (1 / (seq_token_log_probs != 0).sum(dim=-1))
-        prediction = seq_log_prob.argmax(dim=0)
-        return [
-            (gold_choices[int(p)], seq_log_prob[int(p), i].item())
-            for i, p in enumerate(prediction)
-        ]
-
    @torch.no_grad()
    def score_sequence(
        self, prompt: Union[str, List[str]], **kwargs: Any
--- a/manifest/api/models/model.py
+++ b/manifest/api/models/model.py
@ -12,6 +12,7 @@ class Model(ABC):
    def __init__(
        self,
        model_name_or_path: str,
+        model_type: str,
        cache_dir: str,
        device: int,
        use_accelerate: bool,
@ -28,6 +29,7 @@ class Model(ABC):

        Args:
            model_name_or_path: model name string.
+            model_type: model type string for when model_name not in registry.
            cache_dir: cache directory for model.
            device: device to use for model.
            use_accelerate: whether to use accelerate for multi-gpu inference.
--- a/manifest/caches/array_cache.py
+++ b/manifest/caches/array_cache.py
@ -98,6 +98,7 @@ class ArrayCache:
            "offset": self.cur_offset,
            "flatten_size": len(arr),
            "shape": arr_shape,
+            "dtype": arr.dtype,
        }
        self.cur_offset += len(arr)
        return
@ -112,4 +113,4 @@ class ArrayCache:
        arr = memmap[
            file_data["offset"] : file_data["offset"] + file_data["flatten_size"]
        ]
-        return arr.reshape(file_data["shape"])
+        return arr.reshape(file_data["shape"]).astype(file_data["dtype"])
--- a/manifest/clients/chatgpt.py
+++ b/manifest/clients/chatgpt.py
@ -15,7 +15,7 @@ class ChatGPTClient(Client):
    """ChatGPT Client class."""

    # No params for ChatGPT
-    PARAMS = {}
+    PARAMS: Dict[str, Tuple[str, Any]] = {}
    REQUEST_CLS = LMRequest

    def connect(
--- a/setup.py
+++ b/setup.py
@ -35,7 +35,7 @@ EXTRAS = {
        "diffusers>=0.6.0",
        "Flask>=2.1.2",
        "accelerate>=0.10.0",
-        "transformers>=4.20.0",
+        "transformers>=4.20.0,<4.26.0",
        "torch>=1.8.0",
        "numpy>=1.20.0",
    ],
--- a/tests/test_array_cache.py
+++ b/tests/test_array_cache.py
@ -28,6 +28,7 @@ def test_put_get(tmpdir: Path) -> None:
    cache.max_memmap_size = 120
    cache.put("key", arr)
    assert np.allclose(cache.get("key"), arr)
+    assert cache.get("key").dtype == arr.dtype
    assert cache.cur_file_idx == 0
    assert cache.cur_offset == 100
    assert cache.hash2arrloc["key"] == {
@ -35,11 +36,13 @@ def test_put_get(tmpdir: Path) -> None:
        "offset": 0,
        "flatten_size": 100,
        "shape": (10, 10),
+        "dtype": np.dtype("float64"),
    }

-    arr2 = np.random.rand(10, 10)
+    arr2 = np.random.randint(0, 3, size=(10, 10))
    cache.put("key2", arr2)
    assert np.allclose(cache.get("key2"), arr2)
+    assert cache.get("key2").dtype == arr2.dtype
    assert cache.cur_file_idx == 1
    assert cache.cur_offset == 100
    assert cache.hash2arrloc["key2"] == {
@ -47,6 +50,7 @@ def test_put_get(tmpdir: Path) -> None:
        "offset": 0,
        "flatten_size": 100,
        "shape": (10, 10),
+        "dtype": np.dtype("int64"),
    }

    cache = ArrayCache(tmpdir)
@ -55,12 +59,14 @@ def test_put_get(tmpdir: Path) -> None:
        "offset": 0,
        "flatten_size": 100,
        "shape": (10, 10),
+        "dtype": np.dtype("float64"),
    }
    assert cache.hash2arrloc["key2"] == {
        "file_idx": 1,
        "offset": 0,
        "flatten_size": 100,
        "shape": (10, 10),
+        "dtype": np.dtype("int64"),
    }
    assert np.allclose(cache.get("key"), arr)
    assert np.allclose(cache.get("key2"), arr2)
--- a/tests/test_huggingface_api.py
+++ b/tests/test_huggingface_api.py
@ -6,7 +6,7 @@ from subprocess import PIPE, Popen

 import pytest

-from manifest.api.models.huggingface import TextGenerationModel
+from manifest.api.models.huggingface import MODEL_REGISTRY, TextGenerationModel

 NOCUDA = 0
 try:
@ -37,6 +37,17 @@ if NOCUDA == 0:
        NOCUDA = 1


+def test_load_non_registry_model() -> None:
+    """Test load model not in registry."""
+    model_name = "NinedayWang/PolyCoder-160M"
+    assert model_name not in MODEL_REGISTRY
+    model = TextGenerationModel(
+        model_name_or_path=model_name, model_type="text-generation"
+    )
+    result = model.generate("Why is the sky green?", max_tokens=10)
+    assert result is not None
+
+
 def test_gpt_generate() -> None:
    """Test pipeline generation from a gpt model."""
    model = TextGenerationModel(
@ -67,12 +78,6 @@ def test_gpt_generate() -> None:
    assert result[0][0] == "\n\nThe sky is"
    assert math.isclose(round(result[0][1], 3), -6.046)

-    result = model.logits_scoring(inputs, gold_choices=[" blue sky", " green sky"])
-    assert result is not None
-    assert len(result) == 1
-    assert result[0][0] == " blue sky"
-    assert math.isclose(round(result[0][1], 3), -6.999)
-
    # Truncate max length
    model.pipeline.max_length = 5
    result = model.generate(inputs, max_tokens=2)
@ -112,12 +117,6 @@ def test_encdec_generate() -> None:
    assert result[0][0] == "What is the sky green"
    assert math.isclose(round(result[0][1], 3), -5.144)

-    result = model.logits_scoring(inputs, gold_choices=[" blue sky", " green sky"])
-    assert result is not None
-    assert len(result) == 1
-    assert result[0][0] == " green sky"
-    assert math.isclose(round(result[0][1], 3), -13.538)
-
    # Truncate max length
    model.pipeline.max_length = 5
    result = model.generate(inputs, max_tokens=2)
@ -174,16 +173,6 @@ def test_batch_gpt_generate() -> None:
    assert result[1][0] == " not the only ones who"
    assert math.isclose(round(result[1][1], 3), -9.978)

-    result = model.logits_scoring(
-        inputs, gold_choices=[" purple sky", " green sky", " blue sky"]
-    )
-    assert result is not None
-    assert len(result) == 2
-    assert result[0][0] == " blue sky"
-    assert math.isclose(round(result[0][1], 3), -6.999)
-    assert result[1][0] == " blue sky"
-    assert math.isclose(round(result[1][1], 3), -8.212)
-
    # Truncate max length
    model.pipeline.max_length = 5
    result = model.generate(inputs, max_tokens=2)
@ -223,18 +212,6 @@ def test_batch_encdec_generate() -> None:
    assert result[1][0] == "a great way to"
    assert math.isclose(round(result[1][1], 3), -6.353)

-    result = model.logits_scoring(
-        inputs, gold_choices=[" purple sky", " green sky", " blue sky"]
-    )
-    assert result is not None
-    assert len(result) == 2
-    assert result[0][0] == " green sky"
-    assert math.isclose(round(result[0][1], 3), -13.538)
-    assert result[1][0] == " blue sky"
-    assert math.isclose(round(result[1][1], 3), -41.503) or math.isclose(
-        round(result[1][1], 3), -41.504
-    )
-
    # Truncate max length
    model.pipeline.max_length = 5
    result = model.generate(inputs, max_tokens=2)