gpt4all/gpt4all-api/gpt4all_api/app/api_v1/routes/chat.py

import logging
import time
from typing import List
from uuid import uuid4
from fastapi import APIRouter, HTTPException
from gpt4all import GPT4All
from pydantic import BaseModel, Field
from api_v1.settings import settings
from fastapi.responses import StreamingResponse

logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)

### This should follow https://github.com/openai/openai-openapi/blob/master/openapi.yaml
class ChatCompletionMessage(BaseModel):
    role: str
    content: str

class ChatCompletionRequest(BaseModel):
    model: str = Field(settings.model, description='The model to generate a completion from.')
    messages: List[ChatCompletionMessage] = Field(..., description='Messages for the chat completion.')
    temperature: float = Field(settings.temp, description='Model temperature')

class ChatCompletionChoice(BaseModel):
    message: ChatCompletionMessage
    index: int
    logprobs: float
    finish_reason: str

class ChatCompletionUsage(BaseModel):
    prompt_tokens: int
    completion_tokens: int
    total_tokens: int

class ChatCompletionResponse(BaseModel):
    id: str
    object: str = 'text_completion'
    created: int
    model: str
    choices: List[ChatCompletionChoice]
    usage: ChatCompletionUsage

router = APIRouter(prefix="/chat", tags=["Completions Endpoints"])

@router.post("/completions", response_model=ChatCompletionResponse)
async def chat_completion(request: ChatCompletionRequest):
    '''
    Completes a GPT4All model response based on the last message in the chat.
    '''
    # GPU is not implemented yet
    if settings.inference_mode == "gpu":
        raise HTTPException(status_code=400,
              detail=f"Not implemented yet: Can only infer in CPU mode.")

    # we only support the configured model
    if request.model != settings.model:
        raise HTTPException(status_code=400,
              detail=f"The GPT4All inference server is booted to only infer: `{settings.model}`")

    # run only of we have a message
    if request.messages:
        model = GPT4All(model_name=settings.model, model_path=settings.gpt4all_path)

        # format system message and conversation history correctly
        formatted_messages = ""
        for message in request.messages:
            formatted_messages += f"<|im_start|>{message.role}\n{message.content}<|im_end|>\n"

        # the LLM will complete the response of the assistant
        formatted_messages += "<|im_start|>assistant\n"
        response = model.generate(
            prompt=formatted_messages,
            temp=request.temperature
            )

        # the LLM may continue to hallucinate the conversation, but we want only the first response
        # so, cut off everything after first <|im_end|>
        index = response.find("<|im_end|>")
        response_content = response[:index].strip()
    else:
        response_content = "No messages received."

    # Create a chat message for the response
    response_message = ChatCompletionMessage(role="assistant", content=response_content)

    # Create a choice object with the response message
    response_choice = ChatCompletionChoice(
        message=response_message,
        index=0,
        logprobs=-1.0,  # Placeholder value
        finish_reason="length"  # Placeholder value
    )

    # Create the response object
    chat_response = ChatCompletionResponse(
        id=str(uuid4()),
        created=int(time.time()),
        model=request.model,
        choices=[response_choice],
        usage=ChatCompletionUsage(prompt_tokens=0, completion_tokens=0, total_tokens=0),  # Placeholder values
    )

    return chat_response