BREAKING CHANGE: add Experimental function for output streaming

a new feature for output streaming which enables real-time response streaming from the OpenAI server.
pull/10/head
sean1832 1 year ago
parent e8a077e026
commit 5125811dbd

@ -6,7 +6,9 @@ import modules.utilities as util
import modules.language as language
import GPT
openai.api_key = util.read_file(r'.user\API-KEYS.txt').strip()
API_KEY = util.read_file(r'.user\API-KEYS.txt').strip()
openai.api_key = API_KEY
# if 'SESSION_LANGUAGE' not in st.session_state:
# st.session_state['SESSION_LANGUAGE'] = util.read_json_at('.user/language.json', 'SESSION_LANGUAGE', 'en_US')
@ -54,6 +56,20 @@ def run_answer(query, model, temp, max_tokens, top_p, freq_penl, pres_penl, chun
return all_answers
def run_answer_stream(query, model, temp, max_tokens, top_p, freq_penl, pres_penl):
brain_data = util.read_json(r'.user\brain-data.json')
results = GPT.toolkit.search_chunks(query, brain_data, count=1)
for result in results:
my_info = util.read_file(f'{prompt_dir}/' + _('my-info') + '.txt')
prompt = util.read_file(f'{prompt_dir}/' + _('question') + '.txt')
prompt = prompt.replace('<<INFO>>', result['content'])
prompt = prompt.replace('<<QS>>', query)
prompt = prompt.replace('<<MY-INFO>>', my_info)
answer_client = GPT.toolkit.gpt3_stream(API_KEY, prompt, model, temp, max_tokens, top_p, freq_penl, pres_penl)
return answer_client
def run(query, model, prompt_file, temp, max_tokens, top_p, freq_penl, pres_penl):
chunks = textwrap.wrap(query, 10000)
responses = []
@ -63,3 +79,10 @@ def run(query, model, prompt_file, temp, max_tokens, top_p, freq_penl, pres_penl
responses.append(response)
all_response = '\n\n'.join(responses)
return all_response
def run_stream(query, model, prompt_file, temp, max_tokens, top_p, freq_penl, pres_penl):
chunk = textwrap.wrap(query, 10000)[0]
prompt = util.read_file(prompt_file).replace('<<DATA>>', chunk)
client = GPT.toolkit.gpt3_stream(API_KEY, prompt, model, temp, max_tokens, top_p, freq_penl, pres_penl)
return client

@ -1,5 +1,8 @@
import openai
import numpy as np
import requests
import sseclient
import json
# this function compare similarity between two vectors.
@ -44,3 +47,27 @@ def gpt3(prompt, model, temp, max_tokens, top_p, freq_penl, pres_penl):
)
text = response['choices'][0]['text'].strip()
return text
def gpt3_stream(API_KEY, prompt, model, temp, max_tokens, top_p, freq_penl, pres_penl):
url = 'https://api.openai.com/v1/completions'
headers = {
'Accept': 'text/event-stream',
'Authorization': 'Bearer ' + API_KEY
}
body = {
'model': model,
'prompt': prompt,
'max_tokens': max_tokens,
'temperature': temp,
'top_p': top_p,
'frequency_penalty': freq_penl,
'presence_penalty': pres_penl,
'stream': True,
}
req = requests.post(url, stream=True, headers=headers, json=body)
client = sseclient.SSEClient(req)
return client
# print(json.loads(event.data)['choices'][0]['text'], end='', flush=True)

@ -2,6 +2,7 @@ import os
import time
import streamlit as st
import streamlit_toggle as st_toggle
import modules.INFO as INFO
import modules as mod
@ -89,10 +90,16 @@ with st.sidebar:
help=_("The number of tokens to consider at each step. The larger this is, the more "
"context the model has to work with, but the slower generation and expensive "
"will it be."))
chunk_count = st.slider(_('Answer count'), 1, 5, value=util.read_json_at(INFO.BRAIN_MEMO, 'chunk_count', 1),
help=_("The number of answers to generate. The model will continue to iteratively "
"generating answers until it reaches the answer count."))
enable_stream = st_toggle.st_toggle_switch(_('Stream (experimental)'),
default_value=util.read_json_at(INFO.BRAIN_MEMO, 'enable_stream', True))
if not enable_stream:
chunk_count = st.slider(_('Answer count'), 1, 5, value=util.read_json_at(INFO.BRAIN_MEMO, 'chunk_count', 1),
help=_("The number of answers to generate. The model will continue to iteratively "
"generating answers until it reaches the answer count."
"\n\nNote that this function does not supports `stream` mode."))
else:
chunk_count = 1
param = GPT.model.param(temp=temp,
max_tokens=max_tokens,
top_p=top_p,
@ -136,4 +143,4 @@ with body:
st_tool.download_as(_("📥download log"))
# execute brain calculation
if not question == '' and send:
st_tool.execute_brain(question, param, op, models, prompt_dictionary, _('question'), SESSION_LANG)
st_tool.execute_brain(question, param, op, models, prompt_dictionary, _('question'), enable_stream, SESSION_LANG)

@ -5,3 +5,5 @@ streamlit_tags==1.2.8
streamlit_toggle==0.1.3
streamlit_toggle_switch==1.0.2
~treamlit==1.18.1
requests~=2.28.2

@ -1,5 +1,6 @@
import os
import time
import json
import streamlit as st
import tkinter as tk
from tkinter import filedialog
@ -228,11 +229,37 @@ def process_response(query, target_model, prompt_file: str, data: GPT.model.para
log(results, delimiter=f'{file_name.upper()}')
def process_response_stream(query, target_model, prompt_file: str, data: GPT.model.param):
# check if exclude model is not target model
file_name = util.get_file_name(prompt_file)
with st.spinner(_('Thinking on ') + f"{file_name}..."):
client = GPT.query.run_stream(query, target_model, prompt_file,
data.temp,
data.max_tokens,
data.top_p,
data.frequency_penalty,
data.present_penalty)
# displaying results
st.header(f'📃{file_name}')
response_panel = st.empty()
previous_chars = ''
for event in client.events():
if event.data != '[DONE]':
char = json.loads(event.data)['choices'][0]['text']
response = previous_chars + char
response_panel.info(f'{response}')
previous_chars += char
time.sleep(1)
log(previous_chars, delimiter=f'{file_name.upper()}')
def execute_brain(q, params: GPT.model.param,
op: GPT.model.Operation,
model: GPT.model.Model,
prompt_dictionary: dict,
question_prompt: str,
stream: bool,
session_language,
):
# log question
@ -246,28 +273,62 @@ def execute_brain(q, params: GPT.model.param,
msg.success(_('Brain Updated!'), icon="👍")
time.sleep(2)
# thinking on answer
with st.spinner(_('Thinking on Answer')):
answer = GPT.query.run_answer(q, model.question_model,
params.temp,
params.max_tokens,
params.top_p,
params.frequency_penalty,
params.present_penalty,
chunk_count=params.chunk_count)
if util.contains(op.operations, question_prompt):
# =================stream=================
if stream:
previous_chars = ''
is_question_selected = util.contains(op.operations, question_prompt)
with st.spinner(_('Thinking on Answer')):
answer_clients = GPT.query.run_answer_stream(q, model.question_model,
params.temp,
params.max_tokens,
params.top_p,
params.frequency_penalty,
params.present_penalty)
if is_question_selected:
# displaying results
st.header(_('💬Answer'))
st.info(f'{answer}')
time.sleep(1.5)
log(answer, delimiter='ANSWER')
# thinking on other outputs
if len(op.operations_no_question) > 0:
for i in range(len(op.operations_no_question)):
prompt_path = prompt_dictionary[op.operations_no_question[i]]
other_model = model.other_models[i]
process_response(answer, other_model, prompt_path, params)
answer_panel = st.empty()
for event in answer_clients.events():
if event.data != '[DONE]':
char = json.loads(event.data)['choices'][0]['text']
answer = previous_chars + char
if is_question_selected:
answer_panel.info(f'{answer}')
previous_chars += char
time.sleep(0.1)
log(previous_chars, delimiter='ANSWER')
if len(op.operations_no_question) > 0:
for i in range(len(op.operations_no_question)):
prompt_path = prompt_dictionary[op.operations_no_question[i]]
other_model = model.other_models[i]
process_response_stream(previous_chars, other_model, prompt_path, params)
# =================stream=================
else:
# thinking on answer
with st.spinner(_('Thinking on Answer')):
answer = GPT.query.run_answer(q, model.question_model,
params.temp,
params.max_tokens,
params.top_p,
params.frequency_penalty,
params.present_penalty,
chunk_count=params.chunk_count)
if util.contains(op.operations, question_prompt):
# displaying results
st.header(_('💬Answer'))
st.info(f'{answer}')
time.sleep(1.5)
log(answer, delimiter='ANSWER')
# thinking on other outputs
if len(op.operations_no_question) > 0:
for i in range(len(op.operations_no_question)):
prompt_path = prompt_dictionary[op.operations_no_question[i]]
other_model = model.other_models[i]
process_response(answer, other_model, prompt_path, params)
# convert param to dictionary
param_dict = vars(params)

Loading…
Cancel
Save