Add CLI for chatting with OpenAI model

- Add CLI functionality for chatting with OpenAI model - Implement function to allow users to input OpenAI API key and model name - Implement function to allow users to chat with OpenAI model using retrieved documents - Add module to handle sending questions to OpenAI model - Add module to load and split text documents, create retriever, and define StreamStdOut callback class
1 year ago · 226203e4d9
commit 226203e4d9
15 changed files with 1575 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,4 @@
+/.env
+/.idea/
+/.vscode/
+/.venv/
--- a/README.md
+++ b/README.md
@ -0,0 +1,29 @@
+# talk-codebase is a powerful tool for chatting with your codebase
+
+<p align="center">
+  <img src="https://github.com/rsaryev/talk-codebase/assets/70219513/b0cb4d00-94b6-407e-8545-92e79d442d89" width="800" alt="chat">
+</p>
+
+## Description
+
+In the chat, you can ask questions about the codebase. AI will answer your questions, and if necessary, it will offer code improvements. This is very convenient when you want to quickly find something in the codebase, but don't want to waste time searching. It is also convenient when you want to improve a specific function, you can ask "How can I improve the function {function name}?" and AI will suggest improvements. Codebase is analyzed using openai.
+
+## Installation
+
+```bash
+pip install talk-codebase
+```
+
+## Usage
+
+talk-codebase works only with files of popular programming languages and additionally with .txt files. All other files will be ignored.
+```bash
+# Start chatting with your codebase
+talk-codebase chat <directory>
+
+# Configure
+talk-codebase configure
+
+# Help
+talk-codebase --help
+```
--- a/poetry.lock
+++ b/poetry.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@ -0,0 +1,30 @@
+[tool.poetry]
+name = "talk-codebase"
+version = "0.1.1"
+description = "talk-codebase is a powerful tool for querying and analyzing codebases."
+authors = ["Saryev Rustam <rustam1997@gmail.com>"]
+readme = "README.md"
+packages = [{include = "talk_codebase"}]
+keywords = ["chatgpt", "openai", "cli"]
+
+[tool.poetry.dependencies]
+python = "^3.9"
+langchain = "^0.0.180"
+fire = "^0.5.0"
+openai = "^0.27.7"
+tiktoken = "^0.4.0"
+faiss-cpu = "^1.7.4"
+
+
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"
+
+[project.urls]
+"Source" = "https://github.com/rsaryev/talk-codebase"
+"Bug Tracker" = "https://github.com/rsaryev/talk-codebase/issues"
+
+[tool.poetry.scripts]
+talk-codebase = "talk_codebase.cli:main"
+
+
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,53 @@
+aiohttp==3.8.4
+aiosignal==1.3.1
+async-timeout==4.0.2
+attrs==23.1.0
+bleach==6.0.0
+certifi==2023.5.7
+charset-normalizer==3.1.0
+dataclasses-json==0.5.7
+docutils==0.20.1
+faiss-cpu==1.7.4
+fire==0.5.0
+frozenlist==1.3.3
+idna==3.4
+importlib-metadata==6.6.0
+jaraco.classes==3.2.3
+keyring==23.13.1
+langchain==0.0.180
+markdown-it-py==2.2.0
+marshmallow==3.19.0
+marshmallow-enum==1.5.1
+mdurl==0.1.2
+more-itertools==9.1.0
+multidict==6.0.4
+mypy-extensions==1.0.0
+numexpr==2.8.4
+numpy==1.24.3
+openai==0.27.7
+openapi-schema-pydantic==1.2.4
+packaging==23.1
+pkginfo==1.9.6
+pydantic==1.10.8
+Pygments==2.15.1
+PyYAML==6.0
+readme-renderer==37.3
+regex==2023.5.5
+requests==2.31.0
+requests-toolbelt==1.0.0
+rfc3986==2.0.0
+rich==13.3.5
+six==1.16.0
+SQLAlchemy==2.0.15
+talk-codebase==0.1.0
+tenacity==8.2.2
+termcolor==2.3.0
+tiktoken==0.4.0
+tqdm==4.65.0
+twine==4.0.2
+typing-inspect==0.9.0
+typing_extensions==4.6.2
+urllib3==2.0.2
+webencodings==0.5.1
+yarl==1.9.2
+zipp==3.15.0
--- a/talk_codebase/init.py
+++ b/talk_codebase/init.py
--- a/talk_codebase/pycache/init.cpython-39.pyc
+++ b/talk_codebase/pycache/init.cpython-39.pyc
--- a/talk_codebase/pycache/cli.cpython-39.pyc
+++ b/talk_codebase/pycache/cli.cpython-39.pyc
--- a/talk_codebase/pycache/consts.cpython-39.pyc
+++ b/talk_codebase/pycache/consts.cpython-39.pyc
--- a/talk_codebase/pycache/llm.cpython-39.pyc
+++ b/talk_codebase/pycache/llm.cpython-39.pyc
--- a/talk_codebase/pycache/utils.cpython-39.pyc
+++ b/talk_codebase/pycache/utils.cpython-39.pyc
--- a/talk_codebase/cli.py
+++ b/talk_codebase/cli.py
@ -0,0 +1,71 @@
+import os
+import fire
+import yaml
+from talk_codebase.utils import create_retriever
+from talk_codebase.llm import send_question
+
+
+def get_config():
+    home_dir = os.path.expanduser("~")
+    config_path = os.path.join(home_dir, ".config.yaml")
+    if os.path.exists(config_path):
+        with open(config_path, "r") as f:
+            config = yaml.safe_load(f)
+    else:
+        config = {}
+    return config
+
+
+def save_config(config):
+    home_dir = os.path.expanduser("~")
+    config_path = os.path.join(home_dir, ".config.yaml")
+    with open(config_path, "w") as f:
+        yaml.dump(config, f)
+
+
+def configure():
+    config = get_config()
+    api_key = input("🤖 Enter your OpenAI API key: ")
+    model_name = input("🤖 Enter your model name (default: gpt-3.5-turbo): ") or "gpt-3.5-turbo"
+    config["api_key"] = api_key
+    config["model_name"] = model_name
+    save_config(config)
+
+
+def chat(root_dir):
+    try:
+        config = get_config()
+        api_key = config.get("api_key")
+        model_name = config.get("model_name")
+        if not (api_key and model_name):
+            configure()
+            chat(root_dir)
+        retriever = create_retriever(root_dir, api_key)
+        while True:
+            question = input("👉 ")
+            if not question:
+                print("🤖 Please enter a question.")
+                continue
+            if question.lower() in ('exit', 'quit'):
+                break
+            send_question(question, retriever, api_key, model_name)
+    except KeyboardInterrupt:
+        print("\n🤖 Bye!")
+    except Exception as e:
+        if str(e) == "<empty message>":
+            print("🤖 Please configure your API key.")
+            configure()
+            chat(root_dir)
+        else:
+            print(f"🤖 Error: {e}")
+
+
+def main():
+    fire.Fire({
+        "chat": chat,
+        "configure": configure,
+    })
+
+
+if __name__ == "__main__":
+    main()
--- a/talk_codebase/consts.py
+++ b/talk_codebase/consts.py
@ -0,0 +1,6 @@
+EXCLUDE_DIRS = ['__pycache__', '.venv', '.git', '.idea', 'venv', 'env', 'node_modules', 'dist', 'build', '.vscode',
+                '.github', '.gitlab']
+ALLOW_FILES = ['.txt', '.js', '.mjs', '.ts', '.tsx', '.css', '.scss', '.less', '.html', '.htm', '.json', '.py',
+               '.java', '.c', '.cpp', '.cs', '.go', '.php', '.rb', '.rs', '.swift', '.kt', '.scala', '.m', '.h',
+               '.sh', '.pl', '.pm', '.lua', '.sql']
+EXCLUDE_FILES = ['requirements.txt', 'package.json', 'package-lock.json', 'yarn.lock']
--- a/talk_codebase/llm.py
+++ b/talk_codebase/llm.py
@ -0,0 +1,12 @@
+from langchain.callbacks.manager import CallbackManager
+from langchain.chains import ConversationalRetrievalChain
+from langchain.chat_models import ChatOpenAI
+from talk_codebase.utils import StreamStdOut
+
+
+def send_question(question, retriever, openai_api_key, model_name):
+    model = ChatOpenAI(model_name=model_name, openai_api_key=openai_api_key, streaming=True,
+                       callback_manager=CallbackManager([StreamStdOut()]))
+    qa = ConversationalRetrievalChain.from_llm(model, retriever=retriever)
+    answer = qa({"question": question, "chat_history": []})
+    return answer
--- a/talk_codebase/utils.py
+++ b/talk_codebase/utils.py
@ -0,0 +1,54 @@
+import os
+import sys
+
+from langchain import FAISS
+from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
+from langchain.document_loaders import TextLoader
+from langchain.embeddings import OpenAIEmbeddings
+from langchain.text_splitter import CharacterTextSplitter
+
+from talk_codebase.consts import EXCLUDE_DIRS, EXCLUDE_FILES, ALLOW_FILES
+
+
+class StreamStdOut(StreamingStdOutCallbackHandler):
+    def on_llm_new_token(self, token: str, **kwargs) -> None:
+        sys.stdout.write(token)
+        sys.stdout.flush()
+
+    def on_llm_start(self, serialized, prompts, **kwargs):
+        sys.stdout.write("🤖 ")
+
+    def on_llm_end(self, response, **kwargs):
+        sys.stdout.write("\n")
+        sys.stdout.flush()
+
+
+def load_files(root_dir):
+    docs = []
+    for dirpath, dirnames, filenames in os.walk(root_dir):
+        if any(exclude_dir in dirpath for exclude_dir in EXCLUDE_DIRS):
+            continue
+        if not filenames:
+            continue
+        for file in filenames:
+            if any(file.endswith(allow_file) for allow_file in ALLOW_FILES) and not any(
+                    file == exclude_file for exclude_file in EXCLUDE_FILES):
+                try:
+                    loader = TextLoader(os.path.join(dirpath, file), encoding='utf-8')
+                    docs.extend(loader.load_and_split())
+                except Exception as e:
+                    print(f"Error loading file {file}: {e}")
+    print(f"🤖 Loaded {len(docs)} documents")
+    return docs
+
+
+def create_retriever(root_dir, openai_api_key):
+    docs = load_files(root_dir)
+    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
+    texts = text_splitter.split_documents(docs)
+
+    embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
+    db = FAISS.from_documents(texts, embeddings)
+    retriever = db.as_retriever()
+
+    return retriever