NBs and README

1 year ago · c2159d4b93
parent e531a5c0d6
commit c2159d4b93
14 changed files with 751 additions and 15480 deletions
--- a/README.md
+++ b/README.md
@ -36,11 +36,50 @@ Run the trial
 The logs will be sent to `./root/<run_name>`.

 ### To Run: reasoning (HotPotQA)
-Clone this repo and move to the HotPotQA directory
+We provide a set of notebooks to easily run, explore, and interact with the results of the reasoning experiments. Each experiment consists of a random sample of 100 questions from the HotPotQA distractor dataset. Each question in the sample is attempted by an agent with a specific type and reflexion strategy.
+
+#### Setup
+
+To get started:
+
+1. Clone this repo and move to the HotPotQA directory:
 ```bash
 git clone https://github.com/noahshinn024/reflexion && cd ./hotpotqa_runs
 ```

+2. Install the module dependencies into your environment:
+```bash
+pip install -r requirements.txt
+```
+
+3. Set `OPENAI_API_KEY` environment variable to your OpenAI API key:
+```bash
+export OPENAI_API_KEY=<your key>
+```
+
+#### Agent Types
+
+Agent type is determined by the notebook you choose to run. The available agent types include:
+ - `ReAct` - ReAct Agent
+
+ - `CoT_context` - CoT Agent given supporting context about the question 
+
+ - `CoT_no_context` - CoT Agent given no supporting context about the question
+
+The notebook for each agent type is located in the `./hotpot_runs/notebooks` directory.
+
+#### Reflexion Strategies
+
+Each notebook allows you to specify the reflexion strategy to be used by the agents. The available reflexion strategies, which are defined in an `Enum`, include:
+
+ - `ReflexionStrategy.NONE` - The agent is not given any information about its last attempt. 
+
+ - `ReflexionStrategy.LAST_ATTEMPT` - The agent is given its reasoning trace from its last attempt on the question as context.
+
+ - `ReflexionStrategy.REFLEXION` - The agent is given its self-reflection on the last attempt as context. 
+
+ - `ReflexionStrategy.LAST_ATTEMPT_AND_REFLEXION` -  The agent is given both its reasoning trace and self-reflection on the last attempt as context.
+
 ### Another Note

 Due to the nature of these experiments, it may not be feasible for individual developers to rerun the results as GPT-4 has limited access and significant API charges. All runs from the paper and additional results are logged in `./alfworld_runs/root` for decision-making and `./hotpotqa_runs/root` for reasoning. 
@ -64,4 +103,4 @@ For all questions, contact [noahshinn024@gmail.com](noahshinn024@gmail.com)
  journal={arXiv preprint arXiv:2303.11366},
  year={2023}
 }
-```
+```
--- a/hotpotqa_runs/react_cls.py
+++ b/hotpotqa_runs/react_cls.py
@ -1,6 +1,6 @@
 import re, string, os
 from typing import List, Union, Literal
-
+from enum import Enum
 import tiktoken
 from langchain import OpenAI, Wikipedia
 from langchain.llms.base import BaseLLM
@ -11,6 +11,19 @@ from prompts import reflect_prompt, react_agent_prompt, react_reflect_agent_prom
 from prompts import cot_agent_prompt, cot_reflect_agent_prompt, cot_reflect_prompt, COT_INSTRUCTION, COT_REFLECT_INSTRUCTION
 from fewshots import WEBTHINK_SIMPLE6, REFLECTIONS, COT, COT_REFLECT

+class ReflexionStrategy(Enum):
+    """
+    NONE: No reflection
+    LAST_ATTEMPT: Use last reasoning trace in context 
+    REFLEXION: Apply reflexion to the next reasoning trace 
+    LAST_ATTEMPT_AND_REFLEXION: Use last reasoning trace in context and apply reflexion to the next reasoning trace 
+    """
+    NONE = 'base'
+    LAST_ATTEMPT = 'last_trial' 
+    REFLEXION = 'reflexion'
+    LAST_ATTEMPT_AND_REFLEXION = 'last_trial_and_reflexion'
+
+
 class CoTAgent:
    def __init__(self,
                    question: str,
@ -18,7 +31,6 @@ class CoTAgent:
                    key: str,
                    agent_prompt: PromptTemplate = cot_reflect_agent_prompt,
                    reflect_prompt: PromptTemplate = cot_reflect_prompt,
-                    reflect_header: str = REFLECTION_HEADER,
                    cot_examples: str = COT,
                    reflect_examples: str = COT_REFLECT,
                    self_reflect_llm: BaseLLM = OpenAI(
@ -34,13 +46,11 @@ class CoTAgent:
                                            model_kwargs={"stop": "\n"},
                                            openai_api_key=os.environ['OPENAI_API_KEY']),
                    ) -> None:
-        
        self.question = question
        self.context = context
        self.key = key
        self.agent_prompt = agent_prompt
        self.reflect_prompt = reflect_prompt
-        self.reflect_header = reflect_header
        self.cot_examples = cot_examples 
        self.reflect_examples = reflect_examples
        self.self_reflect_llm = self_reflect_llm
@ -51,12 +61,10 @@ class CoTAgent:
        self.step_n: int = 0
        self.reset()

-    def run(self, reflect: bool = True,
-            reflect_strategy: Union[Literal['last_attempt'],
-                                    Literal['reflexion'],
-                                    Literal['last_attempt + reflexion']] = 'reflexion') -> None:
-        if self.step_n > 0 and not self.is_correct() and reflect:
-            self.reflect(reflect_strategy)
+    def run(self,
+            reflexion_strategy: ReflexionStrategy = ReflexionStrategy.REFLEXION) -> None:
+        if self.step_n > 0 and not self.is_correct() and reflexion_strategy != ReflexionStrategy.NONE:
+            self.reflect(reflexion_strategy)
        self.reset()
        self.step()
        self.step_n += 1
@ -87,17 +95,15 @@ class CoTAgent:
            print('Invalid action type, please try again.')
    
    def reflect(self,
-                strategy: Union[Literal['last_attempt'],
-                                Literal['reflexion'],
-                                Literal['last_attempt + reflexion']]) -> None:
-        print('Reflecting...')
-        if strategy == 'last_attempt':
+                strategy: ReflexionStrategy) -> None:
+        print('Running Reflexion strategy...')
+        if strategy == ReflexionStrategy.LAST_ATTEMPT:
            self.reflections = [self.scratchpad]
            self.reflections_str = format_last_attempt(self.question , self.reflections[0])
-        elif strategy == 'reflexion':
+        elif strategy == ReflexionStrategy.REFLEXION:
            self.reflections += [self.prompt_reflection()]
            self.reflections_str = format_reflections(self.reflections)
-        elif strategy == 'last_attempt + reflexion':
+        elif strategy == ReflexionStrategy.LAST_ATTEMPT_AND_REFLEXION:
            self.reflections_str = format_last_attempt(self.question , self.scratchpad)
            self.reflections = [self.prompt_reflection()]
            self.reflections_str += '\n'+ format_reflections(self.reflections, header = REFLECTION_AFTER_LAST_TRIAL_HEADER)
@ -253,7 +259,6 @@ class ReactReflectAgent(ReactAgent):
                 max_steps: int = 6,
                 agent_prompt: PromptTemplate = react_reflect_agent_prompt,
                 reflect_prompt: PromptTemplate = reflect_prompt,
-                 reflect_header: str = REFLECTION_HEADER,
                 docstore: Docstore = Wikipedia(),
                 react_llm: BaseLLM = OpenAI(
                                             temperature=0,
@ -269,29 +274,28 @@ class ReactReflectAgent(ReactAgent):
                 ) -> None:
        
        super().__init__(question, key, max_steps, agent_prompt, docstore, react_llm)
-        self.reflect_header = reflect_header
        self.reflect_llm = reflect_llm
        self.reflect_prompt = reflect_prompt
        self.reflect_examples = REFLECTIONS
        self.reflections: List[str] = []
        self.reflections_str: str = ''
    
-    def run(self, reset = True, reflect_strategy: Union[Literal['last_attempt'], Literal['reflexion'], Literal['last_attempt + reflexion']] = 'reflexion') -> None:
+    def run(self, reset = True, reflect_strategy: ReflexionStrategy = ReflexionStrategy.REFLEXION) -> None:
        if (self.is_finished() or self.is_halted()) and not self.is_correct():
            self.reflect(reflect_strategy)

        ReactAgent.run(self, reset)
    
    def reflect(self,
-                strategy: Union[Literal['last_attempt'], Literal['reflexion'], Literal['last_attempt + reflexion']]) -> None:
+                strategy: ReflexionStrategy) -> None:
        print('Reflecting...')
-        if strategy == 'last_attempt':
+        if strategy == ReflexionStrategy.LAST_ATTEMPT:
            self.reflections = [self.scratchpad]
            self.reflections_str = format_last_attempt(self.question, self.reflections[0])
-        elif strategy == 'reflexion':
+        elif strategy == ReflexionStrategy.REFLEXION: 
            self.reflections += [self.prompt_reflection()]
            self.reflections_str = format_reflections(self.reflections)
-        elif strategy == 'last_attempt + reflexion':
+        elif strategy == ReflexionStrategy.LAST_ATTEMPT_AND_REFLEXION: 
            self.reflections_str = format_last_attempt(self.question, self.scratchpad)
            self.reflections = [self.prompt_reflection()]
            self.reflections_str += format_reflections(self.reflections, header = REFLECTION_AFTER_LAST_TRIAL_HEADER)
--- a/hotpotqa_runs/notebooks/CoT_context/CotQA.ipynb
+++ b/hotpotqa_runs/notebooks/CoT_context/CotQA.ipynb
--- a/hotpotqa_runs/notebooks/CoT_context/CotQA_base.ipynb
+++ b/hotpotqa_runs/notebooks/CoT_context/CotQA_base.ipynb
--- a/hotpotqa_runs/notebooks/CoT_no_context/CotQA_simple.ipynb
+++ b/hotpotqa_runs/notebooks/CoT_no_context/CotQA_simple.ipynb
--- a/hotpotqa_runs/notebooks/CoT_no_context/CotQA_simple_base.ipynb
+++ b/hotpotqa_runs/notebooks/CoT_no_context/CotQA_simple_base.ipynb
--- a/hotpotqa_runs/notebooks/CotQA_context.ipynb
+++ b/hotpotqa_runs/notebooks/CotQA_context.ipynb
@ -0,0 +1,210 @@
+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Notebook for running Chain-of-Thought with supporting context experiments "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys, os\n",
+    "sys.path.append('..')\n",
+    "root = '../root/'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import joblib\n",
+    "import numpy as np\n",
+    "from agents import CoTAgent, ReflexionStrategy\n",
+    "from util import summarize_trial, log_trial, save_agents"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Load the HotPotQA Sample"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "hotpot = joblib.load('../data/hotpot-qa-distractor-sample.joblib').reset_index(drop = True)\n",
+    "\n",
+    "hotpot['supporting_paragraphs'] = None\n",
+    "for ind, row in hotpot.iterrows():\n",
+    "    supporting_articles = row['supporting_facts']['title']\n",
+    "    articles = row['context']['title']\n",
+    "    sentences = row['context']['sentences'] \n",
+    "    supporting_paragraphs = []\n",
+    "    for article in supporting_articles:\n",
+    "        supporting_paragraph = ''.join(sentences[np.where(articles == article)][0])\n",
+    "        supporting_paragraphs.append(supporting_paragraph)\n",
+    "    supporting_paragraphs = '\\n\\n'.join(supporting_paragraphs)\n",
+    "    hotpot.at[ind, 'supporting_paragraphs'] = supporting_paragraphs"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Define the Reflexion Strategy"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "    NONE: No reflection\n",
+      "    LAST_ATTEMPT: Use last reasoning trace in context \n",
+      "    REFLEXION: Apply reflexion to the next reasoning trace \n",
+      "    LAST_ATTEMPT_AND_REFLEXION: Use last reasoning trace in context and apply reflexion to the next reasoning trace \n",
+      "    \n"
+     ]
+    }
+   ],
+   "source": [
+    "print(ReflexionStrategy.__doc__)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "strategy: ReflexionStrategy = ReflexionStrategy.REFLEXION"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Initialize a CoTAgent for each question"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from prompts import cot_agent_prompt, cot_reflect_agent_prompt, cot_reflect_prompt\n",
+    "from fewshots import COT, COT_REFLECT\n",
+    "agents = [CoTAgent(row['question'],\n",
+    "                   row['supporting_paragraphs'],\n",
+    "                   row['answer'],\n",
+    "                   agent_prompt=cot_agent_prompt if strategy == ReflexionStrategy.NONE else cot_reflect_agent_prompt,\n",
+    "                   cot_examples=COT,\n",
+    "                   reflect_prompt=cot_reflect_prompt,\n",
+    "                   reflect_examples=COT_REFLECT,\n",
+    "                    ) for _, row in hotpot.iterrows()]"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Run `n` trials"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "n = 5\n",
+    "trial = 0\n",
+    "log = ''"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for i in range(n):\n",
+    "    for agent in [a for a in agents if not a.is_correct()]:\n",
+    "        agent.run(reflexion_strategy = strategy)\n",
+    "        print(f'Answer: {agent.key}')\n",
+    "    trial += 1\n",
+    "    log += log_trial(agents, trial)\n",
+    "    correct, incorrect = summarize_trial(agents)\n",
+    "    print(f'Finished Trial {trial}, Correct: {len(correct)}, Incorrect: {len(incorrect)}')"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Save the result log"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open(os.path.join(root, 'CoT', 'context', strategy.value, f'{len(agents)}_questions_{trial}_trials.txt'), 'w') as f:\n",
+    "    f.write(log)\n",
+    "save_agents(agents, os.path.join(root, 'CoT', 'context', strategy.value, 'agents'))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "env",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.16"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "e23f799cbd2581634725fbf6ce3480ae26192d78438dfafc8efe944acd6490d5"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/hotpotqa_runs/notebooks/CotQA_no_context.ipynb
+++ b/hotpotqa_runs/notebooks/CotQA_no_context.ipynb
@ -0,0 +1,190 @@
+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Notebook for running Chain-of-Thought with no supporting context experiments"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys, os\n",
+    "sys.path.append('..')\n",
+    "root = '../root/'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from util import summarize_trial, log_trial, save_agents\n",
+    "import joblib\n",
+    "from agents import CoTAgent, ReflexionStrategy"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Load the HotPotQA Sample"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "hotpot = joblib.load('../data/hotpot-qa-distractor-sample.joblib').reset_index(drop = True)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Define the Reflexion Strategy"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "    NONE: No reflection\n",
+      "    LAST_ATTEMPT: Use last reasoning trace in context \n",
+      "    REFLEXION: Apply reflexion to the next reasoning trace \n",
+      "    LAST_ATTEMPT_AND_REFLEXION: Use last reasoning trace in context and apply reflexion to the next reasoning trace \n",
+      "    \n"
+     ]
+    }
+   ],
+   "source": [
+    "print(ReflexionStrategy.__doc__)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "strategy: ReflexionStrategy = ReflexionStrategy.REFLEXION"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Initialize a CoTAgent for each question"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from prompts import cot_simple_reflect_agent_prompt, cot_simple_reflect_prompt, cot_simple_agent_prompt\n",
+    "from fewshots import COTQA_SIMPLE6, COT_SIMPLE_REFLECTION\n",
+    "\n",
+    "agents = [CoTAgent(question = row['question'],\n",
+    "                   context = '',\n",
+    "                   key = row['answer'],\n",
+    "                   agent_prompt=cot_simple_agent_prompt if strategy == ReflexionStrategy.NONE else cot_simple_reflect_agent_prompt,\n",
+    "                   cot_examples = COTQA_SIMPLE6,\n",
+    "                   reflect_prompt = cot_simple_reflect_prompt,\n",
+    "                   reflect_examples = COT_SIMPLE_REFLECTION,\n",
+    "                      ) for _, row in hotpot.iterrows()]"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Run `n` trials"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "n = 5\n",
+    "trial = 0\n",
+    "log = ''\n",
+    "for i in range(n):\n",
+    "    for agent in [a for a in agents if not a.is_correct()]:\n",
+    "        agent.run(reflexion_strategy = strategy)\n",
+    "        print(f'Answer: {agent.key}')\n",
+    "    trial += 1\n",
+    "    log += log_trial(agents, trial)\n",
+    "    correct, incorrect = summarize_trial(agents)\n",
+    "    print(f'Finished Trial {trial}, Correct: {len(correct)}, Incorrect: {len(incorrect)}')"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Save the result log"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open(os.path.join(root, 'CoT', 'no_context', strategy.value, f'{len(agents)}_questions_{trial}_trials.txt'), 'w') as f:\n",
+    "    f.write(log)\n",
+    "save_agents(agents, os.path.join(root, 'CoT', 'no_context', strategy.value, 'agents'))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "env",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.16"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "e23f799cbd2581634725fbf6ce3480ae26192d78438dfafc8efe944acd6490d5"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/hotpotqa_runs/notebooks/ReAct/ReactQA.ipynb
+++ b/hotpotqa_runs/notebooks/ReAct/ReactQA.ipynb
@ -1,245 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import joblib\n",
-    "from react_cls import ReactAgent\n",
-    "from mocks import DocStoreExplorerMock, LLMMock"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def summarize_trial(agents):\n",
-    "    correct = [a for a in agents if a.is_correct()]\n",
-    "    halted = [a for a in agents if a.is_halted()]\n",
-    "    incorrect = [a for a in agents if a.is_finished() and not a.is_correct()]\n",
-    "    return correct, incorrect, halted\n",
-    "\n",
-    "def log_trial(agents, trial_n):\n",
-    "    correct, incorrect, halted = summarize_trial(agents)\n",
-    "\n",
-    "    log = f\"\"\"\n",
-    "########################################\n",
-    "BEGIN TRIAL {trial_n}\n",
-    "Trial summary: Correct: {len(correct)}, Incorrect: {len(incorrect)}, Halted: {len(halted)}\n",
-    "#######################################\n",
-    "\"\"\"\n",
-    "\n",
-    "    log += '------------- BEGIN CORRECT AGENTS -------------\\n\\n'\n",
-    "    for agent in correct:\n",
-    "        log += f'Question: {agent.question}{agent.scratchpad}\\nCorrect answer: {agent.key}\\n\\n'\n",
-    "\n",
-    "    log += '------------- BEGIN INCORRECT AGENTS -----------\\n\\n'\n",
-    "    for agent in incorrect:\n",
-    "        log += f'Question: {agent.question}{agent.scratchpad}\\nCorrect answer: {agent.key}\\n\\n'\n",
-    "\n",
-    "    log += '------------- BEGIN HALTED AGENTS --------------\\n\\n'\n",
-    "    for agent in halted:\n",
-    "        log += f'Question: {agent.question}{agent.scratchpad}\\nCorrect answer: {agent.key}\\n\\n'\n",
-    "\n",
-    "    return log"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "hotpot = joblib.load('data/hotpot-qa-distractor-sample.joblib').reset_index(drop = True)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "agents = [ReactAgent(row['question'], row['answer']) for _, row in hotpot.iterrows()]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "trial = 0\n",
-    "log = ''"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 21,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "q = 0"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 22,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Trial: 4 (0/66)\n",
-      "Trial: 4 (1/66)\n",
-      "Trial: 4 (2/66)\n",
-      "Trial: 4 (3/66)\n",
-      "Trial: 4 (4/66)\n",
-      "Trial: 4 (5/66)\n",
-      "Trial: 4 (6/66)\n",
-      "Trial: 4 (7/66)\n",
-      "Trial: 4 (8/66)\n",
-      "Trial: 4 (9/66)\n",
-      "Trial: 4 (10/66)\n",
-      "Trial: 4 (11/66)\n",
-      "Trial: 4 (12/66)\n",
-      "Trial: 4 (13/66)\n",
-      "Trial: 4 (14/66)\n",
-      "Trial: 4 (15/66)\n",
-      "Trial: 4 (16/66)\n",
-      "Trial: 4 (17/66)\n",
-      "Trial: 4 (18/66)\n",
-      "Trial: 4 (19/66)\n",
-      "Trial: 4 (20/66)\n",
-      "Trial: 4 (21/66)\n",
-      "Trial: 4 (22/66)\n",
-      "Trial: 4 (23/66)\n",
-      "Trial: 4 (24/66)\n",
-      "Trial: 4 (25/66)\n",
-      "Trial: 4 (26/66)\n",
-      "Trial: 4 (27/66)\n",
-      "Trial: 4 (28/66)\n",
-      "Trial: 4 (29/66)\n",
-      "Trial: 4 (30/66)\n",
-      "Trial: 4 (31/66)\n",
-      "Trial: 4 (32/66)\n",
-      "Trial: 4 (33/66)\n",
-      "Trial: 4 (34/66)\n",
-      "Trial: 4 (35/66)\n",
-      "Trial: 4 (36/66)\n",
-      "Trial: 4 (37/66)\n",
-      "Trial: 4 (38/66)\n",
-      "Trial: 4 (39/66)\n",
-      "Trial: 4 (40/66)\n",
-      "Trial: 4 (41/66)\n",
-      "Trial: 4 (42/66)\n",
-      "Trial: 4 (43/66)\n",
-      "Trial: 4 (44/66)\n",
-      "Trial: 4 (45/66)\n",
-      "Trial: 4 (46/66)\n",
-      "Trial: 4 (47/66)\n",
-      "Trial: 4 (48/66)\n",
-      "Trial: 4 (49/66)\n",
-      "Trial: 4 (50/66)\n",
-      "Trial: 4 (51/66)\n",
-      "Trial: 4 (52/66)\n",
-      "Trial: 4 (53/66)\n",
-      "Trial: 4 (54/66)\n",
-      "Trial: 4 (55/66)\n",
-      "Trial: 4 (56/66)\n",
-      "Trial: 4 (57/66)\n",
-      "Trial: 4 (58/66)\n",
-      "Trial: 4 (59/66)\n",
-      "Trial: 4 (60/66)\n",
-      "Trial: 4 (61/66)\n",
-      "Trial: 4 (62/66)\n",
-      "Trial: 4 (63/66)\n",
-      "Trial: 4 (64/66)\n",
-      "Trial: 4 (65/66)\n",
-      "Finished Trial 5, Correct: 34, Incorrect: 56, Halted: 12\n"
-     ]
-    }
-   ],
-   "source": [
-    "agents_to_run = [a for a in agents if not a.is_correct()]\n",
-    "\n",
-    "while q < len(agents_to_run):\n",
-    "    print(f'Trial: {trial} ({q}/{len(agents_to_run)})')\n",
-    "    agents_to_run[q].run()\n",
-    "    q += 1\n",
-    "\n",
-    "trial += 1\n",
-    "\n",
-    "log += log_trial(agents, trial)\n",
-    "correct, incorrect, halted = summarize_trial(agents)\n",
-    "print(f'Finished Trial {trial}, Correct: {len(correct)}, Incorrect: {len(incorrect)}, Halted: {len(halted)}')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 23,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "with open('output/base_react/100_questions_5_trials.txt', 'w') as f:\n",
-    "    f.write(log)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 26,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "['output/base_react_dicts.joblib']"
-      ]
-     },
-     "execution_count": 26,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "dicts = [dict(a.__dict__) for a in agents]\n",
-    "for d in dicts:\n",
-    "    for k, v in d.items():\n",
-    "        d[k] = str(v)\n",
-    "\n",
-    "joblib.dump(dicts, 'output/base_react_dicts.joblib')"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "env",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.8.9"
-  },
-  "orig_nbformat": 4,
-  "vscode": {
-   "interpreter": {
-    "hash": "e23f799cbd2581634725fbf6ce3480ae26192d78438dfafc8efe944acd6490d5"
-   }
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
--- a/hotpotqa_runs/notebooks/ReAct/ReactReflectQA.ipynb
+++ b/hotpotqa_runs/notebooks/ReAct/ReactReflectQA.ipynb
@ -1,215 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import joblib\n",
-    "from react_cls import ReactReflectAgent, format_reflections\n",
-    "from mocks import DocStoreExplorerMock, LLMMock"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 15,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def summarize_trial(agents):\n",
-    "    correct = [a for a in agents if a.is_correct()]\n",
-    "    incorrect = [a for a in agents if a.is_finished() and not a.is_correct()]\n",
-    "    return correct, incorrect\n",
-    "\n",
-    "def remove_fewshot(prompt: str) -> str:\n",
-    "    prefix = prompt.split('Here are some examples:')[0]\n",
-    "    suffix = prompt.split('(END OF EXAMPLES)')[1]\n",
-    "    return prefix.strip('\\n').strip() +'\\n' +  suffix.strip('\\n').strip()\n",
-    "\n",
-    "def log_trial(agents, trial_n):\n",
-    "    correct, incorrect = summarize_trial(agents)\n",
-    "\n",
-    "    log = f\"\"\"\n",
-    "########################################\n",
-    "BEGIN TRIAL {trial_n}\n",
-    "Trial summary: Correct: {len(correct)}, Incorrect: {len(incorrect)}\n",
-    "#######################################\n",
-    "\"\"\"\n",
-    "\n",
-    "    log += '------------- BEGIN CORRECT AGENTS -------------\\n\\n'\n",
-    "    for agent in correct:\n",
-    "        log += remove_fewshot(agent._build_agent_prompt()) + f'\\nCorrect answer: {agent.key}\\n\\n'\n",
-    "\n",
-    "    log += '------------- BEGIN INCORRECT AGENTS -----------\\n\\n'\n",
-    "    for agent in incorrect:\n",
-    "        log += remove_fewshot(agent._build_agent_prompt()) + f'\\nCorrect answer: {agent.key}\\n\\n'\n",
-    "\n",
-    "    return log\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "hotpot = joblib.load('data/hotpot-qa-distractor-sample.joblib').reset_index(drop = True)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "agents = [ReactReflectAgent(row['question'], row['answer']) for _, row in hotpot.iterrows()]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "trial = 0\n",
-    "log = ''\n",
-    "last_correct = 0 "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "for agent in [a for a in agents if not a.is_correct()]:\n",
-    "        agent.run(reflect_strategy='last_attempt')\n",
-    "        print(f'Answer: {agent.key}')\n",
-    "trial += 1\n",
-    "log += log_trial(agents, trial)\n",
-    "correct, incorrect = summarize_trial(agents)\n",
-    "print(f'Finished Trial {trial}, Correct: {len(correct)}, Incorrect: {len(incorrect)}')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 17,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "['output/last_trial_react/react_incorrect_dicts_trial_0.joblib']"
-      ]
-     },
-     "execution_count": 17,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "dicts = [dict(a.__dict__) for a in incorrect]\n",
-    "for d in dicts:\n",
-    "    for k, v in d.items():\n",
-    "        d[k] = str(v)\n",
-    "\n",
-    "joblib.dump(dicts, 'output/last_trial_react/react_incorrect_dicts_trial_0.joblib')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "while last_correct != correct:\n",
-    "    last_correct, _ = summarize_trial(agents)\n",
-    "    for agent in [a for a in agents if not a.is_correct()]:\n",
-    "        agent.run(reflect_strategy='last_attempt')\n",
-    "        print(f'Answer: {agent.key}')\n",
-    "    trial += 1\n",
-    "    log += log_trial(agents, trial)\n",
-    "    correct, incorrect = summarize_trial(agents)\n",
-    "    print(f'Finished Trial {trial}, Correct: {len(correct)}, Incorrect: {len(incorrect)}')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "for agent in [a for a in agents if not a.is_correct()]:\n",
-    "        agent.run(reflect_strategy='last_attempt + reflexion')\n",
-    "        print(f'Answer: {agent.key}')\n",
-    "trial += 1\n",
-    "log += log_trial(agents, trial)\n",
-    "correct, incorrect = summarize_trial(agents)\n",
-    "print(f'Finished Trial {trial}, Correct: {len(correct)}, Incorrect: {len(incorrect)}')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 18,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "with open('output/last_trial_react/100_questions_5_trials.txt', 'w') as f:\n",
-    "    f.write(log)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 18,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "['output/reflect/react_reflect_50_correct_dicts.joblib']"
-      ]
-     },
-     "execution_count": 18,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "dicts = [dict(a.__dict__) for a in correct]\n",
-    "for d in dicts:\n",
-    "    for k, v in d.items():\n",
-    "        d[k] = str(v)\n",
-    "\n",
-    "joblib.dump(dicts, 'output/reflect/react_reflect_50_correct_dicts.joblib')"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "env",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.8.9"
-  },
-  "orig_nbformat": 4,
-  "vscode": {
-   "interpreter": {
-    "hash": "e23f799cbd2581634725fbf6ce3480ae26192d78438dfafc8efe944acd6490d5"
-   }
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
--- a/hotpotqa_runs/notebooks/ReactQA.ipynb
+++ b/hotpotqa_runs/notebooks/ReactQA.ipynb
@ -0,0 +1,249 @@
+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Notebook for running React experiments"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys, os\n",
+    "sys.path.append('..')\n",
+    "root  = '../root/'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import joblib\n",
+    "from util import summarize_react_trial, log_react_trial, save_agents\n",
+    "from agents import ReactReflectAgent, ReactAgent, ReflexionStrategy"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Load the HotpotQA Sample"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "hotpot = joblib.load('../data/hotpot-qa-distractor-sample.joblib').reset_index(drop = True)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Define the Reflexion Strategy"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "    NONE: No reflection\n",
+      "    LAST_ATTEMPT: Use last reasoning trace in context \n",
+      "    REFLEXION: Apply reflexion to the next reasoning trace \n",
+      "    LAST_ATTEMPT_AND_REFLEXION: Use last reasoning trace in context and apply reflexion to the next reasoning trace \n",
+      "    \n"
+     ]
+    }
+   ],
+   "source": [
+    "print(ReflexionStrategy.__doc__)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "strategy: ReflexionStrategy = ReflexionStrategy.REFLEXION"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Initialize a React Agent for each question"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "agent_cls = ReactReflectAgent if strategy != ReflexionStrategy.NONE else ReactAgent\n",
+    "agents = [agent_cls(row['question'], row['answer']) for _, row in hotpot.iterrows()]"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Run `n` trials"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "n = 5\n",
+    "trial = 0\n",
+    "log = ''"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Reflecting...\n",
+      "You have attempted to answer following question before and failed. The following reflection(s) give a plan to avoid failing to answer the question in the same way you did previously. Use them to improve your strategy of correctly answering the given question.\n",
+      "Reflections:\n",
+      "- I got stuck in a loop where I kept trying to search 'VIVA Media AG 2004 name change acronym stands for' but the page could not be found. Instead I should have tried to search the similar results that had a similar name to see if they had the answer I was looking for.\n",
+      "Thought 1: I need to search VIVA Media AG and find what their new acronym stands for after their name change in 2004.\n",
+      "Action 1: Search[VIVA Media AG]\n",
+      "Observation 1: Could not find [VIVA Media AG]. Similar: ['MTV Music (Polish TV channel)', 'Paramount International Networks', 'VIVA Plus', 'Viacom (1952–2006)', 'Vauxhall Viva', 'Sartorius AG', 'GfK Entertainment charts', 'Viva Rapid Transit', 'Aivita Muze', 'Spellbound Entertainment']\n"
+     ]
+    },
+    {
+     "ename": "KeyboardInterrupt",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[16], line 4\u001b[0m\n\u001b[1;32m      2\u001b[0m \u001b[39mfor\u001b[39;00m agent \u001b[39min\u001b[39;00m [a \u001b[39mfor\u001b[39;00m a \u001b[39min\u001b[39;00m agents \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m a\u001b[39m.\u001b[39mis_correct()]:\n\u001b[1;32m      3\u001b[0m     \u001b[39mif\u001b[39;00m strategy \u001b[39m!=\u001b[39m ReflexionStrategy\u001b[39m.\u001b[39mNONE:\n\u001b[0;32m----> 4\u001b[0m         agent\u001b[39m.\u001b[39;49mrun(reflect_strategy \u001b[39m=\u001b[39;49m strategy)\n\u001b[1;32m      5\u001b[0m     \u001b[39melse\u001b[39;00m:\n\u001b[1;32m      6\u001b[0m         agent\u001b[39m.\u001b[39mrun()\n",
+      "File \u001b[0;32m~/Desktop/Projects/reflexion/hotpotqa_runs/notebooks/../agents.py:287\u001b[0m, in \u001b[0;36mReactReflectAgent.run\u001b[0;34m(self, reset, reflect_strategy)\u001b[0m\n\u001b[1;32m    284\u001b[0m \u001b[39mif\u001b[39;00m (\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mis_finished() \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mis_halted()) \u001b[39mand\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mis_correct():\n\u001b[1;32m    285\u001b[0m     \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mreflect(reflect_strategy)\n\u001b[0;32m--> 287\u001b[0m ReactAgent\u001b[39m.\u001b[39;49mrun(\u001b[39mself\u001b[39;49m, reset)\n",
+      "File \u001b[0;32m~/Desktop/Projects/reflexion/hotpotqa_runs/notebooks/../agents.py:180\u001b[0m, in \u001b[0;36mReactAgent.run\u001b[0;34m(self, reset)\u001b[0m\n\u001b[1;32m    177\u001b[0m     \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m__reset_agent()\n\u001b[1;32m    179\u001b[0m \u001b[39mwhile\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mis_halted() \u001b[39mand\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mis_finished():\n\u001b[0;32m--> 180\u001b[0m     \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mstep()\n",
+      "File \u001b[0;32m~/Desktop/Projects/reflexion/hotpotqa_runs/notebooks/../agents.py:185\u001b[0m, in \u001b[0;36mReactAgent.step\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    182\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mstep\u001b[39m(\u001b[39mself\u001b[39m) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m \u001b[39mNone\u001b[39;00m:\n\u001b[1;32m    183\u001b[0m     \u001b[39m# Think\u001b[39;00m\n\u001b[1;32m    184\u001b[0m     \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mscratchpad \u001b[39m+\u001b[39m\u001b[39m=\u001b[39m \u001b[39mf\u001b[39m\u001b[39m'\u001b[39m\u001b[39m\\n\u001b[39;00m\u001b[39mThought \u001b[39m\u001b[39m{\u001b[39;00m\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mstep_n\u001b[39m}\u001b[39;00m\u001b[39m:\u001b[39m\u001b[39m'\u001b[39m\n\u001b[0;32m--> 185\u001b[0m     \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mscratchpad \u001b[39m+\u001b[39m\u001b[39m=\u001b[39m \u001b[39m'\u001b[39m\u001b[39m \u001b[39m\u001b[39m'\u001b[39m \u001b[39m+\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mprompt_agent()\n\u001b[1;32m    186\u001b[0m     \u001b[39mprint\u001b[39m(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mscratchpad\u001b[39m.\u001b[39msplit(\u001b[39m'\u001b[39m\u001b[39m\\n\u001b[39;00m\u001b[39m'\u001b[39m)[\u001b[39m-\u001b[39m\u001b[39m1\u001b[39m])\n\u001b[1;32m    188\u001b[0m     \u001b[39m# Act\u001b[39;00m\n",
+      "File \u001b[0;32m~/Desktop/Projects/reflexion/hotpotqa_runs/notebooks/../agents.py:229\u001b[0m, in \u001b[0;36mReactAgent.prompt_agent\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    228\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mprompt_agent\u001b[39m(\u001b[39mself\u001b[39m) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m \u001b[39mstr\u001b[39m:\n\u001b[0;32m--> 229\u001b[0m     \u001b[39mreturn\u001b[39;00m format_step(\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mllm(\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_build_agent_prompt()))\n",
+      "File \u001b[0;32m~/Desktop/Projects/reflexion/.venv/lib/python3.9/site-packages/langchain/llms/base.py:281\u001b[0m, in \u001b[0;36mBaseLLM.__call__\u001b[0;34m(self, prompt, stop, callbacks)\u001b[0m\n\u001b[1;32m    276\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m__call__\u001b[39m(\n\u001b[1;32m    277\u001b[0m     \u001b[39mself\u001b[39m, prompt: \u001b[39mstr\u001b[39m, stop: Optional[List[\u001b[39mstr\u001b[39m]] \u001b[39m=\u001b[39m \u001b[39mNone\u001b[39;00m, callbacks: Callbacks \u001b[39m=\u001b[39m \u001b[39mNone\u001b[39;00m\n\u001b[1;32m    278\u001b[0m ) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m \u001b[39mstr\u001b[39m:\n\u001b[1;32m    279\u001b[0m \u001b[39m    \u001b[39m\u001b[39m\"\"\"Check Cache and run the LLM on the given prompt and input.\"\"\"\u001b[39;00m\n\u001b[1;32m    280\u001b[0m     \u001b[39mreturn\u001b[39;00m (\n\u001b[0;32m--> 281\u001b[0m         \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mgenerate([prompt], stop\u001b[39m=\u001b[39;49mstop, callbacks\u001b[39m=\u001b[39;49mcallbacks)\n\u001b[1;32m    282\u001b[0m         \u001b[39m.\u001b[39mgenerations[\u001b[39m0\u001b[39m][\u001b[39m0\u001b[39m]\n\u001b[1;32m    283\u001b[0m         \u001b[39m.\u001b[39mtext\n\u001b[1;32m    284\u001b[0m     )\n",
+      "File \u001b[0;32m~/Desktop/Projects/reflexion/.venv/lib/python3.9/site-packages/langchain/llms/base.py:176\u001b[0m, in \u001b[0;36mBaseLLM.generate\u001b[0;34m(self, prompts, stop, callbacks)\u001b[0m\n\u001b[1;32m    174\u001b[0m \u001b[39mexcept\u001b[39;00m (\u001b[39mKeyboardInterrupt\u001b[39;00m, \u001b[39mException\u001b[39;00m) \u001b[39mas\u001b[39;00m e:\n\u001b[1;32m    175\u001b[0m     run_manager\u001b[39m.\u001b[39mon_llm_error(e)\n\u001b[0;32m--> 176\u001b[0m     \u001b[39mraise\u001b[39;00m e\n\u001b[1;32m    177\u001b[0m run_manager\u001b[39m.\u001b[39mon_llm_end(output)\n\u001b[1;32m    178\u001b[0m \u001b[39mreturn\u001b[39;00m output\n",
+      "File \u001b[0;32m~/Desktop/Projects/reflexion/.venv/lib/python3.9/site-packages/langchain/llms/base.py:170\u001b[0m, in \u001b[0;36mBaseLLM.generate\u001b[0;34m(self, prompts, stop, callbacks)\u001b[0m\n\u001b[1;32m    165\u001b[0m run_manager \u001b[39m=\u001b[39m callback_manager\u001b[39m.\u001b[39mon_llm_start(\n\u001b[1;32m    166\u001b[0m     {\u001b[39m\"\u001b[39m\u001b[39mname\u001b[39m\u001b[39m\"\u001b[39m: \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m\u001b[39m__class__\u001b[39m\u001b[39m.\u001b[39m\u001b[39m__name__\u001b[39m}, prompts\n\u001b[1;32m    167\u001b[0m )\n\u001b[1;32m    168\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[1;32m    169\u001b[0m     output \u001b[39m=\u001b[39m (\n\u001b[0;32m--> 170\u001b[0m         \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_generate(prompts, stop\u001b[39m=\u001b[39;49mstop, run_manager\u001b[39m=\u001b[39;49mrun_manager)\n\u001b[1;32m    171\u001b[0m         \u001b[39mif\u001b[39;00m new_arg_supported\n\u001b[1;32m    172\u001b[0m         \u001b[39melse\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_generate(prompts, stop\u001b[39m=\u001b[39mstop)\n\u001b[1;32m    173\u001b[0m     )\n\u001b[1;32m    174\u001b[0m \u001b[39mexcept\u001b[39;00m (\u001b[39mKeyboardInterrupt\u001b[39;00m, \u001b[39mException\u001b[39;00m) \u001b[39mas\u001b[39;00m e:\n\u001b[1;32m    175\u001b[0m     run_manager\u001b[39m.\u001b[39mon_llm_error(e)\n",
+      "File \u001b[0;32m~/Desktop/Projects/reflexion/.venv/lib/python3.9/site-packages/langchain/llms/openai.py:306\u001b[0m, in \u001b[0;36mBaseOpenAI._generate\u001b[0;34m(self, prompts, stop, run_manager)\u001b[0m\n\u001b[1;32m    304\u001b[0m     choices\u001b[39m.\u001b[39mextend(response[\u001b[39m\"\u001b[39m\u001b[39mchoices\u001b[39m\u001b[39m\"\u001b[39m])\n\u001b[1;32m    305\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m--> 306\u001b[0m     response \u001b[39m=\u001b[39m completion_with_retry(\u001b[39mself\u001b[39;49m, prompt\u001b[39m=\u001b[39;49m_prompts, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mparams)\n\u001b[1;32m    307\u001b[0m     choices\u001b[39m.\u001b[39mextend(response[\u001b[39m\"\u001b[39m\u001b[39mchoices\u001b[39m\u001b[39m\"\u001b[39m])\n\u001b[1;32m    308\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mstreaming:\n\u001b[1;32m    309\u001b[0m     \u001b[39m# Can't update token usage if streaming\u001b[39;00m\n",
+      "File \u001b[0;32m~/Desktop/Projects/reflexion/.venv/lib/python3.9/site-packages/langchain/llms/openai.py:106\u001b[0m, in \u001b[0;36mcompletion_with_retry\u001b[0;34m(llm, **kwargs)\u001b[0m\n\u001b[1;32m    102\u001b[0m \u001b[39m@retry_decorator\u001b[39m\n\u001b[1;32m    103\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m_completion_with_retry\u001b[39m(\u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs: Any) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m Any:\n\u001b[1;32m    104\u001b[0m     \u001b[39mreturn\u001b[39;00m llm\u001b[39m.\u001b[39mclient\u001b[39m.\u001b[39mcreate(\u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs)\n\u001b[0;32m--> 106\u001b[0m \u001b[39mreturn\u001b[39;00m _completion_with_retry(\u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n",
+      "File \u001b[0;32m~/Desktop/Projects/reflexion/.venv/lib/python3.9/site-packages/tenacity/__init__.py:289\u001b[0m, in \u001b[0;36mBaseRetrying.wraps.<locals>.wrapped_f\u001b[0;34m(*args, **kw)\u001b[0m\n\u001b[1;32m    287\u001b[0m \u001b[39m@functools\u001b[39m\u001b[39m.\u001b[39mwraps(f)\n\u001b[1;32m    288\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mwrapped_f\u001b[39m(\u001b[39m*\u001b[39margs: t\u001b[39m.\u001b[39mAny, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkw: t\u001b[39m.\u001b[39mAny) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m t\u001b[39m.\u001b[39mAny:\n\u001b[0;32m--> 289\u001b[0m     \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m(f, \u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkw)\n",
+      "File \u001b[0;32m~/Desktop/Projects/reflexion/.venv/lib/python3.9/site-packages/tenacity/__init__.py:379\u001b[0m, in \u001b[0;36mRetrying.__call__\u001b[0;34m(self, fn, *args, **kwargs)\u001b[0m\n\u001b[1;32m    377\u001b[0m retry_state \u001b[39m=\u001b[39m RetryCallState(retry_object\u001b[39m=\u001b[39m\u001b[39mself\u001b[39m, fn\u001b[39m=\u001b[39mfn, args\u001b[39m=\u001b[39margs, kwargs\u001b[39m=\u001b[39mkwargs)\n\u001b[1;32m    378\u001b[0m \u001b[39mwhile\u001b[39;00m \u001b[39mTrue\u001b[39;00m:\n\u001b[0;32m--> 379\u001b[0m     do \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49miter(retry_state\u001b[39m=\u001b[39;49mretry_state)\n\u001b[1;32m    380\u001b[0m     \u001b[39mif\u001b[39;00m \u001b[39misinstance\u001b[39m(do, DoAttempt):\n\u001b[1;32m    381\u001b[0m         \u001b[39mtry\u001b[39;00m:\n",
+      "File \u001b[0;32m~/Desktop/Projects/reflexion/.venv/lib/python3.9/site-packages/tenacity/__init__.py:314\u001b[0m, in \u001b[0;36mBaseRetrying.iter\u001b[0;34m(self, retry_state)\u001b[0m\n\u001b[1;32m    312\u001b[0m is_explicit_retry \u001b[39m=\u001b[39m fut\u001b[39m.\u001b[39mfailed \u001b[39mand\u001b[39;00m \u001b[39misinstance\u001b[39m(fut\u001b[39m.\u001b[39mexception(), TryAgain)\n\u001b[1;32m    313\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m (is_explicit_retry \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mretry(retry_state)):\n\u001b[0;32m--> 314\u001b[0m     \u001b[39mreturn\u001b[39;00m fut\u001b[39m.\u001b[39;49mresult()\n\u001b[1;32m    316\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mafter \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[1;32m    317\u001b[0m     \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mafter(retry_state)\n",
+      "File \u001b[0;32m/usr/local/Cellar/python@3.9/3.9.16/Frameworks/Python.framework/Versions/3.9/lib/python3.9/concurrent/futures/_base.py:439\u001b[0m, in \u001b[0;36mFuture.result\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m    437\u001b[0m     \u001b[39mraise\u001b[39;00m CancelledError()\n\u001b[1;32m    438\u001b[0m \u001b[39melif\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_state \u001b[39m==\u001b[39m FINISHED:\n\u001b[0;32m--> 439\u001b[0m     \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m__get_result()\n\u001b[1;32m    441\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_condition\u001b[39m.\u001b[39mwait(timeout)\n\u001b[1;32m    443\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_state \u001b[39min\u001b[39;00m [CANCELLED, CANCELLED_AND_NOTIFIED]:\n",
+      "File \u001b[0;32m/usr/local/Cellar/python@3.9/3.9.16/Frameworks/Python.framework/Versions/3.9/lib/python3.9/concurrent/futures/_base.py:391\u001b[0m, in \u001b[0;36mFuture.__get_result\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    389\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_exception:\n\u001b[1;32m    390\u001b[0m     \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m--> 391\u001b[0m         \u001b[39mraise\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_exception\n\u001b[1;32m    392\u001b[0m     \u001b[39mfinally\u001b[39;00m:\n\u001b[1;32m    393\u001b[0m         \u001b[39m# Break a reference cycle with the exception in self._exception\u001b[39;00m\n\u001b[1;32m    394\u001b[0m         \u001b[39mself\u001b[39m \u001b[39m=\u001b[39m \u001b[39mNone\u001b[39;00m\n",
+      "File \u001b[0;32m~/Desktop/Projects/reflexion/.venv/lib/python3.9/site-packages/tenacity/__init__.py:382\u001b[0m, in \u001b[0;36mRetrying.__call__\u001b[0;34m(self, fn, *args, **kwargs)\u001b[0m\n\u001b[1;32m    380\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39misinstance\u001b[39m(do, DoAttempt):\n\u001b[1;32m    381\u001b[0m     \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m--> 382\u001b[0m         result \u001b[39m=\u001b[39m fn(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m    383\u001b[0m     \u001b[39mexcept\u001b[39;00m \u001b[39mBaseException\u001b[39;00m:  \u001b[39m# noqa: B902\u001b[39;00m\n\u001b[1;32m    384\u001b[0m         retry_state\u001b[39m.\u001b[39mset_exception(sys\u001b[39m.\u001b[39mexc_info())  \u001b[39m# type: ignore[arg-type]\u001b[39;00m\n",
+      "File \u001b[0;32m~/Desktop/Projects/reflexion/.venv/lib/python3.9/site-packages/langchain/llms/openai.py:104\u001b[0m, in \u001b[0;36mcompletion_with_retry.<locals>._completion_with_retry\u001b[0;34m(**kwargs)\u001b[0m\n\u001b[1;32m    102\u001b[0m \u001b[39m@retry_decorator\u001b[39m\n\u001b[1;32m    103\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m_completion_with_retry\u001b[39m(\u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs: Any) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m Any:\n\u001b[0;32m--> 104\u001b[0m     \u001b[39mreturn\u001b[39;00m llm\u001b[39m.\u001b[39;49mclient\u001b[39m.\u001b[39;49mcreate(\u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n",
+      "File \u001b[0;32m~/Desktop/Projects/reflexion/.venv/lib/python3.9/site-packages/openai/api_resources/completion.py:25\u001b[0m, in \u001b[0;36mCompletion.create\u001b[0;34m(cls, *args, **kwargs)\u001b[0m\n\u001b[1;32m     23\u001b[0m \u001b[39mwhile\u001b[39;00m \u001b[39mTrue\u001b[39;00m:\n\u001b[1;32m     24\u001b[0m     \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m---> 25\u001b[0m         \u001b[39mreturn\u001b[39;00m \u001b[39msuper\u001b[39;49m()\u001b[39m.\u001b[39;49mcreate(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m     26\u001b[0m     \u001b[39mexcept\u001b[39;00m TryAgain \u001b[39mas\u001b[39;00m e:\n\u001b[1;32m     27\u001b[0m         \u001b[39mif\u001b[39;00m timeout \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m \u001b[39mand\u001b[39;00m time\u001b[39m.\u001b[39mtime() \u001b[39m>\u001b[39m start \u001b[39m+\u001b[39m timeout:\n",
+      "File \u001b[0;32m~/Desktop/Projects/reflexion/.venv/lib/python3.9/site-packages/openai/api_resources/abstract/engine_api_resource.py:153\u001b[0m, in \u001b[0;36mEngineAPIResource.create\u001b[0;34m(cls, api_key, api_base, api_type, request_id, api_version, organization, **params)\u001b[0m\n\u001b[1;32m    127\u001b[0m \u001b[39m@classmethod\u001b[39m\n\u001b[1;32m    128\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mcreate\u001b[39m(\n\u001b[1;32m    129\u001b[0m     \u001b[39mcls\u001b[39m,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    136\u001b[0m     \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mparams,\n\u001b[1;32m    137\u001b[0m ):\n\u001b[1;32m    138\u001b[0m     (\n\u001b[1;32m    139\u001b[0m         deployment_id,\n\u001b[1;32m    140\u001b[0m         engine,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    150\u001b[0m         api_key, api_base, api_type, api_version, organization, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mparams\n\u001b[1;32m    151\u001b[0m     )\n\u001b[0;32m--> 153\u001b[0m     response, _, api_key \u001b[39m=\u001b[39m requestor\u001b[39m.\u001b[39;49mrequest(\n\u001b[1;32m    154\u001b[0m         \u001b[39m\"\u001b[39;49m\u001b[39mpost\u001b[39;49m\u001b[39m\"\u001b[39;49m,\n\u001b[1;32m    155\u001b[0m         url,\n\u001b[1;32m    156\u001b[0m         params\u001b[39m=\u001b[39;49mparams,\n\u001b[1;32m    157\u001b[0m         headers\u001b[39m=\u001b[39;49mheaders,\n\u001b[1;32m    158\u001b[0m         stream\u001b[39m=\u001b[39;49mstream,\n\u001b[1;32m    159\u001b[0m         request_id\u001b[39m=\u001b[39;49mrequest_id,\n\u001b[1;32m    160\u001b[0m         request_timeout\u001b[39m=\u001b[39;49mrequest_timeout,\n\u001b[1;32m    161\u001b[0m     )\n\u001b[1;32m    163\u001b[0m     \u001b[39mif\u001b[39;00m stream:\n\u001b[1;32m    164\u001b[0m         \u001b[39m# must be an iterator\u001b[39;00m\n\u001b[1;32m    165\u001b[0m         \u001b[39massert\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39misinstance\u001b[39m(response, OpenAIResponse)\n",
+      "File \u001b[0;32m~/Desktop/Projects/reflexion/.venv/lib/python3.9/site-packages/openai/api_requestor.py:216\u001b[0m, in \u001b[0;36mAPIRequestor.request\u001b[0;34m(self, method, url, params, headers, files, stream, request_id, request_timeout)\u001b[0m\n\u001b[1;32m    205\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mrequest\u001b[39m(\n\u001b[1;32m    206\u001b[0m     \u001b[39mself\u001b[39m,\n\u001b[1;32m    207\u001b[0m     method,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    214\u001b[0m     request_timeout: Optional[Union[\u001b[39mfloat\u001b[39m, Tuple[\u001b[39mfloat\u001b[39m, \u001b[39mfloat\u001b[39m]]] \u001b[39m=\u001b[39m \u001b[39mNone\u001b[39;00m,\n\u001b[1;32m    215\u001b[0m ) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m Tuple[Union[OpenAIResponse, Iterator[OpenAIResponse]], \u001b[39mbool\u001b[39m, \u001b[39mstr\u001b[39m]:\n\u001b[0;32m--> 216\u001b[0m     result \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mrequest_raw(\n\u001b[1;32m    217\u001b[0m         method\u001b[39m.\u001b[39;49mlower(),\n\u001b[1;32m    218\u001b[0m         url,\n\u001b[1;32m    219\u001b[0m         params\u001b[39m=\u001b[39;49mparams,\n\u001b[1;32m    220\u001b[0m         supplied_headers\u001b[39m=\u001b[39;49mheaders,\n\u001b[1;32m    221\u001b[0m         files\u001b[39m=\u001b[39;49mfiles,\n\u001b[1;32m    222\u001b[0m         stream\u001b[39m=\u001b[39;49mstream,\n\u001b[1;32m    223\u001b[0m         request_id\u001b[39m=\u001b[39;49mrequest_id,\n\u001b[1;32m    224\u001b[0m         request_timeout\u001b[39m=\u001b[39;49mrequest_timeout,\n\u001b[1;32m    225\u001b[0m     )\n\u001b[1;32m    226\u001b[0m     resp, got_stream \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_interpret_response(result, stream)\n\u001b[1;32m    227\u001b[0m     \u001b[39mreturn\u001b[39;00m resp, got_stream, \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mapi_key\n",
+      "File \u001b[0;32m~/Desktop/Projects/reflexion/.venv/lib/python3.9/site-packages/openai/api_requestor.py:516\u001b[0m, in \u001b[0;36mAPIRequestor.request_raw\u001b[0;34m(self, method, url, params, supplied_headers, files, stream, request_id, request_timeout)\u001b[0m\n\u001b[1;32m    514\u001b[0m     _thread_context\u001b[39m.\u001b[39msession \u001b[39m=\u001b[39m _make_session()\n\u001b[1;32m    515\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m--> 516\u001b[0m     result \u001b[39m=\u001b[39m _thread_context\u001b[39m.\u001b[39;49msession\u001b[39m.\u001b[39;49mrequest(\n\u001b[1;32m    517\u001b[0m         method,\n\u001b[1;32m    518\u001b[0m         abs_url,\n\u001b[1;32m    519\u001b[0m         headers\u001b[39m=\u001b[39;49mheaders,\n\u001b[1;32m    520\u001b[0m         data\u001b[39m=\u001b[39;49mdata,\n\u001b[1;32m    521\u001b[0m         files\u001b[39m=\u001b[39;49mfiles,\n\u001b[1;32m    522\u001b[0m         stream\u001b[39m=\u001b[39;49mstream,\n\u001b[1;32m    523\u001b[0m         timeout\u001b[39m=\u001b[39;49mrequest_timeout \u001b[39mif\u001b[39;49;00m request_timeout \u001b[39melse\u001b[39;49;00m TIMEOUT_SECS,\n\u001b[1;32m    524\u001b[0m         proxies\u001b[39m=\u001b[39;49m_thread_context\u001b[39m.\u001b[39;49msession\u001b[39m.\u001b[39;49mproxies,\n\u001b[1;32m    525\u001b[0m     )\n\u001b[1;32m    526\u001b[0m \u001b[39mexcept\u001b[39;00m requests\u001b[39m.\u001b[39mexceptions\u001b[39m.\u001b[39mTimeout \u001b[39mas\u001b[39;00m e:\n\u001b[1;32m    527\u001b[0m     \u001b[39mraise\u001b[39;00m error\u001b[39m.\u001b[39mTimeout(\u001b[39m\"\u001b[39m\u001b[39mRequest timed out: \u001b[39m\u001b[39m{}\u001b[39;00m\u001b[39m\"\u001b[39m\u001b[39m.\u001b[39mformat(e)) \u001b[39mfrom\u001b[39;00m \u001b[39me\u001b[39;00m\n",
+      "File \u001b[0;32m~/Desktop/Projects/reflexion/.venv/lib/python3.9/site-packages/requests/sessions.py:587\u001b[0m, in \u001b[0;36mSession.request\u001b[0;34m(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)\u001b[0m\n\u001b[1;32m    582\u001b[0m send_kwargs \u001b[39m=\u001b[39m {\n\u001b[1;32m    583\u001b[0m     \u001b[39m\"\u001b[39m\u001b[39mtimeout\u001b[39m\u001b[39m\"\u001b[39m: timeout,\n\u001b[1;32m    584\u001b[0m     \u001b[39m\"\u001b[39m\u001b[39mallow_redirects\u001b[39m\u001b[39m\"\u001b[39m: allow_redirects,\n\u001b[1;32m    585\u001b[0m }\n\u001b[1;32m    586\u001b[0m send_kwargs\u001b[39m.\u001b[39mupdate(settings)\n\u001b[0;32m--> 587\u001b[0m resp \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49msend(prep, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49msend_kwargs)\n\u001b[1;32m    589\u001b[0m \u001b[39mreturn\u001b[39;00m resp\n",
+      "File \u001b[0;32m~/Desktop/Projects/reflexion/.venv/lib/python3.9/site-packages/requests/sessions.py:701\u001b[0m, in \u001b[0;36mSession.send\u001b[0;34m(self, request, **kwargs)\u001b[0m\n\u001b[1;32m    698\u001b[0m start \u001b[39m=\u001b[39m preferred_clock()\n\u001b[1;32m    700\u001b[0m \u001b[39m# Send the request\u001b[39;00m\n\u001b[0;32m--> 701\u001b[0m r \u001b[39m=\u001b[39m adapter\u001b[39m.\u001b[39;49msend(request, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m    703\u001b[0m \u001b[39m# Total elapsed time of the request (approximately)\u001b[39;00m\n\u001b[1;32m    704\u001b[0m elapsed \u001b[39m=\u001b[39m preferred_clock() \u001b[39m-\u001b[39m start\n",
+      "File \u001b[0;32m~/Desktop/Projects/reflexion/.venv/lib/python3.9/site-packages/requests/adapters.py:486\u001b[0m, in \u001b[0;36mHTTPAdapter.send\u001b[0;34m(self, request, stream, timeout, verify, cert, proxies)\u001b[0m\n\u001b[1;32m    483\u001b[0m     timeout \u001b[39m=\u001b[39m TimeoutSauce(connect\u001b[39m=\u001b[39mtimeout, read\u001b[39m=\u001b[39mtimeout)\n\u001b[1;32m    485\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m--> 486\u001b[0m     resp \u001b[39m=\u001b[39m conn\u001b[39m.\u001b[39;49murlopen(\n\u001b[1;32m    487\u001b[0m         method\u001b[39m=\u001b[39;49mrequest\u001b[39m.\u001b[39;49mmethod,\n\u001b[1;32m    488\u001b[0m         url\u001b[39m=\u001b[39;49murl,\n\u001b[1;32m    489\u001b[0m         body\u001b[39m=\u001b[39;49mrequest\u001b[39m.\u001b[39;49mbody,\n\u001b[1;32m    490\u001b[0m         headers\u001b[39m=\u001b[39;49mrequest\u001b[39m.\u001b[39;49mheaders,\n\u001b[1;32m    491\u001b[0m         redirect\u001b[39m=\u001b[39;49m\u001b[39mFalse\u001b[39;49;00m,\n\u001b[1;32m    492\u001b[0m         assert_same_host\u001b[39m=\u001b[39;49m\u001b[39mFalse\u001b[39;49;00m,\n\u001b[1;32m    493\u001b[0m         preload_content\u001b[39m=\u001b[39;49m\u001b[39mFalse\u001b[39;49;00m,\n\u001b[1;32m    494\u001b[0m         decode_content\u001b[39m=\u001b[39;49m\u001b[39mFalse\u001b[39;49;00m,\n\u001b[1;32m    495\u001b[0m         retries\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mmax_retries,\n\u001b[1;32m    496\u001b[0m         timeout\u001b[39m=\u001b[39;49mtimeout,\n\u001b[1;32m    497\u001b[0m         chunked\u001b[39m=\u001b[39;49mchunked,\n\u001b[1;32m    498\u001b[0m     )\n\u001b[1;32m    500\u001b[0m \u001b[39mexcept\u001b[39;00m (ProtocolError, \u001b[39mOSError\u001b[39;00m) \u001b[39mas\u001b[39;00m err:\n\u001b[1;32m    501\u001b[0m     \u001b[39mraise\u001b[39;00m \u001b[39mConnectionError\u001b[39;00m(err, request\u001b[39m=\u001b[39mrequest)\n",
+      "File \u001b[0;32m~/Desktop/Projects/reflexion/.venv/lib/python3.9/site-packages/urllib3/connectionpool.py:790\u001b[0m, in \u001b[0;36mHTTPConnectionPool.urlopen\u001b[0;34m(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, preload_content, decode_content, **response_kw)\u001b[0m\n\u001b[1;32m    787\u001b[0m response_conn \u001b[39m=\u001b[39m conn \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m release_conn \u001b[39melse\u001b[39;00m \u001b[39mNone\u001b[39;00m\n\u001b[1;32m    789\u001b[0m \u001b[39m# Make the request on the HTTPConnection object\u001b[39;00m\n\u001b[0;32m--> 790\u001b[0m response \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_make_request(\n\u001b[1;32m    791\u001b[0m     conn,\n\u001b[1;32m    792\u001b[0m     method,\n\u001b[1;32m    793\u001b[0m     url,\n\u001b[1;32m    794\u001b[0m     timeout\u001b[39m=\u001b[39;49mtimeout_obj,\n\u001b[1;32m    795\u001b[0m     body\u001b[39m=\u001b[39;49mbody,\n\u001b[1;32m    796\u001b[0m     headers\u001b[39m=\u001b[39;49mheaders,\n\u001b[1;32m    797\u001b[0m     chunked\u001b[39m=\u001b[39;49mchunked,\n\u001b[1;32m    798\u001b[0m     retries\u001b[39m=\u001b[39;49mretries,\n\u001b[1;32m    799\u001b[0m     response_conn\u001b[39m=\u001b[39;49mresponse_conn,\n\u001b[1;32m    800\u001b[0m     preload_content\u001b[39m=\u001b[39;49mpreload_content,\n\u001b[1;32m    801\u001b[0m     decode_content\u001b[39m=\u001b[39;49mdecode_content,\n\u001b[1;32m    802\u001b[0m     \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mresponse_kw,\n\u001b[1;32m    803\u001b[0m )\n\u001b[1;32m    805\u001b[0m \u001b[39m# Everything went great!\u001b[39;00m\n\u001b[1;32m    806\u001b[0m clean_exit \u001b[39m=\u001b[39m \u001b[39mTrue\u001b[39;00m\n",
+      "File \u001b[0;32m~/Desktop/Projects/reflexion/.venv/lib/python3.9/site-packages/urllib3/connectionpool.py:536\u001b[0m, in \u001b[0;36mHTTPConnectionPool._make_request\u001b[0;34m(self, conn, method, url, body, headers, retries, timeout, chunked, response_conn, preload_content, decode_content, enforce_content_length)\u001b[0m\n\u001b[1;32m    534\u001b[0m \u001b[39m# Receive the response from the server\u001b[39;00m\n\u001b[1;32m    535\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m--> 536\u001b[0m     response \u001b[39m=\u001b[39m conn\u001b[39m.\u001b[39;49mgetresponse()\n\u001b[1;32m    537\u001b[0m \u001b[39mexcept\u001b[39;00m (BaseSSLError, \u001b[39mOSError\u001b[39;00m) \u001b[39mas\u001b[39;00m e:\n\u001b[1;32m    538\u001b[0m     \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_raise_timeout(err\u001b[39m=\u001b[39me, url\u001b[39m=\u001b[39murl, timeout_value\u001b[39m=\u001b[39mread_timeout)\n",
+      "File \u001b[0;32m~/Desktop/Projects/reflexion/.venv/lib/python3.9/site-packages/urllib3/connection.py:454\u001b[0m, in \u001b[0;36mHTTPConnection.getresponse\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    451\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39m.\u001b[39;00m\u001b[39mresponse\u001b[39;00m \u001b[39mimport\u001b[39;00m HTTPResponse\n\u001b[1;32m    453\u001b[0m \u001b[39m# Get the response from http.client.HTTPConnection\u001b[39;00m\n\u001b[0;32m--> 454\u001b[0m httplib_response \u001b[39m=\u001b[39m \u001b[39msuper\u001b[39;49m()\u001b[39m.\u001b[39;49mgetresponse()\n\u001b[1;32m    456\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[1;32m    457\u001b[0m     assert_header_parsing(httplib_response\u001b[39m.\u001b[39mmsg)\n",
+      "File \u001b[0;32m/usr/local/Cellar/python@3.9/3.9.16/Frameworks/Python.framework/Versions/3.9/lib/python3.9/http/client.py:1377\u001b[0m, in \u001b[0;36mHTTPConnection.getresponse\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m   1375\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[1;32m   1376\u001b[0m     \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m-> 1377\u001b[0m         response\u001b[39m.\u001b[39;49mbegin()\n\u001b[1;32m   1378\u001b[0m     \u001b[39mexcept\u001b[39;00m \u001b[39mConnectionError\u001b[39;00m:\n\u001b[1;32m   1379\u001b[0m         \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mclose()\n",
+      "File \u001b[0;32m/usr/local/Cellar/python@3.9/3.9.16/Frameworks/Python.framework/Versions/3.9/lib/python3.9/http/client.py:320\u001b[0m, in \u001b[0;36mHTTPResponse.begin\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    318\u001b[0m \u001b[39m# read until we get a non-100 response\u001b[39;00m\n\u001b[1;32m    319\u001b[0m \u001b[39mwhile\u001b[39;00m \u001b[39mTrue\u001b[39;00m:\n\u001b[0;32m--> 320\u001b[0m     version, status, reason \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_read_status()\n\u001b[1;32m    321\u001b[0m     \u001b[39mif\u001b[39;00m status \u001b[39m!=\u001b[39m CONTINUE:\n\u001b[1;32m    322\u001b[0m         \u001b[39mbreak\u001b[39;00m\n",
+      "File \u001b[0;32m/usr/local/Cellar/python@3.9/3.9.16/Frameworks/Python.framework/Versions/3.9/lib/python3.9/http/client.py:281\u001b[0m, in \u001b[0;36mHTTPResponse._read_status\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    280\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m_read_status\u001b[39m(\u001b[39mself\u001b[39m):\n\u001b[0;32m--> 281\u001b[0m     line \u001b[39m=\u001b[39m \u001b[39mstr\u001b[39m(\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mfp\u001b[39m.\u001b[39;49mreadline(_MAXLINE \u001b[39m+\u001b[39;49m \u001b[39m1\u001b[39;49m), \u001b[39m\"\u001b[39m\u001b[39miso-8859-1\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[1;32m    282\u001b[0m     \u001b[39mif\u001b[39;00m \u001b[39mlen\u001b[39m(line) \u001b[39m>\u001b[39m _MAXLINE:\n\u001b[1;32m    283\u001b[0m         \u001b[39mraise\u001b[39;00m LineTooLong(\u001b[39m\"\u001b[39m\u001b[39mstatus line\u001b[39m\u001b[39m\"\u001b[39m)\n",
+      "File \u001b[0;32m/usr/local/Cellar/python@3.9/3.9.16/Frameworks/Python.framework/Versions/3.9/lib/python3.9/socket.py:704\u001b[0m, in \u001b[0;36mSocketIO.readinto\u001b[0;34m(self, b)\u001b[0m\n\u001b[1;32m    702\u001b[0m \u001b[39mwhile\u001b[39;00m \u001b[39mTrue\u001b[39;00m:\n\u001b[1;32m    703\u001b[0m     \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m--> 704\u001b[0m         \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_sock\u001b[39m.\u001b[39;49mrecv_into(b)\n\u001b[1;32m    705\u001b[0m     \u001b[39mexcept\u001b[39;00m timeout:\n\u001b[1;32m    706\u001b[0m         \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_timeout_occurred \u001b[39m=\u001b[39m \u001b[39mTrue\u001b[39;00m\n",
+      "File \u001b[0;32m/usr/local/Cellar/python@3.9/3.9.16/Frameworks/Python.framework/Versions/3.9/lib/python3.9/ssl.py:1242\u001b[0m, in \u001b[0;36mSSLSocket.recv_into\u001b[0;34m(self, buffer, nbytes, flags)\u001b[0m\n\u001b[1;32m   1238\u001b[0m     \u001b[39mif\u001b[39;00m flags \u001b[39m!=\u001b[39m \u001b[39m0\u001b[39m:\n\u001b[1;32m   1239\u001b[0m         \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\n\u001b[1;32m   1240\u001b[0m           \u001b[39m\"\u001b[39m\u001b[39mnon-zero flags not allowed in calls to recv_into() on \u001b[39m\u001b[39m%s\u001b[39;00m\u001b[39m\"\u001b[39m \u001b[39m%\u001b[39m\n\u001b[1;32m   1241\u001b[0m           \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m\u001b[39m__class__\u001b[39m)\n\u001b[0;32m-> 1242\u001b[0m     \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mread(nbytes, buffer)\n\u001b[1;32m   1243\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m   1244\u001b[0m     \u001b[39mreturn\u001b[39;00m \u001b[39msuper\u001b[39m()\u001b[39m.\u001b[39mrecv_into(buffer, nbytes, flags)\n",
+      "File \u001b[0;32m/usr/local/Cellar/python@3.9/3.9.16/Frameworks/Python.framework/Versions/3.9/lib/python3.9/ssl.py:1100\u001b[0m, in \u001b[0;36mSSLSocket.read\u001b[0;34m(self, len, buffer)\u001b[0m\n\u001b[1;32m   1098\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[1;32m   1099\u001b[0m     \u001b[39mif\u001b[39;00m buffer \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[0;32m-> 1100\u001b[0m         \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_sslobj\u001b[39m.\u001b[39;49mread(\u001b[39mlen\u001b[39;49m, buffer)\n\u001b[1;32m   1101\u001b[0m     \u001b[39melse\u001b[39;00m:\n\u001b[1;32m   1102\u001b[0m         \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_sslobj\u001b[39m.\u001b[39mread(\u001b[39mlen\u001b[39m)\n",
+      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
+     ]
+    }
+   ],
+   "source": [
+    "for i in range(n):\n",
+    "    for agent in [a for a in agents if not a.is_correct()]:\n",
+    "        if strategy != ReflexionStrategy.NONE:\n",
+    "            agent.run(reflect_strategy = strategy)\n",
+    "        else:\n",
+    "            agent.run()\n",
+    "        print(f'Answer: {agent.key}')\n",
+    "    trial += 1\n",
+    "    log += log_react_trial(agents, trial)\n",
+    "    correct, incorrect, halted = summarize_react_trial(agents)\n",
+    "    print(f'Finished Trial {trial}, Correct: {len(correct)}, Incorrect: {len(incorrect)}, Halted: {len(halted)}')"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Save the result log"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open(os.path.join(root, 'ReAct', strategy.value, f'{len(agents)}_questions_{trial}_trials.txt'), 'w') as f:\n",
+    "    f.write(log)\n",
+    "save_agents(agents, os.path.join('ReAct', strategy.value, 'agents'))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "env",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.16"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "e23f799cbd2581634725fbf6ce3480ae26192d78438dfafc8efe944acd6490d5"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/hotpotqa_runs/react.py
+++ b/hotpotqa_runs/react.py
--- a/hotpotqa_runs/requirements.txt
+++ b/hotpotqa_runs/requirements.txt
@ -1,4 +1,3 @@
-EdgeGPT==0.3.6
 gym==0.26.2
 joblib==1.2.0
 langchain==0.0.162
@ -9,4 +8,5 @@ tenacity==8.2.2
 tiktoken==0.4.0
 transformers==4.28.1
 pandas==1.5.3
-scikit-learn
+scikit-learn
+wikipedia
--- a/hotpotqa_runs/util.py
+++ b/hotpotqa_runs/util.py
@ -31,7 +31,37 @@ Trial summary: Correct: {len(correct)}, Incorrect: {len(incorrect)}

    return log

+def summarize_react_trial(agents):
+    correct = [a for a in agents if a.is_correct()]
+    halted = [a for a in agents if a.is_halted()]
+    incorrect = [a for a in agents if a.is_finished() and not a.is_correct()]
+    return correct, incorrect, halted
+
+def log_react_trial(agents, trial_n):
+    correct, incorrect, halted = summarize_react_trial(agents)
+
+    log = f"""
+########################################
+BEGIN TRIAL {trial_n}
+Trial summary: Correct: {len(correct)}, Incorrect: {len(incorrect)}, Halted: {len(halted)}
+#######################################
+"""
+
+    log += '------------- BEGIN CORRECT AGENTS -------------\n\n'
+    for agent in correct:
+        log += remove_fewshot(agent._build_agent_prompt()) + f'\nCorrect answer: {agent.key}\n\n'
+
+    log += '------------- BEGIN INCORRECT AGENTS -----------\n\n'
+    for agent in incorrect:
+        log += remove_fewshot(agent._build_agent_prompt()) + f'\nCorrect answer: {agent.key}\n\n'
+
+    log += '------------- BEGIN HALTED AGENTS -----------\n\n'
+    for agent in halted:
+        log += remove_fewshot(agent._build_agent_prompt()) + f'\nCorrect answer: {agent.key}\n\n'
+
+    return log
+
 def save_agents(agents, dir: str):
    os.makedirs(dir, exist_ok=True)
    for i, agent in enumerate(agents):
-        joblib.dump(agent, f'{dir}/{i}.joblib')
+        joblib.dump(agent, os.path.join(dir, f'{i}.joblib'))