diff --git a/flaml/autogen/agent/__init__.py b/flaml/autogen/agent/__init__.py
index d9cd8933ed..e20314c15f 100644
--- a/flaml/autogen/agent/__init__.py
+++ b/flaml/autogen/agent/__init__.py
@@ -1,6 +1,9 @@
 from .agent import Agent
 from .assistant_agent import AssistantAgent
 from .user_proxy_agent import UserProxyAgent
+from .teaching_agent import TeachingAgent
+from .learning_agent import LearningAgent
+
 from .math_user_proxy_agent import MathUserProxyAgent
 
-__all__ = ["Agent", "AssistantAgent", "UserProxyAgent", "MathUserProxyAgent"]
+__all__ = ["Agent", "AssistantAgent", "UserProxyAgent", "MathUserProxyAgent", "TeachingAgent", "LearningAgent"]
diff --git a/flaml/autogen/agent/agent.py b/flaml/autogen/agent/agent.py
index 6c3078d272..3c7c32879f 100644
--- a/flaml/autogen/agent/agent.py
+++ b/flaml/autogen/agent/agent.py
@@ -1,5 +1,6 @@
 from collections import defaultdict
 from typing import Dict, Union
+import asyncio
 
 
 class Agent:
@@ -63,11 +64,11 @@ def _append_oai_message(self, message: Union[Dict, str], role, conversation_id):
         oai_message["role"] = "function" if message.get("role") == "function" else role
         self._oai_conversations[conversation_id].append(oai_message)
 
-    def _send(self, message: Union[Dict, str], recipient):
+    async def _send(self, message: Union[Dict, str], recipient):
         """Send a message to another agent."""
         # When the agent composes and sends the message, the role of the message is "assistant". (If 'role' exists and is 'function', it will remain unchanged.)
         self._append_oai_message(message, "assistant", recipient.name)
-        recipient.receive(message, self)
+        await recipient.receive(message, self)
 
     def _receive(self, message: Union[Dict, str], sender):
         """Receive a message from another agent.
@@ -102,6 +103,13 @@ def _receive(self, message: Union[Dict, str], sender):
                     sep="",
                 )
                 print("*" * len(func_print), flush=True)
+
+            # print("message = ",  message, flush=True, sep="")
+            # format the printing of the message
+            print("Message content:", flush=True, sep="")
+            for key, value in message.items():
+                print(f"{key}: {value}", flush=True, sep="\n")
+
         print("\n", "-" * 80, flush=True, sep="")
 
         # When the agent receives a message, the role of the message is "user". (If 'role' exists and is 'function', it will remain unchanged.)
diff --git a/flaml/autogen/agent/learning_agent.py b/flaml/autogen/agent/learning_agent.py
new file mode 100644
index 0000000000..363671b5d9
--- /dev/null
+++ b/flaml/autogen/agent/learning_agent.py
@@ -0,0 +1,121 @@
+from .assistant_agent import AssistantAgent
+from flaml.autogen.code_utils import DEFAULT_MODEL
+from flaml import oai
+import asyncio
+
+
+class LearningAgent(AssistantAgent):
+    """(Experimental) A learning agent."""
+
+    DEFAULT_SYSTEM_MESSAGE = """You are a helpful AI assistant.
+    In the following cases, suggest python code (in a python coding block) or shell script (in a sh coding block) for the user to execute. You must indicate the script type in the code block.
+    1. When you need to ask the user for some info, use the code to output the info you need, for example, browse or search the web, download/read a file.
+    2. When you need to perform some task with code, use the code to perform the task and output the result. Finish the task smartly. Solve the task step by step if you need to.
+    If you want the user to save the code in a file before executing it, put # filename: <filename> inside the code block as the first line. Don't include multiple code blocks in one response. Do not ask users to copy and paste the result. Instead, use 'print' function for the output when relevant. Check the execution result returned by the user.
+    If the result indicates there is an error, fix the error and output the code again. Suggeset the full code instead of partial code or code changes.
+    Reply "TERMINATE" in the end when everything is done.
+    """
+
+    DEFAULT_CONFIG = {
+        "model": DEFAULT_MODEL,
+    }
+
+    def __init__(self, name, system_message=DEFAULT_SYSTEM_MESSAGE, **config):
+        """
+        Args:
+            name (str): agent name.
+            system_message (str): system message to be sent to the agent.
+            **config (dict): other configurations allowed in
+              [oai.Completion.create](../oai/Completion#create).
+              These configurations will be used when invoking LLM.
+        """
+        super().__init__(name, system_message, **config)
+        self._system_message_learning = """You are a helpful AI assistant."""
+        self._learning_objectives = ""
+        self._can_handle_data_volume = lambda *args: True
+
+    def _generate_task_prompt(self, learning_results, learning_data):
+        """
+        Process the message using NLP.
+        """
+        task_prompt = f"""
+        {self._learning_objectives}.
+        This is the latest data entry: {learning_data}.
+        Renew the current result:
+        {learning_results}
+        You can try to condense the current result and add a new bullet point to the result.
+        """
+        return task_prompt
+
+    @staticmethod
+    def is_total_token_count_within_threshold(learning_results, learning_data):
+        """
+        Check if the total token count of learning data and learning results
+        is within a specified threshold.
+        """
+
+        def _token_counter(input_string):
+            from transformers import GPT2Tokenizer
+
+            # Load a pre-trained tokenizer
+            tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
+            # Tokenize the string
+            tokens = tokenizer.tokenize(input_string)
+            return len(tokens)
+
+        oai_max_token_size = 4096
+        return _token_counter(learning_results) + _token_counter(learning_data) < oai_max_token_size * 0.8
+
+    def _validate_learning_constraints(self, learning_constraints):
+        # check if the learning constraints are satisfied
+        # do nothing for now
+        return True
+
+    async def receive(self, message, sender):
+        """Receive a message from another agent."""
+        content = message.get("content", None) if isinstance(message, dict) else message
+        self._receive(message, sender)
+        # NOTE: content and learning settings are mutually exclusive
+        if content is not None:
+            # if content is provided, perform the default receive function
+            super().receive(content, sender)
+        else:
+            # perform learning based on the learning settings
+            learning_func = message.get("learning_func", None)
+            learning_objectives = message.get("learning_objectives", None)
+            learning_constraints = message.get("learning_constraints", None)
+            learning_results = message.get("learning_results", None)
+            data4learning = message.get("data4learning", None)
+            if learning_objectives:
+                self._learning_objectives = learning_objectives
+            # when data is available, perform the learning task when learning_constraints are satisfied
+            if data4learning and self._validate_learning_constraints(learning_constraints):
+                # perform learning
+                if learning_func:
+                    # assumption: learning_func is a function that takes learning_results and learning_data as input and returns new_learning_results and can_handle_data_volume
+                    # when learning_data is None, the learning_func should work as well, outputting the input learning_results as the
+                    # new_learning_results and can_handle_data_volume function
+                    new_learning_results, self._can_handle_data_volume = learning_func(learning_results, data4learning)
+                else:
+                    self._can_handle_data_volume = self.is_total_token_count_within_threshold
+                    if data4learning:
+                        task_prompt = self._generate_task_prompt(learning_results, data4learning)
+                        learning_msg = [
+                            # {"content": self._system_message_learning, "role": "system"},
+                            {"role": "user", "content": task_prompt},
+                        ]
+                        responses = oai.ChatCompletion.create(messages=learning_msg, **self._config)
+                        new_learning_results = oai.ChatCompletion.extract_text(responses)[0]
+                    else:
+                        new_learning_results = learning_results
+                print("*********Current learning results of the learner*********\n", new_learning_results, flush=True)
+                print("*" * 50, flush=True)
+                await self._send(
+                    {"learning_results": new_learning_results, "can_handle_data_volume": self._can_handle_data_volume},
+                    sender,
+                )
+            else:
+                await self._send(
+                    {"learning_results": learning_results, "can_handle_data_volume": self._can_handle_data_volume},
+                    sender,
+                )
diff --git a/flaml/autogen/agent/teaching_agent.py b/flaml/autogen/agent/teaching_agent.py
new file mode 100644
index 0000000000..b212ee15cb
--- /dev/null
+++ b/flaml/autogen/agent/teaching_agent.py
@@ -0,0 +1,137 @@
+from .user_proxy_agent import UserProxyAgent
+from typing import Optional, Callable
+from transformers import AutoTokenizer
+import asyncio
+
+
+class TeachingAgent(UserProxyAgent):
+    """(Experimental) A teaching agent."""
+
+    def __init__(
+        self,
+        name,
+        system_message="",
+        work_dir=None,
+        human_input_mode="ALWAYS",
+        max_consecutive_auto_reply=None,
+        is_termination_msg=None,
+        use_docker=True,
+        **config,
+    ):
+        """
+        Args:
+            name (str): name of the agent
+            system_message (str): system message to be sent to the agent
+            work_dir (str): working directory for the agent
+            human_input_mode (str): whether to ask for human inputs every time a message is received.
+                Possible values are "ALWAYS", "TERMINATE", "NEVER".
+                (1) When "ALWAYS", the agent prompts for human input every time a message is received.
+                    Under this mode, the conversation stops when the human input is "exit",
+                    or when is_termination_msg is True and there is no human input.
+                (2) When "TERMINATE", the agent only prompts for human input only when a termination message is received or
+                    the number of auto reply reaches the max_consecutive_auto_reply.
+                (3) When "NEVER", the agent will never prompt for human input. Under this mode, the conversation stops
+                    when the number of auto reply reaches the max_consecutive_auto_reply or or when is_termination_msg is True.
+            max_consecutive_auto_reply (int): the maximum number of consecutive auto replies.
+                default to None (no limit provided, class attribute MAX_CONSECUTIVE_AUTO_REPLY will be used as the limit in this case).
+                The limit only plays a role when human_input_mode is not "ALWAYS".
+            is_termination_msg (function): a function that takes a message and returns a boolean value.
+                This function is used to determine if a received message is a termination message.
+            use_docker (bool): whether to use docker to execute the code.
+            **config (dict): other configurations.
+        """
+        super().__init__(
+            name,
+            system_message,
+            work_dir=work_dir,
+            human_input_mode=human_input_mode,
+            max_consecutive_auto_reply=max_consecutive_auto_reply,
+            is_termination_msg=is_termination_msg,
+            use_docker=use_docker,
+            **config,
+        )
+        self._data4learning = []
+        self._learning_constraints = None
+        self._learning_objectives = None
+        self._learning_results = None
+        self._learning_func = None
+        self._can_handle_data_volume = lambda *args: True
+        self._data_available_event = asyncio.Event()
+
+    def setup_learning(
+        self,
+        learning_func: Optional[Callable] = None,
+        learning_objectives: Optional[str] = None,
+        learning_constraints: Optional[dict] = None,
+        learning_results: Optional[str] = "",
+    ):
+        """
+        Args:
+            learning_func (Optional, Callable): the learning function to be executed.
+                The learning function should take the following arguments as inputs:
+                    (1) data4learning: the data for learning.
+                    (2) learning_results: old learning results.
+                The learning function should return the new learning results.
+            learning_objectives (Optional, str): the learning objectives in natural language.
+            learning_constraints (Optional, dict): the learning constraints.
+            learning_results (Optional, str): the learning results in natural language.
+            #TODO: learning_results could be other types of data, e.g., a list of data.
+        Either learning_func or learning_objectives should be provided.
+        """
+        self._learning_constraints = learning_constraints
+        self._learning_objectives = learning_objectives  # already reflected in the learning_func
+        self._learning_results = learning_results
+        self._learning_func = learning_func
+        assert (
+            self._learning_func is not None or self._learning_objectives is not None
+        ), "learning_func or learning_objectives should be provided"
+
+        self._learning_settings = {
+            "learning_func": self._learning_func,
+            "learning_objectives": self._learning_objectives,
+            "learning_constraints": self._learning_constraints,
+            "learning_results": self._learning_results,
+            "data4learning": [],
+        }
+
+    def generate_init_prompt(self):
+        """
+        When generating the init prompt, we need to distinguish the two cases where learning_func or learning_objectives is provided.
+        """
+        self._init_prompt = self._learning_settings.copy()
+
+        return self._init_prompt
+
+    async def add_data(self, data4learning):
+        """Add data for learning."""
+        self._data4learning += data4learning
+        print(f"{len(data4learning)} data entries added for learning!")
+        self._data_available_event.set()
+
+    async def auto_reply(self, message, sender, default_reply=""):
+        """
+        Need to distinguish if the sender is requesting for learning data or not
+        """
+        learning_results = message.get("learning_results", "")
+        can_handle_data_volume = message.get("can_handle_data_volume") or self._can_handle_data_volume
+        current_data4learning = []
+        # Wait here if no data is available
+        while not self._data4learning:
+            print("waiting for data...")
+            await self._data_available_event.wait()
+        # Reset the event as we are going to consume data
+        self._data_available_event.clear()
+        while self._data4learning:
+            combined_data_str = "\n".join(current_data4learning + [self._data4learning[0]])
+            if can_handle_data_volume(learning_results, combined_data_str):
+                current_data4learning.append(self._data4learning.pop(0))
+            else:
+                break
+        if current_data4learning:
+            response = {
+                "learning_results": learning_results,
+                "data4learning": current_data4learning,
+            }
+            await self._send(response, sender)
+        else:
+            print("no data for learning and thus terminate the conversation")
diff --git a/flaml/autogen/agent/user_proxy_agent.py b/flaml/autogen/agent/user_proxy_agent.py
index 009cc8e7ee..3ca3e97737 100644
--- a/flaml/autogen/agent/user_proxy_agent.py
+++ b/flaml/autogen/agent/user_proxy_agent.py
@@ -3,6 +3,7 @@
 from collections import defaultdict
 import json
 from typing import Dict, Union
+import asyncio
 
 
 class UserProxyAgent(Agent):
@@ -173,24 +174,24 @@ def _execute_function(self, func_call):
             "content": str(content),
         }
 
-    def auto_reply(self, message: dict, sender, default_reply=""):
+    async def auto_reply(self, message: dict, sender, default_reply=""):
         """Generate an auto reply."""
         if "function_call" in message:
             is_exec_success, func_return = self._execute_function(message["function_call"])
-            self._send(func_return, sender)
+            await self._send(func_return, sender)
             return
 
         code_blocks = extract_code(message["content"])
         if len(code_blocks) == 1 and code_blocks[0][0] == UNKNOWN:
             # no code block is found, lang should be `UNKNOWN`
-            self._send(default_reply, sender)
+            await self._send(default_reply, sender)
         else:
             # try to execute the code
             exitcode, logs = self._execute_code(code_blocks)
             exitcode2str = "execution succeeded" if exitcode == 0 else "execution failed"
-            self._send(f"exitcode: {exitcode} ({exitcode2str})\nCode output: {logs}", sender)
+            await self._send(f"exitcode: {exitcode} ({exitcode2str})\nCode output: {logs}", sender)
 
-    def receive(self, message: Union[Dict, str], sender):
+    async def receive(self, message: Union[Dict, str], sender):
         """Receive a message from the sender agent.
         Once a message is received, this function sends a reply to the sender or simply stop.
         The reply can be generated automatically or entered manually by a human.
@@ -221,9 +222,9 @@ def receive(self, message: Union[Dict, str], sender):
         if reply:
             # reset the consecutive_auto_reply_counter
             self._consecutive_auto_reply_counter[sender.name] = 0
-            self._send(reply, sender)
+            await self._send(reply, sender)
             return
 
         self._consecutive_auto_reply_counter[sender.name] += 1
         print("\n>>>>>>>> NO HUMAN INPUT RECEIVED. USING AUTO REPLY FOR THE USER...", flush=True)
-        self.auto_reply(message, sender, default_reply=reply)
+        await self.auto_reply(message, sender, default_reply=reply)
diff --git a/notebook/autogen_agent_continual_summarization.ipynb b/notebook/autogen_agent_continual_summarization.ipynb
new file mode 100644
index 0000000000..965391e910
--- /dev/null
+++ b/notebook/autogen_agent_continual_summarization.ipynb
@@ -0,0 +1,693 @@
+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<a href=\"https://colab.research.google.com/github/microsoft/FLAML/blob/main/notebook/autogen_agent_web_info.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {
+    "slideshow": {
+     "slide_type": "slide"
+    }
+   },
+   "source": [
+    "# Interactive LLM Agent for Continual Summarization\n",
+    "\n",
+    "## Requirements\n",
+    "\n",
+    "FLAML requires `Python>=3.7`. To run this notebook example, please install flaml with the [openai] option, and feedparser:\n",
+    "```bash\n",
+    "pip install flaml[autogen] feedparser\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-02-13T23:40:52.317406Z",
+     "iopub.status.busy": "2023-02-13T23:40:52.316561Z",
+     "iopub.status.idle": "2023-02-13T23:40:52.321193Z",
+     "shell.execute_reply": "2023-02-13T23:40:52.320628Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# %pip install flaml[autogen]==2.0.0rc1 feedparser"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Set your API Endpoint\n",
+    "\n",
+    "\n",
+    "The [`config_list_gpt4_gpt35`](https://microsoft.github.io/FLAML/docs/reference/autogen/oai/openai_utils#config_list_gpt4_gpt35) function tries to create a list of gpt-4 and gpt-3.5 configurations using Azure OpenAI endpoints and OpenAI endpoints. It assumes the api keys and api bases are stored in the corresponding environment variables or local txt files:\n",
+    "\n",
+    "- OpenAI API key: os.environ[\"OPENAI_API_KEY\"] or `openai_api_key_file=\"key_openai.txt\"`.\n",
+    "- Azure OpenAI API key: os.environ[\"AZURE_OPENAI_API_KEY\"] or `aoai_api_key_file=\"key_aoai.txt\"`. Multiple keys can be stored, one per line.\n",
+    "- Azure OpenAI API base: os.environ[\"AZURE_OPENAI_API_BASE\"] or `aoai_api_base_file=\"base_aoai.txt\"`. Multiple bases can be stored, one per line.\n",
+    "\n",
+    "It's OK to have only the OpenAI API key, or only the Azure OpenAI API key + base."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from flaml import oai\n",
+    "config_list = oai.config_list_from_models(model_list=[\"gpt-3.5-turbo-0613\"])"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Continaul Research Digest via `LeanrningAgent` and `TeachingAgent`\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "5 data entries added for learning!\n",
+      "research_teacher (to research_learner):\n",
+      "\n",
+      "Message content:\n",
+      "learning_func: None\n",
+      "learning_objectives: Condense the provided data, which consists of titles and abstracts of research papers, into a research digest.\n",
+      "        Create a single bullet point for each entry, ensuring clarity and coherence.\n",
+      "        \n",
+      "learning_constraints: {'learning_trigger': True, 'cpu': 1}\n",
+      "learning_results:  \n",
+      "data4learning: []\n",
+      "\n",
+      "--------------------------------------------------------------------------------\n",
+      "research_learner (to research_teacher):\n",
+      "\n",
+      "Message content:\n",
+      "learning_results:  \n",
+      "can_handle_data_volume: <function LearningAgent.__init__.<locals>.<lambda> at 0x7f98faa05f70>\n",
+      "\n",
+      "--------------------------------------------------------------------------------\n",
+      "\n",
+      ">>>>>>>> NO HUMAN INPUT RECEIVED. USING AUTO REPLY FOR THE USER...\n",
+      "research_teacher (to research_learner):\n",
+      "\n",
+      "Message content:\n",
+      "learning_results:  \n",
+      "data4learning: ['Title: Still No Lie Detector for Language Models: Probing Empirical and Conceptual Roadblocks. (arXiv:2307.00175v1 [cs.CL]). \\n Abstract: <p>We consider the questions of whether or not large language models (LLMs) have\\nbeliefs, and, if they do, how we might measure them. First, we evaluate two\\nexisting approaches, one due to Azaria and Mitchell (2023) and the other to\\nBurns et al. (2022). We provide empirical results that show that these methods\\nfail to generalize in very basic ways. We then argue that, even if LLMs have\\nbeliefs, these methods are unlikely to be successful for conceptual reasons.\\nThus, there is still no lie-detector for LLMs. After describing our empirical\\nresults we take a step back and consider whether or not we should expect LLMs\\nto have something like beliefs in the first place. We consider some recent\\narguments aiming to show that LLMs cannot have beliefs. We show that these\\narguments are misguided. We provide a more productive framing of questions\\nsurrounding the status of beliefs in LLMs, and highlight the empirical nature\\nof the problem. We conclude by suggesting some concrete paths for future work.\\n</p>', 'Title: How to Index Item IDs for Recommendation Foundation Models. (arXiv:2305.06569v3 [cs.IR] UPDATED). \\n Abstract: <p>Recommendation foundation model utilizes large language models (LLM) for\\nrecommendation by converting recommendation tasks into natural language tasks.\\nIt enables generative recommendation which directly generates the item(s) to\\nrecommend rather than calculating a ranking score for each and every candidate\\nitem in traditional recommendation models, simplifying the recommendation\\npipeline from multi-stage filtering to single-stage filtering. To avoid\\ngenerating excessively long text when deciding which item(s) to recommend,\\ncreating LLM-compatible item IDs is essential for recommendation foundation\\nmodels. In this study, we systematically examine the item indexing problem for\\nrecommendation foundation models, using P5 as the representative backbone model\\nand replicating its results with various indexing methods. To emphasize the\\nimportance of item indexing, we first discuss the issues of several trivial\\nitem indexing methods, such as independent indexing, title indexing, and random\\nindexing. We then propose four simple yet effective solutions, including\\nsequential indexing, collaborative indexing, semantic (content-based) indexing,\\nand hybrid indexing. Our reproducibility study of P5 highlights the significant\\ninfluence of item indexing methods on the model performance, and our results on\\nreal-world datasets validate the effectiveness of our proposed solutions.\\n</p>', 'Title: LLM-Blender: Ensembling Large Language Models with Pairwise Ranking and Generative Fusion. (arXiv:2306.02561v3 [cs.CL] UPDATED). \\n Abstract: <p>We present LLM-Blender, an ensembling framework designed to attain\\nconsistently superior performance by leveraging the diverse strengths of\\nmultiple open-source large language models (LLMs). Our framework consists of\\ntwo modules: PairRanker and GenFuser, addressing the observation that optimal\\nLLMs for different examples can significantly vary. PairRanker employs a\\nspecialized pairwise comparison method to distinguish subtle differences\\nbetween candidate outputs. It jointly encodes the input text and a pair of\\ncandidates, using cross-attention encoders to determine the superior one. Our\\nresults demonstrate that PairRanker exhibits the highest correlation with\\nChatGPT-based ranking. Then, GenFuser aims to merge the top-ranked candidates,\\ngenerating an improved output by capitalizing on their strengths and mitigating\\ntheir weaknesses. To facilitate large-scale evaluation, we introduce a\\nbenchmark dataset, MixInstruct, which is a mixture of multiple instruction\\ndatasets featuring oracle pairwise comparisons. Our LLM-Blender significantly\\noutperform individual LLMs and baseline methods across various metrics,\\nestablishing a substantial performance gap.\\n</p>', \"Title: On the Reliability of Watermarks for Large Language Models. (arXiv:2306.04634v3 [cs.LG] UPDATED). \\n Abstract: <p>As LLMs become commonplace, machine-generated text has the potential to flood\\nthe internet with spam, social media bots, and valueless content. Watermarking\\nis a simple and effective strategy for mitigating such harms by enabling the\\ndetection and documentation of LLM-generated text. Yet a crucial question\\nremains: How reliable is watermarking in realistic settings in the wild? There,\\nwatermarked text may be modified to suit a user's needs, or entirely rewritten\\nto avoid detection.\\n</p>\\n<p>We study the robustness of watermarked text after it is re-written by humans,\\nparaphrased by a non-watermarked LLM, or mixed into a longer hand-written\\ndocument. We find that watermarks remain detectable even after human and\\nmachine paraphrasing. While these attacks dilute the strength of the watermark,\\nparaphrases are statistically likely to leak n-grams or even longer fragments\\nof the original text, resulting in high-confidence detections when enough\\ntokens are observed. For example, after strong human paraphrasing the watermark\\nis detectable after observing 800 tokens on average, when setting a 1e-5 false\\npositive rate. We also consider a range of new detection schemes that are\\nsensitive to short spans of watermarked text embedded inside a large document,\\nand we compare the robustness of watermarking to other kinds of detectors.\\n</p>\", \"Title: SparseOptimizer: Sparsify Language Models through Moreau-Yosida Regularization and Accelerate via Compiler Co-design. (arXiv:2306.15656v2 [cs.LG] UPDATED). \\n Abstract: <p>This paper introduces SparseOptimizer, a novel deep learning optimizer that\\nexploits Moreau-Yosida regularization to naturally induce sparsity in large\\nlanguage models such as BERT, ALBERT and GPT. Key to the design of\\nSparseOptimizer is an embedded shrinkage operator, which imparts sparsity\\ndirectly within the optimization process. This operator, backed by a sound\\ntheoretical framework, includes an analytical solution, thereby reinforcing the\\noptimizer's robustness and efficacy. Crucially, SparseOptimizer's plug-and-play\\nfunctionality eradicates the need for code modifications, making it a\\nuniversally adaptable tool for a wide array of large language models. Empirical\\nevaluations on benchmark datasets such as GLUE, RACE, SQuAD1, and SQuAD2\\nconfirm that SparseBERT and SparseALBERT, when sparsified using\\nSparseOptimizer, achieve performance comparable to their dense counterparts,\\nBERT and ALBERT, while significantly reducing their parameter count. Further,\\nthis work proposes an innovative optimizer-compiler co-design strategy,\\ndemonstrating the potential of inference acceleration (\\\\textbf{3.37x},\\n\\\\textbf{6.30x}, and \\\\textbf{7.15x} in comparison with Pytorch, TensorFlow, and\\nLLVM generic compile, respectively) in SparseBERT when paired with an\\nappropriately designed compiler. This study represents a significant step\\nforward in the evolution of efficient, scalable, and high-performing large\\nlanguage models, setting a precedent for future exploration and optimization in\\nthis domain. The SparseOptimizer code and SparseALBERT model will be publicly\\navailable upon paper acceptance.\\n</p>\"]\n",
+      "\n",
+      "--------------------------------------------------------------------------------\n",
+      "*********Current learning results of the learner*********\n",
+      " - There is still no effective lie detector for large language models (LLMs), as existing methods fail to generalize and are unlikely to be successful for conceptual reasons.\n",
+      "- The creation of LLM-compatible item IDs is essential for recommendation foundation models to avoid generating excessively long text and simplify the recommendation pipeline. Four simple yet effective item indexing methods are proposed to enhance model performance.\n",
+      "- LLM-Blender is an ensembling framework that leverages the strengths of multiple LLMs to achieve superior performance in ranking and generating outputs for various examples. It outperforms individual LLMs and baseline methods.\n",
+      "- Watermarking is a reliable strategy for detecting and documenting LLM-generated text, even after human or machine paraphrasing. Paraphrases are likely to leak fragments of the original text, allowing for high-confidence detections. Other detection schemes are also explored.\n",
+      "- SparseOptimizer is a deep learning optimizer that induces sparsity in large language models. It achieves performance comparable to dense counterparts while significantly reducing parameter count. An optimizer-compiler co-design strategy demonstrates potential inference acceleration.\n",
+      "**************************************************\n",
+      "research_learner (to research_teacher):\n",
+      "\n",
+      "Message content:\n",
+      "learning_results: - There is still no effective lie detector for large language models (LLMs), as existing methods fail to generalize and are unlikely to be successful for conceptual reasons.\n",
+      "- The creation of LLM-compatible item IDs is essential for recommendation foundation models to avoid generating excessively long text and simplify the recommendation pipeline. Four simple yet effective item indexing methods are proposed to enhance model performance.\n",
+      "- LLM-Blender is an ensembling framework that leverages the strengths of multiple LLMs to achieve superior performance in ranking and generating outputs for various examples. It outperforms individual LLMs and baseline methods.\n",
+      "- Watermarking is a reliable strategy for detecting and documenting LLM-generated text, even after human or machine paraphrasing. Paraphrases are likely to leak fragments of the original text, allowing for high-confidence detections. Other detection schemes are also explored.\n",
+      "- SparseOptimizer is a deep learning optimizer that induces sparsity in large language models. It achieves performance comparable to dense counterparts while significantly reducing parameter count. An optimizer-compiler co-design strategy demonstrates potential inference acceleration.\n",
+      "can_handle_data_volume: <function LearningAgent.is_total_token_count_within_threshold at 0x7f98fbab2160>\n",
+      "\n",
+      "--------------------------------------------------------------------------------\n",
+      "\n",
+      ">>>>>>>> NO HUMAN INPUT RECEIVED. USING AUTO REPLY FOR THE USER...\n",
+      "waiting for data...\n",
+      "adding AI data...\n",
+      "12 data entries added for learning!\n",
+      "research_teacher (to research_learner):\n",
+      "\n",
+      "Message content:\n",
+      "learning_results: - There is still no effective lie detector for large language models (LLMs), as existing methods fail to generalize and are unlikely to be successful for conceptual reasons.\n",
+      "- The creation of LLM-compatible item IDs is essential for recommendation foundation models to avoid generating excessively long text and simplify the recommendation pipeline. Four simple yet effective item indexing methods are proposed to enhance model performance.\n",
+      "- LLM-Blender is an ensembling framework that leverages the strengths of multiple LLMs to achieve superior performance in ranking and generating outputs for various examples. It outperforms individual LLMs and baseline methods.\n",
+      "- Watermarking is a reliable strategy for detecting and documenting LLM-generated text, even after human or machine paraphrasing. Paraphrases are likely to leak fragments of the original text, allowing for high-confidence detections. Other detection schemes are also explored.\n",
+      "- SparseOptimizer is a deep learning optimizer that induces sparsity in large language models. It achieves performance comparable to dense counterparts while significantly reducing parameter count. An optimizer-compiler co-design strategy demonstrates potential inference acceleration.\n",
+      "data4learning: ['Title: Queer People are People First: Deconstructing Sexual Identity Stereotypes in Large Language Models. (arXiv:2307.00101v1 [cs.CL]). \\n Abstract: <p>Large Language Models (LLMs) are trained primarily on minimally processed web\\ntext, which exhibits the same wide range of social biases held by the humans\\nwho created that content. Consequently, text generated by LLMs can\\ninadvertently perpetuate stereotypes towards marginalized groups, like the\\nLGBTQIA+ community. In this paper, we perform a comparative study of how LLMs\\ngenerate text describing people with different sexual identities. Analyzing\\nbias in the text generated by an LLM using regard score shows measurable bias\\nagainst queer people. We then show that a post-hoc method based on\\nchain-of-thought prompting using SHAP analysis can increase the regard of the\\nsentence, representing a promising approach towards debiasing the output of\\nLLMs in this setting.\\n</p>', \"Title: Performance of ChatGPT on USMLE: Unlocking the Potential of Large Language Models for AI-Assisted Medical Education. (arXiv:2307.00112v1 [cs.CY]). \\n Abstract: <p>Artificial intelligence is gaining traction in more ways than ever before.\\nThe popularity of language models and AI-based businesses has soared since\\nChatGPT was made available to the general public via OpenAI. It is becoming\\nincreasingly common for people to use ChatGPT both professionally and\\npersonally. Considering the widespread use of ChatGPT and the reliance people\\nplace on it, this study determined how reliable ChatGPT can be for answering\\ncomplex medical and clinical questions. Harvard University gross anatomy along\\nwith the United States Medical Licensing Examination (USMLE) questionnaire were\\nused to accomplish the objective. The paper evaluated the obtained results\\nusing a 2-way ANOVA and posthoc analysis. Both showed systematic covariation\\nbetween format and prompt. Furthermore, the physician adjudicators\\nindependently rated the outcome's accuracy, concordance, and insight. As a\\nresult of the analysis, ChatGPT-generated answers were found to be more\\ncontext-oriented and represented a better model for deductive reasoning than\\nregular Google search results. Furthermore, ChatGPT obtained 58.8% on logical\\nquestions and 60% on ethical questions. This means that the ChatGPT is\\napproaching the passing range for logical questions and has crossed the\\nthreshold for ethical questions. The paper believes ChatGPT and other language\\nlearning models can be invaluable tools for e-learners; however, the study\\nsuggests that there is still room to improve their accuracy. In order to\\nimprove ChatGPT's performance in the future, further research is needed to\\nbetter understand how it can answer different types of questions.\\n</p>\", \"Title: Large Language Models (GPT) for automating feedback on programming assignments. (arXiv:2307.00150v1 [cs.HC]). \\n Abstract: <p>Addressing the challenge of generating personalized feedback for programming\\nassignments is demanding due to several factors, like the complexity of code\\nsyntax or different ways to correctly solve a task. In this experimental study,\\nwe automated the process of feedback generation by employing OpenAI's GPT-3.5\\nmodel to generate personalized hints for students solving programming\\nassignments on an automated assessment platform. Students rated the usefulness\\nof GPT-generated hints positively. The experimental group (with GPT hints\\nenabled) relied less on the platform's regular feedback but performed better in\\nterms of percentage of successful submissions across consecutive attempts for\\ntasks, where GPT hints were enabled. For tasks where the GPT feedback was made\\nunavailable, the experimental group needed significantly less time to solve\\nassignments. Furthermore, when GPT hints were unavailable, students in the\\nexperimental condition were initially less likely to solve the assignment\\ncorrectly. This suggests potential over-reliance on GPT-generated feedback.\\nHowever, students in the experimental condition were able to correct reasonably\\nrapidly, reaching the same percentage correct after seven submission attempts.\\nThe availability of GPT hints did not significantly impact students' affective\\nstate.\\n</p>\", 'Title: Personality Traits in Large Language Models. (arXiv:2307.00184v1 [cs.CL]). \\n Abstract: <p>The advent of large language models (LLMs) has revolutionized natural\\nlanguage processing, enabling the generation of coherent and contextually\\nrelevant text. As LLMs increasingly power conversational agents, the\\nsynthesized personality embedded in these models by virtue of their training on\\nlarge amounts of human-generated data draws attention. Since personality is an\\nimportant factor determining the effectiveness of communication, we present a\\ncomprehensive method for administering validated psychometric tests and\\nquantifying, analyzing, and shaping personality traits exhibited in text\\ngenerated from widely-used LLMs. We find that: 1) personality simulated in the\\noutputs of some LLMs (under specific prompting configurations) is reliable and\\nvalid; 2) evidence of reliability and validity of LLM-simulated personality is\\nstronger for larger and instruction fine-tuned models; and 3) personality in\\nLLM outputs can be shaped along desired dimensions to mimic specific\\npersonality profiles. We also discuss potential applications and ethical\\nimplications of our measurement and shaping framework, especially regarding\\nresponsible use of LLMs.\\n</p>', 'Title: InstructEval: Systematic Evaluation of Instruction Selection Methods. (arXiv:2307.00259v1 [cs.CL]). \\n Abstract: <p>In-context learning (ICL) performs tasks by prompting a large language model\\n(LLM) using an instruction and a small set of annotated examples called\\ndemonstrations. Recent work has shown that the precise details of the inputs\\nused in the prompt significantly impacts ICL, which has incentivized\\ninstruction selection algorithms. The effect of instruction-choice however is\\nseverely underexplored, with existing analyses being restricted to shallow\\nsubsets of models and tasks, which limits the generalizability of their\\ninsights. We develop an ICL evaluation suite to conduct a thorough assessment\\nof these techniques. The suite includes 13 open-sourced LLMs of varying scales\\nfrom 4 distinct model families and covers 9 different tasks, representing a\\nrange of task types across 3 categories. In this work, we evaluate the relative\\nperformance of 7 popular instruction selection methods using our benchmark over\\nfive desiderata relevant to ICL. We discover that using curated\\nmanually-written instructions and simple instructions without any task-specific\\ndescriptions often elicits superior ICL performance than that of automatic\\ninstruction-induction methods, pointing to a lack of generalizability among the\\nlatter. We release our evaluation suite for benchmarking instruction selection\\napproaches, and call for more rigorous and generalizable methods in this space.\\n</p>', 'Title: Zero-Shot Cross-Lingual Summarization via Large Language Models. (arXiv:2302.14229v3 [cs.CL] UPDATED). \\n Abstract: <p>Given a document in a source language, cross-lingual summarization (CLS) aims\\nto generate a summary in a different target language. Recently, the emergence\\nof Large Language Models (LLMs), such as GPT-3.5, ChatGPT and GPT-4, has\\nattracted wide attention from the computational linguistics community. However,\\nit is not yet known the performance of LLMs on CLS. In this report, we\\nempirically use various prompts to guide LLMs to perform zero-shot CLS from\\ndifferent paradigms (i.e., end-to-end and pipeline), and provide a preliminary\\nevaluation on the generated summaries. We find that ChatGPT and GPT-4\\noriginally prefer to produce lengthy summaries with detailed information. These\\ntwo LLMs can further balance informativeness and conciseness with the help of\\nan interactive prompt, significantly improving their CLS performance.\\nExperimental results on three widely-used CLS datasets show that GPT-4 achieves\\nstate-of-the-art zero-shot CLS performance, and performs competitively compared\\nwith the fine-tuned mBART-50. Moreover, we also find some multi-lingual and\\nbilingual LLMs (i.e., BLOOMZ, ChatGLM-6B, Vicuna-13B and ChatYuan) have limited\\nzero-shot CLS ability. Due to the composite nature of CLS, which requires\\nmodels to perform summarization and translation simultaneously, accomplishing\\nthis task in a zero-shot manner is even a challenge for LLMs. Therefore, we\\nsincerely hope and recommend future LLM research could use CLS as a testbed.\\n</p>', 'Title: How to Index Item IDs for Recommendation Foundation Models. (arXiv:2305.06569v3 [cs.IR] UPDATED). \\n Abstract: <p>Recommendation foundation model utilizes large language models (LLM) for\\nrecommendation by converting recommendation tasks into natural language tasks.\\nIt enables generative recommendation which directly generates the item(s) to\\nrecommend rather than calculating a ranking score for each and every candidate\\nitem in traditional recommendation models, simplifying the recommendation\\npipeline from multi-stage filtering to single-stage filtering. To avoid\\ngenerating excessively long text when deciding which item(s) to recommend,\\ncreating LLM-compatible item IDs is essential for recommendation foundation\\nmodels. In this study, we systematically examine the item indexing problem for\\nrecommendation foundation models, using P5 as the representative backbone model\\nand replicating its results with various indexing methods. To emphasize the\\nimportance of item indexing, we first discuss the issues of several trivial\\nitem indexing methods, such as independent indexing, title indexing, and random\\nindexing. We then propose four simple yet effective solutions, including\\nsequential indexing, collaborative indexing, semantic (content-based) indexing,\\nand hybrid indexing. Our reproducibility study of P5 highlights the significant\\ninfluence of item indexing methods on the model performance, and our results on\\nreal-world datasets validate the effectiveness of our proposed solutions.\\n</p>', 'Title: Practical PCG Through Large Language Models. (arXiv:2305.18243v3 [cs.CL] UPDATED). \\n Abstract: <p>Large Language Models (LLMs) have proven to be useful tools in various\\ndomains outside of the field of their inception, which was natural language\\nprocessing. In this study, we provide practical directions on how to use LLMs\\nto generate 2D-game rooms for an under-development game, named Metavoidal. Our\\ntechnique can harness the power of GPT-3 by Human-in-the-loop fine-tuning which\\nallows our method to create 37% Playable-Novel levels from as scarce data as\\nonly 60 hand-designed rooms under a scenario of the non-trivial game, with\\nrespect to (Procedural Content Generation) PCG, that has a good amount of local\\nand global constraints.\\n</p>', 'Title: LLM-Blender: Ensembling Large Language Models with Pairwise Ranking and Generative Fusion. (arXiv:2306.02561v3 [cs.CL] UPDATED). \\n Abstract: <p>We present LLM-Blender, an ensembling framework designed to attain\\nconsistently superior performance by leveraging the diverse strengths of\\nmultiple open-source large language models (LLMs). Our framework consists of\\ntwo modules: PairRanker and GenFuser, addressing the observation that optimal\\nLLMs for different examples can significantly vary. PairRanker employs a\\nspecialized pairwise comparison method to distinguish subtle differences\\nbetween candidate outputs. It jointly encodes the input text and a pair of\\ncandidates, using cross-attention encoders to determine the superior one. Our\\nresults demonstrate that PairRanker exhibits the highest correlation with\\nChatGPT-based ranking. Then, GenFuser aims to merge the top-ranked candidates,\\ngenerating an improved output by capitalizing on their strengths and mitigating\\ntheir weaknesses. To facilitate large-scale evaluation, we introduce a\\nbenchmark dataset, MixInstruct, which is a mixture of multiple instruction\\ndatasets featuring oracle pairwise comparisons. Our LLM-Blender significantly\\noutperform individual LLMs and baseline methods across various metrics,\\nestablishing a substantial performance gap.\\n</p>', 'Title: Artificial General Intelligence for Medical Imaging. (arXiv:2306.05480v2 [cs.AI] UPDATED). \\n Abstract: <p>In this review, we explore the potential applications of Artificial General\\nIntelligence (AGI) models in healthcare, focusing on foundational Large\\nLanguage Models (LLMs), Large Vision Models, and Large Multimodal Models. We\\nemphasize the importance of integrating clinical expertise, domain knowledge,\\nand multimodal capabilities into AGI models. In addition, we lay out key\\nroadmaps that guide the development and deployment of healthcare AGI models.\\nThroughout the review, we provide critical perspectives on the potential\\nchallenges and pitfalls associated with deploying large-scale AGI models in the\\nmedical field. This comprehensive review aims to offer insights into the future\\nimplications of AGI in medical imaging, healthcare and beyond.\\n</p>']\n",
+      "\n",
+      "--------------------------------------------------------------------------------\n",
+      "*********Current learning results of the learner*********\n",
+      " - Deconstructing sexual identity stereotypes in large language models (LLMs): LLMs can inadvertently perpetuate stereotypes towards marginalized groups, like the LGBTQIA+ community. Post-hoc methods using SHAP analysis show promise in debiasing LLM outputs.\n",
+      "- Unlocking the potential of LLMs for AI-assisted medical education: ChatGPT performs well in answering complex medical and clinical questions, approaching passing range for logical and ethical questions. Further research is needed to improve accuracy.\n",
+      "- Automating feedback on programming assignments using LLMs: GPT-3.5 generates personalized hints for programming assignments, improving performance and reducing time to solve assignments. Over-reliance on GPT-generated feedback is observed.\n",
+      "- Analyzing personality traits in LLMs: LLMs simulate reliable and valid personality traits, with larger and instruction fine-tuned models showing stronger evidence. Personality in LLM outputs can be shaped to mimic specific profiles.\n",
+      "- Evaluation of instruction selection methods for in-context learning (ICL): Curated manually-written instructions and simple instructions without task-specific descriptions outperform automatic instruction-induction methods. More rigorous and generalizable methods are needed.\n",
+      "- Zero-shot cross-lingual summarization using LLMs: LLMs like GPT-4 perform well in generating summaries in different target languages. Some multi-lingual and bilingual LLMs have limited zero-shot CLS ability, highlighting the challenge of simultaneous summarization and translation.\n",
+      "- Item indexing for recommendation foundation models: Effective item indexing methods, such as sequential, collaborative, semantic, and hybrid indexing, significantly influence model performance. The choice of indexing method is crucial for generating concise item recommendations.\n",
+      "- Practical generation of 2D-game rooms using LLMs: LLMs, with human-in-the-loop fine-tuning, can generate playable-novel levels for games with local and global constraints.\n",
+      "- Ensembling LLMs with pairwise ranking and generative fusion: LLM-Blender outperforms individual LLMs and baseline methods in generating improved outputs by leveraging the strengths of multiple LLMs.\n",
+      "- Potential applications of artificial general intelligence (AGI) in medical imaging: AGI models, including LLMs, large vision models, and multimodal models, offer promising possibilities in healthcare. Integration of clinical expertise, domain knowledge, and multimodal capabilities is crucial for successful deployment.\n",
+      "- Watermarking as a reliable strategy for detecting LLM-generated text: Watermarking can detect and document LLM-generated text, even after paraphrasing. Paraphrases are likely to leak fragments of the original text, allowing for high-confidence detections. Other detection schemes are also explored.\n",
+      "- SparseOptimizer for inducing sparsity in LLMs: SparseOptimizer reduces parameter count while maintaining performance comparable to dense counterparts. Optimizer-compiler co-design shows potential for inference acceleration.\n",
+      "**************************************************\n",
+      "research_learner (to research_teacher):\n",
+      "\n",
+      "Message content:\n",
+      "learning_results: - Deconstructing sexual identity stereotypes in large language models (LLMs): LLMs can inadvertently perpetuate stereotypes towards marginalized groups, like the LGBTQIA+ community. Post-hoc methods using SHAP analysis show promise in debiasing LLM outputs.\n",
+      "- Unlocking the potential of LLMs for AI-assisted medical education: ChatGPT performs well in answering complex medical and clinical questions, approaching passing range for logical and ethical questions. Further research is needed to improve accuracy.\n",
+      "- Automating feedback on programming assignments using LLMs: GPT-3.5 generates personalized hints for programming assignments, improving performance and reducing time to solve assignments. Over-reliance on GPT-generated feedback is observed.\n",
+      "- Analyzing personality traits in LLMs: LLMs simulate reliable and valid personality traits, with larger and instruction fine-tuned models showing stronger evidence. Personality in LLM outputs can be shaped to mimic specific profiles.\n",
+      "- Evaluation of instruction selection methods for in-context learning (ICL): Curated manually-written instructions and simple instructions without task-specific descriptions outperform automatic instruction-induction methods. More rigorous and generalizable methods are needed.\n",
+      "- Zero-shot cross-lingual summarization using LLMs: LLMs like GPT-4 perform well in generating summaries in different target languages. Some multi-lingual and bilingual LLMs have limited zero-shot CLS ability, highlighting the challenge of simultaneous summarization and translation.\n",
+      "- Item indexing for recommendation foundation models: Effective item indexing methods, such as sequential, collaborative, semantic, and hybrid indexing, significantly influence model performance. The choice of indexing method is crucial for generating concise item recommendations.\n",
+      "- Practical generation of 2D-game rooms using LLMs: LLMs, with human-in-the-loop fine-tuning, can generate playable-novel levels for games with local and global constraints.\n",
+      "- Ensembling LLMs with pairwise ranking and generative fusion: LLM-Blender outperforms individual LLMs and baseline methods in generating improved outputs by leveraging the strengths of multiple LLMs.\n",
+      "- Potential applications of artificial general intelligence (AGI) in medical imaging: AGI models, including LLMs, large vision models, and multimodal models, offer promising possibilities in healthcare. Integration of clinical expertise, domain knowledge, and multimodal capabilities is crucial for successful deployment.\n",
+      "- Watermarking as a reliable strategy for detecting LLM-generated text: Watermarking can detect and document LLM-generated text, even after paraphrasing. Paraphrases are likely to leak fragments of the original text, allowing for high-confidence detections. Other detection schemes are also explored.\n",
+      "- SparseOptimizer for inducing sparsity in LLMs: SparseOptimizer reduces parameter count while maintaining performance comparable to dense counterparts. Optimizer-compiler co-design shows potential for inference acceleration.\n",
+      "can_handle_data_volume: <function LearningAgent.is_total_token_count_within_threshold at 0x7f98fbab2160>\n",
+      "\n",
+      "--------------------------------------------------------------------------------\n",
+      "\n",
+      ">>>>>>>> NO HUMAN INPUT RECEIVED. USING AUTO REPLY FOR THE USER...\n",
+      "research_teacher (to research_learner):\n",
+      "\n",
+      "Message content:\n",
+      "learning_results: - Deconstructing sexual identity stereotypes in large language models (LLMs): LLMs can inadvertently perpetuate stereotypes towards marginalized groups, like the LGBTQIA+ community. Post-hoc methods using SHAP analysis show promise in debiasing LLM outputs.\n",
+      "- Unlocking the potential of LLMs for AI-assisted medical education: ChatGPT performs well in answering complex medical and clinical questions, approaching passing range for logical and ethical questions. Further research is needed to improve accuracy.\n",
+      "- Automating feedback on programming assignments using LLMs: GPT-3.5 generates personalized hints for programming assignments, improving performance and reducing time to solve assignments. Over-reliance on GPT-generated feedback is observed.\n",
+      "- Analyzing personality traits in LLMs: LLMs simulate reliable and valid personality traits, with larger and instruction fine-tuned models showing stronger evidence. Personality in LLM outputs can be shaped to mimic specific profiles.\n",
+      "- Evaluation of instruction selection methods for in-context learning (ICL): Curated manually-written instructions and simple instructions without task-specific descriptions outperform automatic instruction-induction methods. More rigorous and generalizable methods are needed.\n",
+      "- Zero-shot cross-lingual summarization using LLMs: LLMs like GPT-4 perform well in generating summaries in different target languages. Some multi-lingual and bilingual LLMs have limited zero-shot CLS ability, highlighting the challenge of simultaneous summarization and translation.\n",
+      "- Item indexing for recommendation foundation models: Effective item indexing methods, such as sequential, collaborative, semantic, and hybrid indexing, significantly influence model performance. The choice of indexing method is crucial for generating concise item recommendations.\n",
+      "- Practical generation of 2D-game rooms using LLMs: LLMs, with human-in-the-loop fine-tuning, can generate playable-novel levels for games with local and global constraints.\n",
+      "- Ensembling LLMs with pairwise ranking and generative fusion: LLM-Blender outperforms individual LLMs and baseline methods in generating improved outputs by leveraging the strengths of multiple LLMs.\n",
+      "- Potential applications of artificial general intelligence (AGI) in medical imaging: AGI models, including LLMs, large vision models, and multimodal models, offer promising possibilities in healthcare. Integration of clinical expertise, domain knowledge, and multimodal capabilities is crucial for successful deployment.\n",
+      "- Watermarking as a reliable strategy for detecting LLM-generated text: Watermarking can detect and document LLM-generated text, even after paraphrasing. Paraphrases are likely to leak fragments of the original text, allowing for high-confidence detections. Other detection schemes are also explored.\n",
+      "- SparseOptimizer for inducing sparsity in LLMs: SparseOptimizer reduces parameter count while maintaining performance comparable to dense counterparts. Optimizer-compiler co-design shows potential for inference acceleration.\n",
+      "data4learning: ['Title: Can We Trust AI-Generated Educational Content? Comparative Analysis of Human and AI-Generated Learning Resources. (arXiv:2306.10509v2 [cs.HC] UPDATED). \\n Abstract: <p>As an increasing number of students move to online learning platforms that\\ndeliver personalized learning experiences, there is a great need for the\\nproduction of high-quality educational content. Large language models (LLMs)\\nappear to offer a promising solution to the rapid creation of learning\\nmaterials at scale, reducing the burden on instructors. In this study, we\\ninvestigated the potential for LLMs to produce learning resources in an\\nintroductory programming context, by comparing the quality of the resources\\ngenerated by an LLM with those created by students as part of a learnersourcing\\nactivity. Using a blind evaluation, students rated the correctness and\\nhelpfulness of resources generated by AI and their peers, after both were\\ninitially provided with identical exemplars. Our results show that the quality\\nof AI-generated resources, as perceived by students, is equivalent to the\\nquality of resources generated by their peers. This suggests that AI-generated\\nresources may serve as viable supplementary material in certain contexts.\\nResources generated by LLMs tend to closely mirror the given exemplars, whereas\\nstudent-generated resources exhibit greater variety in terms of content length\\nand specific syntax features used. The study highlights the need for further\\nresearch exploring different types of learning resources and a broader range of\\nsubject areas, and understanding the long-term impact of AI-generated resources\\non learning outcomes.\\n</p>', \"Title: SparseOptimizer: Sparsify Language Models through Moreau-Yosida Regularization and Accelerate via Compiler Co-design. (arXiv:2306.15656v2 [cs.LG] UPDATED). \\n Abstract: <p>This paper introduces SparseOptimizer, a novel deep learning optimizer that\\nexploits Moreau-Yosida regularization to naturally induce sparsity in large\\nlanguage models such as BERT, ALBERT and GPT. Key to the design of\\nSparseOptimizer is an embedded shrinkage operator, which imparts sparsity\\ndirectly within the optimization process. This operator, backed by a sound\\ntheoretical framework, includes an analytical solution, thereby reinforcing the\\noptimizer's robustness and efficacy. Crucially, SparseOptimizer's plug-and-play\\nfunctionality eradicates the need for code modifications, making it a\\nuniversally adaptable tool for a wide array of large language models. Empirical\\nevaluations on benchmark datasets such as GLUE, RACE, SQuAD1, and SQuAD2\\nconfirm that SparseBERT and SparseALBERT, when sparsified using\\nSparseOptimizer, achieve performance comparable to their dense counterparts,\\nBERT and ALBERT, while significantly reducing their parameter count. Further,\\nthis work proposes an innovative optimizer-compiler co-design strategy,\\ndemonstrating the potential of inference acceleration (\\\\textbf{3.37x},\\n\\\\textbf{6.30x}, and \\\\textbf{7.15x} in comparison with Pytorch, TensorFlow, and\\nLLVM generic compile, respectively) in SparseBERT when paired with an\\nappropriately designed compiler. This study represents a significant step\\nforward in the evolution of efficient, scalable, and high-performing large\\nlanguage models, setting a precedent for future exploration and optimization in\\nthis domain. The SparseOptimizer code and SparseALBERT model will be publicly\\navailable upon paper acceptance.\\n</p>\"]\n",
+      "\n",
+      "--------------------------------------------------------------------------------\n",
+      "*********Current learning results of the learner*********\n",
+      " - Large language models (LLMs) can inadvertently perpetuate stereotypes towards marginalized groups, such as the LGBTQIA+ community, but post-hoc methods using SHAP analysis show promise in debiasing LLM outputs.\n",
+      "- LLMs like ChatGPT perform well in answering complex medical and clinical questions, but further research is needed to improve accuracy.\n",
+      "- GPT-3.5 can generate personalized hints for programming assignments, improving performance and reducing time to solve assignments, but over-reliance on GPT-generated feedback is observed.\n",
+      "- LLMs simulate reliable and valid personality traits, with larger and instruction fine-tuned models showing stronger evidence, and personality in LLM outputs can be shaped to mimic specific profiles.\n",
+      "- Curated manually-written instructions and simple instructions without task-specific descriptions outperform automatic instruction-induction methods for in-context learning, but more rigorous and generalizable methods are needed.\n",
+      "- LLMs like GPT-4 perform well in generating summaries in different target languages, but there are challenges in simultaneous summarization and translation for multi-lingual and bilingual LLMs.\n",
+      "- Effective item indexing methods significantly influence model performance in recommendation systems, and the choice of indexing method is crucial for generating concise item recommendations.\n",
+      "- LLMs, with human-in-the-loop fine-tuning, can generate playable-novel levels for 2D games with local and global constraints.\n",
+      "- Ensembling LLMs using pairwise ranking and generative fusion outperforms individual LLMs and baseline methods in generating improved outputs.\n",
+      "- AGI models, including LLMs, offer promising possibilities in medical imaging but require integration of clinical expertise, domain knowledge, and multimodal capabilities for successful deployment.\n",
+      "- Watermarking can reliably detect LLM-generated text, even after paraphrasing, and other detection schemes are explored.\n",
+      "- SparseOptimizer reduces parameter count in LLMs while maintaining performance comparable to dense counterparts, and optimizer-compiler co-design shows potential for inference acceleration.\n",
+      "- AI-generated learning resources, as perceived by students, are equivalent in quality to those generated by their peers, suggesting that AI-generated resources can serve as supplementary material in certain contexts. Further research is needed to explore different types of resources and their long-term impact on learning outcomes.\n",
+      "**************************************************\n",
+      "research_learner (to research_teacher):\n",
+      "\n",
+      "Message content:\n",
+      "learning_results: - Large language models (LLMs) can inadvertently perpetuate stereotypes towards marginalized groups, such as the LGBTQIA+ community, but post-hoc methods using SHAP analysis show promise in debiasing LLM outputs.\n",
+      "- LLMs like ChatGPT perform well in answering complex medical and clinical questions, but further research is needed to improve accuracy.\n",
+      "- GPT-3.5 can generate personalized hints for programming assignments, improving performance and reducing time to solve assignments, but over-reliance on GPT-generated feedback is observed.\n",
+      "- LLMs simulate reliable and valid personality traits, with larger and instruction fine-tuned models showing stronger evidence, and personality in LLM outputs can be shaped to mimic specific profiles.\n",
+      "- Curated manually-written instructions and simple instructions without task-specific descriptions outperform automatic instruction-induction methods for in-context learning, but more rigorous and generalizable methods are needed.\n",
+      "- LLMs like GPT-4 perform well in generating summaries in different target languages, but there are challenges in simultaneous summarization and translation for multi-lingual and bilingual LLMs.\n",
+      "- Effective item indexing methods significantly influence model performance in recommendation systems, and the choice of indexing method is crucial for generating concise item recommendations.\n",
+      "- LLMs, with human-in-the-loop fine-tuning, can generate playable-novel levels for 2D games with local and global constraints.\n",
+      "- Ensembling LLMs using pairwise ranking and generative fusion outperforms individual LLMs and baseline methods in generating improved outputs.\n",
+      "- AGI models, including LLMs, offer promising possibilities in medical imaging but require integration of clinical expertise, domain knowledge, and multimodal capabilities for successful deployment.\n",
+      "- Watermarking can reliably detect LLM-generated text, even after paraphrasing, and other detection schemes are explored.\n",
+      "- SparseOptimizer reduces parameter count in LLMs while maintaining performance comparable to dense counterparts, and optimizer-compiler co-design shows potential for inference acceleration.\n",
+      "- AI-generated learning resources, as perceived by students, are equivalent in quality to those generated by their peers, suggesting that AI-generated resources can serve as supplementary material in certain contexts. Further research is needed to explore different types of resources and their long-term impact on learning outcomes.\n",
+      "can_handle_data_volume: <function LearningAgent.is_total_token_count_within_threshold at 0x7f98fbab2160>\n",
+      "\n",
+      "--------------------------------------------------------------------------------\n",
+      "\n",
+      ">>>>>>>> NO HUMAN INPUT RECEIVED. USING AUTO REPLY FOR THE USER...\n",
+      "waiting for data...\n"
+     ]
+    }
+   ],
+   "source": [
+    "from flaml import oai\n",
+    "from flaml.autogen.agent import LearningAgent, TeachingAgent\n",
+    "import asyncio\n",
+    "\n",
+    "KEY_LOC = \"\"\n",
+    "async def test_continual_summarization():\n",
+    "    def LLM_related(input_string):\n",
+    "        if \"Large Language Models\" in input_string or \"LLM\" in input_string or \"GPT\" in input_string:\n",
+    "            return True\n",
+    "        else:\n",
+    "            return False\n",
+    "\n",
+    "    import feedparser\n",
+    "\n",
+    "    research_teacher = TeachingAgent(name=\"research_teacher\", human_input_mode=\"NEVER\")\n",
+    "    research_teacher.setup_learning(\n",
+    "        learning_constraints={\"learning_trigger\": True, \"cpu\": 1},\n",
+    "        learning_objectives=\"\"\"Condense the provided data, which consists of titles and abstracts of research papers, into a research digest.\n",
+    "        Create a single bullet point for each entry, ensuring clarity and coherence.\n",
+    "        \"\"\",\n",
+    "        learning_results=\" \",\n",
+    "        # learning_func=oai.summarize,\n",
+    "    )\n",
+    "\n",
+    "    # get data from ml arxiv feed\n",
+    "    ml_feed = feedparser.parse(\"http://export.arxiv.org/rss/cs.LG\")\n",
+    "    ml_data = []\n",
+    "    for entry in ml_feed.entries:\n",
+    "        title_and_abstract = f\"Title: {entry.title}. \\n Abstract: {entry.summary}\"\n",
+    "        if LLM_related(title_and_abstract):\n",
+    "            ml_data.append(title_and_abstract)\n",
+    "    await research_teacher.add_data(ml_data)\n",
+    "    \n",
+    "    config_list = oai.config_list_from_models(key_file_path=KEY_LOC, model_list=[\"gpt-3.5-turbo-0613\"], exclude=\"aoai\")\n",
+    "    research_learner = LearningAgent(name=\"research_learner\", model=\"gpt-3.5-turbo-0613\", config_list=config_list)\n",
+    "    asyncio.create_task(research_learner.receive(research_teacher.generate_init_prompt(), research_teacher))\n",
+    "    \n",
+    "    # get data from ai arxiv feed\n",
+    "    await asyncio.sleep(5)\n",
+    "    ai_feed = feedparser.parse(\"http://export.arxiv.org/rss/cs.AI\")\n",
+    "    ai_data = []\n",
+    "    for entry in ai_feed.entries:\n",
+    "        title_and_abstract = f\"Title: {entry.title}. \\n Abstract: {entry.summary}\"\n",
+    "        if LLM_related(title_and_abstract):\n",
+    "            ai_data.append(title_and_abstract)\n",
+    "    print(\"adding AI data...\")\n",
+    "    await research_teacher.add_data(ai_data)\n",
+    "\n",
+    "\n",
+    "await test_continual_summarization()\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.16"
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "949777d72b0d2535278d3dc13498b2535136f6dfe0678499012e853ee9abcab1"
+   }
+  },
+  "widgets": {
+   "application/vnd.jupyter.widget-state+json": {
+    "state": {
+     "2d910cfd2d2a4fc49fc30fbbdc5576a7": {
+      "model_module": "@jupyter-widgets/base",
+      "model_module_version": "2.0.0",
+      "model_name": "LayoutModel",
+      "state": {
+       "_model_module": "@jupyter-widgets/base",
+       "_model_module_version": "2.0.0",
+       "_model_name": "LayoutModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/base",
+       "_view_module_version": "2.0.0",
+       "_view_name": "LayoutView",
+       "align_content": null,
+       "align_items": null,
+       "align_self": null,
+       "border_bottom": null,
+       "border_left": null,
+       "border_right": null,
+       "border_top": null,
+       "bottom": null,
+       "display": null,
+       "flex": null,
+       "flex_flow": null,
+       "grid_area": null,
+       "grid_auto_columns": null,
+       "grid_auto_flow": null,
+       "grid_auto_rows": null,
+       "grid_column": null,
+       "grid_gap": null,
+       "grid_row": null,
+       "grid_template_areas": null,
+       "grid_template_columns": null,
+       "grid_template_rows": null,
+       "height": null,
+       "justify_content": null,
+       "justify_items": null,
+       "left": null,
+       "margin": null,
+       "max_height": null,
+       "max_width": null,
+       "min_height": null,
+       "min_width": null,
+       "object_fit": null,
+       "object_position": null,
+       "order": null,
+       "overflow": null,
+       "padding": null,
+       "right": null,
+       "top": null,
+       "visibility": null,
+       "width": null
+      }
+     },
+     "454146d0f7224f038689031002906e6f": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "2.0.0",
+      "model_name": "HBoxModel",
+      "state": {
+       "_dom_classes": [],
+       "_model_module": "@jupyter-widgets/controls",
+       "_model_module_version": "2.0.0",
+       "_model_name": "HBoxModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/controls",
+       "_view_module_version": "2.0.0",
+       "_view_name": "HBoxView",
+       "box_style": "",
+       "children": [
+        "IPY_MODEL_e4ae2b6f5a974fd4bafb6abb9d12ff26",
+        "IPY_MODEL_577e1e3cc4db4942b0883577b3b52755",
+        "IPY_MODEL_b40bdfb1ac1d4cffb7cefcb870c64d45"
+       ],
+       "layout": "IPY_MODEL_dc83c7bff2f241309537a8119dfc7555",
+       "tabbable": null,
+       "tooltip": null
+      }
+     },
+     "577e1e3cc4db4942b0883577b3b52755": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "2.0.0",
+      "model_name": "FloatProgressModel",
+      "state": {
+       "_dom_classes": [],
+       "_model_module": "@jupyter-widgets/controls",
+       "_model_module_version": "2.0.0",
+       "_model_name": "FloatProgressModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/controls",
+       "_view_module_version": "2.0.0",
+       "_view_name": "ProgressView",
+       "bar_style": "success",
+       "description": "",
+       "description_allow_html": false,
+       "layout": "IPY_MODEL_2d910cfd2d2a4fc49fc30fbbdc5576a7",
+       "max": 1,
+       "min": 0,
+       "orientation": "horizontal",
+       "style": "IPY_MODEL_74a6ba0c3cbc4051be0a83e152fe1e62",
+       "tabbable": null,
+       "tooltip": null,
+       "value": 1
+      }
+     },
+     "6086462a12d54bafa59d3c4566f06cb2": {
+      "model_module": "@jupyter-widgets/base",
+      "model_module_version": "2.0.0",
+      "model_name": "LayoutModel",
+      "state": {
+       "_model_module": "@jupyter-widgets/base",
+       "_model_module_version": "2.0.0",
+       "_model_name": "LayoutModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/base",
+       "_view_module_version": "2.0.0",
+       "_view_name": "LayoutView",
+       "align_content": null,
+       "align_items": null,
+       "align_self": null,
+       "border_bottom": null,
+       "border_left": null,
+       "border_right": null,
+       "border_top": null,
+       "bottom": null,
+       "display": null,
+       "flex": null,
+       "flex_flow": null,
+       "grid_area": null,
+       "grid_auto_columns": null,
+       "grid_auto_flow": null,
+       "grid_auto_rows": null,
+       "grid_column": null,
+       "grid_gap": null,
+       "grid_row": null,
+       "grid_template_areas": null,
+       "grid_template_columns": null,
+       "grid_template_rows": null,
+       "height": null,
+       "justify_content": null,
+       "justify_items": null,
+       "left": null,
+       "margin": null,
+       "max_height": null,
+       "max_width": null,
+       "min_height": null,
+       "min_width": null,
+       "object_fit": null,
+       "object_position": null,
+       "order": null,
+       "overflow": null,
+       "padding": null,
+       "right": null,
+       "top": null,
+       "visibility": null,
+       "width": null
+      }
+     },
+     "74a6ba0c3cbc4051be0a83e152fe1e62": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "2.0.0",
+      "model_name": "ProgressStyleModel",
+      "state": {
+       "_model_module": "@jupyter-widgets/controls",
+       "_model_module_version": "2.0.0",
+       "_model_name": "ProgressStyleModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/base",
+       "_view_module_version": "2.0.0",
+       "_view_name": "StyleView",
+       "bar_color": null,
+       "description_width": ""
+      }
+     },
+     "7d3f3d9e15894d05a4d188ff4f466554": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "2.0.0",
+      "model_name": "HTMLStyleModel",
+      "state": {
+       "_model_module": "@jupyter-widgets/controls",
+       "_model_module_version": "2.0.0",
+       "_model_name": "HTMLStyleModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/base",
+       "_view_module_version": "2.0.0",
+       "_view_name": "StyleView",
+       "background": null,
+       "description_width": "",
+       "font_size": null,
+       "text_color": null
+      }
+     },
+     "b40bdfb1ac1d4cffb7cefcb870c64d45": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "2.0.0",
+      "model_name": "HTMLModel",
+      "state": {
+       "_dom_classes": [],
+       "_model_module": "@jupyter-widgets/controls",
+       "_model_module_version": "2.0.0",
+       "_model_name": "HTMLModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/controls",
+       "_view_module_version": "2.0.0",
+       "_view_name": "HTMLView",
+       "description": "",
+       "description_allow_html": false,
+       "layout": "IPY_MODEL_f1355871cc6f4dd4b50d9df5af20e5c8",
+       "placeholder": "​",
+       "style": "IPY_MODEL_ca245376fd9f4354af6b2befe4af4466",
+       "tabbable": null,
+       "tooltip": null,
+       "value": " 1/1 [00:00&lt;00:00, 44.69it/s]"
+      }
+     },
+     "ca245376fd9f4354af6b2befe4af4466": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "2.0.0",
+      "model_name": "HTMLStyleModel",
+      "state": {
+       "_model_module": "@jupyter-widgets/controls",
+       "_model_module_version": "2.0.0",
+       "_model_name": "HTMLStyleModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/base",
+       "_view_module_version": "2.0.0",
+       "_view_name": "StyleView",
+       "background": null,
+       "description_width": "",
+       "font_size": null,
+       "text_color": null
+      }
+     },
+     "dc83c7bff2f241309537a8119dfc7555": {
+      "model_module": "@jupyter-widgets/base",
+      "model_module_version": "2.0.0",
+      "model_name": "LayoutModel",
+      "state": {
+       "_model_module": "@jupyter-widgets/base",
+       "_model_module_version": "2.0.0",
+       "_model_name": "LayoutModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/base",
+       "_view_module_version": "2.0.0",
+       "_view_name": "LayoutView",
+       "align_content": null,
+       "align_items": null,
+       "align_self": null,
+       "border_bottom": null,
+       "border_left": null,
+       "border_right": null,
+       "border_top": null,
+       "bottom": null,
+       "display": null,
+       "flex": null,
+       "flex_flow": null,
+       "grid_area": null,
+       "grid_auto_columns": null,
+       "grid_auto_flow": null,
+       "grid_auto_rows": null,
+       "grid_column": null,
+       "grid_gap": null,
+       "grid_row": null,
+       "grid_template_areas": null,
+       "grid_template_columns": null,
+       "grid_template_rows": null,
+       "height": null,
+       "justify_content": null,
+       "justify_items": null,
+       "left": null,
+       "margin": null,
+       "max_height": null,
+       "max_width": null,
+       "min_height": null,
+       "min_width": null,
+       "object_fit": null,
+       "object_position": null,
+       "order": null,
+       "overflow": null,
+       "padding": null,
+       "right": null,
+       "top": null,
+       "visibility": null,
+       "width": null
+      }
+     },
+     "e4ae2b6f5a974fd4bafb6abb9d12ff26": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "2.0.0",
+      "model_name": "HTMLModel",
+      "state": {
+       "_dom_classes": [],
+       "_model_module": "@jupyter-widgets/controls",
+       "_model_module_version": "2.0.0",
+       "_model_name": "HTMLModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/controls",
+       "_view_module_version": "2.0.0",
+       "_view_name": "HTMLView",
+       "description": "",
+       "description_allow_html": false,
+       "layout": "IPY_MODEL_6086462a12d54bafa59d3c4566f06cb2",
+       "placeholder": "​",
+       "style": "IPY_MODEL_7d3f3d9e15894d05a4d188ff4f466554",
+       "tabbable": null,
+       "tooltip": null,
+       "value": "100%"
+      }
+     },
+     "f1355871cc6f4dd4b50d9df5af20e5c8": {
+      "model_module": "@jupyter-widgets/base",
+      "model_module_version": "2.0.0",
+      "model_name": "LayoutModel",
+      "state": {
+       "_model_module": "@jupyter-widgets/base",
+       "_model_module_version": "2.0.0",
+       "_model_name": "LayoutModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/base",
+       "_view_module_version": "2.0.0",
+       "_view_name": "LayoutView",
+       "align_content": null,
+       "align_items": null,
+       "align_self": null,
+       "border_bottom": null,
+       "border_left": null,
+       "border_right": null,
+       "border_top": null,
+       "bottom": null,
+       "display": null,
+       "flex": null,
+       "flex_flow": null,
+       "grid_area": null,
+       "grid_auto_columns": null,
+       "grid_auto_flow": null,
+       "grid_auto_rows": null,
+       "grid_column": null,
+       "grid_gap": null,
+       "grid_row": null,
+       "grid_template_areas": null,
+       "grid_template_columns": null,
+       "grid_template_rows": null,
+       "height": null,
+       "justify_content": null,
+       "justify_items": null,
+       "left": null,
+       "margin": null,
+       "max_height": null,
+       "max_width": null,
+       "min_height": null,
+       "min_width": null,
+       "object_fit": null,
+       "object_position": null,
+       "order": null,
+       "overflow": null,
+       "padding": null,
+       "right": null,
+       "top": null,
+       "visibility": null,
+       "width": null
+      }
+     }
+    },
+    "version_major": 2,
+    "version_minor": 0
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/test/autogen/test_continual_summarization.py b/test/autogen/test_continual_summarization.py
new file mode 100644
index 0000000000..45a9ffff8e
--- /dev/null
+++ b/test/autogen/test_continual_summarization.py
@@ -0,0 +1,59 @@
+try:
+    import openai
+except ImportError:
+    openai = None
+import pytest
+from flaml import oai
+from flaml.autogen.agent import LearningAgent, TeachingAgent
+import asyncio
+
+KEY_LOC = "test/autogen/"
+
+
+@pytest.mark.skipif(openai is None, reason="openai not installed")
+async def test_continual_summarization():
+    def LLM_related(input_string):
+        if "Large Language Models" in input_string or "LLM" in input_string or "GPT" in input_string:
+            return True
+        else:
+            return False
+
+    import feedparser
+
+    research_teacher = TeachingAgent(name="research_teacher", human_input_mode="NEVER")
+    research_teacher.setup_learning(
+        learning_constraints={"learning_trigger": True, "cpu": 1},
+        learning_objectives="""Condense the provided data, which consists of titles and abstracts of research papers, into a research digest.
+        Create a single bullet point for each entry, ensuring clarity and coherence.
+        """,
+        learning_results=" ",
+        # learning_func=oai.summarize,
+    )
+
+    # get data from ml arxiv feed
+    ml_feed = feedparser.parse("http://export.arxiv.org/rss/cs.LG")
+    ml_data = []
+    for entry in ml_feed.entries:
+        title_and_abstract = f"Title: {entry.title}. \n Abstract: {entry.summary}"
+        if LLM_related(title_and_abstract):
+            ml_data.append(title_and_abstract)
+    await research_teacher.add_data(ml_data)
+
+    config_list = oai.config_list_from_models(key_file_path=KEY_LOC, model_list=["gpt-3.5-turbo-0613"], exclude="aoai")
+    research_learner = LearningAgent(name="research_learner", config_list=config_list)
+    asyncio.create_task(research_learner.receive(research_teacher.generate_init_prompt(), research_teacher))
+
+    # get data from ai arxiv feed
+    await asyncio.sleep(5)
+    ai_feed = feedparser.parse("http://export.arxiv.org/rss/cs.AI")
+    ai_data = []
+    for entry in ai_feed.entries:
+        title_and_abstract = f"Title: {entry.title}. \n Abstract: {entry.summary}"
+        if LLM_related(title_and_abstract):
+            ai_data.append(title_and_abstract)
+    print("adding AI data...")
+    await research_teacher.add_data(ai_data)
+
+
+if __name__ == "__main__":
+    asyncio.run(test_continual_summarization())