index.html

<!-- -&#45;&#45;-->
<!--permalink: /cocon/-->
<!-- -&#45;&#45;-->
<!DOCTYPE html>
<html>
<head>
  <meta charset="utf-8">
  <meta name="description"
        content="Evaluating Very Long-Term Conversational Memory of LLM Agents">
  <meta name="keywords" content="dialog, memory, locomo, llm-agent, conversational agent">
  <meta name="viewport" content="width=device-width, initial-scale=1">
  <title>Evaluating Very Long-Term Conversational Memory of LLM Agents</title>

<!--  &lt;!&ndash; Global site tag (gtag.js) - Google Analytics &ndash;&gt;-->
<!--  <script async src="https://www.googletagmanager.com/gtag/js?id=G-PYVRSFMDRL"></script>-->
<!--  <script>-->
<!--    window.dataLayer = window.dataLayer || [];-->

<!--    function gtag() {-->
<!--      dataLayer.push(arguments);-->
<!--    }-->

<!--    gtag('js', new Date());-->

<!--    gtag('config', 'G-PYVRSFMDRL');-->
<!--  </script>-->

  <link href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro"
        rel="stylesheet">

  <link rel="stylesheet" href="./static/css/bulma.min.css">
  <link rel="stylesheet" href="./static/css/bulma-carousel.min.css">
  <link rel="stylesheet" href="./static/css/bulma-slider.min.css">
  <link rel="stylesheet" href="./static/css/fontawesome.all.min.css">
  <link rel="stylesheet"
        href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css">
  <link rel="stylesheet" href="./static/css/index.css">
  <link rel="icon" href="./static/images/favicon.svg">

  <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
  <script defer src="./static/js/fontawesome.all.min.js"></script>
  <script src="./static/js/bulma-carousel.min.js"></script>
  <script src="./static/js/bulma-slider.min.js"></script>
  <script src="./static/js/index.js"></script>
</head>
<body>


<section class="hero">
  <div class="hero-body">
    <div class="container is-max-desktop">
      <div class="columns is-centered">
        <div class="column has-text-centered">
          <h2 class="title is-2 publication-title">Evaluating Very Long-Term Conversational Memory of LLM Agents</h2>
          <!-- <h3 class="title is-3">Accepted to <a href="https://jmlr.org/tmlr/">TMLR</a> 02/2024</h3> -->
          <div class="is-size-5 publication-authors">
            <span class="author-block">
              <a href="https://adymaharana.github.io/">Adyasha Maharana</a><sup>1</sup>,</span>
            <span class="author-block">
              <a href="https://www.danny-lee.info/">Dong-Ho Lee</a><sup>2</sup>,</span>
            <span class="author-block">
              <a href="http://www.stulyakov.com/">Sergey Tulyakov</a><sup>3</sup>,
            </span>
            <span class="author-block">
              <a href="https://www.cs.unc.edu/~mbansal/">Mohit Bansal</a><sup>1&dagger;</sup>,
            </span>
            <span class="author-block">
              <a href="https://fvancesco.github.io/">Francesco Barbieri</a><sup>&dagger;</sup>,
            </span>
            <span class="author-block">
              <a href="https://yuwfan.github.io/">Yuwei Fang</a><sup>3&dagger;</sup>
            </span>
          </div>

          <div class="is-size-5 publication-authors">
            <span class="author-block"><sup>1</sup>UNC Chapel Hill</span>
            <span class="author-block"><sup>2</sup>University of Southern California</span>
            <span class="author-block"><sup>3</sup>Snap Inc.</span>
            <span class="author-block"><sup>&dagger;</sup>Equal advising</span>
          </div>

          <div class="column has-text-centered">
            <div class="publication-links">
              <!-- PDF Link. -->
              <!-- <span class="link-block">
                <a href="https://openreview.net/forum?id=ue9igTDLN2"
                   class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                      <i class="fas fa-file-pdf"></i>
                  </span>
                  <span>Open Review</span>
                </a>
              </span> -->
              <span class="link-block">
                <a href="http://arxiv.org/abs/2402.17753"
                   class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                      <i class="ai ai-arxiv"></i>
                  </span>
                  <span>arXiv</span>
                </a>
              </span>
<!--              &lt;!&ndash; Video Link. &ndash;&gt;-->
<!--              <span class="link-block">-->
<!--                <a href="https://www.youtube.com/watch?v=MrKrnHhk8IA"-->
<!--                   class="external-link button is-normal is-rounded is-dark">-->
<!--                  <span class="icon">-->
<!--                      <i class="fab fa-youtube"></i>-->
<!--                  </span>-->
<!--                  <span>Video</span>-->
<!--                </a>-->
<!--              </span>-->
              <!-- Code Link. -->
              <span class="link-block">
                <a href="https://github.com/snap-research/LoCoMo"
                   class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                      <i class="fab fa-github"></i>
                  </span>
                  <span>Code</span>
                  </a>
              </span>
              <!-- Dataset Link. -->
              <span class="link-block">
                <a href="https://github.com/snap-research/locomo/tree/main/data"
                   class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                      <i class="fas fa-clock"></i>
                  </span>
                  <span>Data</span>
                  </a>
            </div>

          </div>
        </div>
      </div>
    </div>
  </div>
</section>


<section class="hero teaser">
  <div class="container is-max-desktop">
    <div class="hero-body">
<!--        <iframe src="./static/images/main_figure_arxiv.pdf" style="width: 100%;height: 100%;border: none;"></iframe>-->
      <img src="./static/images/evaluation_framework.svg" alt="Logo" style="display:block;border: none;"/>
<!--      <video id="teaser" autoplay muted loop playsinline height="100%">-->
<!--        <source src="./static/images/main_figure.svg"-->
<!--                type="image/svg+xml">-->
<!--      </video>-->
      <h2 class="subtitle has-text-centered">
        <b>Overview of our evaluation framework.</b> We propose three tasks: question answering, event summarization and multimodal dialog generation 
        to evaluate models' comprehension in very long-term dialogues.
      </h2>
    </div>
  </div>
</section>


<section class="section">
  <div class="container is-max-desktop">
    <!-- Abstract. -->
    <div class="columns is-centered has-text-centered">
      <div class="column is-four-fifths">
        <h2 class="title is-3">Motivation</h2>
        <div class="content has-text-justified">
          <p>
            Existing works on long-term open-domain dialogues focus on evaluating model responses
            within contexts spanning no more than five
            chat sessions.
          </p>
          <p>
            We introduce a machine-human
            pipeline to generate high-quality, very long-term dialogues by leveraging LLM-based agent
            architectures and grounding their dialogues on personas and temporal event graphs. Moreover,
            we equip each agent with the capability of sharing and reacting to images. 
            The generated conversations are verified and edited by human annotators for long-range consistency and grounding to the event graphs. 
            Using this pipeline, we collect LoCoMo, a dataset of very long-term conversations, each encompassing 300
            turns and 9K tokens on avg., over up to 35 sessions.
          </p>
          <p>
            Based on LOCOMO, we present a comprehensive evaluation benchmark to measure long-term memory in models, encompassing
            question answering, event summarization, and multi-modal dialogue generation tasks. 
            <!-- Our experimental results indicate that LLMs exhibit -->
            <!-- challenges in understanding lengthy conversations and comprehending long-range temporal -->
            <!-- and causal dynamics within dialogues. -->
          </p>
        </div>
      </div>
    </div>
    <!--/ Abstract. -->

    <!-- Motivation. -->
    <div class="columns is-centered has-text-centered">
      <div class="column is-four-fifths">
        <h2 class="title is-3">How do we generate <i>very</i> long-term conversations?</h2>
        <div class="intro-figure">
          <img src="./static/images/intro_figure_conv_only_v2.svg" alt="Intro" style="float: right; margin: 8px;"/>
<!--          <iframe src="https://www.youtube.com/embed/MrKrnHhk8IA?rel=0&amp;showinfo=0"-->
<!--                  frameborder="0" allow="autoplay; encrypted-media" allowfullscreen></iframe>-->
        </div>
        <div class="content has-text-justified">
          <p>
            An example of a conversation in LoCoMo is shown to the right. 
            
            <ul>
              <li>We create two <b>virtual agents</b>, each initialized with a LLM.</li>
              <li>To start, unique <b>persona statements</b> are assigned to each agent, ensuring the integration of distinct personalities into their dialogues.</li>
              <li>To mirror real-life experiences, we create a <b>temporal event graph</b> for each agent, which illustrates a realistic sequence of life events.</li>
              <li>The LLM <b>agent architecture</b> is utilized for each agent, enabling them to effectively memorize and reflect conversation history into ongoing dialogues.</li>
              <li>Further, each agent can share coherent images, thereby enhancing the <b>multi-modal dialogue</b> aspect.</li>
              <li>Finally, human annotators are tasked with <b>manually filtering and refining</b> the generated data.</li>
            </ul>
          </p>
        </div>
      </div>
    </div>
    <!--/ Paper video. -->


  </div>
</section>


<section class="hero teaser">
  <div class="container is-max-desktop">
    <div class="hero-body">
<!--        <iframe src="./static/images/main_figure_arxiv.pdf" style="width: 100%;height: 100%;border: none;"></iframe>-->
      <img src="./static/images/main_v4.svg" alt="Logo" style="margin-top:50px;"/>
<!--      <video id="teaser" autoplay muted loop playsinline height="100%">-->
<!--        <source src="./static/images/main_figure.svg"-->
<!--                type="image/svg+xml">-->
<!--      </video>-->
      <h2 class="subtitle has-text-centered">
        <b>Overview of the generative pipeline for LoCoMo.</b>  Each LLM agent is assigned a distinct persona and
            a timeline of causally connected events in their file. The agent is equipped with a memory and reflection module to
            retrieve relevant history for dialog generation and is also enabled for image-sharing and image-reaction behaviors
            (left). The generated conversations are edited by human annotators to maintain long-range consistency (right).
      </h2>
    </div>
  </div>
</section>


<section class="hero teaser">
  <div class="container is-max-desktop">
    <div class="hero-body">
<!--        <iframe src="./static/images/main_figure_arxiv.pdf" style="width: 100%;height: 100%;border: none;"></iframe>-->
      <img src="./static/images/events.svg" alt="Logo"/>
<!--      <video id="teaser" autoplay muted loop playsinline height="100%">-->
<!--        <source src="./static/images/main_figure.svg"-->
<!--                type="image/svg+xml">-->
<!--      </video>-->
      <h2 class="subtitle has-text-centered">
        <b>Temporal Event Graph Creation.</b> Each event is generated in accordance with the specified persona p
        and causal connections l between events are depicted to illustrate the casual relationships among them.
      </h2>
    </div>
  </div>
</section>


<section class="section">
  <div class="container is-max-desktop">
    <div class="columns is-centered has-text-centered">
      <div class="column is-four-fifths">
        <h2 class="title is-3">Evaluation Framework</h2>
        <div class="content has-text-justified">
          <p>
            In this study, we present a holistic evaluation framework to assess an agent’s proficiency in managing and responding within long-term contexts.
            <ul>
              <li><b>Question Answering.</b> Agents need to “recall” past
                context correctly to integrate relevant information
                into future responses. We present a direct examination of their memory via a question answering
                task. We classify questions into five distinct
                reasoning types to evaluate memory from multiple perspectives: <i>single-hop</i>, <i>multi-hop</i>, <i>temporal</i>,
                <i>commonsense or world knowledge</i>, and <i>adversarial</i>.</li>
              <li><b>Event Graph Summarization.</b> Agents also need to recognize long-range causal and temporal connections in the dialogues to generate empathetic and relevant responses. We
                propose a measurement of their causal and temporal understanding with an event graph summarization task where the event graphs
                linked to each LLM speaker serve as the correct
                answers, and models are tasked with extracting this
                information from the conversation history.</li>
              <li><b>Multi-modal Dialog Generation.</b> Conversational agents need to utilize relevant context recalled from past conversations to generate
                responses that are consistent with the ongoing narrative. We assess this ability via the multi-modal
                dialog generation task.</li>
            </ul>
          </p>
        </div>
      </div>
    </div>
  </div>
</section>


<section class="section">
  <div class="container is-max-desktop">
    <div class="columns is-centered has-text-centered">
      <div class="column is-four-fifths">
        <h2 class="title is-3">Findings</h2>
        <div class="content has-text-justified">
          <p>
            We present extensive experimental results on the LoCoMo benchmark using instruction-based LLMs, long-context LLMs, and RAG techniques. Our findings include:
            <ul>
              <li> <b>Long-context LLMs and RAG demonstrate effectiveness in QA tasks </b>, improving ‘memory’ capabilities of LLMs (with improvements ranging
                from 22-66%), but still significantly lag behind human levels (by 56%), especially in temporal reasoning, (by 73%);
                </li>
              <li><b>Long-context LLMs demonstrate significant hallucinations</b>, leading to difficulty with adversarial questions in the QA task and in the event graph summarization task.</li>
              <li><b>RAG offers a balanced compromise</b>, combining the accuracy of short-context LLMs with the
                extensive comprehension of wide-context LLMs, and does particularly well when dialogues are transformed into a database of assertions (observations)
                about each speaker’s life and persona.
                </li>
            </ul>
          </p>
        </div>
      </div>
    </div>
  </div>
</section>


<section class="hero teaser">
  <div class="container is-max-desktop">

    <div class="hero-body">
      <!--        <iframe src="./static/images/main_figure_arxiv.pdf" style="width: 100%;height: 100%;border: none;"></iframe>-->
            <img src="./static/images/tables.svg" alt="Logo"/>
      <!--      <video id="teaser" autoplay muted loop playsinline height="100%">-->
      <!--        <source src="./static/images/main_figure.svg"-->
      <!--                type="image/svg+xml">-->
      <!--      </video>-->
            <h2 class="subtitle has-text-centered">
              <b>Question answering performance of Base, Long-context and RAG models.</b> Optimal performance is in bold.
              Results are based on F1-score for answer prediction; higher is better.
            </h2>
    </div>

    <div class="hero-body">
<!--        <iframe src="./static/images/main_figure_arxiv.pdf" style="width: 100%;height: 100%;border: none;"></iframe>-->
      <img src="./static/images/minigpt5_results.svg" alt="Logo"/>
<!--      <video id="teaser" autoplay muted loop playsinline height="100%">-->
<!--        <source src="./static/images/main_figure.svg"-->
<!--                type="image/svg+xml">-->
<!--      </video>-->
      <h2 class="subtitle has-text-centered">
        <b>Multimodal dialog generation performance of MiniGPT-5</b>. (A) an example of multimodal dialog
predicted using MiniGPT5 with and without observation as retrieved context, (B) Variation of MM-Relevance score
with length of dialog history, and (C) comparison of RAG-based MiniGPT-5 methods.
      </h2>
    </div>
  </div>
</section>


<section class="section" id="BibTeX">
  <div class="container is-max-desktop content">
    <h2 class="title">BibTeX</h2>
    <pre><code>@article{maharana2024lococmo,
  author    = {Maharana, Adyasha and Lee, Dong-Ho and Tulyakov, Sergey and Bansal, Mohit and Barbieri, Francesco and Fang, Yuwei},
  title     = {Evaluating Very Long-Term Conversational Memory of LLM Agents.},
  journal   = {arxiv},
  year      = {2024},
}</code></pre>
  </div>
</section>


<footer class="footer">
  <div class="container">
    <div class="content has-text-centered">
      <a class="icon-link"
         href="https://arxiv.org/abs/2303.16133">
        <i class="fas fa-file-pdf"></i>
      </a>
      <a class="icon-link" href="https://github.com/adymaharana" class="external-link" disabled>
        <i class="fab fa-github"></i>
      </a>
    </div>
    <div class="columns is-centered">
      <div class="column is-8">
        <div class="content">
          <p>
            Website design borrowed from <a
              href="https://github.com/nerfies/nerfies.github.io">nerfies</a>.
          </p>
        </div>
      </div>
    </div>
  </div>
</footer>

</body>
</html>