From c8813b97f39f5706e4c3743a820dbf5a14199b3b Mon Sep 17 00:00:00 2001
From: eric <eric@users.noreply.github.com>
Date: Sat, 9 Aug 2025 04:06:01 +0000
Subject: [PATCH] deploy: c9ed800d9fce2a440a321cc022c16bbfbc939c18

---
 404.html                                      |  2 +-
 about/index.html                              |  2 +-
 categories/index.html                         |  2 +-
 index.html                                    |  2 +-
 index.xml                                     |  3 ++-
 .../index.html                                |  2 +-
 .../index.html                                |  2 +-
 .../index.html                                | 21 +++++++++++++++++++
 posts/index.html                              |  5 +++--
 posts/index.xml                               |  3 ++-
 .../index.html                                |  2 +-
 posts/supabase-deep-dive/index.html           |  2 +-
 .../index.html                                |  2 +-
 posts/useful/index.html                       |  2 +-
 sitemap.xml                                   |  2 +-
 tags/index.html                               |  2 +-
 16 files changed, 40 insertions(+), 16 deletions(-)
 create mode 100644 posts/how-rvq-teaches-llms-to-see-and-hear/index.html
diff --git a/404.html b/404.html
index faef608..142bdf9 100644
--- a/404.html
+++ b/404.html
@@ -4,4 +4,4 @@
 2016 -
 2025
 Eric X. Liu
-<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/c25cd89">[c25cd89]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
\ No newline at end of file
+<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/c9ed800">[c9ed800]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
\ No newline at end of file
diff --git a/about/index.html b/about/index.html
index ec32479..740abc2 100644
--- a/about/index.html
+++ b/about/index.html
@@ -4,4 +4,4 @@
 2016 -
 2025
 Eric X. Liu
-<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/c25cd89">[c25cd89]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
\ No newline at end of file
+<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/c9ed800">[c9ed800]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
\ No newline at end of file
diff --git a/categories/index.html b/categories/index.html
index ac1cac6..0a5af57 100644
--- a/categories/index.html
+++ b/categories/index.html
@@ -4,4 +4,4 @@
 2016 -
 2025
 Eric X. Liu
-<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/c25cd89">[c25cd89]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
\ No newline at end of file
+<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/c9ed800">[c9ed800]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
\ No newline at end of file
diff --git a/index.html b/index.html
index ae1a4e5..d06cd49 100644
--- a/index.html
+++ b/index.html
@@ -4,4 +4,4 @@
 2016 -
 2025
 Eric X. Liu
-<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/c25cd89">[c25cd89]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
\ No newline at end of file
+<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/c9ed800">[c9ed800]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
\ No newline at end of file
diff --git a/index.xml b/index.xml
index f48f163..5fd3f5e 100644
--- a/index.xml
+++ b/index.xml
@@ -1,4 +1,5 @@
-<?xml version="1.0" encoding="utf-8" standalone="yes"?><rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom"><channel><title>Eric X. Liu's Personal Page</title><link>/</link><description>Recent content on Eric X. Liu's Personal Page</description><generator>Hugo</generator><language>en</language><lastBuildDate>Mon, 04 Aug 2025 03:59:37 +0000</lastBuildDate><atom:link href="/index.xml" rel="self" type="application/rss+xml"/><item><title>Supabase Deep Dive: It's Not Magic, It's Just Postgres</title><link>/posts/supabase-deep-dive/</link><pubDate>Sun, 03 Aug 2025 00:00:00 +0000</pubDate><guid>/posts/supabase-deep-dive/</guid><description>&lt;p>In the world of Backend-as-a-Service (BaaS), platforms are often treated as magic boxes. You push data in, you get data out, and you hope the magic inside scales. While this simplicity is powerful, it can obscure the underlying mechanics, leaving developers wondering what&amp;rsquo;s really going on.&lt;/p>
+<?xml version="1.0" encoding="utf-8" standalone="yes"?><rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom"><channel><title>Eric X. Liu's Personal Page</title><link>/</link><description>Recent content on Eric X. Liu's Personal Page</description><generator>Hugo</generator><language>en</language><lastBuildDate>Fri, 08 Aug 2025 17:36:52 +0000</lastBuildDate><atom:link href="/index.xml" rel="self" type="application/rss+xml"/><item><title>Beyond Words: How RVQ Teaches LLMs to See and Hear</title><link>/posts/how-rvq-teaches-llms-to-see-and-hear/</link><pubDate>Thu, 07 Aug 2025 00:00:00 +0000</pubDate><guid>/posts/how-rvq-teaches-llms-to-see-and-hear/</guid><description>&lt;p>Large Language Models (LLMs) are masters of text, but the world is not made of text alone. It’s a symphony of sights, sounds, and experiences. The ultimate goal for AI is to understand this rich, multi-modal world as we do. But how do you teach a model that thinks in words to understand a picture of a sunset or the melody of a song?&lt;/p>
+&lt;p>The answer lies in creating a universal language—a bridge between the continuous, messy world of pixels and audio waves and the discrete, structured world of language tokens. One of the most elegant and powerful tools for building this bridge is &lt;strong>Residual Vector Quantization (RVQ)&lt;/strong>.&lt;/p></description></item><item><title>Supabase Deep Dive: It's Not Magic, It's Just Postgres</title><link>/posts/supabase-deep-dive/</link><pubDate>Sun, 03 Aug 2025 00:00:00 +0000</pubDate><guid>/posts/supabase-deep-dive/</guid><description>&lt;p>In the world of Backend-as-a-Service (BaaS), platforms are often treated as magic boxes. You push data in, you get data out, and you hope the magic inside scales. While this simplicity is powerful, it can obscure the underlying mechanics, leaving developers wondering what&amp;rsquo;s really going on.&lt;/p>
 &lt;p>Supabase enters this space with a radically different philosophy: &lt;strong>transparency&lt;/strong>. It provides the convenience of a BaaS, but it’s built on the world&amp;rsquo;s most trusted relational database: PostgreSQL. The &amp;ldquo;magic&amp;rdquo; isn&amp;rsquo;t a proprietary black box; it&amp;rsquo;s a carefully assembled suite of open-source tools that enhance Postgres, not hide it.&lt;/p></description></item><item><title>A Deep Dive into PPO for Language Models</title><link>/posts/a-deep-dive-into-ppo-for-language-models/</link><pubDate>Sat, 02 Aug 2025 00:00:00 +0000</pubDate><guid>/posts/a-deep-dive-into-ppo-for-language-models/</guid><description>&lt;p>Large Language Models (LLMs) have demonstrated astonishing capabilities, but out-of-the-box, they are simply powerful text predictors. They don&amp;rsquo;t inherently understand what makes a response helpful, harmless, or aligned with human values. The technique that has proven most effective at bridging this gap is Reinforcement Learning from Human Feedback (RLHF), and at its heart lies a powerful algorithm: Proximal Policy Optimization (PPO).&lt;/p>
 &lt;p>You may have seen diagrams like the one below, which outlines the RLHF training process. It can look intimidating, with a web of interconnected models, losses, and data flows.&lt;/p></description></item><item><title>Mixture-of-Experts (MoE) Models Challenges &amp; Solutions in Practice</title><link>/posts/mixture-of-experts-moe-models-challenges-solutions-in-practice/</link><pubDate>Wed, 02 Jul 2025 00:00:00 +0000</pubDate><guid>/posts/mixture-of-experts-moe-models-challenges-solutions-in-practice/</guid><description>&lt;p>Mixture-of-Experts (MoEs) are neural network architectures that allow different parts of the model (called &amp;ldquo;experts&amp;rdquo;) to specialize in different types of inputs. A &amp;ldquo;gating network&amp;rdquo; or &amp;ldquo;router&amp;rdquo; learns to dispatch each input (or &amp;ldquo;token&amp;rdquo;) to a subset of these experts. While powerful for scaling models, MoEs introduce several practical challenges.&lt;/p>
 &lt;h3 id="1-challenge-non-differentiability-of-routing-functions">
diff --git a/posts/a-deep-dive-into-ppo-for-language-models/index.html b/posts/a-deep-dive-into-ppo-for-language-models/index.html
index 2ea933e..22c1c0d 100644
--- a/posts/a-deep-dive-into-ppo-for-language-models/index.html
+++ b/posts/a-deep-dive-into-ppo-for-language-models/index.html
@@ -23,4 +23,4 @@ where <code>δ_t = r_t + γV(s_{t+1}) - V(s_t)</code></p><ul><li><strong>γ (gam
 2016 -
 2025
 Eric X. Liu
-<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/c25cd89">[c25cd89]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
\ No newline at end of file
+<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/c9ed800">[c9ed800]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
\ No newline at end of file
diff --git a/posts/espresso-theory-application-a-guide-for-the-breville-barista-pro/index.html b/posts/espresso-theory-application-a-guide-for-the-breville-barista-pro/index.html
index fb1e0cf..b8af4f7 100644
--- a/posts/espresso-theory-application-a-guide-for-the-breville-barista-pro/index.html
+++ b/posts/espresso-theory-application-a-guide-for-the-breville-barista-pro/index.html
@@ -20,4 +20,4 @@ Our overarching philosophy is simple: isolate and change only one variable at a
 2016 -
 2025
 Eric X. Liu
-<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/c25cd89">[c25cd89]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
\ No newline at end of file
+<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/c9ed800">[c9ed800]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
\ No newline at end of file
diff --git a/posts/how-rvq-teaches-llms-to-see-and-hear/index.html b/posts/how-rvq-teaches-llms-to-see-and-hear/index.html
new file mode 100644
index 0000000..24353e2
--- /dev/null
+++ b/posts/how-rvq-teaches-llms-to-see-and-hear/index.html
@@ -0,0 +1,21 @@
+<!doctype html><html lang=en><head><title>Beyond Words: How RVQ Teaches LLMs to See and Hear · Eric X. Liu's Personal Page</title><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><meta name=color-scheme content="light dark"><meta name=author content="Eric X. Liu"><meta name=description content="Large Language Models (LLMs) are masters of text, but the world is not made of text alone. It’s a symphony of sights, sounds, and experiences. The ultimate goal for AI is to understand this rich, multi-modal world as we do. But how do you teach a model that thinks in words to understand a picture of a sunset or the melody of a song?
+The answer lies in creating a universal language—a bridge between the continuous, messy world of pixels and audio waves and the discrete, structured world of language tokens. One of the most elegant and powerful tools for building this bridge is Residual Vector Quantization (RVQ)."><meta name=keywords content="software engineer,performance engineering,Google engineer,tech blog,software development,performance optimization,Eric Liu,engineering blog,mountain biking,Jeep enthusiast,overlanding,camping,outdoor adventures"><meta name=twitter:card content="summary"><meta name=twitter:title content="Beyond Words: How RVQ Teaches LLMs to See and Hear"><meta name=twitter:description content="Large Language Models (LLMs) are masters of text, but the world is not made of text alone. It’s a symphony of sights, sounds, and experiences. The ultimate goal for AI is to understand this rich, multi-modal world as we do. But how do you teach a model that thinks in words to understand a picture of a sunset or the melody of a song?
+The answer lies in creating a universal language—a bridge between the continuous, messy world of pixels and audio waves and the discrete, structured world of language tokens. One of the most elegant and powerful tools for building this bridge is Residual Vector Quantization (RVQ)."><meta property="og:url" content="/posts/how-rvq-teaches-llms-to-see-and-hear/"><meta property="og:site_name" content="Eric X. Liu's Personal Page"><meta property="og:title" content="Beyond Words: How RVQ Teaches LLMs to See and Hear"><meta property="og:description" content="Large Language Models (LLMs) are masters of text, but the world is not made of text alone. It’s a symphony of sights, sounds, and experiences. The ultimate goal for AI is to understand this rich, multi-modal world as we do. But how do you teach a model that thinks in words to understand a picture of a sunset or the melody of a song?
+The answer lies in creating a universal language—a bridge between the continuous, messy world of pixels and audio waves and the discrete, structured world of language tokens. One of the most elegant and powerful tools for building this bridge is Residual Vector Quantization (RVQ)."><meta property="og:locale" content="en"><meta property="og:type" content="article"><meta property="article:section" content="posts"><meta property="article:published_time" content="2025-08-07T00:00:00+00:00"><meta property="article:modified_time" content="2025-08-08T17:36:52+00:00"><link rel=canonical href=/posts/how-rvq-teaches-llms-to-see-and-hear/><link rel=preload href=/fonts/fa-brands-400.woff2 as=font type=font/woff2 crossorigin><link rel=preload href=/fonts/fa-regular-400.woff2 as=font type=font/woff2 crossorigin><link rel=preload href=/fonts/fa-solid-900.woff2 as=font type=font/woff2 crossorigin><link rel=stylesheet href=/css/coder.min.6445a802b9389c9660e1b07b724dcf5718b1065ed2d71b4eeaf981cc7cc5fc46.css integrity="sha256-ZEWoArk4nJZg4bB7ck3PVxixBl7S1xtO6vmBzHzF/EY=" crossorigin=anonymous media=screen><link rel=stylesheet href=/css/coder-dark.min.a00e6364bacbc8266ad1cc81230774a1397198f8cfb7bcba29b7d6fcb54ce57f.css integrity="sha256-oA5jZLrLyCZq0cyBIwd0oTlxmPjPt7y6KbfW/LVM5X8=" crossorigin=anonymous media=screen><link rel=icon type=image/svg+xml href=/images/favicon.svg sizes=any><link rel=icon type=image/png href=/images/favicon-32x32.png sizes=32x32><link rel=icon type=image/png href=/images/favicon-16x16.png sizes=16x16><link rel=apple-touch-icon href=/images/apple-touch-icon.png><link rel=apple-touch-icon sizes=180x180 href=/images/apple-touch-icon.png><link rel=manifest href=/site.webmanifest><link rel=mask-icon href=/images/safari-pinned-tab.svg color=#5bbad5></head><body class="preload-transitions colorscheme-auto"><div class=float-container><a id=dark-mode-toggle class=colorscheme-toggle><i class="fa-solid fa-adjust fa-fw" aria-hidden=true></i></a></div><main class=wrapper><nav class=navigation><section class=container><a class=navigation-title href=/>Eric X. Liu's Personal Page
+</a><input type=checkbox id=menu-toggle>
+<label class="menu-button float-right" for=menu-toggle><i class="fa-solid fa-bars fa-fw" aria-hidden=true></i></label><ul class=navigation-list><li class=navigation-item><a class=navigation-link href=/posts/>Posts</a></li><li class=navigation-item><a class=navigation-link href=https://chat.ericxliu.me>Chat</a></li><li class=navigation-item><a class=navigation-link href=https://git.ericxliu.me/user/oauth2/Authenitk>Git</a></li><li class=navigation-item><a class=navigation-link href=https://coder.ericxliu.me/api/v2/users/oidc/callback>Coder</a></li><li class=navigation-item><a class=navigation-link href=https://rss.ericxliu.me/oauth2/oidc/redirect>RSS</a></li><li class=navigation-item><a class=navigation-link href=/>|</a></li><li class=navigation-item><a class=navigation-link href=https://sso.ericxliu.me>Sign in</a></li></ul></section></nav><div class=content><section class="container post"><article><header><div class=post-title><h1 class=title><a class=title-link href=/posts/how-rvq-teaches-llms-to-see-and-hear/>Beyond Words: How RVQ Teaches LLMs to See and Hear</a></h1></div><div class=post-meta><div class=date><span class=posted-on><i class="fa-solid fa-calendar" aria-hidden=true></i>
+<time datetime=2025-08-07T00:00:00Z>August 7, 2025
+</time></span><span class=reading-time><i class="fa-solid fa-clock" aria-hidden=true></i>
+6-minute read</span></div></div></header><div class=post-content><p>Large Language Models (LLMs) are masters of text, but the world is not made of text alone. It’s a symphony of sights, sounds, and experiences. The ultimate goal for AI is to understand this rich, multi-modal world as we do. But how do you teach a model that thinks in words to understand a picture of a sunset or the melody of a song?</p><p>The answer lies in creating a universal language—a bridge between the continuous, messy world of pixels and audio waves and the discrete, structured world of language tokens. One of the most elegant and powerful tools for building this bridge is <strong>Residual Vector Quantization (RVQ)</strong>.</p><p>This article dives deep into RVQ, exploring how it turns raw data into meaningful semantic IDs and how these IDs, in turn, unlock multi-modal understanding in LLMs.</p><h4 id=what-is-residual-vector-quantization-the-art-of-smart-compression><strong>What is Residual Vector Quantization? The Art of Smart Compression</strong>
+<a class=heading-link href=#what-is-residual-vector-quantization-the-art-of-smart-compression><i class="fa-solid fa-link" aria-hidden=true title="Link to heading"></i>
+<span class=sr-only>Link to heading</span></a></h4><p>At its core, Vector Quantization (VQ) is a compression technique. It maps a high-dimensional vector (like an data embedding) to the single closest vector in a predefined dictionary, called a <strong>codebook</strong>. You then only need to store the index of that chosen vector. The problem? To represent complex data accurately, you&rsquo;d need a codebook with an astronomical number of entries, which is computationally impossible.</p><p>This is where <strong>Residual</strong> Vector Quantization shines. Instead of one giant codebook, RVQ uses a series of smaller codebooks in stages.</p><ol><li><strong>Stage 1 (Coarse Quantization):</strong> The input vector is quantized by the first codebook. This finds the broadest, most general category for the data.</li><li><strong>Calculate the Residual:</strong> The system calculates the error, or &ldquo;residual,&rdquo; between the original vector and its quantized version from Stage 1. This residual vector represents the information that was lost in the first coarse approximation.</li><li><strong>Stage 2 (Refinement):</strong> This residual vector is then quantized by the <em>second</em> codebook. This stage doesn&rsquo;t re-evaluate the whole vector, but only focuses on correcting the error from the previous stage.</li><li><strong>Iterate:</strong> This process repeats for several stages, with each subsequent codebook quantizing the residual error from the previous one, adding a finer and finer layer of detail.</li></ol><p>The final compressed representation is simply the sequence of indices from each codebook. For example, an ID like <code>[8, 5, 4, 1]</code> is produced. The magic of this approach is that it creates a <strong>hierarchical ID</strong>. The first digit <code>[8]</code> might represent &ldquo;Sports,&rdquo; the next <code>[5]</code> refines it to &ldquo;Court Sports,&rdquo; <code>[4]</code> to &ldquo;Beach Volleyball,&rdquo; and the final <code>[1]</code> distinguishes a specific match. Videos with similar content will naturally share a longer prefix in their Semantic ID.</p><h4 id=learning-what-matters-the-trainable-vq-autoencoder><strong>Learning What Matters: The Trainable VQ-Autoencoder</strong>
+<a class=heading-link href=#learning-what-matters-the-trainable-vq-autoencoder><i class="fa-solid fa-link" aria-hidden=true title="Link to heading"></i>
+<span class=sr-only>Link to heading</span></a></h4><p>A key insight is that RVQ is not a fixed algorithm but a <strong>trainable neural network component</strong>. Its codebooks are not predefined; they are learned. This learning happens within a <strong>Vector-Quantized Autoencoder (VQ-AE)</strong> architecture.</p><ol><li><strong>Encoder:</strong> A powerful neural network (e.g., a Transformer or CNN) takes the raw data (like video frames and audio) and converts it into a continuous semantic embedding.</li><li><strong>RVQ Bottleneck:</strong> This embedding is fed into the RVQ module, which quantizes it into the sequence of discrete IDs.</li><li><strong>Decoder:</strong> The decoder takes these discrete IDs, looks up the corresponding codebook vectors, sums them up to get a reconstructed embedding, and attempts to rebuild the original video/audio.</li></ol><p>The entire system is trained end-to-end. The <strong>reconstruction loss</strong> (the difference between the original and reconstructed data) is used to update the parameters of the Encoder, the Decoder, and, most importantly, <strong>the codebook vectors within the RVQ module</strong>. Initially random, the codebook vectors are gradually pushed to become meaningful &ldquo;anchors&rdquo; for the core concepts present in the training data.</p><h4 id=from-implicit-to-explicit-controlling-semantics-with-contrastive-learning><strong>From Implicit to Explicit: Controlling Semantics with Contrastive Learning</strong>
+<a class=heading-link href=#from-implicit-to-explicit-controlling-semantics-with-contrastive-learning><i class="fa-solid fa-link" aria-hidden=true title="Link to heading"></i>
+<span class=sr-only>Link to heading</span></a></h4><p>A standard VQ-AE learns implicit semantics. It gets good at reconstruction, but we can&rsquo;t control <em>what</em> concepts it learns. To make the Semantic IDs truly meaningful and aligned with human language, we introduce <strong>contrastive learning</strong>.</p><p>The architecture is enhanced with a parallel text encoder (like BERT or CLIP&rsquo;s). The model is then trained with a joint loss function:</p><p><code>L_total = L_reconstruction + λ * L_contrastive</code></p><ul><li><strong>Reconstruction Loss</strong> ensures the RVQ codes contain enough information to rebuild the input.</li><li><strong>Contrastive Loss</strong> forces the media embedding (from the video/audio encoder) to be mathematically &ldquo;close&rdquo; to the text embedding of its description, and &ldquo;far&rdquo; from the embeddings of unrelated text descriptions.</li></ul><p>This dual goal forces the model to organize its embedding space according to the semantics of human language. The codebook vectors now learn to represent concepts that are not just useful for reconstruction, but are also tied to explicit textual descriptions.</p><h4 id=integrating-with-llms-two-powerful-paths-to-multi-modality><strong>Integrating with LLMs: Two Powerful Paths to Multi-Modality</strong>
+<a class=heading-link href=#integrating-with-llms-two-powerful-paths-to-multi-modality><i class="fa-solid fa-link" aria-hidden=true title="Link to heading"></i>
+<span class=sr-only>Link to heading</span></a></h4><p>Once we have a contrastively-trained VQ-AE, we can use its output to give LLMs the ability to see and hear. There are two primary strategies for this.</p><p><strong>Path 1: The Tokenizer Approach - Teaching the LLM a New Language</strong></p><p>This path treats the RVQ IDs as a new vocabulary. It’s a two-stage process ideal for high-fidelity content generation.</p><ol><li><strong>Create a Neural Codec:</strong> The trained VQ-AE serves as a powerful &ldquo;codec.&rdquo; You can take any piece of media (e.g., a song) and use the codec to compress it into a sequence of discrete RVQ tokens (e.g., <code>[8, 5, 4, 1, 8, 5, 9, 2, ...]</code>).</li><li><strong>Train a Generative LLM:</strong> A new Transformer model is trained auto-regressively on a massive dataset of these media-derived tokens. Its sole purpose is to learn the patterns and predict the next token in a sequence.</li></ol><p><strong>Use Case:</strong> This is the architecture behind models like Meta&rsquo;s MusicGen. A user provides a text prompt, which conditions the Transformer to generate a new sequence of RVQ tokens. These tokens are then fed to the VQ-AE&rsquo;s decoder to synthesize the final audio waveform.</p><p><strong>Path 2: The Adapter Approach - Translating for a Language Expert</strong></p><p>This path is used to augment a powerful, pre-trained, text-only LLM without the astronomical cost of retraining it.</p><ol><li><strong>Freeze the LLM:</strong> A massive, pre-trained LLM (like LLaMA) is frozen. Its deep language understanding is preserved.</li><li><strong>Use the Pre-Quantized Embedding:</strong> Instead of using the discrete RVQ tokens, we take the rich, continuous embedding vector produced by our media encoder <em>just before</em> it enters the RVQ module.</li><li><strong>Train a Small Adapter:</strong> A small, lightweight projection layer (or &ldquo;adapter&rdquo;) is trained. Its only job is to translate the media embedding into a vector that has the same format and structure as the LLM&rsquo;s own word embeddings. It learns to map visual concepts to their corresponding &ldquo;word&rdquo; concepts in the LLM&rsquo;s latent space.</li></ol><p><strong>Use Case:</strong> This is the principle behind models like Google&rsquo;s Flamingo. To answer a question about an image, the image is passed through the media encoder and adapter. The resulting &ldquo;vision-as-a-word&rdquo; vector is inserted into the prompt sequence alongside the text tokens. The frozen LLM can now &ldquo;reason&rdquo; about the visual input because it has been translated into a format it already understands.</p></div><footer><div id=disqus_thread></div><script>window.disqus_config=function(){},function(){if(["localhost","127.0.0.1"].indexOf(window.location.hostname)!=-1){document.getElementById("disqus_thread").innerHTML="Disqus comments not available by default when the website is previewed locally.";return}var t=document,e=t.createElement("script");e.async=!0,e.src="//ericxliu-me.disqus.com/embed.js",e.setAttribute("data-timestamp",+new Date),(t.head||t.body).appendChild(e)}(),document.addEventListener("themeChanged",function(){document.readyState=="complete"&&DISQUS.reset({reload:!0,config:disqus_config})})</script></footer></article><link rel=stylesheet href=https://cdn.jsdelivr.net/npm/katex@0.16.4/dist/katex.min.css integrity=sha384-vKruj+a13U8yHIkAyGgK1J3ArTLzrFGBbBc0tDp4ad/EyewESeXE/Iv67Aj8gKZ0 crossorigin=anonymous><script defer src=https://cdn.jsdelivr.net/npm/katex@0.16.4/dist/katex.min.js integrity=sha384-PwRUT/YqbnEjkZO0zZxNqcxACrXe+j766U2amXcgMg5457rve2Y7I6ZJSm2A0mS4 crossorigin=anonymous></script><script defer src=https://cdn.jsdelivr.net/npm/katex@0.16.4/dist/contrib/auto-render.min.js integrity=sha384-+VBxd3r6XgURycqtZ117nYw44OOcIax56Z4dCRWbxyPt0Koah1uHoK0o4+/RRE05 crossorigin=anonymous onload='renderMathInElement(document.body,{delimiters:[{left:"$$",right:"$$",display:!0},{left:"$",right:"$",display:!1},{left:"\\(",right:"\\)",display:!1},{left:"\\[",right:"\\]",display:!0}]})'></script></section></div><footer class=footer><section class=container>©
+2016 -
+2025
+Eric X. Liu
+<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/c9ed800">[c9ed800]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
\ No newline at end of file
diff --git a/posts/index.html b/posts/index.html
index 1ebd185..27554ca 100644
--- a/posts/index.html
+++ b/posts/index.html
@@ -1,6 +1,7 @@
 <!doctype html><html lang=en><head><title>Posts · Eric X. Liu's Personal Page</title><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><meta name=color-scheme content="light dark"><meta name=author content="Eric X. Liu"><meta name=description content="Eric X. Liu - Software & Performance Engineer at Google. Sharing insights about software engineering, performance optimization, tech industry experiences, mountain biking adventures, Jeep overlanding, and outdoor activities."><meta name=keywords content="software engineer,performance engineering,Google engineer,tech blog,software development,performance optimization,Eric Liu,engineering blog,mountain biking,Jeep enthusiast,overlanding,camping,outdoor adventures"><meta name=twitter:card content="summary"><meta name=twitter:title content="Posts"><meta name=twitter:description content="Eric X. Liu - Software & Performance Engineer at Google. Sharing insights about software engineering, performance optimization, tech industry experiences, mountain biking adventures, Jeep overlanding, and outdoor activities."><meta property="og:url" content="/posts/"><meta property="og:site_name" content="Eric X. Liu's Personal Page"><meta property="og:title" content="Posts"><meta property="og:description" content="Eric X. Liu - Software & Performance Engineer at Google. Sharing insights about software engineering, performance optimization, tech industry experiences, mountain biking adventures, Jeep overlanding, and outdoor activities."><meta property="og:locale" content="en"><meta property="og:type" content="website"><link rel=canonical href=/posts/><link rel=preload href=/fonts/fa-brands-400.woff2 as=font type=font/woff2 crossorigin><link rel=preload href=/fonts/fa-regular-400.woff2 as=font type=font/woff2 crossorigin><link rel=preload href=/fonts/fa-solid-900.woff2 as=font type=font/woff2 crossorigin><link rel=stylesheet href=/css/coder.min.6445a802b9389c9660e1b07b724dcf5718b1065ed2d71b4eeaf981cc7cc5fc46.css integrity="sha256-ZEWoArk4nJZg4bB7ck3PVxixBl7S1xtO6vmBzHzF/EY=" crossorigin=anonymous media=screen><link rel=stylesheet href=/css/coder-dark.min.a00e6364bacbc8266ad1cc81230774a1397198f8cfb7bcba29b7d6fcb54ce57f.css integrity="sha256-oA5jZLrLyCZq0cyBIwd0oTlxmPjPt7y6KbfW/LVM5X8=" crossorigin=anonymous media=screen><link rel=icon type=image/svg+xml href=/images/favicon.svg sizes=any><link rel=icon type=image/png href=/images/favicon-32x32.png sizes=32x32><link rel=icon type=image/png href=/images/favicon-16x16.png sizes=16x16><link rel=apple-touch-icon href=/images/apple-touch-icon.png><link rel=apple-touch-icon sizes=180x180 href=/images/apple-touch-icon.png><link rel=manifest href=/site.webmanifest><link rel=mask-icon href=/images/safari-pinned-tab.svg color=#5bbad5><link rel=alternate type=application/rss+xml href=/posts/index.xml title="Eric X. Liu's Personal Page"></head><body class="preload-transitions colorscheme-auto"><div class=float-container><a id=dark-mode-toggle class=colorscheme-toggle><i class="fa-solid fa-adjust fa-fw" aria-hidden=true></i></a></div><main class=wrapper><nav class=navigation><section class=container><a class=navigation-title href=/>Eric X. Liu's Personal Page
 </a><input type=checkbox id=menu-toggle>
-<label class="menu-button float-right" for=menu-toggle><i class="fa-solid fa-bars fa-fw" aria-hidden=true></i></label><ul class=navigation-list><li class=navigation-item><a class=navigation-link href=/posts/>Posts</a></li><li class=navigation-item><a class=navigation-link href=https://chat.ericxliu.me>Chat</a></li><li class=navigation-item><a class=navigation-link href=https://git.ericxliu.me/user/oauth2/Authenitk>Git</a></li><li class=navigation-item><a class=navigation-link href=https://coder.ericxliu.me/api/v2/users/oidc/callback>Coder</a></li><li class=navigation-item><a class=navigation-link href=https://rss.ericxliu.me/oauth2/oidc/redirect>RSS</a></li><li class=navigation-item><a class=navigation-link href=/>|</a></li><li class=navigation-item><a class=navigation-link href=https://sso.ericxliu.me>Sign in</a></li></ul></section></nav><div class=content><section class="container list"><header><h1 class=title><a class=title-link href=/posts/>Posts</a></h1></header><ul><li><span class=date>August 3, 2025</span>
+<label class="menu-button float-right" for=menu-toggle><i class="fa-solid fa-bars fa-fw" aria-hidden=true></i></label><ul class=navigation-list><li class=navigation-item><a class=navigation-link href=/posts/>Posts</a></li><li class=navigation-item><a class=navigation-link href=https://chat.ericxliu.me>Chat</a></li><li class=navigation-item><a class=navigation-link href=https://git.ericxliu.me/user/oauth2/Authenitk>Git</a></li><li class=navigation-item><a class=navigation-link href=https://coder.ericxliu.me/api/v2/users/oidc/callback>Coder</a></li><li class=navigation-item><a class=navigation-link href=https://rss.ericxliu.me/oauth2/oidc/redirect>RSS</a></li><li class=navigation-item><a class=navigation-link href=/>|</a></li><li class=navigation-item><a class=navigation-link href=https://sso.ericxliu.me>Sign in</a></li></ul></section></nav><div class=content><section class="container list"><header><h1 class=title><a class=title-link href=/posts/>Posts</a></h1></header><ul><li><span class=date>August 7, 2025</span>
+<a class=title href=/posts/how-rvq-teaches-llms-to-see-and-hear/>Beyond Words: How RVQ Teaches LLMs to See and Hear</a></li><li><span class=date>August 3, 2025</span>
 <a class=title href=/posts/supabase-deep-dive/>Supabase Deep Dive: It's Not Magic, It's Just Postgres</a></li><li><span class=date>August 2, 2025</span>
 <a class=title href=/posts/a-deep-dive-into-ppo-for-language-models/>A Deep Dive into PPO for Language Models</a></li><li><span class=date>July 2, 2025</span>
 <a class=title href=/posts/mixture-of-experts-moe-models-challenges-solutions-in-practice/>Mixture-of-Experts (MoE) Models Challenges & Solutions in Practice</a></li><li><span class=date>June 1, 2025</span>
@@ -10,4 +11,4 @@
 2016 -
 2025
 Eric X. Liu
-<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/c25cd89">[c25cd89]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
\ No newline at end of file
+<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/c9ed800">[c9ed800]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
\ No newline at end of file
diff --git a/posts/index.xml b/posts/index.xml
index 9bd04c8..4f6b254 100644
--- a/posts/index.xml
+++ b/posts/index.xml
@@ -1,4 +1,5 @@
-<?xml version="1.0" encoding="utf-8" standalone="yes"?><rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom"><channel><title>Posts on Eric X. Liu's Personal Page</title><link>/posts/</link><description>Recent content in Posts on Eric X. Liu's Personal Page</description><generator>Hugo</generator><language>en</language><lastBuildDate>Mon, 04 Aug 2025 03:59:37 +0000</lastBuildDate><atom:link href="/posts/index.xml" rel="self" type="application/rss+xml"/><item><title>Supabase Deep Dive: It's Not Magic, It's Just Postgres</title><link>/posts/supabase-deep-dive/</link><pubDate>Sun, 03 Aug 2025 00:00:00 +0000</pubDate><guid>/posts/supabase-deep-dive/</guid><description>&lt;p>In the world of Backend-as-a-Service (BaaS), platforms are often treated as magic boxes. You push data in, you get data out, and you hope the magic inside scales. While this simplicity is powerful, it can obscure the underlying mechanics, leaving developers wondering what&amp;rsquo;s really going on.&lt;/p>
+<?xml version="1.0" encoding="utf-8" standalone="yes"?><rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom"><channel><title>Posts on Eric X. Liu's Personal Page</title><link>/posts/</link><description>Recent content in Posts on Eric X. Liu's Personal Page</description><generator>Hugo</generator><language>en</language><lastBuildDate>Fri, 08 Aug 2025 17:36:52 +0000</lastBuildDate><atom:link href="/posts/index.xml" rel="self" type="application/rss+xml"/><item><title>Beyond Words: How RVQ Teaches LLMs to See and Hear</title><link>/posts/how-rvq-teaches-llms-to-see-and-hear/</link><pubDate>Thu, 07 Aug 2025 00:00:00 +0000</pubDate><guid>/posts/how-rvq-teaches-llms-to-see-and-hear/</guid><description>&lt;p>Large Language Models (LLMs) are masters of text, but the world is not made of text alone. It’s a symphony of sights, sounds, and experiences. The ultimate goal for AI is to understand this rich, multi-modal world as we do. But how do you teach a model that thinks in words to understand a picture of a sunset or the melody of a song?&lt;/p>
+&lt;p>The answer lies in creating a universal language—a bridge between the continuous, messy world of pixels and audio waves and the discrete, structured world of language tokens. One of the most elegant and powerful tools for building this bridge is &lt;strong>Residual Vector Quantization (RVQ)&lt;/strong>.&lt;/p></description></item><item><title>Supabase Deep Dive: It's Not Magic, It's Just Postgres</title><link>/posts/supabase-deep-dive/</link><pubDate>Sun, 03 Aug 2025 00:00:00 +0000</pubDate><guid>/posts/supabase-deep-dive/</guid><description>&lt;p>In the world of Backend-as-a-Service (BaaS), platforms are often treated as magic boxes. You push data in, you get data out, and you hope the magic inside scales. While this simplicity is powerful, it can obscure the underlying mechanics, leaving developers wondering what&amp;rsquo;s really going on.&lt;/p>
 &lt;p>Supabase enters this space with a radically different philosophy: &lt;strong>transparency&lt;/strong>. It provides the convenience of a BaaS, but it’s built on the world&amp;rsquo;s most trusted relational database: PostgreSQL. The &amp;ldquo;magic&amp;rdquo; isn&amp;rsquo;t a proprietary black box; it&amp;rsquo;s a carefully assembled suite of open-source tools that enhance Postgres, not hide it.&lt;/p></description></item><item><title>A Deep Dive into PPO for Language Models</title><link>/posts/a-deep-dive-into-ppo-for-language-models/</link><pubDate>Sat, 02 Aug 2025 00:00:00 +0000</pubDate><guid>/posts/a-deep-dive-into-ppo-for-language-models/</guid><description>&lt;p>Large Language Models (LLMs) have demonstrated astonishing capabilities, but out-of-the-box, they are simply powerful text predictors. They don&amp;rsquo;t inherently understand what makes a response helpful, harmless, or aligned with human values. The technique that has proven most effective at bridging this gap is Reinforcement Learning from Human Feedback (RLHF), and at its heart lies a powerful algorithm: Proximal Policy Optimization (PPO).&lt;/p>
 &lt;p>You may have seen diagrams like the one below, which outlines the RLHF training process. It can look intimidating, with a web of interconnected models, losses, and data flows.&lt;/p></description></item><item><title>Mixture-of-Experts (MoE) Models Challenges &amp; Solutions in Practice</title><link>/posts/mixture-of-experts-moe-models-challenges-solutions-in-practice/</link><pubDate>Wed, 02 Jul 2025 00:00:00 +0000</pubDate><guid>/posts/mixture-of-experts-moe-models-challenges-solutions-in-practice/</guid><description>&lt;p>Mixture-of-Experts (MoEs) are neural network architectures that allow different parts of the model (called &amp;ldquo;experts&amp;rdquo;) to specialize in different types of inputs. A &amp;ldquo;gating network&amp;rdquo; or &amp;ldquo;router&amp;rdquo; learns to dispatch each input (or &amp;ldquo;token&amp;rdquo;) to a subset of these experts. While powerful for scaling models, MoEs introduce several practical challenges.&lt;/p>
 &lt;h3 id="1-challenge-non-differentiability-of-routing-functions">
diff --git a/posts/mixture-of-experts-moe-models-challenges-solutions-in-practice/index.html b/posts/mixture-of-experts-moe-models-challenges-solutions-in-practice/index.html
index 4a52517..1f5aeb7 100644
--- a/posts/mixture-of-experts-moe-models-challenges-solutions-in-practice/index.html
+++ b/posts/mixture-of-experts-moe-models-challenges-solutions-in-practice/index.html
@@ -44,4 +44,4 @@ The <strong>Top-K routing</strong> mechanism, as illustrated in the provided ima
 2016 -
 2025
 Eric X. Liu
-<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/c25cd89">[c25cd89]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
\ No newline at end of file
+<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/c9ed800">[c9ed800]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
\ No newline at end of file
diff --git a/posts/supabase-deep-dive/index.html b/posts/supabase-deep-dive/index.html
index 0fa251f..891faac 100644
--- a/posts/supabase-deep-dive/index.html
+++ b/posts/supabase-deep-dive/index.html
@@ -90,4 +90,4 @@ Supabase enters this space with a radically different philosophy: transparency.
 2016 -
 2025
 Eric X. Liu
-<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/c25cd89">[c25cd89]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
\ No newline at end of file
+<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/c9ed800">[c9ed800]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
\ No newline at end of file
diff --git a/posts/t5-the-transformer-that-zigged-when-others-zagged-an-architectural-deep-dive/index.html b/posts/t5-the-transformer-that-zigged-when-others-zagged-an-architectural-deep-dive/index.html
index 57dc12a..414888f 100644
--- a/posts/t5-the-transformer-that-zigged-when-others-zagged-an-architectural-deep-dive/index.html
+++ b/posts/t5-the-transformer-that-zigged-when-others-zagged-an-architectural-deep-dive/index.html
@@ -30,4 +30,4 @@ But to truly understand the field, we must look at the pivotal models that explo
 2016 -
 2025
 Eric X. Liu
-<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/c25cd89">[c25cd89]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
\ No newline at end of file
+<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/c9ed800">[c9ed800]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
\ No newline at end of file
diff --git a/posts/useful/index.html b/posts/useful/index.html
index e3f6752..aaf65f8 100644
--- a/posts/useful/index.html
+++ b/posts/useful/index.html
@@ -9,4 +9,4 @@ One-minute read</span></div></div></header><div class=post-content><ul><li><a hr
 2016 -
 2025
 Eric X. Liu
-<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/c25cd89">[c25cd89]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
\ No newline at end of file
+<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/c9ed800">[c9ed800]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
\ No newline at end of file
diff --git a/sitemap.xml b/sitemap.xml
index 79d93ac..5c9c927 100644
--- a/sitemap.xml
+++ b/sitemap.xml
@@ -1 +1 @@
-<?xml version="1.0" encoding="utf-8" standalone="yes"?><urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:xhtml="http://www.w3.org/1999/xhtml"><url><loc>/</loc><lastmod>2025-08-04T03:59:37+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/posts/</loc><lastmod>2025-08-04T03:59:37+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/posts/supabase-deep-dive/</loc><lastmod>2025-08-04T03:59:37+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/posts/a-deep-dive-into-ppo-for-language-models/</loc><lastmod>2025-08-03T03:28:39+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/posts/mixture-of-experts-moe-models-challenges-solutions-in-practice/</loc><lastmod>2025-08-03T06:02:48+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/posts/t5-the-transformer-that-zigged-when-others-zagged-an-architectural-deep-dive/</loc><lastmod>2025-08-03T03:41:10+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/posts/espresso-theory-application-a-guide-for-the-breville-barista-pro/</loc><lastmod>2025-08-03T04:20:20+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/posts/useful/</loc><lastmod>2025-08-03T08:37:28-07:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/about/</loc><lastmod>2020-06-16T23:30:17-07:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/categories/</loc><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/tags/</loc><changefreq>weekly</changefreq><priority>0.5</priority></url></urlset>
\ No newline at end of file
+<?xml version="1.0" encoding="utf-8" standalone="yes"?><urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:xhtml="http://www.w3.org/1999/xhtml"><url><loc>/posts/how-rvq-teaches-llms-to-see-and-hear/</loc><lastmod>2025-08-08T17:36:52+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/</loc><lastmod>2025-08-08T17:36:52+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/posts/</loc><lastmod>2025-08-08T17:36:52+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/posts/supabase-deep-dive/</loc><lastmod>2025-08-04T03:59:37+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/posts/a-deep-dive-into-ppo-for-language-models/</loc><lastmod>2025-08-03T03:28:39+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/posts/mixture-of-experts-moe-models-challenges-solutions-in-practice/</loc><lastmod>2025-08-03T06:02:48+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/posts/t5-the-transformer-that-zigged-when-others-zagged-an-architectural-deep-dive/</loc><lastmod>2025-08-03T03:41:10+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/posts/espresso-theory-application-a-guide-for-the-breville-barista-pro/</loc><lastmod>2025-08-03T04:20:20+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/posts/useful/</loc><lastmod>2025-08-03T08:37:28-07:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/about/</loc><lastmod>2020-06-16T23:30:17-07:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/categories/</loc><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/tags/</loc><changefreq>weekly</changefreq><priority>0.5</priority></url></urlset>
\ No newline at end of file
diff --git a/tags/index.html b/tags/index.html
index 99c2c27..00fb3a6 100644
--- a/tags/index.html
+++ b/tags/index.html
@@ -4,4 +4,4 @@
 2016 -
 2025
 Eric X. Liu
-<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/c25cd89">[c25cd89]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
\ No newline at end of file
+<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/c9ed800">[c9ed800]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
\ No newline at end of file