diff --git a/404.html b/404.html
index 5f4f793..20e0bf2 100644
--- a/404.html
+++ b/404.html
@@ -4,4 +4,4 @@
 2016 -
 2025
 Eric X. Liu
-<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/84b3c20">[84b3c20]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script></body></html>
\ No newline at end of file
+<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/e15a7ff">[e15a7ff]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script></body></html>
\ No newline at end of file
diff --git a/about/index.html b/about/index.html
index 0b0cd05..d1a9853 100644
--- a/about/index.html
+++ b/about/index.html
@@ -4,4 +4,4 @@
 2016 -
 2025
 Eric X. Liu
-<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/84b3c20">[84b3c20]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script></body></html>
\ No newline at end of file
+<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/e15a7ff">[e15a7ff]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script></body></html>
\ No newline at end of file
diff --git a/categories/index.html b/categories/index.html
index ed3b24a..41ac209 100644
--- a/categories/index.html
+++ b/categories/index.html
@@ -4,4 +4,4 @@
 2016 -
 2025
 Eric X. Liu
-<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/84b3c20">[84b3c20]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script></body></html>
\ No newline at end of file
+<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/e15a7ff">[e15a7ff]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script></body></html>
\ No newline at end of file
diff --git a/index.html b/index.html
index 95e5f1f..fd50c59 100644
--- a/index.html
+++ b/index.html
@@ -4,4 +4,4 @@
 2016 -
 2025
 Eric X. Liu
-<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/84b3c20">[84b3c20]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script></body></html>
\ No newline at end of file
+<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/e15a7ff">[e15a7ff]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script></body></html>
\ No newline at end of file
diff --git a/index.xml b/index.xml
index 1a01f80..e6c9b0b 100644
--- a/index.xml
+++ b/index.xml
@@ -1,5 +1,5 @@
-<?xml version="1.0" encoding="utf-8" standalone="yes"?><rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom"><channel><title>Eric X. Liu's Personal Page</title><link>/</link><description>Recent content on Eric X. Liu's Personal Page</description><generator>Hugo</generator><language>en</language><lastBuildDate>Sun, 03 Aug 2025 03:15:05 +0000</lastBuildDate><atom:link href="/index.xml" rel="self" type="application/rss+xml"/><item><title>A Deep Dive into PPO for Language Models</title><link>/posts/a-deep-dive-into-ppo-for-language-models/</link><pubDate>Sun, 03 Aug 2025 03:14:20 +0000</pubDate><guid>/posts/a-deep-dive-into-ppo-for-language-models/</guid><description>&lt;p>Large Language Models (LLMs) have demonstrated astonishing capabilities, but out-of-the-box, they are simply powerful text predictors. They don&amp;rsquo;t inherently understand what makes a response helpful, harmless, or aligned with human values. The technique that has proven most effective at bridging this gap is Reinforcement Learning from Human Feedback (RLHF), and at its heart lies a powerful algorithm: Proximal Policy Optimization (PPO).&lt;/p>
-&lt;p>You may have seen diagrams like the one below, which outlines the RLHF training process. It can look intimidating, with a web of interconnected models, losses, and data flows.&lt;/p></description></item><item><title>Mixture-of-Experts (MoE) Models Challenges &amp; Solutions in Practice</title><link>/posts/mixture-of-experts-moe-models-challenges-solutions-in-practice/</link><pubDate>Sun, 03 Aug 2025 03:14:20 +0000</pubDate><guid>/posts/mixture-of-experts-moe-models-challenges-solutions-in-practice/</guid><description>&lt;p>Mixture-of-Experts (MoEs) are neural network architectures that allow different parts of the model (called &amp;ldquo;experts&amp;rdquo;) to specialize in different types of inputs. A &amp;ldquo;gating network&amp;rdquo; or &amp;ldquo;router&amp;rdquo; learns to dispatch each input (or &amp;ldquo;token&amp;rdquo;) to a subset of these experts. While powerful for scaling models, MoEs introduce several practical challenges.&lt;/p>
+<?xml version="1.0" encoding="utf-8" standalone="yes"?><rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom"><channel><title>Eric X. Liu's Personal Page</title><link>/</link><description>Recent content on Eric X. Liu's Personal Page</description><generator>Hugo</generator><language>en</language><lastBuildDate>Sun, 03 Aug 2025 03:17:01 +0000</lastBuildDate><atom:link href="/index.xml" rel="self" type="application/rss+xml"/><item><title>A Deep Dive into PPO for Language Models</title><link>/posts/a-deep-dive-into-ppo-for-language-models/</link><pubDate>Sun, 03 Aug 2025 03:14:23 +0000</pubDate><guid>/posts/a-deep-dive-into-ppo-for-language-models/</guid><description>&lt;p>Large Language Models (LLMs) have demonstrated astonishing capabilities, but out-of-the-box, they are simply powerful text predictors. They don&amp;rsquo;t inherently understand what makes a response helpful, harmless, or aligned with human values. The technique that has proven most effective at bridging this gap is Reinforcement Learning from Human Feedback (RLHF), and at its heart lies a powerful algorithm: Proximal Policy Optimization (PPO).&lt;/p>
+&lt;p>You may have seen diagrams like the one below, which outlines the RLHF training process. It can look intimidating, with a web of interconnected models, losses, and data flows.&lt;/p></description></item><item><title>Mixture-of-Experts (MoE) Models Challenges &amp; Solutions in Practice</title><link>/posts/mixture-of-experts-moe-models-challenges-solutions-in-practice/</link><pubDate>Sun, 03 Aug 2025 03:14:23 +0000</pubDate><guid>/posts/mixture-of-experts-moe-models-challenges-solutions-in-practice/</guid><description>&lt;p>Mixture-of-Experts (MoEs) are neural network architectures that allow different parts of the model (called &amp;ldquo;experts&amp;rdquo;) to specialize in different types of inputs. A &amp;ldquo;gating network&amp;rdquo; or &amp;ldquo;router&amp;rdquo; learns to dispatch each input (or &amp;ldquo;token&amp;rdquo;) to a subset of these experts. While powerful for scaling models, MoEs introduce several practical challenges.&lt;/p>
 &lt;h3 id="1-challenge-non-differentiability-of-routing-functions">
  1. Challenge: Non-Differentiability of Routing Functions
  &lt;a class="heading-link" href="#1-challenge-non-differentiability-of-routing-functions">
@@ -8,7 +8,7 @@
  &lt;/a>
 &lt;/h3>
 &lt;p>&lt;strong>The Problem:&lt;/strong>
-Many routing mechanisms, especially &amp;ldquo;Top-K routing,&amp;rdquo; involve a discrete, hard selection process. A common function is &lt;code>KeepTopK(v, k)&lt;/code>, which selects the top &lt;code>k&lt;/code> scoring elements from a vector &lt;code>v&lt;/code> and sets others to $-\infty$ or $0$.&lt;/p></description></item><item><title>T5 - The Transformer That Zigged When Others Zagged - An Architectural Deep Dive</title><link>/posts/t5-the-transformer-that-zigged-when-others-zagged-an-architectural-deep-dive/</link><pubDate>Sun, 03 Aug 2025 03:14:20 +0000</pubDate><guid>/posts/t5-the-transformer-that-zigged-when-others-zagged-an-architectural-deep-dive/</guid><description>&lt;p>In the rapidly evolving landscape of Large Language Models, a few key architectures define the dominant paradigms. Today, the &amp;ldquo;decoder-only&amp;rdquo; model, popularized by the GPT series and its successors like LLaMA and Mistral, reigns supreme. These models are scaled to incredible sizes and excel at in-context learning.&lt;/p>
+Many routing mechanisms, especially &amp;ldquo;Top-K routing,&amp;rdquo; involve a discrete, hard selection process. A common function is &lt;code>KeepTopK(v, k)&lt;/code>, which selects the top &lt;code>k&lt;/code> scoring elements from a vector &lt;code>v&lt;/code> and sets others to $-\infty$ or $0$.&lt;/p></description></item><item><title>T5 - The Transformer That Zigged When Others Zagged - An Architectural Deep Dive</title><link>/posts/t5-the-transformer-that-zigged-when-others-zagged-an-architectural-deep-dive/</link><pubDate>Sun, 03 Aug 2025 03:14:23 +0000</pubDate><guid>/posts/t5-the-transformer-that-zigged-when-others-zagged-an-architectural-deep-dive/</guid><description>&lt;p>In the rapidly evolving landscape of Large Language Models, a few key architectures define the dominant paradigms. Today, the &amp;ldquo;decoder-only&amp;rdquo; model, popularized by the GPT series and its successors like LLaMA and Mistral, reigns supreme. These models are scaled to incredible sizes and excel at in-context learning.&lt;/p>
 &lt;p>But to truly understand the field, we must look at the pivotal models that explored different paths. Google&amp;rsquo;s T5, or &lt;strong>Text-to-Text Transfer Transformer&lt;/strong>, stands out as one of the most influential. It didn&amp;rsquo;t just introduce a new model; it proposed a new philosophy. This article dives deep into the architecture of T5, how it fundamentally differs from modern LLMs, and the lasting legacy of its unique design choices.&lt;/p></description></item><item><title>Some useful files</title><link>/posts/useful/</link><pubDate>Mon, 26 Oct 2020 04:14:43 +0000</pubDate><guid>/posts/useful/</guid><description>&lt;ul>
 &lt;li>&lt;a href="https://ericxliu.me/rootCA.pem" class="external-link" target="_blank" rel="noopener">rootCA.pem&lt;/a>&lt;/li>
 &lt;li>&lt;a href="https://ericxliu.me/vpnclient.ovpn" class="external-link" target="_blank" rel="noopener">vpnclient.ovpn&lt;/a>&lt;/li>
diff --git a/posts/a-deep-dive-into-ppo-for-language-models/index.html b/posts/a-deep-dive-into-ppo-for-language-models/index.html
index b9284d0..02b073c 100644
--- a/posts/a-deep-dive-into-ppo-for-language-models/index.html
+++ b/posts/a-deep-dive-into-ppo-for-language-models/index.html
@@ -1,10 +1,10 @@
 <!doctype html><html lang=en><head><title>A Deep Dive into PPO for Language Models · Eric X. Liu's Personal Page</title><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><meta name=color-scheme content="light dark"><meta name=author content="Eric X. Liu"><meta name=description content="Large Language Models (LLMs) have demonstrated astonishing capabilities, but out-of-the-box, they are simply powerful text predictors. They don&rsquo;t inherently understand what makes a response helpful, harmless, or aligned with human values. The technique that has proven most effective at bridging this gap is Reinforcement Learning from Human Feedback (RLHF), and at its heart lies a powerful algorithm: Proximal Policy Optimization (PPO).
 You may have seen diagrams like the one below, which outlines the RLHF training process. It can look intimidating, with a web of interconnected models, losses, and data flows."><meta name=keywords content="software engineer,performance engineering,Google engineer,tech blog,software development,performance optimization,Eric Liu,engineering blog,mountain biking,Jeep enthusiast,overlanding,camping,outdoor adventures"><meta name=fediverse:creator content><meta name=twitter:card content="summary"><meta name=twitter:title content="A Deep Dive into PPO for Language Models"><meta name=twitter:description content="Large Language Models (LLMs) have demonstrated astonishing capabilities, but out-of-the-box, they are simply powerful text predictors. They don’t inherently understand what makes a response helpful, harmless, or aligned with human values. The technique that has proven most effective at bridging this gap is Reinforcement Learning from Human Feedback (RLHF), and at its heart lies a powerful algorithm: Proximal Policy Optimization (PPO).
 You may have seen diagrams like the one below, which outlines the RLHF training process. It can look intimidating, with a web of interconnected models, losses, and data flows."><meta property="og:url" content="/posts/a-deep-dive-into-ppo-for-language-models/"><meta property="og:site_name" content="Eric X. Liu's Personal Page"><meta property="og:title" content="A Deep Dive into PPO for Language Models"><meta property="og:description" content="Large Language Models (LLMs) have demonstrated astonishing capabilities, but out-of-the-box, they are simply powerful text predictors. They don’t inherently understand what makes a response helpful, harmless, or aligned with human values. The technique that has proven most effective at bridging this gap is Reinforcement Learning from Human Feedback (RLHF), and at its heart lies a powerful algorithm: Proximal Policy Optimization (PPO).
-You may have seen diagrams like the one below, which outlines the RLHF training process. It can look intimidating, with a web of interconnected models, losses, and data flows."><meta property="og:locale" content="en"><meta property="og:type" content="article"><meta property="article:section" content="posts"><meta property="article:published_time" content="2025-08-03T03:14:20+00:00"><meta property="article:modified_time" content="2025-08-03T03:15:05+00:00"><link rel=canonical href=/posts/a-deep-dive-into-ppo-for-language-models/><link rel=preload href=/fonts/fa-brands-400.woff2 as=font type=font/woff2 crossorigin><link rel=preload href=/fonts/fa-regular-400.woff2 as=font type=font/woff2 crossorigin><link rel=preload href=/fonts/fa-solid-900.woff2 as=font type=font/woff2 crossorigin><link rel=stylesheet href=/css/coder.min.60f552a2c0452fcc0254c54f21c3e0728460c1ae85f97a9c35833a222ef8b884.css integrity="sha256-YPVSosBFL8wCVMVPIcPgcoRgwa6F+XqcNYM6Ii74uIQ=" crossorigin=anonymous media=screen><link rel=stylesheet href=/css/coder-dark.min.a00e6364bacbc8266ad1cc81230774a1397198f8cfb7bcba29b7d6fcb54ce57f.css integrity="sha256-oA5jZLrLyCZq0cyBIwd0oTlxmPjPt7y6KbfW/LVM5X8=" crossorigin=anonymous media=screen><link rel=icon type=image/svg+xml href=/images/favicon.svg sizes=any><link rel=icon type=image/png href=/images/favicon-32x32.png sizes=32x32><link rel=icon type=image/png href=/images/favicon-16x16.png sizes=16x16><link rel=apple-touch-icon href=/images/apple-touch-icon.png><link rel=apple-touch-icon sizes=180x180 href=/images/apple-touch-icon.png><link rel=manifest href=/site.webmanifest><link rel=mask-icon href=/images/safari-pinned-tab.svg color=#5bbad5></head><body class="preload-transitions colorscheme-auto"><div class=float-container><a id=dark-mode-toggle class=colorscheme-toggle><i class="fa-solid fa-adjust fa-fw" aria-hidden=true></i></a></div><main class=wrapper><nav class=navigation><section class=container><a class=navigation-title href=/>Eric X. Liu's Personal Page
+You may have seen diagrams like the one below, which outlines the RLHF training process. It can look intimidating, with a web of interconnected models, losses, and data flows."><meta property="og:locale" content="en"><meta property="og:type" content="article"><meta property="article:section" content="posts"><meta property="article:published_time" content="2025-08-03T03:14:23+00:00"><meta property="article:modified_time" content="2025-08-03T03:17:01+00:00"><link rel=canonical href=/posts/a-deep-dive-into-ppo-for-language-models/><link rel=preload href=/fonts/fa-brands-400.woff2 as=font type=font/woff2 crossorigin><link rel=preload href=/fonts/fa-regular-400.woff2 as=font type=font/woff2 crossorigin><link rel=preload href=/fonts/fa-solid-900.woff2 as=font type=font/woff2 crossorigin><link rel=stylesheet href=/css/coder.min.60f552a2c0452fcc0254c54f21c3e0728460c1ae85f97a9c35833a222ef8b884.css integrity="sha256-YPVSosBFL8wCVMVPIcPgcoRgwa6F+XqcNYM6Ii74uIQ=" crossorigin=anonymous media=screen><link rel=stylesheet href=/css/coder-dark.min.a00e6364bacbc8266ad1cc81230774a1397198f8cfb7bcba29b7d6fcb54ce57f.css integrity="sha256-oA5jZLrLyCZq0cyBIwd0oTlxmPjPt7y6KbfW/LVM5X8=" crossorigin=anonymous media=screen><link rel=icon type=image/svg+xml href=/images/favicon.svg sizes=any><link rel=icon type=image/png href=/images/favicon-32x32.png sizes=32x32><link rel=icon type=image/png href=/images/favicon-16x16.png sizes=16x16><link rel=apple-touch-icon href=/images/apple-touch-icon.png><link rel=apple-touch-icon sizes=180x180 href=/images/apple-touch-icon.png><link rel=manifest href=/site.webmanifest><link rel=mask-icon href=/images/safari-pinned-tab.svg color=#5bbad5></head><body class="preload-transitions colorscheme-auto"><div class=float-container><a id=dark-mode-toggle class=colorscheme-toggle><i class="fa-solid fa-adjust fa-fw" aria-hidden=true></i></a></div><main class=wrapper><nav class=navigation><section class=container><a class=navigation-title href=/>Eric X. Liu's Personal Page
 </a><input type=checkbox id=menu-toggle>
 <label class="menu-button float-right" for=menu-toggle><i class="fa-solid fa-bars fa-fw" aria-hidden=true></i></label><ul class=navigation-list><li class=navigation-item><a class=navigation-link href=/posts/>Posts</a></li><li class=navigation-item><a class=navigation-link href=https://chat.ericxliu.me>Chat</a></li><li class=navigation-item><a class=navigation-link href=https://git.ericxliu.me/user/oauth2/Authenitk>Git</a></li><li class=navigation-item><a class=navigation-link href=https://coder.ericxliu.me/api/v2/users/oidc/callback>Coder</a></li><li class=navigation-item><a class=navigation-link href=https://rss.ericxliu.me/oauth2/oidc/redirect>RSS</a></li><li class=navigation-item><a class=navigation-link href=/>|</a></li><li class=navigation-item><a class=navigation-link href=https://sso.ericxliu.me>Sign in</a></li></ul></section></nav><div class=content><section class="container post"><article><header><div class=post-title><h1 class=title><a class=title-link href=/posts/a-deep-dive-into-ppo-for-language-models/>A Deep Dive into PPO for Language Models</a></h1></div><div class=post-meta><div class=date><span class=posted-on><i class="fa-solid fa-calendar" aria-hidden=true></i>
-<time datetime=2025-08-03T03:14:20Z>August 3, 2025
+<time datetime=2025-08-03T03:14:23Z>August 3, 2025
 </time></span><span class=reading-time><i class="fa-solid fa-clock" aria-hidden=true></i>
 7-minute read</span></div></div></header><div class=post-content><p>Large Language Models (LLMs) have demonstrated astonishing capabilities, but out-of-the-box, they are simply powerful text predictors. They don&rsquo;t inherently understand what makes a response helpful, harmless, or aligned with human values. The technique that has proven most effective at bridging this gap is Reinforcement Learning from Human Feedback (RLHF), and at its heart lies a powerful algorithm: Proximal Policy Optimization (PPO).</p><p>You may have seen diagrams like the one below, which outlines the RLHF training process. It can look intimidating, with a web of interconnected models, losses, and data flows.</p><p><img src=/images/a-deep-dive-into-ppo-for-language-models/.png alt></p><p>This post will decode that diagram, piece by piece. We&rsquo;ll explore the &ldquo;why&rdquo; behind each component, moving from high-level concepts to the deep technical reasoning that makes this process work.</p><h3 id=translating-rl-to-a-conversation>Translating RL to a Conversation
 <a class=heading-link href=#translating-rl-to-a-conversation><i class="fa-solid fa-link" aria-hidden=true title="Link to heading"></i>
@@ -23,4 +23,4 @@ where <code>δ_t = r_t + γV(s_{t+1}) - V(s_t)</code></p><ul><li><strong>γ (gam
 2016 -
 2025
 Eric X. Liu
-<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/84b3c20">[84b3c20]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script></body></html>
\ No newline at end of file
+<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/e15a7ff">[e15a7ff]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script></body></html>
\ No newline at end of file
diff --git a/posts/index.html b/posts/index.html
index bf0b567..9656577 100644
--- a/posts/index.html
+++ b/posts/index.html
@@ -8,4 +8,4 @@
 2016 -
 2025
 Eric X. Liu
-<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/84b3c20">[84b3c20]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script></body></html>
\ No newline at end of file
+<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/e15a7ff">[e15a7ff]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script></body></html>
\ No newline at end of file
diff --git a/posts/index.xml b/posts/index.xml
index 163f28a..9bfbb08 100644
--- a/posts/index.xml
+++ b/posts/index.xml
@@ -1,5 +1,5 @@
-<?xml version="1.0" encoding="utf-8" standalone="yes"?><rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom"><channel><title>Posts on Eric X. Liu's Personal Page</title><link>/posts/</link><description>Recent content in Posts on Eric X. Liu's Personal Page</description><generator>Hugo</generator><language>en</language><lastBuildDate>Sun, 03 Aug 2025 03:15:05 +0000</lastBuildDate><atom:link href="/posts/index.xml" rel="self" type="application/rss+xml"/><item><title>A Deep Dive into PPO for Language Models</title><link>/posts/a-deep-dive-into-ppo-for-language-models/</link><pubDate>Sun, 03 Aug 2025 03:14:20 +0000</pubDate><guid>/posts/a-deep-dive-into-ppo-for-language-models/</guid><description>&lt;p>Large Language Models (LLMs) have demonstrated astonishing capabilities, but out-of-the-box, they are simply powerful text predictors. They don&amp;rsquo;t inherently understand what makes a response helpful, harmless, or aligned with human values. The technique that has proven most effective at bridging this gap is Reinforcement Learning from Human Feedback (RLHF), and at its heart lies a powerful algorithm: Proximal Policy Optimization (PPO).&lt;/p>
-&lt;p>You may have seen diagrams like the one below, which outlines the RLHF training process. It can look intimidating, with a web of interconnected models, losses, and data flows.&lt;/p></description></item><item><title>Mixture-of-Experts (MoE) Models Challenges &amp; Solutions in Practice</title><link>/posts/mixture-of-experts-moe-models-challenges-solutions-in-practice/</link><pubDate>Sun, 03 Aug 2025 03:14:20 +0000</pubDate><guid>/posts/mixture-of-experts-moe-models-challenges-solutions-in-practice/</guid><description>&lt;p>Mixture-of-Experts (MoEs) are neural network architectures that allow different parts of the model (called &amp;ldquo;experts&amp;rdquo;) to specialize in different types of inputs. A &amp;ldquo;gating network&amp;rdquo; or &amp;ldquo;router&amp;rdquo; learns to dispatch each input (or &amp;ldquo;token&amp;rdquo;) to a subset of these experts. While powerful for scaling models, MoEs introduce several practical challenges.&lt;/p>
+<?xml version="1.0" encoding="utf-8" standalone="yes"?><rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom"><channel><title>Posts on Eric X. Liu's Personal Page</title><link>/posts/</link><description>Recent content in Posts on Eric X. Liu's Personal Page</description><generator>Hugo</generator><language>en</language><lastBuildDate>Sun, 03 Aug 2025 03:17:01 +0000</lastBuildDate><atom:link href="/posts/index.xml" rel="self" type="application/rss+xml"/><item><title>A Deep Dive into PPO for Language Models</title><link>/posts/a-deep-dive-into-ppo-for-language-models/</link><pubDate>Sun, 03 Aug 2025 03:14:23 +0000</pubDate><guid>/posts/a-deep-dive-into-ppo-for-language-models/</guid><description>&lt;p>Large Language Models (LLMs) have demonstrated astonishing capabilities, but out-of-the-box, they are simply powerful text predictors. They don&amp;rsquo;t inherently understand what makes a response helpful, harmless, or aligned with human values. The technique that has proven most effective at bridging this gap is Reinforcement Learning from Human Feedback (RLHF), and at its heart lies a powerful algorithm: Proximal Policy Optimization (PPO).&lt;/p>
+&lt;p>You may have seen diagrams like the one below, which outlines the RLHF training process. It can look intimidating, with a web of interconnected models, losses, and data flows.&lt;/p></description></item><item><title>Mixture-of-Experts (MoE) Models Challenges &amp; Solutions in Practice</title><link>/posts/mixture-of-experts-moe-models-challenges-solutions-in-practice/</link><pubDate>Sun, 03 Aug 2025 03:14:23 +0000</pubDate><guid>/posts/mixture-of-experts-moe-models-challenges-solutions-in-practice/</guid><description>&lt;p>Mixture-of-Experts (MoEs) are neural network architectures that allow different parts of the model (called &amp;ldquo;experts&amp;rdquo;) to specialize in different types of inputs. A &amp;ldquo;gating network&amp;rdquo; or &amp;ldquo;router&amp;rdquo; learns to dispatch each input (or &amp;ldquo;token&amp;rdquo;) to a subset of these experts. While powerful for scaling models, MoEs introduce several practical challenges.&lt;/p>
 &lt;h3 id="1-challenge-non-differentiability-of-routing-functions">
  1. Challenge: Non-Differentiability of Routing Functions
  &lt;a class="heading-link" href="#1-challenge-non-differentiability-of-routing-functions">
@@ -8,7 +8,7 @@
  &lt;/a>
 &lt;/h3>
 &lt;p>&lt;strong>The Problem:&lt;/strong>
-Many routing mechanisms, especially &amp;ldquo;Top-K routing,&amp;rdquo; involve a discrete, hard selection process. A common function is &lt;code>KeepTopK(v, k)&lt;/code>, which selects the top &lt;code>k&lt;/code> scoring elements from a vector &lt;code>v&lt;/code> and sets others to $-\infty$ or $0$.&lt;/p></description></item><item><title>T5 - The Transformer That Zigged When Others Zagged - An Architectural Deep Dive</title><link>/posts/t5-the-transformer-that-zigged-when-others-zagged-an-architectural-deep-dive/</link><pubDate>Sun, 03 Aug 2025 03:14:20 +0000</pubDate><guid>/posts/t5-the-transformer-that-zigged-when-others-zagged-an-architectural-deep-dive/</guid><description>&lt;p>In the rapidly evolving landscape of Large Language Models, a few key architectures define the dominant paradigms. Today, the &amp;ldquo;decoder-only&amp;rdquo; model, popularized by the GPT series and its successors like LLaMA and Mistral, reigns supreme. These models are scaled to incredible sizes and excel at in-context learning.&lt;/p>
+Many routing mechanisms, especially &amp;ldquo;Top-K routing,&amp;rdquo; involve a discrete, hard selection process. A common function is &lt;code>KeepTopK(v, k)&lt;/code>, which selects the top &lt;code>k&lt;/code> scoring elements from a vector &lt;code>v&lt;/code> and sets others to $-\infty$ or $0$.&lt;/p></description></item><item><title>T5 - The Transformer That Zigged When Others Zagged - An Architectural Deep Dive</title><link>/posts/t5-the-transformer-that-zigged-when-others-zagged-an-architectural-deep-dive/</link><pubDate>Sun, 03 Aug 2025 03:14:23 +0000</pubDate><guid>/posts/t5-the-transformer-that-zigged-when-others-zagged-an-architectural-deep-dive/</guid><description>&lt;p>In the rapidly evolving landscape of Large Language Models, a few key architectures define the dominant paradigms. Today, the &amp;ldquo;decoder-only&amp;rdquo; model, popularized by the GPT series and its successors like LLaMA and Mistral, reigns supreme. These models are scaled to incredible sizes and excel at in-context learning.&lt;/p>
 &lt;p>But to truly understand the field, we must look at the pivotal models that explored different paths. Google&amp;rsquo;s T5, or &lt;strong>Text-to-Text Transfer Transformer&lt;/strong>, stands out as one of the most influential. It didn&amp;rsquo;t just introduce a new model; it proposed a new philosophy. This article dives deep into the architecture of T5, how it fundamentally differs from modern LLMs, and the lasting legacy of its unique design choices.&lt;/p></description></item><item><title>Some useful files</title><link>/posts/useful/</link><pubDate>Mon, 26 Oct 2020 04:14:43 +0000</pubDate><guid>/posts/useful/</guid><description>&lt;ul>
 &lt;li>&lt;a href="https://ericxliu.me/rootCA.pem" class="external-link" target="_blank" rel="noopener">rootCA.pem&lt;/a>&lt;/li>
 &lt;li>&lt;a href="https://ericxliu.me/vpnclient.ovpn" class="external-link" target="_blank" rel="noopener">vpnclient.ovpn&lt;/a>&lt;/li>
diff --git a/posts/mixture-of-experts-moe-models-challenges-solutions-in-practice/index.html b/posts/mixture-of-experts-moe-models-challenges-solutions-in-practice/index.html
index 77ac991..9b5e4b2 100644
--- a/posts/mixture-of-experts-moe-models-challenges-solutions-in-practice/index.html
+++ b/posts/mixture-of-experts-moe-models-challenges-solutions-in-practice/index.html
@@ -9,10 +9,10 @@
 The Problem:
 Many routing mechanisms, especially &ldquo;Top-K routing,&rdquo; involve a discrete, hard selection process. A common function is KeepTopK(v, k), which selects the top k scoring elements from a vector v and sets others to $-\infty$ or $0$."><meta name=keywords content="software engineer,performance engineering,Google engineer,tech blog,software development,performance optimization,Eric Liu,engineering blog,mountain biking,Jeep enthusiast,overlanding,camping,outdoor adventures"><meta name=fediverse:creator content><meta name=twitter:card content="summary"><meta name=twitter:title content="Mixture-of-Experts (MoE) Models Challenges & Solutions in Practice"><meta name=twitter:description content="Mixture-of-Experts (MoEs) are neural network architectures that allow different parts of the model (called “experts”) to specialize in different types of inputs. A “gating network” or “router” learns to dispatch each input (or “token”) to a subset of these experts. While powerful for scaling models, MoEs introduce several practical challenges.
 1. Challenge: Non-Differentiability of Routing Functions Link to heading The Problem: Many routing mechanisms, especially “Top-K routing,” involve a discrete, hard selection process. A common function is KeepTopK(v, k), which selects the top k scoring elements from a vector v and sets others to $-\infty$ or $0$."><meta property="og:url" content="/posts/mixture-of-experts-moe-models-challenges-solutions-in-practice/"><meta property="og:site_name" content="Eric X. Liu's Personal Page"><meta property="og:title" content="Mixture-of-Experts (MoE) Models Challenges & Solutions in Practice"><meta property="og:description" content="Mixture-of-Experts (MoEs) are neural network architectures that allow different parts of the model (called “experts”) to specialize in different types of inputs. A “gating network” or “router” learns to dispatch each input (or “token”) to a subset of these experts. While powerful for scaling models, MoEs introduce several practical challenges.
-1. Challenge: Non-Differentiability of Routing Functions Link to heading The Problem: Many routing mechanisms, especially “Top-K routing,” involve a discrete, hard selection process. A common function is KeepTopK(v, k), which selects the top k scoring elements from a vector v and sets others to $-\infty$ or $0$."><meta property="og:locale" content="en"><meta property="og:type" content="article"><meta property="article:section" content="posts"><meta property="article:published_time" content="2025-08-03T03:14:20+00:00"><meta property="article:modified_time" content="2025-08-03T03:15:05+00:00"><link rel=canonical href=/posts/mixture-of-experts-moe-models-challenges-solutions-in-practice/><link rel=preload href=/fonts/fa-brands-400.woff2 as=font type=font/woff2 crossorigin><link rel=preload href=/fonts/fa-regular-400.woff2 as=font type=font/woff2 crossorigin><link rel=preload href=/fonts/fa-solid-900.woff2 as=font type=font/woff2 crossorigin><link rel=stylesheet href=/css/coder.min.60f552a2c0452fcc0254c54f21c3e0728460c1ae85f97a9c35833a222ef8b884.css integrity="sha256-YPVSosBFL8wCVMVPIcPgcoRgwa6F+XqcNYM6Ii74uIQ=" crossorigin=anonymous media=screen><link rel=stylesheet href=/css/coder-dark.min.a00e6364bacbc8266ad1cc81230774a1397198f8cfb7bcba29b7d6fcb54ce57f.css integrity="sha256-oA5jZLrLyCZq0cyBIwd0oTlxmPjPt7y6KbfW/LVM5X8=" crossorigin=anonymous media=screen><link rel=icon type=image/svg+xml href=/images/favicon.svg sizes=any><link rel=icon type=image/png href=/images/favicon-32x32.png sizes=32x32><link rel=icon type=image/png href=/images/favicon-16x16.png sizes=16x16><link rel=apple-touch-icon href=/images/apple-touch-icon.png><link rel=apple-touch-icon sizes=180x180 href=/images/apple-touch-icon.png><link rel=manifest href=/site.webmanifest><link rel=mask-icon href=/images/safari-pinned-tab.svg color=#5bbad5></head><body class="preload-transitions colorscheme-auto"><div class=float-container><a id=dark-mode-toggle class=colorscheme-toggle><i class="fa-solid fa-adjust fa-fw" aria-hidden=true></i></a></div><main class=wrapper><nav class=navigation><section class=container><a class=navigation-title href=/>Eric X. Liu's Personal Page
+1. Challenge: Non-Differentiability of Routing Functions Link to heading The Problem: Many routing mechanisms, especially “Top-K routing,” involve a discrete, hard selection process. A common function is KeepTopK(v, k), which selects the top k scoring elements from a vector v and sets others to $-\infty$ or $0$."><meta property="og:locale" content="en"><meta property="og:type" content="article"><meta property="article:section" content="posts"><meta property="article:published_time" content="2025-08-03T03:14:23+00:00"><meta property="article:modified_time" content="2025-08-03T03:17:01+00:00"><link rel=canonical href=/posts/mixture-of-experts-moe-models-challenges-solutions-in-practice/><link rel=preload href=/fonts/fa-brands-400.woff2 as=font type=font/woff2 crossorigin><link rel=preload href=/fonts/fa-regular-400.woff2 as=font type=font/woff2 crossorigin><link rel=preload href=/fonts/fa-solid-900.woff2 as=font type=font/woff2 crossorigin><link rel=stylesheet href=/css/coder.min.60f552a2c0452fcc0254c54f21c3e0728460c1ae85f97a9c35833a222ef8b884.css integrity="sha256-YPVSosBFL8wCVMVPIcPgcoRgwa6F+XqcNYM6Ii74uIQ=" crossorigin=anonymous media=screen><link rel=stylesheet href=/css/coder-dark.min.a00e6364bacbc8266ad1cc81230774a1397198f8cfb7bcba29b7d6fcb54ce57f.css integrity="sha256-oA5jZLrLyCZq0cyBIwd0oTlxmPjPt7y6KbfW/LVM5X8=" crossorigin=anonymous media=screen><link rel=icon type=image/svg+xml href=/images/favicon.svg sizes=any><link rel=icon type=image/png href=/images/favicon-32x32.png sizes=32x32><link rel=icon type=image/png href=/images/favicon-16x16.png sizes=16x16><link rel=apple-touch-icon href=/images/apple-touch-icon.png><link rel=apple-touch-icon sizes=180x180 href=/images/apple-touch-icon.png><link rel=manifest href=/site.webmanifest><link rel=mask-icon href=/images/safari-pinned-tab.svg color=#5bbad5></head><body class="preload-transitions colorscheme-auto"><div class=float-container><a id=dark-mode-toggle class=colorscheme-toggle><i class="fa-solid fa-adjust fa-fw" aria-hidden=true></i></a></div><main class=wrapper><nav class=navigation><section class=container><a class=navigation-title href=/>Eric X. Liu's Personal Page
 </a><input type=checkbox id=menu-toggle>
 <label class="menu-button float-right" for=menu-toggle><i class="fa-solid fa-bars fa-fw" aria-hidden=true></i></label><ul class=navigation-list><li class=navigation-item><a class=navigation-link href=/posts/>Posts</a></li><li class=navigation-item><a class=navigation-link href=https://chat.ericxliu.me>Chat</a></li><li class=navigation-item><a class=navigation-link href=https://git.ericxliu.me/user/oauth2/Authenitk>Git</a></li><li class=navigation-item><a class=navigation-link href=https://coder.ericxliu.me/api/v2/users/oidc/callback>Coder</a></li><li class=navigation-item><a class=navigation-link href=https://rss.ericxliu.me/oauth2/oidc/redirect>RSS</a></li><li class=navigation-item><a class=navigation-link href=/>|</a></li><li class=navigation-item><a class=navigation-link href=https://sso.ericxliu.me>Sign in</a></li></ul></section></nav><div class=content><section class="container post"><article><header><div class=post-title><h1 class=title><a class=title-link href=/posts/mixture-of-experts-moe-models-challenges-solutions-in-practice/>Mixture-of-Experts (MoE) Models Challenges & Solutions in Practice</a></h1></div><div class=post-meta><div class=date><span class=posted-on><i class="fa-solid fa-calendar" aria-hidden=true></i>
-<time datetime=2025-08-03T03:14:20Z>August 3, 2025
+<time datetime=2025-08-03T03:14:23Z>August 3, 2025
 </time></span><span class=reading-time><i class="fa-solid fa-clock" aria-hidden=true></i>
 7-minute read</span></div></div></header><div class=post-content><p>Mixture-of-Experts (MoEs) are neural network architectures that allow different parts of the model (called &ldquo;experts&rdquo;) to specialize in different types of inputs. A &ldquo;gating network&rdquo; or &ldquo;router&rdquo; learns to dispatch each input (or &ldquo;token&rdquo;) to a subset of these experts. While powerful for scaling models, MoEs introduce several practical challenges.</p><h3 id=1-challenge-non-differentiability-of-routing-functions>1. Challenge: Non-Differentiability of Routing Functions
 <a class=heading-link href=#1-challenge-non-differentiability-of-routing-functions><i class="fa-solid fa-link" aria-hidden=true title="Link to heading"></i>
@@ -44,4 +44,4 @@ The <strong>Top-K routing</strong> mechanism, as illustrated in the provided ima
 2016 -
 2025
 Eric X. Liu
-<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/84b3c20">[84b3c20]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script></body></html>
\ No newline at end of file
+<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/e15a7ff">[e15a7ff]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script></body></html>
\ No newline at end of file
diff --git a/posts/t5-the-transformer-that-zigged-when-others-zagged-an-architectural-deep-dive/index.html b/posts/t5-the-transformer-that-zigged-when-others-zagged-an-architectural-deep-dive/index.html
index 7fadae3..2f39c66 100644
--- a/posts/t5-the-transformer-that-zigged-when-others-zagged-an-architectural-deep-dive/index.html
+++ b/posts/t5-the-transformer-that-zigged-when-others-zagged-an-architectural-deep-dive/index.html
@@ -1,10 +1,10 @@
 <!doctype html><html lang=en><head><title>T5 - The Transformer That Zigged When Others Zagged - An Architectural Deep Dive · Eric X. Liu's Personal Page</title><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><meta name=color-scheme content="light dark"><meta name=author content="Eric X. Liu"><meta name=description content="In the rapidly evolving landscape of Large Language Models, a few key architectures define the dominant paradigms. Today, the &ldquo;decoder-only&rdquo; model, popularized by the GPT series and its successors like LLaMA and Mistral, reigns supreme. These models are scaled to incredible sizes and excel at in-context learning.
 But to truly understand the field, we must look at the pivotal models that explored different paths. Google&rsquo;s T5, or Text-to-Text Transfer Transformer, stands out as one of the most influential. It didn&rsquo;t just introduce a new model; it proposed a new philosophy. This article dives deep into the architecture of T5, how it fundamentally differs from modern LLMs, and the lasting legacy of its unique design choices."><meta name=keywords content="software engineer,performance engineering,Google engineer,tech blog,software development,performance optimization,Eric Liu,engineering blog,mountain biking,Jeep enthusiast,overlanding,camping,outdoor adventures"><meta name=fediverse:creator content><meta name=twitter:card content="summary"><meta name=twitter:title content="T5 - The Transformer That Zigged When Others Zagged - An Architectural Deep Dive"><meta name=twitter:description content="In the rapidly evolving landscape of Large Language Models, a few key architectures define the dominant paradigms. Today, the “decoder-only” model, popularized by the GPT series and its successors like LLaMA and Mistral, reigns supreme. These models are scaled to incredible sizes and excel at in-context learning.
 But to truly understand the field, we must look at the pivotal models that explored different paths. Google’s T5, or Text-to-Text Transfer Transformer, stands out as one of the most influential. It didn’t just introduce a new model; it proposed a new philosophy. This article dives deep into the architecture of T5, how it fundamentally differs from modern LLMs, and the lasting legacy of its unique design choices."><meta property="og:url" content="/posts/t5-the-transformer-that-zigged-when-others-zagged-an-architectural-deep-dive/"><meta property="og:site_name" content="Eric X. Liu's Personal Page"><meta property="og:title" content="T5 - The Transformer That Zigged When Others Zagged - An Architectural Deep Dive"><meta property="og:description" content="In the rapidly evolving landscape of Large Language Models, a few key architectures define the dominant paradigms. Today, the “decoder-only” model, popularized by the GPT series and its successors like LLaMA and Mistral, reigns supreme. These models are scaled to incredible sizes and excel at in-context learning.
-But to truly understand the field, we must look at the pivotal models that explored different paths. Google’s T5, or Text-to-Text Transfer Transformer, stands out as one of the most influential. It didn’t just introduce a new model; it proposed a new philosophy. This article dives deep into the architecture of T5, how it fundamentally differs from modern LLMs, and the lasting legacy of its unique design choices."><meta property="og:locale" content="en"><meta property="og:type" content="article"><meta property="article:section" content="posts"><meta property="article:published_time" content="2025-08-03T03:14:20+00:00"><meta property="article:modified_time" content="2025-08-03T03:15:05+00:00"><link rel=canonical href=/posts/t5-the-transformer-that-zigged-when-others-zagged-an-architectural-deep-dive/><link rel=preload href=/fonts/fa-brands-400.woff2 as=font type=font/woff2 crossorigin><link rel=preload href=/fonts/fa-regular-400.woff2 as=font type=font/woff2 crossorigin><link rel=preload href=/fonts/fa-solid-900.woff2 as=font type=font/woff2 crossorigin><link rel=stylesheet href=/css/coder.min.60f552a2c0452fcc0254c54f21c3e0728460c1ae85f97a9c35833a222ef8b884.css integrity="sha256-YPVSosBFL8wCVMVPIcPgcoRgwa6F+XqcNYM6Ii74uIQ=" crossorigin=anonymous media=screen><link rel=stylesheet href=/css/coder-dark.min.a00e6364bacbc8266ad1cc81230774a1397198f8cfb7bcba29b7d6fcb54ce57f.css integrity="sha256-oA5jZLrLyCZq0cyBIwd0oTlxmPjPt7y6KbfW/LVM5X8=" crossorigin=anonymous media=screen><link rel=icon type=image/svg+xml href=/images/favicon.svg sizes=any><link rel=icon type=image/png href=/images/favicon-32x32.png sizes=32x32><link rel=icon type=image/png href=/images/favicon-16x16.png sizes=16x16><link rel=apple-touch-icon href=/images/apple-touch-icon.png><link rel=apple-touch-icon sizes=180x180 href=/images/apple-touch-icon.png><link rel=manifest href=/site.webmanifest><link rel=mask-icon href=/images/safari-pinned-tab.svg color=#5bbad5></head><body class="preload-transitions colorscheme-auto"><div class=float-container><a id=dark-mode-toggle class=colorscheme-toggle><i class="fa-solid fa-adjust fa-fw" aria-hidden=true></i></a></div><main class=wrapper><nav class=navigation><section class=container><a class=navigation-title href=/>Eric X. Liu's Personal Page
+But to truly understand the field, we must look at the pivotal models that explored different paths. Google’s T5, or Text-to-Text Transfer Transformer, stands out as one of the most influential. It didn’t just introduce a new model; it proposed a new philosophy. This article dives deep into the architecture of T5, how it fundamentally differs from modern LLMs, and the lasting legacy of its unique design choices."><meta property="og:locale" content="en"><meta property="og:type" content="article"><meta property="article:section" content="posts"><meta property="article:published_time" content="2025-08-03T03:14:23+00:00"><meta property="article:modified_time" content="2025-08-03T03:17:01+00:00"><link rel=canonical href=/posts/t5-the-transformer-that-zigged-when-others-zagged-an-architectural-deep-dive/><link rel=preload href=/fonts/fa-brands-400.woff2 as=font type=font/woff2 crossorigin><link rel=preload href=/fonts/fa-regular-400.woff2 as=font type=font/woff2 crossorigin><link rel=preload href=/fonts/fa-solid-900.woff2 as=font type=font/woff2 crossorigin><link rel=stylesheet href=/css/coder.min.60f552a2c0452fcc0254c54f21c3e0728460c1ae85f97a9c35833a222ef8b884.css integrity="sha256-YPVSosBFL8wCVMVPIcPgcoRgwa6F+XqcNYM6Ii74uIQ=" crossorigin=anonymous media=screen><link rel=stylesheet href=/css/coder-dark.min.a00e6364bacbc8266ad1cc81230774a1397198f8cfb7bcba29b7d6fcb54ce57f.css integrity="sha256-oA5jZLrLyCZq0cyBIwd0oTlxmPjPt7y6KbfW/LVM5X8=" crossorigin=anonymous media=screen><link rel=icon type=image/svg+xml href=/images/favicon.svg sizes=any><link rel=icon type=image/png href=/images/favicon-32x32.png sizes=32x32><link rel=icon type=image/png href=/images/favicon-16x16.png sizes=16x16><link rel=apple-touch-icon href=/images/apple-touch-icon.png><link rel=apple-touch-icon sizes=180x180 href=/images/apple-touch-icon.png><link rel=manifest href=/site.webmanifest><link rel=mask-icon href=/images/safari-pinned-tab.svg color=#5bbad5></head><body class="preload-transitions colorscheme-auto"><div class=float-container><a id=dark-mode-toggle class=colorscheme-toggle><i class="fa-solid fa-adjust fa-fw" aria-hidden=true></i></a></div><main class=wrapper><nav class=navigation><section class=container><a class=navigation-title href=/>Eric X. Liu's Personal Page
 </a><input type=checkbox id=menu-toggle>
 <label class="menu-button float-right" for=menu-toggle><i class="fa-solid fa-bars fa-fw" aria-hidden=true></i></label><ul class=navigation-list><li class=navigation-item><a class=navigation-link href=/posts/>Posts</a></li><li class=navigation-item><a class=navigation-link href=https://chat.ericxliu.me>Chat</a></li><li class=navigation-item><a class=navigation-link href=https://git.ericxliu.me/user/oauth2/Authenitk>Git</a></li><li class=navigation-item><a class=navigation-link href=https://coder.ericxliu.me/api/v2/users/oidc/callback>Coder</a></li><li class=navigation-item><a class=navigation-link href=https://rss.ericxliu.me/oauth2/oidc/redirect>RSS</a></li><li class=navigation-item><a class=navigation-link href=/>|</a></li><li class=navigation-item><a class=navigation-link href=https://sso.ericxliu.me>Sign in</a></li></ul></section></nav><div class=content><section class="container post"><article><header><div class=post-title><h1 class=title><a class=title-link href=/posts/t5-the-transformer-that-zigged-when-others-zagged-an-architectural-deep-dive/>T5 - The Transformer That Zigged When Others Zagged - An Architectural Deep Dive</a></h1></div><div class=post-meta><div class=date><span class=posted-on><i class="fa-solid fa-calendar" aria-hidden=true></i>
-<time datetime=2025-08-03T03:14:20Z>August 3, 2025
+<time datetime=2025-08-03T03:14:23Z>August 3, 2025
 </time></span><span class=reading-time><i class="fa-solid fa-clock" aria-hidden=true></i>
 6-minute read</span></div></div></header><div class=post-content><p>In the rapidly evolving landscape of Large Language Models, a few key architectures define the dominant paradigms. Today, the &ldquo;decoder-only&rdquo; model, popularized by the GPT series and its successors like LLaMA and Mistral, reigns supreme. These models are scaled to incredible sizes and excel at in-context learning.</p><p>But to truly understand the field, we must look at the pivotal models that explored different paths. Google&rsquo;s T5, or <strong>Text-to-Text Transfer Transformer</strong>, stands out as one of the most influential. It didn&rsquo;t just introduce a new model; it proposed a new philosophy. This article dives deep into the architecture of T5, how it fundamentally differs from modern LLMs, and the lasting legacy of its unique design choices.</p><h3 id=the-core-philosophy-everything-is-a-text-to-text-problem>The Core Philosophy: Everything is a Text-to-Text Problem
 <a class=heading-link href=#the-core-philosophy-everything-is-a-text-to-text-problem><i class="fa-solid fa-link" aria-hidden=true title="Link to heading"></i>
@@ -30,4 +30,4 @@ But to truly understand the field, we must look at the pivotal models that explo
 2016 -
 2025
 Eric X. Liu
-<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/84b3c20">[84b3c20]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script></body></html>
\ No newline at end of file
+<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/e15a7ff">[e15a7ff]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script></body></html>
\ No newline at end of file
diff --git a/posts/useful/index.html b/posts/useful/index.html
index 6665160..aaa4bd8 100644
--- a/posts/useful/index.html
+++ b/posts/useful/index.html
@@ -10,4 +10,4 @@ One-minute read</span></div></div></header><div class=post-content><ul><li><a hr
 2016 -
 2025
 Eric X. Liu
-<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/84b3c20">[84b3c20]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script></body></html>
\ No newline at end of file
+<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/e15a7ff">[e15a7ff]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script></body></html>
\ No newline at end of file
diff --git a/sitemap.xml b/sitemap.xml
index 0f30bd6..7b09a31 100644
--- a/sitemap.xml
+++ b/sitemap.xml
@@ -1 +1 @@
-<?xml version="1.0" encoding="utf-8" standalone="yes"?><urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:xhtml="http://www.w3.org/1999/xhtml"><url><loc>/posts/a-deep-dive-into-ppo-for-language-models/</loc><lastmod>2025-08-03T03:15:05+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/</loc><lastmod>2025-08-03T03:15:05+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/posts/mixture-of-experts-moe-models-challenges-solutions-in-practice/</loc><lastmod>2025-08-03T03:15:05+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/posts/</loc><lastmod>2025-08-03T03:15:05+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/posts/t5-the-transformer-that-zigged-when-others-zagged-an-architectural-deep-dive/</loc><lastmod>2025-08-03T03:15:05+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/posts/useful/</loc><lastmod>2020-10-26T04:47:36+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/about/</loc><lastmod>2020-06-16T23:30:17-07:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/categories/</loc><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/tags/</loc><changefreq>weekly</changefreq><priority>0.5</priority></url></urlset>
\ No newline at end of file
+<?xml version="1.0" encoding="utf-8" standalone="yes"?><urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:xhtml="http://www.w3.org/1999/xhtml"><url><loc>/posts/a-deep-dive-into-ppo-for-language-models/</loc><lastmod>2025-08-03T03:17:01+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/</loc><lastmod>2025-08-03T03:17:01+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/posts/mixture-of-experts-moe-models-challenges-solutions-in-practice/</loc><lastmod>2025-08-03T03:17:01+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/posts/</loc><lastmod>2025-08-03T03:17:01+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/posts/t5-the-transformer-that-zigged-when-others-zagged-an-architectural-deep-dive/</loc><lastmod>2025-08-03T03:17:01+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/posts/useful/</loc><lastmod>2020-10-26T04:47:36+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/about/</loc><lastmod>2020-06-16T23:30:17-07:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/categories/</loc><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/tags/</loc><changefreq>weekly</changefreq><priority>0.5</priority></url></urlset>
\ No newline at end of file
diff --git a/tags/index.html b/tags/index.html
index 05b1bd1..3428fa8 100644
--- a/tags/index.html
+++ b/tags/index.html
@@ -4,4 +4,4 @@
 2016 -
 2025
 Eric X. Liu
-<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/84b3c20">[84b3c20]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script></body></html>
\ No newline at end of file
+<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/e15a7ff">[e15a7ff]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script></body></html>
\ No newline at end of file