diff --git a/404.html b/404.html
index 91b4f21..e9fbd70 100644
--- a/404.html
+++ b/404.html
@@ -4,4 +4,4 @@
 2016 -
 2025
 Eric X. Liu
-<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/5706ff7">[5706ff7]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
\ No newline at end of file
+<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/eba296f">[eba296f]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
\ No newline at end of file
diff --git a/about/index.html b/about/index.html
index 7a63b22..4097fbb 100644
--- a/about/index.html
+++ b/about/index.html
@@ -1,7 +1,7 @@
 <!doctype html><html lang=en><head><title>About · Eric X. Liu's Personal Page</title><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><meta name=color-scheme content="light dark"><meta name=author content="Eric X. Liu"><meta name=description content="Eric X. Liu - Software & Performance Engineer at Google. Sharing insights about software engineering, performance optimization, tech industry experiences, mountain biking adventures, Jeep overlanding, and outdoor activities."><meta name=keywords content="software engineer,performance engineering,Google engineer,tech blog,software development,performance optimization,Eric Liu,engineering blog,mountain biking,Jeep enthusiast,overlanding,camping,outdoor adventures"><meta name=twitter:card content="summary"><meta name=twitter:title content="About"><meta name=twitter:description content="Eric X. Liu - Software & Performance Engineer at Google. Sharing insights about software engineering, performance optimization, tech industry experiences, mountain biking adventures, Jeep overlanding, and outdoor activities."><meta property="og:url" content="/about/"><meta property="og:site_name" content="Eric X. Liu's Personal Page"><meta property="og:title" content="About"><meta property="og:description" content="Eric X. Liu - Software & Performance Engineer at Google. Sharing insights about software engineering, performance optimization, tech industry experiences, mountain biking adventures, Jeep overlanding, and outdoor activities."><meta property="og:locale" content="en"><meta property="og:type" content="article"><meta property="article:published_time" content="2018-06-01T07:13:52+00:00"><meta property="article:modified_time" content="2020-06-16T23:30:17-07:00"><link rel=canonical href=/about/><link rel=preload href=/fonts/fa-brands-400.woff2 as=font type=font/woff2 crossorigin><link rel=preload href=/fonts/fa-regular-400.woff2 as=font type=font/woff2 crossorigin><link rel=preload href=/fonts/fa-solid-900.woff2 as=font type=font/woff2 crossorigin><link rel=stylesheet href=/css/coder.min.6445a802b9389c9660e1b07b724dcf5718b1065ed2d71b4eeaf981cc7cc5fc46.css integrity="sha256-ZEWoArk4nJZg4bB7ck3PVxixBl7S1xtO6vmBzHzF/EY=" crossorigin=anonymous media=screen><link rel=stylesheet href=/css/coder-dark.min.a00e6364bacbc8266ad1cc81230774a1397198f8cfb7bcba29b7d6fcb54ce57f.css integrity="sha256-oA5jZLrLyCZq0cyBIwd0oTlxmPjPt7y6KbfW/LVM5X8=" crossorigin=anonymous media=screen><link rel=icon type=image/svg+xml href=/images/favicon.svg sizes=any><link rel=icon type=image/png href=/images/favicon-32x32.png sizes=32x32><link rel=icon type=image/png href=/images/favicon-16x16.png sizes=16x16><link rel=apple-touch-icon href=/images/apple-touch-icon.png><link rel=apple-touch-icon sizes=180x180 href=/images/apple-touch-icon.png><link rel=manifest href=/site.webmanifest><link rel=mask-icon href=/images/safari-pinned-tab.svg color=#5bbad5></head><body class="preload-transitions colorscheme-auto"><div class=float-container><a id=dark-mode-toggle class=colorscheme-toggle><i class="fa-solid fa-adjust fa-fw" aria-hidden=true></i></a></div><main class=wrapper><nav class=navigation><section class=container><a class=navigation-title href=/>Eric X. Liu's Personal Page
 </a><input type=checkbox id=menu-toggle>
-<label class="menu-button float-right" for=menu-toggle><i class="fa-solid fa-bars fa-fw" aria-hidden=true></i></label><ul class=navigation-list><li class=navigation-item><a class=navigation-link href=/posts/>Posts</a></li><li class=navigation-item><a class=navigation-link href=https://chat.ericxliu.me>Chat</a></li><li class=navigation-item><a class=navigation-link href=https://git.ericxliu.me/user/oauth2/Authenitk>Git</a></li><li class=navigation-item><a class=navigation-link href=https://coder.ericxliu.me/api/v2/users/oidc/callback>Coder</a></li><li class=navigation-item><a class=navigation-link href=https://rss.ericxliu.me/oauth2/oidc/redirect>RSS</a></li><li class=navigation-item><a class=navigation-link href=/>|</a></li><li class=navigation-item><a class=navigation-link href=https://sso.ericxliu.me>Sign in</a></li></ul></section></nav><div class=content><section class="container page"><article><header><h1 class=title><a class=title-link href=/about/>About</a></h1></header></article></section></div><footer class=footer><section class=container>©
+<label class="menu-button float-right" for=menu-toggle><i class="fa-solid fa-bars fa-fw" aria-hidden=true></i></label><ul class=navigation-list><li class=navigation-item><a class=navigation-link href=/posts/>Posts</a></li><li class=navigation-item><a class=navigation-link href=https://chat.ericxliu.me>Chat</a></li><li class=navigation-item><a class=navigation-link href=https://git.ericxliu.me/user/oauth2/Authenitk>Git</a></li><li class=navigation-item><a class=navigation-link href=https://coder.ericxliu.me/api/v2/users/oidc/callback>Coder</a></li><li class=navigation-item><a class=navigation-link href=https://rss.ericxliu.me/oauth2/oidc/redirect>RSS</a></li><li class=navigation-item><a class=navigation-link href=/>|</a></li><li class=navigation-item><a class=navigation-link href=https://sso.ericxliu.me>Sign in</a></li></ul></section></nav><div class=content><section class="container page"><article><header><h1 class=title><a class=title-link href=/about/>About</a></h1></header></article></section><link rel=stylesheet href=https://cdn.jsdelivr.net/npm/katex@0.16.4/dist/katex.min.css integrity=sha384-vKruj+a13U8yHIkAyGgK1J3ArTLzrFGBbBc0tDp4ad/EyewESeXE/Iv67Aj8gKZ0 crossorigin=anonymous><script defer src=https://cdn.jsdelivr.net/npm/katex@0.16.4/dist/katex.min.js integrity=sha384-PwRUT/YqbnEjkZO0zZxNqcxACrXe+j766U2amXcgMg5457rve2Y7I6ZJSm2A0mS4 crossorigin=anonymous></script><script defer src=https://cdn.jsdelivr.net/npm/katex@0.16.4/dist/contrib/auto-render.min.js integrity=sha384-+VBxd3r6XgURycqtZ117nYw44OOcIax56Z4dCRWbxyPt0Koah1uHoK0o4+/RRE05 crossorigin=anonymous onload='renderMathInElement(document.body,{delimiters:[{left:"$$",right:"$$",display:!0},{left:"$",right:"$",display:!1},{left:"\\(",right:"\\)",display:!1},{left:"\\[",right:"\\]",display:!0}]})'></script></div><footer class=footer><section class=container>©
 2016 -
 2025
 Eric X. Liu
-<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/5706ff7">[5706ff7]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
\ No newline at end of file
+<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/eba296f">[eba296f]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
\ No newline at end of file
diff --git a/categories/index.html b/categories/index.html
index f13e225..a25e4bd 100644
--- a/categories/index.html
+++ b/categories/index.html
@@ -4,4 +4,4 @@
 2016 -
 2025
 Eric X. Liu
-<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/5706ff7">[5706ff7]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
\ No newline at end of file
+<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/eba296f">[eba296f]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
\ No newline at end of file
diff --git a/index.html b/index.html
index 4034f50..7ec884a 100644
--- a/index.html
+++ b/index.html
@@ -4,4 +4,4 @@
 2016 -
 2025
 Eric X. Liu
-<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/5706ff7">[5706ff7]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
\ No newline at end of file
+<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/eba296f">[eba296f]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
\ No newline at end of file
diff --git a/index.xml b/index.xml
index d8dec86..680d753 100644
--- a/index.xml
+++ b/index.xml
@@ -1,4 +1,4 @@
-<?xml version="1.0" encoding="utf-8" standalone="yes"?><rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom"><channel><title>Eric X. Liu's Personal Page</title><link>/</link><description>Recent content on Eric X. Liu's Personal Page</description><generator>Hugo</generator><language>en</language><lastBuildDate>Sun, 03 Aug 2025 04:20:20 +0000</lastBuildDate><atom:link href="/index.xml" rel="self" type="application/rss+xml"/><item><title>A Deep Dive into PPO for Language Models</title><link>/posts/a-deep-dive-into-ppo-for-language-models/</link><pubDate>Sat, 02 Aug 2025 00:00:00 +0000</pubDate><guid>/posts/a-deep-dive-into-ppo-for-language-models/</guid><description>&lt;p>Large Language Models (LLMs) have demonstrated astonishing capabilities, but out-of-the-box, they are simply powerful text predictors. They don&amp;rsquo;t inherently understand what makes a response helpful, harmless, or aligned with human values. The technique that has proven most effective at bridging this gap is Reinforcement Learning from Human Feedback (RLHF), and at its heart lies a powerful algorithm: Proximal Policy Optimization (PPO).&lt;/p>
+<?xml version="1.0" encoding="utf-8" standalone="yes"?><rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom"><channel><title>Eric X. Liu's Personal Page</title><link>/</link><description>Recent content on Eric X. Liu's Personal Page</description><generator>Hugo</generator><language>en</language><lastBuildDate>Sun, 03 Aug 2025 06:02:48 +0000</lastBuildDate><atom:link href="/index.xml" rel="self" type="application/rss+xml"/><item><title>A Deep Dive into PPO for Language Models</title><link>/posts/a-deep-dive-into-ppo-for-language-models/</link><pubDate>Sat, 02 Aug 2025 00:00:00 +0000</pubDate><guid>/posts/a-deep-dive-into-ppo-for-language-models/</guid><description>&lt;p>Large Language Models (LLMs) have demonstrated astonishing capabilities, but out-of-the-box, they are simply powerful text predictors. They don&amp;rsquo;t inherently understand what makes a response helpful, harmless, or aligned with human values. The technique that has proven most effective at bridging this gap is Reinforcement Learning from Human Feedback (RLHF), and at its heart lies a powerful algorithm: Proximal Policy Optimization (PPO).&lt;/p>
 &lt;p>You may have seen diagrams like the one below, which outlines the RLHF training process. It can look intimidating, with a web of interconnected models, losses, and data flows.&lt;/p></description></item><item><title>Mixture-of-Experts (MoE) Models Challenges &amp; Solutions in Practice</title><link>/posts/mixture-of-experts-moe-models-challenges-solutions-in-practice/</link><pubDate>Wed, 02 Jul 2025 00:00:00 +0000</pubDate><guid>/posts/mixture-of-experts-moe-models-challenges-solutions-in-practice/</guid><description>&lt;p>Mixture-of-Experts (MoEs) are neural network architectures that allow different parts of the model (called &amp;ldquo;experts&amp;rdquo;) to specialize in different types of inputs. A &amp;ldquo;gating network&amp;rdquo; or &amp;ldquo;router&amp;rdquo; learns to dispatch each input (or &amp;ldquo;token&amp;rdquo;) to a subset of these experts. While powerful for scaling models, MoEs introduce several practical challenges.&lt;/p>
 &lt;h3 id="1-challenge-non-differentiability-of-routing-functions">
  1. Challenge: Non-Differentiability of Routing Functions
@@ -8,7 +8,7 @@
  &lt;/a>
 &lt;/h3>
 &lt;p>&lt;strong>The Problem:&lt;/strong>
-Many routing mechanisms, especially &amp;ldquo;Top-K routing,&amp;rdquo; involve a discrete, hard selection process. A common function is &lt;code>KeepTopK(v, k)&lt;/code>, which selects the top &lt;code>k&lt;/code> scoring elements from a vector &lt;code>v&lt;/code> and sets others to (-\infty) or (0).&lt;/p></description></item><item><title>An Architectural Deep Dive of T5</title><link>/posts/t5-the-transformer-that-zigged-when-others-zagged-an-architectural-deep-dive/</link><pubDate>Sun, 01 Jun 2025 00:00:00 +0000</pubDate><guid>/posts/t5-the-transformer-that-zigged-when-others-zagged-an-architectural-deep-dive/</guid><description>&lt;p>In the rapidly evolving landscape of Large Language Models, a few key architectures define the dominant paradigms. Today, the &amp;ldquo;decoder-only&amp;rdquo; model, popularized by the GPT series and its successors like LLaMA and Mistral, reigns supreme. These models are scaled to incredible sizes and excel at in-context learning.&lt;/p>
+Many routing mechanisms, especially &amp;ldquo;Top-K routing,&amp;rdquo; involve a discrete, hard selection process. A common function is &lt;code>KeepTopK(v, k)&lt;/code>, which selects the top &lt;code>k&lt;/code> scoring elements from a vector &lt;code>v&lt;/code> and sets others to $-\infty$ or $0$.&lt;/p></description></item><item><title>An Architectural Deep Dive of T5</title><link>/posts/t5-the-transformer-that-zigged-when-others-zagged-an-architectural-deep-dive/</link><pubDate>Sun, 01 Jun 2025 00:00:00 +0000</pubDate><guid>/posts/t5-the-transformer-that-zigged-when-others-zagged-an-architectural-deep-dive/</guid><description>&lt;p>In the rapidly evolving landscape of Large Language Models, a few key architectures define the dominant paradigms. Today, the &amp;ldquo;decoder-only&amp;rdquo; model, popularized by the GPT series and its successors like LLaMA and Mistral, reigns supreme. These models are scaled to incredible sizes and excel at in-context learning.&lt;/p>
 &lt;p>But to truly understand the field, we must look at the pivotal models that explored different paths. Google&amp;rsquo;s T5, or &lt;strong>Text-to-Text Transfer Transformer&lt;/strong>, stands out as one of the most influential. It didn&amp;rsquo;t just introduce a new model; it proposed a new philosophy. This article dives deep into the architecture of T5, how it fundamentally differs from modern LLMs, and the lasting legacy of its unique design choices.&lt;/p></description></item><item><title>Mastering Your Breville Barista Pro: The Ultimate Guide to Dialing In Espresso</title><link>/posts/espresso-theory-application-a-guide-for-the-breville-barista-pro/</link><pubDate>Thu, 01 May 2025 00:00:00 +0000</pubDate><guid>/posts/espresso-theory-application-a-guide-for-the-breville-barista-pro/</guid><description>&lt;p>Are you ready to transform your home espresso game from good to genuinely great? The Breville Barista Pro is a fantastic machine, but unlocking its full potential requires understanding a few key principles. This guide will walk you through the systematic process of dialing in your espresso, ensuring every shot is delicious and repeatable.&lt;/p>
 &lt;p>Our overarching philosophy is simple: &lt;strong>isolate and change only one variable at a time.&lt;/strong> While numbers are crucial, your palate is the ultimate judge. Dose, ratio, and time are interconnected, but your &lt;strong>grind size&lt;/strong> is your most powerful lever.&lt;/p></description></item><item><title>Some useful files</title><link>/posts/useful/</link><pubDate>Mon, 26 Oct 2020 04:14:43 +0000</pubDate><guid>/posts/useful/</guid><description>&lt;ul>
 &lt;li>&lt;a href="https://ericxliu.me/rootCA.pem" class="external-link" target="_blank" rel="noopener">rootCA.pem&lt;/a>&lt;/li>
diff --git a/posts/a-deep-dive-into-ppo-for-language-models/index.html b/posts/a-deep-dive-into-ppo-for-language-models/index.html
index 91fe7b7..e3449b7 100644
--- a/posts/a-deep-dive-into-ppo-for-language-models/index.html
+++ b/posts/a-deep-dive-into-ppo-for-language-models/index.html
@@ -19,8 +19,8 @@ where <code>δ_t = r_t + γV(s_{t+1}) - V(s_t)</code></p><ul><li><strong>γ (gam
 <a class=heading-link href=#avoiding-amnesia-the-pretraining-loss><i class="fa-solid fa-link" aria-hidden=true title="Link to heading"></i>
 <span class=sr-only>Link to heading</span></a></h3><p>There&rsquo;s one final problem. If we only optimize for the PPO loss, the model might learn to &ldquo;hack&rdquo; the reward model by generating repetitive or nonsensical text that gets a high score. In doing so, it could suffer from <strong>catastrophic forgetting</strong>, losing its fundamental grasp of grammar and facts.</p><p>To prevent this, we introduce a second loss term. As seen in the diagram, we mix in data from the original <strong>Pretraining Data</strong> (or the dataset used for Supervised Fine-Tuning). We calculate a standard next-token prediction loss (<code>LM Loss</code>) on this high-quality data.</p><p>The final loss for the Actor is a combination of both objectives:</p><p><strong>Total Loss = Loss_PPO + <code>λ_ptx</code> * Loss_LM</strong></p><p>This brilliantly balances two goals:</p><ol><li>The <code>Loss_PPO</code> pushes the model towards behaviors that align with human preferences.</li><li>The <code>Loss_LM</code> acts as a regularizer, pulling the model back towards its core language capabilities and preventing it from drifting into gibberish.</li></ol><h3 id=the-full-training-loop>The Full Training Loop
 <a class=heading-link href=#the-full-training-loop><i class="fa-solid fa-link" aria-hidden=true title="Link to heading"></i>
-<span class=sr-only>Link to heading</span></a></h3><p>Now, we can assemble the entire process into a clear, iterative loop:</p><ol><li><strong>Collect</strong>: The current Actor policy <code>π_k</code> generates responses to a batch of prompts. These experiences—<code>(state, action, probability, reward, value)</code>—are stored in an <strong>Experience Buffer</strong>.</li><li><strong>Calculate</strong>: Once the buffer is full, we use the collected data to compute the advantage estimates <code>Â_t</code> for every single token-generation step.</li><li><strong>Optimize</strong>: For a few epochs, we repeatedly sample mini-batches from the buffer and update the Actor and Critic models. The Actor is updated using the combined <code>PPO-clip Loss</code> and <code>LM Loss</code>. The Critic is updated to improve its value predictions.</li><li><strong>Flush and Repeat</strong>: After the optimization phase, the entire experience buffer is discarded. The data is now &ldquo;stale&rdquo; because our policy has changed. The newly updated policy <code>π_{k+1}</code> becomes the new Actor, and we return to step 1 to collect fresh data.</li></ol><p>This cycle of collection and optimization allows the language model to gradually and safely steer its behavior towards human-defined goals, creating the helpful and aligned AI assistants we interact with today.</p><hr><p><strong>References:</strong></p><ol><li>Schulman, J., Wolski, F., Dhariwal, P., Radford, A., & Klimov, O. (2017). <em>Proximal Policy Optimization Algorithms</em>. arXiv preprint arXiv:1707.06347.</li><li>Schulman, J., Moritz, P., Levine, S., Jordan, M., & Abbeel, P. (2015). <em>High-Dimensional Continuous Control Using Generalized Advantage Estimation</em>. arXiv preprint arXiv:1506.02438.</li><li>Ouyang, L., et al. (2022). <em>Training language models to follow instructions with human feedback</em>. Advances in Neural Information Processing Systems 35.</li></ol></div><footer><div id=disqus_thread></div><script>window.disqus_config=function(){},function(){if(["localhost","127.0.0.1"].indexOf(window.location.hostname)!=-1){document.getElementById("disqus_thread").innerHTML="Disqus comments not available by default when the website is previewed locally.";return}var t=document,e=t.createElement("script");e.async=!0,e.src="//ericxliu-me.disqus.com/embed.js",e.setAttribute("data-timestamp",+new Date),(t.head||t.body).appendChild(e)}(),document.addEventListener("themeChanged",function(){document.readyState=="complete"&&DISQUS.reset({reload:!0,config:disqus_config})})</script></footer></article></section></div><footer class=footer><section class=container>©
+<span class=sr-only>Link to heading</span></a></h3><p>Now, we can assemble the entire process into a clear, iterative loop:</p><ol><li><strong>Collect</strong>: The current Actor policy <code>π_k</code> generates responses to a batch of prompts. These experiences—<code>(state, action, probability, reward, value)</code>—are stored in an <strong>Experience Buffer</strong>.</li><li><strong>Calculate</strong>: Once the buffer is full, we use the collected data to compute the advantage estimates <code>Â_t</code> for every single token-generation step.</li><li><strong>Optimize</strong>: For a few epochs, we repeatedly sample mini-batches from the buffer and update the Actor and Critic models. The Actor is updated using the combined <code>PPO-clip Loss</code> and <code>LM Loss</code>. The Critic is updated to improve its value predictions.</li><li><strong>Flush and Repeat</strong>: After the optimization phase, the entire experience buffer is discarded. The data is now &ldquo;stale&rdquo; because our policy has changed. The newly updated policy <code>π_{k+1}</code> becomes the new Actor, and we return to step 1 to collect fresh data.</li></ol><p>This cycle of collection and optimization allows the language model to gradually and safely steer its behavior towards human-defined goals, creating the helpful and aligned AI assistants we interact with today.</p><hr><p><strong>References:</strong></p><ol><li>Schulman, J., Wolski, F., Dhariwal, P., Radford, A., & Klimov, O. (2017). <em>Proximal Policy Optimization Algorithms</em>. arXiv preprint arXiv:1707.06347.</li><li>Schulman, J., Moritz, P., Levine, S., Jordan, M., & Abbeel, P. (2015). <em>High-Dimensional Continuous Control Using Generalized Advantage Estimation</em>. arXiv preprint arXiv:1506.02438.</li><li>Ouyang, L., et al. (2022). <em>Training language models to follow instructions with human feedback</em>. Advances in Neural Information Processing Systems 35.</li></ol></div><footer><div id=disqus_thread></div><script>window.disqus_config=function(){},function(){if(["localhost","127.0.0.1"].indexOf(window.location.hostname)!=-1){document.getElementById("disqus_thread").innerHTML="Disqus comments not available by default when the website is previewed locally.";return}var t=document,e=t.createElement("script");e.async=!0,e.src="//ericxliu-me.disqus.com/embed.js",e.setAttribute("data-timestamp",+new Date),(t.head||t.body).appendChild(e)}(),document.addEventListener("themeChanged",function(){document.readyState=="complete"&&DISQUS.reset({reload:!0,config:disqus_config})})</script></footer></article><link rel=stylesheet href=https://cdn.jsdelivr.net/npm/katex@0.16.4/dist/katex.min.css integrity=sha384-vKruj+a13U8yHIkAyGgK1J3ArTLzrFGBbBc0tDp4ad/EyewESeXE/Iv67Aj8gKZ0 crossorigin=anonymous><script defer src=https://cdn.jsdelivr.net/npm/katex@0.16.4/dist/katex.min.js integrity=sha384-PwRUT/YqbnEjkZO0zZxNqcxACrXe+j766U2amXcgMg5457rve2Y7I6ZJSm2A0mS4 crossorigin=anonymous></script><script defer src=https://cdn.jsdelivr.net/npm/katex@0.16.4/dist/contrib/auto-render.min.js integrity=sha384-+VBxd3r6XgURycqtZ117nYw44OOcIax56Z4dCRWbxyPt0Koah1uHoK0o4+/RRE05 crossorigin=anonymous onload='renderMathInElement(document.body,{delimiters:[{left:"$$",right:"$$",display:!0},{left:"$",right:"$",display:!1},{left:"\\(",right:"\\)",display:!1},{left:"\\[",right:"\\]",display:!0}]})'></script></section></div><footer class=footer><section class=container>©
 2016 -
 2025
 Eric X. Liu
-<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/5706ff7">[5706ff7]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
\ No newline at end of file
+<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/eba296f">[eba296f]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
\ No newline at end of file
diff --git a/posts/espresso-theory-application-a-guide-for-the-breville-barista-pro/index.html b/posts/espresso-theory-application-a-guide-for-the-breville-barista-pro/index.html
index d33254c..4e53faa 100644
--- a/posts/espresso-theory-application-a-guide-for-the-breville-barista-pro/index.html
+++ b/posts/espresso-theory-application-a-guide-for-the-breville-barista-pro/index.html
@@ -16,8 +16,8 @@ Our overarching philosophy is simple: isolate and change only one variable at a
 <a class=heading-link href=#part-4-the-primary-control--grind-setting><i class="fa-solid fa-link" aria-hidden=true title="Link to heading"></i>
 <span class=sr-only>Link to heading</span></a></h3><p>This is where the magic (and sometimes frustration) happens. Grind size is your main tool for controlling the resistance of the coffee puck, which directly dictates your brew time.</p><p><strong>The Dual Impact of Grinding Finer:</strong></p><ol><li><strong>Increases surface area:</strong> Allows for more efficient flavor extraction.</li><li><strong>Increases resistance:</strong> Slows down water flow and increases contact time.</li></ol><p><strong>The Risk of Grinding Too Fine (Channeling):</strong></p><p>If the grind is too fine, the puck becomes so dense that high-pressure water can&rsquo;t flow evenly. Instead, it &ldquo;breaks&rdquo; the puck and punches an easy path (a channel) through a weak spot. This results in a disastrous shot that is simultaneously:</p><ul><li><strong>Under-extracted:</strong> Most of the coffee is bypassed.</li><li><strong>Over-extracted:</strong> The water that does flow blasts through the channel, extracting harsh, bitter compounds.</li><li><strong>The Taste:</strong> A channeled shot tastes hollow, weak, sour, <em>and</em> bitter all at once.</li></ul><p><strong>The Goal:</strong> You want to <strong>grind as fine as you possibly can <em>without</em> causing significant channeling</strong>. This is the sweet spot for maximizing surface area and resistance for high, even extraction.</p><p><strong>Grind Retention (Purging):</strong> Most grinders retain some old grounds. When you change your grind setting, always purge a few grams of coffee to ensure your dose is entirely at the new setting.</p><p><strong>Application for Your Breville Barista Pro:</strong></p><ul><li><strong>Grinder Mechanism:</strong> The &ldquo;Grind Amount&rdquo; dial controls the <strong>TIME</strong> the grinder runs, not the weight. When you adjust the fineness, you <strong>must</strong> re-adjust the grind time to ensure you are still getting your target 18g dose.</li><li><strong>Tackling Channeling:</strong> The Barista Pro is prone to channeling. To fight this, focus on excellent <strong>puck prep</strong>: use a WDT (Weiss Distribution Technique) tool to break up clumps and evenly distribute the grounds before tamping levelly.</li></ul><hr><h3 id=the-complete-dialing-in-workflow><strong>The Complete Dialing-In Workflow</strong>
 <a class=heading-link href=#the-complete-dialing-in-workflow><i class="fa-solid fa-link" aria-hidden=true title="Link to heading"></i>
-<span class=sr-only>Link to heading</span></a></h3><p>This systematic process will get you to a delicious shot from your Breville Barista Pro efficiently:</p><ol><li><strong>Set Your Constants:</strong><ul><li><strong>Dose:</strong> <strong>18g</strong>.</li><li><strong>Ratio:</strong> <strong>1:2</strong> (meaning a <strong>Yield</strong> of <strong>36g</strong>).</li><li><strong>Pre-infusion:</strong> Use a consistent method (e.g., manual 8-second hold).</li></ul></li><li><strong>Make an Initial Grind:</strong><ul><li>Set the grinder to a starting point of <strong>15</strong>.</li><li>Adjust the grind <strong>time</strong> until the grinder dispenses exactly 18g.</li></ul></li><li><strong>Pull the First Shot:</strong><ul><li>Brew manually, stopping at <strong>36g</strong> of liquid in the cup. Note the <strong>total brew time</strong>.</li></ul></li><li><strong>Taste and Diagnose:</strong><ul><li><strong>Fast & Sour? (&lt;25s):</strong> Grind is too coarse.</li><li><strong>Slow & Bitter? (>32s):</strong> Grind is too fine.</li></ul></li><li><strong>Make ONE Adjustment - THE GRIND SIZE:</strong><ul><li>If fast/sour, adjust the grind <strong>finer</strong> (e.g., from 15 down to 13).</li><li>If slow/bitter, adjust the grind <strong>coarser</strong> (e.g., from 15 up to 17).</li></ul></li><li><strong>Re-adjust and Repeat:</strong><ul><li>After changing the grind setting, <strong>purge</strong> a small amount of coffee.</li><li>Re-weigh your next dose and <strong>adjust the grind time</strong> to get back to exactly 18g.</li><li>Pull another 36g shot. Repeat this process until your shot tastes balanced and the time falls roughly between <strong>25-32 seconds</strong>.</li></ul></li></ol><p>Happy brewing! With patience and this systematic approach, you&rsquo;ll be pulling consistently delicious espresso shots from your Breville Barista Pro in no time.</p></div><footer><div id=disqus_thread></div><script>window.disqus_config=function(){},function(){if(["localhost","127.0.0.1"].indexOf(window.location.hostname)!=-1){document.getElementById("disqus_thread").innerHTML="Disqus comments not available by default when the website is previewed locally.";return}var t=document,e=t.createElement("script");e.async=!0,e.src="//ericxliu-me.disqus.com/embed.js",e.setAttribute("data-timestamp",+new Date),(t.head||t.body).appendChild(e)}(),document.addEventListener("themeChanged",function(){document.readyState=="complete"&&DISQUS.reset({reload:!0,config:disqus_config})})</script></footer></article></section></div><footer class=footer><section class=container>©
+<span class=sr-only>Link to heading</span></a></h3><p>This systematic process will get you to a delicious shot from your Breville Barista Pro efficiently:</p><ol><li><strong>Set Your Constants:</strong><ul><li><strong>Dose:</strong> <strong>18g</strong>.</li><li><strong>Ratio:</strong> <strong>1:2</strong> (meaning a <strong>Yield</strong> of <strong>36g</strong>).</li><li><strong>Pre-infusion:</strong> Use a consistent method (e.g., manual 8-second hold).</li></ul></li><li><strong>Make an Initial Grind:</strong><ul><li>Set the grinder to a starting point of <strong>15</strong>.</li><li>Adjust the grind <strong>time</strong> until the grinder dispenses exactly 18g.</li></ul></li><li><strong>Pull the First Shot:</strong><ul><li>Brew manually, stopping at <strong>36g</strong> of liquid in the cup. Note the <strong>total brew time</strong>.</li></ul></li><li><strong>Taste and Diagnose:</strong><ul><li><strong>Fast & Sour? (&lt;25s):</strong> Grind is too coarse.</li><li><strong>Slow & Bitter? (>32s):</strong> Grind is too fine.</li></ul></li><li><strong>Make ONE Adjustment - THE GRIND SIZE:</strong><ul><li>If fast/sour, adjust the grind <strong>finer</strong> (e.g., from 15 down to 13).</li><li>If slow/bitter, adjust the grind <strong>coarser</strong> (e.g., from 15 up to 17).</li></ul></li><li><strong>Re-adjust and Repeat:</strong><ul><li>After changing the grind setting, <strong>purge</strong> a small amount of coffee.</li><li>Re-weigh your next dose and <strong>adjust the grind time</strong> to get back to exactly 18g.</li><li>Pull another 36g shot. Repeat this process until your shot tastes balanced and the time falls roughly between <strong>25-32 seconds</strong>.</li></ul></li></ol><p>Happy brewing! With patience and this systematic approach, you&rsquo;ll be pulling consistently delicious espresso shots from your Breville Barista Pro in no time.</p></div><footer><div id=disqus_thread></div><script>window.disqus_config=function(){},function(){if(["localhost","127.0.0.1"].indexOf(window.location.hostname)!=-1){document.getElementById("disqus_thread").innerHTML="Disqus comments not available by default when the website is previewed locally.";return}var t=document,e=t.createElement("script");e.async=!0,e.src="//ericxliu-me.disqus.com/embed.js",e.setAttribute("data-timestamp",+new Date),(t.head||t.body).appendChild(e)}(),document.addEventListener("themeChanged",function(){document.readyState=="complete"&&DISQUS.reset({reload:!0,config:disqus_config})})</script></footer></article><link rel=stylesheet href=https://cdn.jsdelivr.net/npm/katex@0.16.4/dist/katex.min.css integrity=sha384-vKruj+a13U8yHIkAyGgK1J3ArTLzrFGBbBc0tDp4ad/EyewESeXE/Iv67Aj8gKZ0 crossorigin=anonymous><script defer src=https://cdn.jsdelivr.net/npm/katex@0.16.4/dist/katex.min.js integrity=sha384-PwRUT/YqbnEjkZO0zZxNqcxACrXe+j766U2amXcgMg5457rve2Y7I6ZJSm2A0mS4 crossorigin=anonymous></script><script defer src=https://cdn.jsdelivr.net/npm/katex@0.16.4/dist/contrib/auto-render.min.js integrity=sha384-+VBxd3r6XgURycqtZ117nYw44OOcIax56Z4dCRWbxyPt0Koah1uHoK0o4+/RRE05 crossorigin=anonymous onload='renderMathInElement(document.body,{delimiters:[{left:"$$",right:"$$",display:!0},{left:"$",right:"$",display:!1},{left:"\\(",right:"\\)",display:!1},{left:"\\[",right:"\\]",display:!0}]})'></script></section></div><footer class=footer><section class=container>©
 2016 -
 2025
 Eric X. Liu
-<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/5706ff7">[5706ff7]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
\ No newline at end of file
+<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/eba296f">[eba296f]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
\ No newline at end of file
diff --git a/posts/index.html b/posts/index.html
index e9e4c7f..faa45e3 100644
--- a/posts/index.html
+++ b/posts/index.html
@@ -9,4 +9,4 @@
 2016 -
 2025
 Eric X. Liu
-<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/5706ff7">[5706ff7]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
\ No newline at end of file
+<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/eba296f">[eba296f]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
\ No newline at end of file
diff --git a/posts/index.xml b/posts/index.xml
index 1876b5e..d435e64 100644
--- a/posts/index.xml
+++ b/posts/index.xml
@@ -1,4 +1,4 @@
-<?xml version="1.0" encoding="utf-8" standalone="yes"?><rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom"><channel><title>Posts on Eric X. Liu's Personal Page</title><link>/posts/</link><description>Recent content in Posts on Eric X. Liu's Personal Page</description><generator>Hugo</generator><language>en</language><lastBuildDate>Sun, 03 Aug 2025 04:20:20 +0000</lastBuildDate><atom:link href="/posts/index.xml" rel="self" type="application/rss+xml"/><item><title>A Deep Dive into PPO for Language Models</title><link>/posts/a-deep-dive-into-ppo-for-language-models/</link><pubDate>Sat, 02 Aug 2025 00:00:00 +0000</pubDate><guid>/posts/a-deep-dive-into-ppo-for-language-models/</guid><description>&lt;p>Large Language Models (LLMs) have demonstrated astonishing capabilities, but out-of-the-box, they are simply powerful text predictors. They don&amp;rsquo;t inherently understand what makes a response helpful, harmless, or aligned with human values. The technique that has proven most effective at bridging this gap is Reinforcement Learning from Human Feedback (RLHF), and at its heart lies a powerful algorithm: Proximal Policy Optimization (PPO).&lt;/p>
+<?xml version="1.0" encoding="utf-8" standalone="yes"?><rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom"><channel><title>Posts on Eric X. Liu's Personal Page</title><link>/posts/</link><description>Recent content in Posts on Eric X. Liu's Personal Page</description><generator>Hugo</generator><language>en</language><lastBuildDate>Sun, 03 Aug 2025 06:02:48 +0000</lastBuildDate><atom:link href="/posts/index.xml" rel="self" type="application/rss+xml"/><item><title>A Deep Dive into PPO for Language Models</title><link>/posts/a-deep-dive-into-ppo-for-language-models/</link><pubDate>Sat, 02 Aug 2025 00:00:00 +0000</pubDate><guid>/posts/a-deep-dive-into-ppo-for-language-models/</guid><description>&lt;p>Large Language Models (LLMs) have demonstrated astonishing capabilities, but out-of-the-box, they are simply powerful text predictors. They don&amp;rsquo;t inherently understand what makes a response helpful, harmless, or aligned with human values. The technique that has proven most effective at bridging this gap is Reinforcement Learning from Human Feedback (RLHF), and at its heart lies a powerful algorithm: Proximal Policy Optimization (PPO).&lt;/p>
 &lt;p>You may have seen diagrams like the one below, which outlines the RLHF training process. It can look intimidating, with a web of interconnected models, losses, and data flows.&lt;/p></description></item><item><title>Mixture-of-Experts (MoE) Models Challenges &amp; Solutions in Practice</title><link>/posts/mixture-of-experts-moe-models-challenges-solutions-in-practice/</link><pubDate>Wed, 02 Jul 2025 00:00:00 +0000</pubDate><guid>/posts/mixture-of-experts-moe-models-challenges-solutions-in-practice/</guid><description>&lt;p>Mixture-of-Experts (MoEs) are neural network architectures that allow different parts of the model (called &amp;ldquo;experts&amp;rdquo;) to specialize in different types of inputs. A &amp;ldquo;gating network&amp;rdquo; or &amp;ldquo;router&amp;rdquo; learns to dispatch each input (or &amp;ldquo;token&amp;rdquo;) to a subset of these experts. While powerful for scaling models, MoEs introduce several practical challenges.&lt;/p>
 &lt;h3 id="1-challenge-non-differentiability-of-routing-functions">
  1. Challenge: Non-Differentiability of Routing Functions
@@ -8,7 +8,7 @@
  &lt;/a>
 &lt;/h3>
 &lt;p>&lt;strong>The Problem:&lt;/strong>
-Many routing mechanisms, especially &amp;ldquo;Top-K routing,&amp;rdquo; involve a discrete, hard selection process. A common function is &lt;code>KeepTopK(v, k)&lt;/code>, which selects the top &lt;code>k&lt;/code> scoring elements from a vector &lt;code>v&lt;/code> and sets others to (-\infty) or (0).&lt;/p></description></item><item><title>An Architectural Deep Dive of T5</title><link>/posts/t5-the-transformer-that-zigged-when-others-zagged-an-architectural-deep-dive/</link><pubDate>Sun, 01 Jun 2025 00:00:00 +0000</pubDate><guid>/posts/t5-the-transformer-that-zigged-when-others-zagged-an-architectural-deep-dive/</guid><description>&lt;p>In the rapidly evolving landscape of Large Language Models, a few key architectures define the dominant paradigms. Today, the &amp;ldquo;decoder-only&amp;rdquo; model, popularized by the GPT series and its successors like LLaMA and Mistral, reigns supreme. These models are scaled to incredible sizes and excel at in-context learning.&lt;/p>
+Many routing mechanisms, especially &amp;ldquo;Top-K routing,&amp;rdquo; involve a discrete, hard selection process. A common function is &lt;code>KeepTopK(v, k)&lt;/code>, which selects the top &lt;code>k&lt;/code> scoring elements from a vector &lt;code>v&lt;/code> and sets others to $-\infty$ or $0$.&lt;/p></description></item><item><title>An Architectural Deep Dive of T5</title><link>/posts/t5-the-transformer-that-zigged-when-others-zagged-an-architectural-deep-dive/</link><pubDate>Sun, 01 Jun 2025 00:00:00 +0000</pubDate><guid>/posts/t5-the-transformer-that-zigged-when-others-zagged-an-architectural-deep-dive/</guid><description>&lt;p>In the rapidly evolving landscape of Large Language Models, a few key architectures define the dominant paradigms. Today, the &amp;ldquo;decoder-only&amp;rdquo; model, popularized by the GPT series and its successors like LLaMA and Mistral, reigns supreme. These models are scaled to incredible sizes and excel at in-context learning.&lt;/p>
 &lt;p>But to truly understand the field, we must look at the pivotal models that explored different paths. Google&amp;rsquo;s T5, or &lt;strong>Text-to-Text Transfer Transformer&lt;/strong>, stands out as one of the most influential. It didn&amp;rsquo;t just introduce a new model; it proposed a new philosophy. This article dives deep into the architecture of T5, how it fundamentally differs from modern LLMs, and the lasting legacy of its unique design choices.&lt;/p></description></item><item><title>Mastering Your Breville Barista Pro: The Ultimate Guide to Dialing In Espresso</title><link>/posts/espresso-theory-application-a-guide-for-the-breville-barista-pro/</link><pubDate>Thu, 01 May 2025 00:00:00 +0000</pubDate><guid>/posts/espresso-theory-application-a-guide-for-the-breville-barista-pro/</guid><description>&lt;p>Are you ready to transform your home espresso game from good to genuinely great? The Breville Barista Pro is a fantastic machine, but unlocking its full potential requires understanding a few key principles. This guide will walk you through the systematic process of dialing in your espresso, ensuring every shot is delicious and repeatable.&lt;/p>
 &lt;p>Our overarching philosophy is simple: &lt;strong>isolate and change only one variable at a time.&lt;/strong> While numbers are crucial, your palate is the ultimate judge. Dose, ratio, and time are interconnected, but your &lt;strong>grind size&lt;/strong> is your most powerful lever.&lt;/p></description></item><item><title>Some useful files</title><link>/posts/useful/</link><pubDate>Mon, 26 Oct 2020 04:14:43 +0000</pubDate><guid>/posts/useful/</guid><description>&lt;ul>
 &lt;li>&lt;a href="https://ericxliu.me/rootCA.pem" class="external-link" target="_blank" rel="noopener">rootCA.pem&lt;/a>&lt;/li>
diff --git a/posts/mixture-of-experts-moe-models-challenges-solutions-in-practice/index.html b/posts/mixture-of-experts-moe-models-challenges-solutions-in-practice/index.html
index b5183f2..6f59f07 100644
--- a/posts/mixture-of-experts-moe-models-challenges-solutions-in-practice/index.html
+++ b/posts/mixture-of-experts-moe-models-challenges-solutions-in-practice/index.html
@@ -7,9 +7,9 @@
   
 
 The Problem:
-Many routing mechanisms, especially &ldquo;Top-K routing,&rdquo; involve a discrete, hard selection process. A common function is KeepTopK(v, k), which selects the top k scoring elements from a vector v and sets others to (-\infty) or (0)."><meta name=keywords content="software engineer,performance engineering,Google engineer,tech blog,software development,performance optimization,Eric Liu,engineering blog,mountain biking,Jeep enthusiast,overlanding,camping,outdoor adventures"><meta name=twitter:card content="summary"><meta name=twitter:title content="Mixture-of-Experts (MoE) Models Challenges & Solutions in Practice"><meta name=twitter:description content="Mixture-of-Experts (MoEs) are neural network architectures that allow different parts of the model (called “experts”) to specialize in different types of inputs. A “gating network” or “router” learns to dispatch each input (or “token”) to a subset of these experts. While powerful for scaling models, MoEs introduce several practical challenges.
-1. Challenge: Non-Differentiability of Routing Functions Link to heading The Problem: Many routing mechanisms, especially “Top-K routing,” involve a discrete, hard selection process. A common function is KeepTopK(v, k), which selects the top k scoring elements from a vector v and sets others to (-\infty) or (0)."><meta property="og:url" content="/posts/mixture-of-experts-moe-models-challenges-solutions-in-practice/"><meta property="og:site_name" content="Eric X. Liu's Personal Page"><meta property="og:title" content="Mixture-of-Experts (MoE) Models Challenges & Solutions in Practice"><meta property="og:description" content="Mixture-of-Experts (MoEs) are neural network architectures that allow different parts of the model (called “experts”) to specialize in different types of inputs. A “gating network” or “router” learns to dispatch each input (or “token”) to a subset of these experts. While powerful for scaling models, MoEs introduce several practical challenges.
-1. Challenge: Non-Differentiability of Routing Functions Link to heading The Problem: Many routing mechanisms, especially “Top-K routing,” involve a discrete, hard selection process. A common function is KeepTopK(v, k), which selects the top k scoring elements from a vector v and sets others to (-\infty) or (0)."><meta property="og:locale" content="en"><meta property="og:type" content="article"><meta property="article:section" content="posts"><meta property="article:published_time" content="2025-07-02T00:00:00+00:00"><meta property="article:modified_time" content="2025-08-03T03:49:59+00:00"><link rel=canonical href=/posts/mixture-of-experts-moe-models-challenges-solutions-in-practice/><link rel=preload href=/fonts/fa-brands-400.woff2 as=font type=font/woff2 crossorigin><link rel=preload href=/fonts/fa-regular-400.woff2 as=font type=font/woff2 crossorigin><link rel=preload href=/fonts/fa-solid-900.woff2 as=font type=font/woff2 crossorigin><link rel=stylesheet href=/css/coder.min.6445a802b9389c9660e1b07b724dcf5718b1065ed2d71b4eeaf981cc7cc5fc46.css integrity="sha256-ZEWoArk4nJZg4bB7ck3PVxixBl7S1xtO6vmBzHzF/EY=" crossorigin=anonymous media=screen><link rel=stylesheet href=/css/coder-dark.min.a00e6364bacbc8266ad1cc81230774a1397198f8cfb7bcba29b7d6fcb54ce57f.css integrity="sha256-oA5jZLrLyCZq0cyBIwd0oTlxmPjPt7y6KbfW/LVM5X8=" crossorigin=anonymous media=screen><link rel=icon type=image/svg+xml href=/images/favicon.svg sizes=any><link rel=icon type=image/png href=/images/favicon-32x32.png sizes=32x32><link rel=icon type=image/png href=/images/favicon-16x16.png sizes=16x16><link rel=apple-touch-icon href=/images/apple-touch-icon.png><link rel=apple-touch-icon sizes=180x180 href=/images/apple-touch-icon.png><link rel=manifest href=/site.webmanifest><link rel=mask-icon href=/images/safari-pinned-tab.svg color=#5bbad5></head><body class="preload-transitions colorscheme-auto"><div class=float-container><a id=dark-mode-toggle class=colorscheme-toggle><i class="fa-solid fa-adjust fa-fw" aria-hidden=true></i></a></div><main class=wrapper><nav class=navigation><section class=container><a class=navigation-title href=/>Eric X. Liu's Personal Page
+Many routing mechanisms, especially &ldquo;Top-K routing,&rdquo; involve a discrete, hard selection process. A common function is KeepTopK(v, k), which selects the top k scoring elements from a vector v and sets others to $-\infty$ or $0$."><meta name=keywords content="software engineer,performance engineering,Google engineer,tech blog,software development,performance optimization,Eric Liu,engineering blog,mountain biking,Jeep enthusiast,overlanding,camping,outdoor adventures"><meta name=twitter:card content="summary"><meta name=twitter:title content="Mixture-of-Experts (MoE) Models Challenges & Solutions in Practice"><meta name=twitter:description content="Mixture-of-Experts (MoEs) are neural network architectures that allow different parts of the model (called “experts”) to specialize in different types of inputs. A “gating network” or “router” learns to dispatch each input (or “token”) to a subset of these experts. While powerful for scaling models, MoEs introduce several practical challenges.
+1. Challenge: Non-Differentiability of Routing Functions Link to heading The Problem: Many routing mechanisms, especially “Top-K routing,” involve a discrete, hard selection process. A common function is KeepTopK(v, k), which selects the top k scoring elements from a vector v and sets others to $-\infty$ or $0$."><meta property="og:url" content="/posts/mixture-of-experts-moe-models-challenges-solutions-in-practice/"><meta property="og:site_name" content="Eric X. Liu's Personal Page"><meta property="og:title" content="Mixture-of-Experts (MoE) Models Challenges & Solutions in Practice"><meta property="og:description" content="Mixture-of-Experts (MoEs) are neural network architectures that allow different parts of the model (called “experts”) to specialize in different types of inputs. A “gating network” or “router” learns to dispatch each input (or “token”) to a subset of these experts. While powerful for scaling models, MoEs introduce several practical challenges.
+1. Challenge: Non-Differentiability of Routing Functions Link to heading The Problem: Many routing mechanisms, especially “Top-K routing,” involve a discrete, hard selection process. A common function is KeepTopK(v, k), which selects the top k scoring elements from a vector v and sets others to $-\infty$ or $0$."><meta property="og:locale" content="en"><meta property="og:type" content="article"><meta property="article:section" content="posts"><meta property="article:published_time" content="2025-07-02T00:00:00+00:00"><meta property="article:modified_time" content="2025-08-03T06:02:48+00:00"><link rel=canonical href=/posts/mixture-of-experts-moe-models-challenges-solutions-in-practice/><link rel=preload href=/fonts/fa-brands-400.woff2 as=font type=font/woff2 crossorigin><link rel=preload href=/fonts/fa-regular-400.woff2 as=font type=font/woff2 crossorigin><link rel=preload href=/fonts/fa-solid-900.woff2 as=font type=font/woff2 crossorigin><link rel=stylesheet href=/css/coder.min.6445a802b9389c9660e1b07b724dcf5718b1065ed2d71b4eeaf981cc7cc5fc46.css integrity="sha256-ZEWoArk4nJZg4bB7ck3PVxixBl7S1xtO6vmBzHzF/EY=" crossorigin=anonymous media=screen><link rel=stylesheet href=/css/coder-dark.min.a00e6364bacbc8266ad1cc81230774a1397198f8cfb7bcba29b7d6fcb54ce57f.css integrity="sha256-oA5jZLrLyCZq0cyBIwd0oTlxmPjPt7y6KbfW/LVM5X8=" crossorigin=anonymous media=screen><link rel=icon type=image/svg+xml href=/images/favicon.svg sizes=any><link rel=icon type=image/png href=/images/favicon-32x32.png sizes=32x32><link rel=icon type=image/png href=/images/favicon-16x16.png sizes=16x16><link rel=apple-touch-icon href=/images/apple-touch-icon.png><link rel=apple-touch-icon sizes=180x180 href=/images/apple-touch-icon.png><link rel=manifest href=/site.webmanifest><link rel=mask-icon href=/images/safari-pinned-tab.svg color=#5bbad5></head><body class="preload-transitions colorscheme-auto"><div class=float-container><a id=dark-mode-toggle class=colorscheme-toggle><i class="fa-solid fa-adjust fa-fw" aria-hidden=true></i></a></div><main class=wrapper><nav class=navigation><section class=container><a class=navigation-title href=/>Eric X. Liu's Personal Page
 </a><input type=checkbox id=menu-toggle>
 <label class="menu-button float-right" for=menu-toggle><i class="fa-solid fa-bars fa-fw" aria-hidden=true></i></label><ul class=navigation-list><li class=navigation-item><a class=navigation-link href=/posts/>Posts</a></li><li class=navigation-item><a class=navigation-link href=https://chat.ericxliu.me>Chat</a></li><li class=navigation-item><a class=navigation-link href=https://git.ericxliu.me/user/oauth2/Authenitk>Git</a></li><li class=navigation-item><a class=navigation-link href=https://coder.ericxliu.me/api/v2/users/oidc/callback>Coder</a></li><li class=navigation-item><a class=navigation-link href=https://rss.ericxliu.me/oauth2/oidc/redirect>RSS</a></li><li class=navigation-item><a class=navigation-link href=/>|</a></li><li class=navigation-item><a class=navigation-link href=https://sso.ericxliu.me>Sign in</a></li></ul></section></nav><div class=content><section class="container post"><article><header><div class=post-title><h1 class=title><a class=title-link href=/posts/mixture-of-experts-moe-models-challenges-solutions-in-practice/>Mixture-of-Experts (MoE) Models Challenges & Solutions in Practice</a></h1></div><div class=post-meta><div class=date><span class=posted-on><i class="fa-solid fa-calendar" aria-hidden=true></i>
 <time datetime=2025-07-02T00:00:00Z>July 2, 2025
@@ -17,31 +17,31 @@ Many routing mechanisms, especially &ldquo;Top-K routing,&rdquo; involve a discr
 7-minute read</span></div></div></header><div class=post-content><p>Mixture-of-Experts (MoEs) are neural network architectures that allow different parts of the model (called &ldquo;experts&rdquo;) to specialize in different types of inputs. A &ldquo;gating network&rdquo; or &ldquo;router&rdquo; learns to dispatch each input (or &ldquo;token&rdquo;) to a subset of these experts. While powerful for scaling models, MoEs introduce several practical challenges.</p><h3 id=1-challenge-non-differentiability-of-routing-functions>1. Challenge: Non-Differentiability of Routing Functions
 <a class=heading-link href=#1-challenge-non-differentiability-of-routing-functions><i class="fa-solid fa-link" aria-hidden=true title="Link to heading"></i>
 <span class=sr-only>Link to heading</span></a></h3><p><strong>The Problem:</strong>
-Many routing mechanisms, especially &ldquo;Top-K routing,&rdquo; involve a discrete, hard selection process. A common function is <code>KeepTopK(v, k)</code>, which selects the top <code>k</code> scoring elements from a vector <code>v</code> and sets others to (-\infty) or (0).</p><p>[
-KeepTopK(v, k)_i = \begin{cases} v_i & \text{if } v_i \text{ is in the top } k \text{ elements of } v \ -\infty & \text{otherwise.} \end{cases}
-]</p><p>This function is <strong>not differentiable</strong>. Its gradient is zero almost everywhere and undefined at the threshold points, making it impossible to directly train the gating network&rsquo;s parameters (e.g., (W_g)) using standard gradient descent.</p><p><strong>Solutions (Stochastic Approximations):</strong>
+Many routing mechanisms, especially &ldquo;Top-K routing,&rdquo; involve a discrete, hard selection process. A common function is <code>KeepTopK(v, k)</code>, which selects the top <code>k</code> scoring elements from a vector <code>v</code> and sets others to $-\infty$ or $0$.</p>$$
+KeepTopK(v, k)_i = \begin{cases} v_i & \text{if } v_i \text{ is in the top } k \text{ elements of } v \\ -\infty & \text{otherwise.} \end{cases}
+$$<p>This function is <strong>not differentiable</strong>. Its gradient is zero almost everywhere and undefined at the threshold points, making it impossible to directly train the gating network&rsquo;s parameters (e.g., $W_g$) using standard gradient descent.</p><p><strong>Solutions (Stochastic Approximations):</strong>
 To enable end-to-end training, non-differentiable routing decisions must be approximated with differentiable or stochastic methods.</p><ul><li><p><strong>Stochastic Scoring (e.g., Shazeer et al. 2017):</strong>
-The expert score (H(x)_i = (x \cdot W_g)<em>i + \text{StandardNormal}() \cdot \text{Softplus}((x \cdot W</em>{noise})_i)) introduces Gaussian noise. This makes the scores themselves stochastic, which can be leveraged with other methods.</p></li><li><p><strong>Gumbel-Softmax Trick (or Concrete Distribution):</strong>
+The expert score $H(x)_i = (x \cdot W_g)_i + \text{StandardNormal}() \cdot \text{Softplus}((x \cdot W_{noise})_i)$ introduces Gaussian noise. This makes the scores themselves stochastic, which can be leveraged with other methods.</p></li><li><p><strong>Gumbel-Softmax Trick (or Concrete Distribution):</strong>
 This method allows for differentiable sampling from categorical distributions. Instead of directly picking the top-k, Gumbel noise is added to the scores, and a Softmax (with a temperature parameter) is applied. This provides a continuous, differentiable approximation of a discrete choice, allowing gradients to flow back.</p></li><li><p><strong>REINFORCE (Score Function Estimator):</strong>
 This is a policy gradient method from reinforcement learning. The routing decision is treated as an action, and the gating network&rsquo;s parameters are updated based on the &ldquo;reward&rdquo; (e.g., the model&rsquo;s performance). Gradients are estimated by sampling routing choices and weighting them by their outcomes.</p></li><li><p><strong>Straight-Through Estimator (STE):</strong>
 A simpler approximation where, during the backward pass, gradients are treated as if the non-differentiable operation was an identity function or a simple smooth function.</p></li><li><p><strong>Softmax after TopK (e.g., Mixtral, DBRX, DeepSeek v3):</strong>
-Instead of <code>Softmax(KeepTopK(...))</code>, some models apply a Softmax <em>only to the scores of the selected TopK experts</em>, and then assign (0) to the rest. This provides differentiable weights for the selected experts while still enforcing sparsity.</p></li></ul><h3 id=2-challenge-uneven-expert-utilization-balancing-loss>2. Challenge: Uneven Expert Utilization (Balancing Loss)
+Instead of <code>Softmax(KeepTopK(...))</code>, some models apply a Softmax <em>only to the scores of the selected TopK experts</em>, and then assign $0$ to the rest. This provides differentiable weights for the selected experts while still enforcing sparsity.</p></li></ul><h3 id=2-challenge-uneven-expert-utilization-balancing-loss>2. Challenge: Uneven Expert Utilization (Balancing Loss)
 <a class=heading-link href=#2-challenge-uneven-expert-utilization-balancing-loss><i class="fa-solid fa-link" aria-hidden=true title="Link to heading"></i>
 <span class=sr-only>Link to heading</span></a></h3><p><strong>The Problem:</strong>
 Left unchecked, the gating network might learn to heavily favor a few experts, leaving others underutilized. This leads to:</p><ul><li><strong>System Inefficiency:</strong> Overloaded experts become bottlenecks, while underutilized experts waste computational resources.</li><li><strong>Suboptimal Learning:</strong> Experts might not specialize effectively if they don&rsquo;t receive diverse data.</li></ul><p><strong>Solution: Heuristic Balancing Losses (e.g., from Switch Transformer, Fedus et al. 2022)</strong>
-An auxiliary loss is added to the total model loss during training to encourage more even expert usage.</p><p>(( \text{loss}<em>{\text{auxiliary}} = \alpha \cdot N \cdot \sum</em>{i=1}^{N} f_i \cdot P_i ))</p><p>Where:</p><ul><li>(\alpha): A hyperparameter controlling the strength of the auxiliary loss.</li><li>(N): Total number of experts.</li><li>(f_i): The <strong>fraction of tokens <em>actually dispatched</em> to expert (i)</strong> in the current batch (B).
-(( f_i = \frac{1}{T} \sum_{x \in B} \mathbf{1}{\text{argmax } p(x) = i} ))
-((p(x)) here refers to the output of the gating network, which could be (s_{i,t}) in the DeepSeek/classic router. The (\text{argmax}) means it counts hard assignments to expert (i).)</li><li>(P_i): The <strong>fraction of the router <em>probability mass</em> allocated to expert (i)</strong> in the current batch (B).
-(( P_i = \frac{1}{T} \sum_{x \in B} p_i(x) ))
-((p_i(x)) is the learned probability (or soft score) from the gating network for token (x) and expert (i).)</li></ul><p><strong>How it works:</strong>
-The loss aims to minimize the product (f_i \cdot P_i) when (f_i) and (P_i) are small, effectively pushing them to be larger (closer to (1/N)). If an expert (i) is overused (high (f_i) and (P_i)), its term in the sum contributes significantly to the loss. The derivative with respect to (p_i(x)) reveals that &ldquo;more frequent use = stronger downweighting,&rdquo; meaning the gating network is penalized for sending too much traffic to an already busy expert.</p><p><strong>Relationship to Gating Network:</strong></p><ul><li><strong>(p_i(x)) (or (s_{i,t})):</strong> This is the output of the <strong>learned gating network</strong> (e.g., from a linear layer followed by Softmax). The gating network&rsquo;s parameters are updated via gradient descent, influenced by this auxiliary loss.</li><li><strong>(P_i):</strong> This is <em>calculated</em> from the outputs of the learned gating network for the current batch. It&rsquo;s not a pre-defined value.</li></ul><p><strong>Limitation (&ldquo;Second Best&rdquo; Scenario):</strong>
-Even with this loss, an expert can remain imbalanced if it&rsquo;s consistently the &ldquo;second best&rdquo; option (high (P_i)) but never the <em>absolute top choice</em> that gets counted in (f_i) (especially if (K=1)). This is because (f_i) strictly counts hard assignments based on <code>argmax</code>. This limitation highlights why &ldquo;soft&rdquo; routing or &ldquo;softmax after TopK&rdquo; approaches can be more effective for truly even distribution.</p><h3 id=3-challenge-overfitting-during-fine-tuning>3. Challenge: Overfitting during Fine-tuning
+An auxiliary loss is added to the total model loss during training to encourage more even expert usage.</p>$$ \text{loss}_{\text{auxiliary}} = \alpha \cdot N \cdot \sum_{i=1}^{N} f_i \cdot P_i $$<p>Where:</p><ul><li>$\alpha$: A hyperparameter controlling the strength of the auxiliary loss.</li><li>$N$: Total number of experts.</li><li>$f_i$: The <strong>fraction of tokens <em>actually dispatched</em> to expert $i$</strong> in the current batch $B$.
+$$ f_i = \frac{1}{T} \sum_{x \in B} \mathbf{1}\{\text{argmax } p(x) = i\} $$
+($p(x)$ here refers to the output of the gating network, which could be $s_{i,t}$ in the DeepSeek/classic router. The $\text{argmax}$ means it counts hard assignments to expert $i$.)</li><li>$P_i$: The <strong>fraction of the router <em>probability mass</em> allocated to expert $i$</strong> in the current batch $B$.
+$$ P_i = \frac{1}{T} \sum_{x \in B} p_i(x) $$
+($p_i(x)$ is the learned probability (or soft score) from the gating network for token $x$ and expert $i$.)</li></ul><p><strong>How it works:</strong>
+The loss aims to minimize the product $f_i \cdot P_i$ when $f_i$ and $P_i$ are small, effectively pushing them to be larger (closer to $1/N$). If an expert $i$ is overused (high $f_i$ and $P_i$), its term in the sum contributes significantly to the loss. The derivative with respect to $p_i(x)$ reveals that &ldquo;more frequent use = stronger downweighting,&rdquo; meaning the gating network is penalized for sending too much traffic to an already busy expert.</p><p><strong>Relationship to Gating Network:</strong></p><ul><li><strong>$p_i(x)$ (or $s_{i,t}$):</strong> This is the output of the <strong>learned gating network</strong> (e.g., from a linear layer followed by Softmax). The gating network&rsquo;s parameters are updated via gradient descent, influenced by this auxiliary loss.</li><li><strong>$P_i$:</strong> This is <em>calculated</em> from the outputs of the learned gating network for the current batch. It&rsquo;s not a pre-defined value.</li></ul><p><strong>Limitation (&ldquo;Second Best&rdquo; Scenario):</strong>
+Even with this loss, an expert can remain imbalanced if it&rsquo;s consistently the &ldquo;second best&rdquo; option (high $P_i$) but never the <em>absolute top choice</em> that gets counted in $f_i$ (especially if $K=1$). This is because $f_i$ strictly counts hard assignments based on <code>argmax</code>. This limitation highlights why &ldquo;soft&rdquo; routing or &ldquo;softmax after TopK&rdquo; approaches can be more effective for truly even distribution.</p><h3 id=3-challenge-overfitting-during-fine-tuning>3. Challenge: Overfitting during Fine-tuning
 <a class=heading-link href=#3-challenge-overfitting-during-fine-tuning><i class="fa-solid fa-link" aria-hidden=true title="Link to heading"></i>
 <span class=sr-only>Link to heading</span></a></h3><p><strong>The Problem:</strong>
 Sparse MoE models, despite only activating a few experts per token, possess a very large total number of parameters. When fine-tuning these models on <strong>smaller datasets</strong>, they are highly prone to <strong>overfitting</strong>. The model&rsquo;s vast capacity allows it to memorize the limited fine-tuning data, leading to poor generalization performance on unseen validation data. This is evident when training loss continues to decrease, but validation loss stagnates or increases.</p><p><strong>Solutions:</strong></p><ul><li><p><strong>Zoph et al. Solution – Fine-tune non-MoE MLPs:</strong></p><ul><li>This strategy involves freezing a portion of the MoE model&rsquo;s parameters during fine-tuning, specifically the large expert weights.</li><li>Instead, only the &ldquo;non-MoE&rdquo; parameters (e.g., attention layers, adapter layers, or the gating network itself) are updated.</li><li>This reduces the effective number of trainable parameters during fine-tuning, thereby mitigating the risk of overfitting on small datasets. It assumes the experts are already well-pre-trained for general tasks.</li></ul></li><li><p><strong>DeepSeek Solution – Use Lots of Data (1.4M SFT):</strong></p><ul><li>This approach tackles the problem by providing the model with a very large and diverse dataset for Supervised Fine-Tuning (SFT).</li><li>With abundant data (e.g., 1.4 million examples covering a wide range of tasks and languages), the model&rsquo;s large capacity can be effectively utilized for specialized learning rather than memorization. The diversity and volume of data prevent individual experts from overfitting to specific examples.</li></ul></li></ul><p><strong>Conclusion:</strong>
 MoE models offer significant advantages in terms of model capacity and computational efficiency, but their unique sparse activation pattern introduces challenges in training and fine-tuning. Overcoming non-differentiability in routing and ensuring balanced expert utilization are crucial for effective pre-training. During fine-tuning, managing the model&rsquo;s vast parameter count to prevent overfitting on smaller datasets requires either strategic parameter freezing or access to very large and diverse fine-tuning data.
-The <strong>Top-K routing</strong> mechanism, as illustrated in the provided image, is a core component in many modern Mixture-of-Experts (MoE) models. It involves selecting a fixed number (<code>K</code>) of experts for each input based on relevance scores.</p><hr><p><strong>Traditional Top-K (Deterministic Selection):</strong></p><ul><li><strong>How it works:</strong><ol><li>Calculate relevance scores (<code>s_{i,t}</code>) for each expert <code>i</code> and input <code>t</code>.</li><li>Identify the <code>K</code> experts with the highest scores.</li><li>Experts <em>within</em> the Top-K are assigned their scores (<code>g_{i,t} = s_{i,t}</code>).</li><li>Experts <em>outside</em> the Top-K are assigned a score of <code>0</code> (<code>g_{i,t} = 0</code>).</li><li>The output is a weighted sum of the selected experts&rsquo; outputs.</li></ol></li><li><strong>Pros:</strong> Predictable, deterministic, selects the &ldquo;best&rdquo; experts based on current scores.</li><li><strong>Cons:</strong> Can lead to expert imbalance, where a few popular experts are always chosen, starving others of training.</li></ul><p><strong>Alternative: Sampling from Softmax (Probabilistic Selection):</strong></p><ul><li><strong>How it works:</strong><ol><li>Calculate relevance scores (<code>s_{i,t}</code>) which are treated as probabilities (after softmax).</li><li><strong>Randomly sample</strong> <code>K</code> unique expert indices from the distribution defined by these probabilities.</li><li>Selected experts contribute; unselected experts do not.</li></ol></li><li><strong>Why it&rsquo;s suggested:</strong><ul><li><strong>Load Balancing:</strong> Prevents expert collapse by ensuring all experts get a chance to be selected, even those with slightly lower scores. This promotes more even training across the entire expert pool.</li><li><strong>Diversity & Exploration:</strong> Introduces randomness, potentially leading to better generalization and robustness by exploring different expert combinations.</li></ul></li><li><strong>Pros:</strong> Better load balancing, prevents expert starvation, encourages exploration.</li><li><strong>Cons:</strong> Stochastic (non-deterministic routing), can make debugging harder, might not pick the absolute &ldquo;best&rdquo; expert in a single instance (but better for long-term training).</li></ul><p><strong>Key Takeaway:</strong> While deterministic Top-K is simpler and directly picks the &ldquo;highest-scoring&rdquo; experts, sampling from the softmax offers a more robust training dynamic by ensuring that all experts receive training data, thereby preventing some experts from becoming unused (&ldquo;dead experts&rdquo;).</p><hr></div><footer><div id=disqus_thread></div><script>window.disqus_config=function(){},function(){if(["localhost","127.0.0.1"].indexOf(window.location.hostname)!=-1){document.getElementById("disqus_thread").innerHTML="Disqus comments not available by default when the website is previewed locally.";return}var t=document,e=t.createElement("script");e.async=!0,e.src="//ericxliu-me.disqus.com/embed.js",e.setAttribute("data-timestamp",+new Date),(t.head||t.body).appendChild(e)}(),document.addEventListener("themeChanged",function(){document.readyState=="complete"&&DISQUS.reset({reload:!0,config:disqus_config})})</script></footer></article></section></div><footer class=footer><section class=container>©
+The <strong>Top-K routing</strong> mechanism, as illustrated in the provided image, is a core component in many modern Mixture-of-Experts (MoE) models. It involves selecting a fixed number (<code>K</code>) of experts for each input based on relevance scores.</p><hr><p><strong>Traditional Top-K (Deterministic Selection):</strong></p><ul><li><strong>How it works:</strong><ol><li>Calculate relevance scores (<code>s_{i,t}</code>) for each expert <code>i</code> and input <code>t</code>.</li><li>Identify the <code>K</code> experts with the highest scores.</li><li>Experts <em>within</em> the Top-K are assigned their scores (<code>g_{i,t} = s_{i,t}</code>).</li><li>Experts <em>outside</em> the Top-K are assigned a score of <code>0</code> (<code>g_{i,t} = 0</code>).</li><li>The output is a weighted sum of the selected experts&rsquo; outputs.</li></ol></li><li><strong>Pros:</strong> Predictable, deterministic, selects the &ldquo;best&rdquo; experts based on current scores.</li><li><strong>Cons:</strong> Can lead to expert imbalance, where a few popular experts are always chosen, starving others of training.</li></ul><p><strong>Alternative: Sampling from Softmax (Probabilistic Selection):</strong></p><ul><li><strong>How it works:</strong><ol><li>Calculate relevance scores (<code>s_{i,t}</code>) which are treated as probabilities (after softmax).</li><li><strong>Randomly sample</strong> <code>K</code> unique expert indices from the distribution defined by these probabilities.</li><li>Selected experts contribute; unselected experts do not.</li></ol></li><li><strong>Why it&rsquo;s suggested:</strong><ul><li><strong>Load Balancing:</strong> Prevents expert collapse by ensuring all experts get a chance to be selected, even those with slightly lower scores. This promotes more even training across the entire expert pool.</li><li><strong>Diversity & Exploration:</strong> Introduces randomness, potentially leading to better generalization and robustness by exploring different expert combinations.</li></ul></li><li><strong>Pros:</strong> Better load balancing, prevents expert starvation, encourages exploration.</li><li><strong>Cons:</strong> Stochastic (non-deterministic routing), can make debugging harder, might not pick the absolute &ldquo;best&rdquo; expert in a single instance (but better for long-term training).</li></ul><p><strong>Key Takeaway:</strong> While deterministic Top-K is simpler and directly picks the &ldquo;highest-scoring&rdquo; experts, sampling from the softmax offers a more robust training dynamic by ensuring that all experts receive training data, thereby preventing some experts from becoming unused (&ldquo;dead experts&rdquo;).</p><hr></div><footer><div id=disqus_thread></div><script>window.disqus_config=function(){},function(){if(["localhost","127.0.0.1"].indexOf(window.location.hostname)!=-1){document.getElementById("disqus_thread").innerHTML="Disqus comments not available by default when the website is previewed locally.";return}var t=document,e=t.createElement("script");e.async=!0,e.src="//ericxliu-me.disqus.com/embed.js",e.setAttribute("data-timestamp",+new Date),(t.head||t.body).appendChild(e)}(),document.addEventListener("themeChanged",function(){document.readyState=="complete"&&DISQUS.reset({reload:!0,config:disqus_config})})</script></footer></article><link rel=stylesheet href=https://cdn.jsdelivr.net/npm/katex@0.16.4/dist/katex.min.css integrity=sha384-vKruj+a13U8yHIkAyGgK1J3ArTLzrFGBbBc0tDp4ad/EyewESeXE/Iv67Aj8gKZ0 crossorigin=anonymous><script defer src=https://cdn.jsdelivr.net/npm/katex@0.16.4/dist/katex.min.js integrity=sha384-PwRUT/YqbnEjkZO0zZxNqcxACrXe+j766U2amXcgMg5457rve2Y7I6ZJSm2A0mS4 crossorigin=anonymous></script><script defer src=https://cdn.jsdelivr.net/npm/katex@0.16.4/dist/contrib/auto-render.min.js integrity=sha384-+VBxd3r6XgURycqtZ117nYw44OOcIax56Z4dCRWbxyPt0Koah1uHoK0o4+/RRE05 crossorigin=anonymous onload='renderMathInElement(document.body,{delimiters:[{left:"$$",right:"$$",display:!0},{left:"$",right:"$",display:!1},{left:"\\(",right:"\\)",display:!1},{left:"\\[",right:"\\]",display:!0}]})'></script></section></div><footer class=footer><section class=container>©
 2016 -
 2025
 Eric X. Liu
-<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/5706ff7">[5706ff7]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
\ No newline at end of file
+<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/eba296f">[eba296f]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
\ No newline at end of file
diff --git a/posts/t5-the-transformer-that-zigged-when-others-zagged-an-architectural-deep-dive/index.html b/posts/t5-the-transformer-that-zigged-when-others-zagged-an-architectural-deep-dive/index.html
index f33df47..11dddda 100644
--- a/posts/t5-the-transformer-that-zigged-when-others-zagged-an-architectural-deep-dive/index.html
+++ b/posts/t5-the-transformer-that-zigged-when-others-zagged-an-architectural-deep-dive/index.html
@@ -26,8 +26,8 @@ But to truly understand the field, we must look at the pivotal models that explo
 <a class=heading-link href=#where-t5-was-ahead-of-its-time><i class="fa-solid fa-link" aria-hidden=true title="Link to heading"></i>
 <span class=sr-only>Link to heading</span></a></h3><p>Despite its differences, the &ldquo;T5 v1.1&rdquo; variant pioneered several techniques that are now standard practice in the most advanced LLMs:</p><ul><li><strong>RMSNorm:</strong> It was one of the first major models to adopt Root Mean Square Normalization instead of LayerNorm, a choice now used by LLaMA, Mistral, and others for its efficiency and stability.</li><li><strong>Pre-Normalization:</strong> T5 applies the normalization layer <em>before</em> the attention and FFN blocks, a critical technique for enabling stable training of very deep networks.</li><li><strong>No Bias Terms:</strong> T5 v1.1 removed the bias parameters from its normalization and FFN layers, a small but important optimization for memory and stability that modern models follow.</li><li><strong>Gated Activations (GeGLU):</strong> While the original T5 used ReLU, T5 v1.1 adopted a Gated Linear Unit (GeGLU), presaging the move to GLU-family activations (like SwiGLU) that is now ubiquitous.</li></ul><h3 id=conclusion-the-lasting-legacy>Conclusion: The Lasting Legacy
 <a class=heading-link href=#conclusion-the-lasting-legacy><i class="fa-solid fa-link" aria-hidden=true title="Link to heading"></i>
-<span class=sr-only>Link to heading</span></a></h3><p>T5 represents a different evolutionary branch in the Transformer family tree. While the field has largely converged on the decoder-only architecture for its scalability in general-purpose models, T5&rsquo;s design remains a masterclass in purpose-built engineering.</p><p>Its text-to-text framework was revolutionary, its encoder-decoder structure is still a go-to for tasks like translation, and its refined T5 v1.1 architecture laid the groundwork for many of the stability and efficiency tricks we see in today&rsquo;s state-of-the-art models. T5 is more than just a model; it&rsquo;s a crucial case study in the architectural trade-offs that continue to shape the future of artificial intelligence.</p></div><footer><div id=disqus_thread></div><script>window.disqus_config=function(){},function(){if(["localhost","127.0.0.1"].indexOf(window.location.hostname)!=-1){document.getElementById("disqus_thread").innerHTML="Disqus comments not available by default when the website is previewed locally.";return}var t=document,e=t.createElement("script");e.async=!0,e.src="//ericxliu-me.disqus.com/embed.js",e.setAttribute("data-timestamp",+new Date),(t.head||t.body).appendChild(e)}(),document.addEventListener("themeChanged",function(){document.readyState=="complete"&&DISQUS.reset({reload:!0,config:disqus_config})})</script></footer></article></section></div><footer class=footer><section class=container>©
+<span class=sr-only>Link to heading</span></a></h3><p>T5 represents a different evolutionary branch in the Transformer family tree. While the field has largely converged on the decoder-only architecture for its scalability in general-purpose models, T5&rsquo;s design remains a masterclass in purpose-built engineering.</p><p>Its text-to-text framework was revolutionary, its encoder-decoder structure is still a go-to for tasks like translation, and its refined T5 v1.1 architecture laid the groundwork for many of the stability and efficiency tricks we see in today&rsquo;s state-of-the-art models. T5 is more than just a model; it&rsquo;s a crucial case study in the architectural trade-offs that continue to shape the future of artificial intelligence.</p></div><footer><div id=disqus_thread></div><script>window.disqus_config=function(){},function(){if(["localhost","127.0.0.1"].indexOf(window.location.hostname)!=-1){document.getElementById("disqus_thread").innerHTML="Disqus comments not available by default when the website is previewed locally.";return}var t=document,e=t.createElement("script");e.async=!0,e.src="//ericxliu-me.disqus.com/embed.js",e.setAttribute("data-timestamp",+new Date),(t.head||t.body).appendChild(e)}(),document.addEventListener("themeChanged",function(){document.readyState=="complete"&&DISQUS.reset({reload:!0,config:disqus_config})})</script></footer></article><link rel=stylesheet href=https://cdn.jsdelivr.net/npm/katex@0.16.4/dist/katex.min.css integrity=sha384-vKruj+a13U8yHIkAyGgK1J3ArTLzrFGBbBc0tDp4ad/EyewESeXE/Iv67Aj8gKZ0 crossorigin=anonymous><script defer src=https://cdn.jsdelivr.net/npm/katex@0.16.4/dist/katex.min.js integrity=sha384-PwRUT/YqbnEjkZO0zZxNqcxACrXe+j766U2amXcgMg5457rve2Y7I6ZJSm2A0mS4 crossorigin=anonymous></script><script defer src=https://cdn.jsdelivr.net/npm/katex@0.16.4/dist/contrib/auto-render.min.js integrity=sha384-+VBxd3r6XgURycqtZ117nYw44OOcIax56Z4dCRWbxyPt0Koah1uHoK0o4+/RRE05 crossorigin=anonymous onload='renderMathInElement(document.body,{delimiters:[{left:"$$",right:"$$",display:!0},{left:"$",right:"$",display:!1},{left:"\\(",right:"\\)",display:!1},{left:"\\[",right:"\\]",display:!0}]})'></script></section></div><footer class=footer><section class=container>©
 2016 -
 2025
 Eric X. Liu
-<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/5706ff7">[5706ff7]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
\ No newline at end of file
+<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/eba296f">[eba296f]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
\ No newline at end of file
diff --git a/posts/useful/index.html b/posts/useful/index.html
index f6e030c..9e12775 100644
--- a/posts/useful/index.html
+++ b/posts/useful/index.html
@@ -6,8 +6,8 @@ vpnclient.ovpn
 <label class="menu-button float-right" for=menu-toggle><i class="fa-solid fa-bars fa-fw" aria-hidden=true></i></label><ul class=navigation-list><li class=navigation-item><a class=navigation-link href=/posts/>Posts</a></li><li class=navigation-item><a class=navigation-link href=https://chat.ericxliu.me>Chat</a></li><li class=navigation-item><a class=navigation-link href=https://git.ericxliu.me/user/oauth2/Authenitk>Git</a></li><li class=navigation-item><a class=navigation-link href=https://coder.ericxliu.me/api/v2/users/oidc/callback>Coder</a></li><li class=navigation-item><a class=navigation-link href=https://rss.ericxliu.me/oauth2/oidc/redirect>RSS</a></li><li class=navigation-item><a class=navigation-link href=/>|</a></li><li class=navigation-item><a class=navigation-link href=https://sso.ericxliu.me>Sign in</a></li></ul></section></nav><div class=content><section class="container post"><article><header><div class=post-title><h1 class=title><a class=title-link href=/posts/useful/>Some useful files</a></h1></div><div class=post-meta><div class=date><span class=posted-on><i class="fa-solid fa-calendar" aria-hidden=true></i>
 <time datetime=2020-10-26T04:14:43Z>October 26, 2020
 </time></span><span class=reading-time><i class="fa-solid fa-clock" aria-hidden=true></i>
-One-minute read</span></div></div></header><div class=post-content><ul><li><a href=https://ericxliu.me/rootCA.pem class=external-link target=_blank rel=noopener>rootCA.pem</a></li><li><a href=https://ericxliu.me/vpnclient.ovpn class=external-link target=_blank rel=noopener>vpnclient.ovpn</a></li></ul></div><footer><div id=disqus_thread></div><script>window.disqus_config=function(){},function(){if(["localhost","127.0.0.1"].indexOf(window.location.hostname)!=-1){document.getElementById("disqus_thread").innerHTML="Disqus comments not available by default when the website is previewed locally.";return}var t=document,e=t.createElement("script");e.async=!0,e.src="//ericxliu-me.disqus.com/embed.js",e.setAttribute("data-timestamp",+new Date),(t.head||t.body).appendChild(e)}(),document.addEventListener("themeChanged",function(){document.readyState=="complete"&&DISQUS.reset({reload:!0,config:disqus_config})})</script></footer></article></section></div><footer class=footer><section class=container>©
+One-minute read</span></div></div></header><div class=post-content><ul><li><a href=https://ericxliu.me/rootCA.pem class=external-link target=_blank rel=noopener>rootCA.pem</a></li><li><a href=https://ericxliu.me/vpnclient.ovpn class=external-link target=_blank rel=noopener>vpnclient.ovpn</a></li></ul></div><footer><div id=disqus_thread></div><script>window.disqus_config=function(){},function(){if(["localhost","127.0.0.1"].indexOf(window.location.hostname)!=-1){document.getElementById("disqus_thread").innerHTML="Disqus comments not available by default when the website is previewed locally.";return}var t=document,e=t.createElement("script");e.async=!0,e.src="//ericxliu-me.disqus.com/embed.js",e.setAttribute("data-timestamp",+new Date),(t.head||t.body).appendChild(e)}(),document.addEventListener("themeChanged",function(){document.readyState=="complete"&&DISQUS.reset({reload:!0,config:disqus_config})})</script></footer></article><link rel=stylesheet href=https://cdn.jsdelivr.net/npm/katex@0.16.4/dist/katex.min.css integrity=sha384-vKruj+a13U8yHIkAyGgK1J3ArTLzrFGBbBc0tDp4ad/EyewESeXE/Iv67Aj8gKZ0 crossorigin=anonymous><script defer src=https://cdn.jsdelivr.net/npm/katex@0.16.4/dist/katex.min.js integrity=sha384-PwRUT/YqbnEjkZO0zZxNqcxACrXe+j766U2amXcgMg5457rve2Y7I6ZJSm2A0mS4 crossorigin=anonymous></script><script defer src=https://cdn.jsdelivr.net/npm/katex@0.16.4/dist/contrib/auto-render.min.js integrity=sha384-+VBxd3r6XgURycqtZ117nYw44OOcIax56Z4dCRWbxyPt0Koah1uHoK0o4+/RRE05 crossorigin=anonymous onload='renderMathInElement(document.body,{delimiters:[{left:"$$",right:"$$",display:!0},{left:"$",right:"$",display:!1},{left:"\\(",right:"\\)",display:!1},{left:"\\[",right:"\\]",display:!0}]})'></script></section></div><footer class=footer><section class=container>©
 2016 -
 2025
 Eric X. Liu
-<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/5706ff7">[5706ff7]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
\ No newline at end of file
+<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/eba296f">[eba296f]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
\ No newline at end of file
diff --git a/sitemap.xml b/sitemap.xml
index 97059f8..661d8e9 100644
--- a/sitemap.xml
+++ b/sitemap.xml
@@ -1 +1 @@
-<?xml version="1.0" encoding="utf-8" standalone="yes"?><urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:xhtml="http://www.w3.org/1999/xhtml"><url><loc>/posts/a-deep-dive-into-ppo-for-language-models/</loc><lastmod>2025-08-03T03:28:39+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/</loc><lastmod>2025-08-03T04:20:20+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/posts/</loc><lastmod>2025-08-03T04:20:20+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/posts/mixture-of-experts-moe-models-challenges-solutions-in-practice/</loc><lastmod>2025-08-03T03:49:59+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/posts/t5-the-transformer-that-zigged-when-others-zagged-an-architectural-deep-dive/</loc><lastmod>2025-08-03T03:41:10+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/posts/espresso-theory-application-a-guide-for-the-breville-barista-pro/</loc><lastmod>2025-08-03T04:20:20+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/posts/useful/</loc><lastmod>2020-10-26T04:47:36+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/about/</loc><lastmod>2020-06-16T23:30:17-07:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/categories/</loc><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/tags/</loc><changefreq>weekly</changefreq><priority>0.5</priority></url></urlset>
\ No newline at end of file
+<?xml version="1.0" encoding="utf-8" standalone="yes"?><urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:xhtml="http://www.w3.org/1999/xhtml"><url><loc>/posts/a-deep-dive-into-ppo-for-language-models/</loc><lastmod>2025-08-03T03:28:39+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/</loc><lastmod>2025-08-03T06:02:48+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/posts/</loc><lastmod>2025-08-03T06:02:48+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/posts/mixture-of-experts-moe-models-challenges-solutions-in-practice/</loc><lastmod>2025-08-03T06:02:48+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/posts/t5-the-transformer-that-zigged-when-others-zagged-an-architectural-deep-dive/</loc><lastmod>2025-08-03T03:41:10+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/posts/espresso-theory-application-a-guide-for-the-breville-barista-pro/</loc><lastmod>2025-08-03T04:20:20+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/posts/useful/</loc><lastmod>2020-10-26T04:47:36+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/about/</loc><lastmod>2020-06-16T23:30:17-07:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/categories/</loc><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/tags/</loc><changefreq>weekly</changefreq><priority>0.5</priority></url></urlset>
\ No newline at end of file
diff --git a/tags/index.html b/tags/index.html
index 6da50ea..74a50ea 100644
--- a/tags/index.html
+++ b/tags/index.html
@@ -4,4 +4,4 @@
 2016 -
 2025
 Eric X. Liu
-<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/5706ff7">[5706ff7]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
\ No newline at end of file
+<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/eba296f">[eba296f]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
\ No newline at end of file