diff --git a/404.html b/404.html
index 815489c..8296236 100644
--- a/404.html
+++ b/404.html
@@ -4,4 +4,4 @@
 2016 -
 2025
 Eric X. Liu
-<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/c06e978">[c06e978]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
\ No newline at end of file
+<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/0945204">[0945204]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
\ No newline at end of file
diff --git a/about/index.html b/about/index.html
index d8443cb..e553ccd 100644
--- a/about/index.html
+++ b/about/index.html
@@ -4,4 +4,4 @@
 2016 -
 2025
 Eric X. Liu
-<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/c06e978">[c06e978]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
\ No newline at end of file
+<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/0945204">[0945204]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
\ No newline at end of file
diff --git a/categories/index.html b/categories/index.html
index ded53da..fdf123f 100644
--- a/categories/index.html
+++ b/categories/index.html
@@ -4,4 +4,4 @@
 2016 -
 2025
 Eric X. Liu
-<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/c06e978">[c06e978]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
\ No newline at end of file
+<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/0945204">[0945204]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
\ No newline at end of file
diff --git a/images/a-technical-deep-dive-into-the-transformer-s-core-mechanics/.png b/images/a-technical-deep-dive-into-the-transformer-s-core-mechanics/.png
new file mode 100644
index 0000000..a7d5e5d
Binary files /dev/null and b/images/a-technical-deep-dive-into-the-transformer-s-core-mechanics/.png differ
diff --git a/index.html b/index.html
index ad76cc7..4d22342 100644
--- a/index.html
+++ b/index.html
@@ -4,4 +4,4 @@
 2016 -
 2025
 Eric X. Liu
-<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/c06e978">[c06e978]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
\ No newline at end of file
+<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/0945204">[0945204]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
\ No newline at end of file
diff --git a/index.xml b/index.xml
index 0be3bae..7ea126a 100644
--- a/index.xml
+++ b/index.xml
@@ -1,4 +1,4 @@
-<?xml version="1.0" encoding="utf-8" standalone="yes"?><rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom"><channel><title>Eric X. Liu's Personal Page</title><link>/</link><description>Recent content on Eric X. Liu's Personal Page</description><generator>Hugo</generator><language>en</language><lastBuildDate>Wed, 20 Aug 2025 04:16:22 +0000</lastBuildDate><atom:link href="/index.xml" rel="self" type="application/rss+xml"/><item><title>A Comprehensive Guide to Breville Barista Pro Maintenance</title><link>/posts/a-comprehensive-guide-to-breville-barista-pro-maintenance/</link><pubDate>Wed, 20 Aug 2025 04:16:13 +0000</pubDate><guid>/posts/a-comprehensive-guide-to-breville-barista-pro-maintenance/</guid><description>&lt;p>Proper maintenance is critical for the longevity and performance of a Breville Barista Pro espresso machine. Consistent cleaning not only ensures the machine functions correctly but also directly impacts the quality of the espresso produced. This guide provides a detailed, technical breakdown of the essential maintenance routines, from automated cycles to daily upkeep.&lt;/p>
+<?xml version="1.0" encoding="utf-8" standalone="yes"?><rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom"><channel><title>Eric X. Liu's Personal Page</title><link>/</link><description>Recent content on Eric X. Liu's Personal Page</description><generator>Hugo</generator><language>en</language><lastBuildDate>Wed, 20 Aug 2025 04:32:59 +0000</lastBuildDate><atom:link href="/index.xml" rel="self" type="application/rss+xml"/><item><title>A Comprehensive Guide to Breville Barista Pro Maintenance</title><link>/posts/a-comprehensive-guide-to-breville-barista-pro-maintenance/</link><pubDate>Wed, 20 Aug 2025 04:32:52 +0000</pubDate><guid>/posts/a-comprehensive-guide-to-breville-barista-pro-maintenance/</guid><description>&lt;p>Proper maintenance is critical for the longevity and performance of a Breville Barista Pro espresso machine. Consistent cleaning not only ensures the machine functions correctly but also directly impacts the quality of the espresso produced. This guide provides a detailed, technical breakdown of the essential maintenance routines, from automated cycles to daily upkeep.&lt;/p>
 &lt;h4 id="understanding-the-two-primary-maintenance-cycles">
  &lt;strong>Understanding the Two Primary Maintenance Cycles&lt;/strong>
  &lt;a class="heading-link" href="#understanding-the-two-primary-maintenance-cycles">
@@ -6,7 +6,15 @@
  &lt;span class="sr-only">Link to heading&lt;/span>
  &lt;/a>
 &lt;/h4>
-&lt;p>The Breville Barista Pro has two distinct, automated maintenance procedures: the &lt;strong>Cleaning (Flush) Cycle&lt;/strong> and the &lt;strong>Descale Cycle&lt;/strong>. It is important to understand that these are not interchangeable, as they address different types of buildup within the machine.&lt;/p></description></item><item><title>Fixing GPU Operator Pods Stuck in Init: Secure Boot, DKMS, and MOK on Proxmox + Debian</title><link>/posts/secure-boot-dkms-and-mok-on-proxmox-debian/</link><pubDate>Sat, 09 Aug 2025 00:00:00 +0000</pubDate><guid>/posts/secure-boot-dkms-and-mok-on-proxmox-debian/</guid><description>&lt;p>I hit an issue where all GPU Operator pods on one node were stuck in Init after migrating from Legacy BIOS to UEFI. The common error was NVIDIA components waiting for “toolkit-ready,” while the toolkit init container looped with:&lt;/p>
+&lt;p>The Breville Barista Pro has two distinct, automated maintenance procedures: the &lt;strong>Cleaning (Flush) Cycle&lt;/strong> and the &lt;strong>Descale Cycle&lt;/strong>. It is important to understand that these are not interchangeable, as they address different types of buildup within the machine.&lt;/p></description></item><item><title>A Technical Deep Dive into the Transformer's Core Mechanics</title><link>/posts/a-technical-deep-dive-into-the-transformer-s-core-mechanics/</link><pubDate>Wed, 20 Aug 2025 04:32:52 +0000</pubDate><guid>/posts/a-technical-deep-dive-into-the-transformer-s-core-mechanics/</guid><description>&lt;p>The Transformer architecture is the bedrock of modern Large Language Models (LLMs). While its high-level success is widely known, a deeper understanding requires dissecting its core components. This article provides a detailed, technical breakdown of the fundamental concepts within a Transformer block, from the notion of &amp;ldquo;channels&amp;rdquo; to the intricate workings of the attention mechanism and its relationship with other advanced architectures like Mixture of Experts.&lt;/p>
+&lt;h3 id="1-the-channel-a-foundational-view-of-d_model">
+ 1. The &amp;ldquo;Channel&amp;rdquo;: A Foundational View of &lt;code>d_model&lt;/code>
+ &lt;a class="heading-link" href="#1-the-channel-a-foundational-view-of-d_model">
+ &lt;i class="fa-solid fa-link" aria-hidden="true" title="Link to heading">&lt;/i>
+ &lt;span class="sr-only">Link to heading&lt;/span>
+ &lt;/a>
+&lt;/h3>
+&lt;p>In deep learning, a &amp;ldquo;channel&amp;rdquo; can be thought of as a feature dimension. While this term is common in Convolutional Neural Networks for images (e.g., Red, Green, Blue channels), in LLMs, the analogous concept is the model&amp;rsquo;s primary embedding dimension, commonly referred to as &lt;code>d_model&lt;/code>.&lt;/p></description></item><item><title>Fixing GPU Operator Pods Stuck in Init: Secure Boot, DKMS, and MOK on Proxmox + Debian</title><link>/posts/secure-boot-dkms-and-mok-on-proxmox-debian/</link><pubDate>Sat, 09 Aug 2025 00:00:00 +0000</pubDate><guid>/posts/secure-boot-dkms-and-mok-on-proxmox-debian/</guid><description>&lt;p>I hit an issue where all GPU Operator pods on one node were stuck in Init after migrating from Legacy BIOS to UEFI. The common error was NVIDIA components waiting for “toolkit-ready,” while the toolkit init container looped with:&lt;/p>
 &lt;ul>
 &lt;li>nvidia-smi failed to communicate with the NVIDIA driver&lt;/li>
 &lt;li>modprobe nvidia → “Key was rejected by service”&lt;/li>
diff --git a/posts/a-comprehensive-guide-to-breville-barista-pro-maintenance/index.html b/posts/a-comprehensive-guide-to-breville-barista-pro-maintenance/index.html
index 7996f3c..22515c1 100644
--- a/posts/a-comprehensive-guide-to-breville-barista-pro-maintenance/index.html
+++ b/posts/a-comprehensive-guide-to-breville-barista-pro-maintenance/index.html
@@ -8,10 +8,10 @@
 
 The Breville Barista Pro has two distinct, automated maintenance procedures: the Cleaning (Flush) Cycle and the Descale Cycle. It is important to understand that these are not interchangeable, as they address different types of buildup within the machine."><meta name=keywords content="software engineer,performance engineering,Google engineer,tech blog,software development,performance optimization,Eric Liu,engineering blog,mountain biking,Jeep enthusiast,overlanding,camping,outdoor adventures"><meta name=twitter:card content="summary"><meta name=twitter:title content="A Comprehensive Guide to Breville Barista Pro Maintenance"><meta name=twitter:description content="Proper maintenance is critical for the longevity and performance of a Breville Barista Pro espresso machine. Consistent cleaning not only ensures the machine functions correctly but also directly impacts the quality of the espresso produced. This guide provides a detailed, technical breakdown of the essential maintenance routines, from automated cycles to daily upkeep.
 Understanding the Two Primary Maintenance Cycles Link to heading The Breville Barista Pro has two distinct, automated maintenance procedures: the Cleaning (Flush) Cycle and the Descale Cycle. It is important to understand that these are not interchangeable, as they address different types of buildup within the machine."><meta property="og:url" content="/posts/a-comprehensive-guide-to-breville-barista-pro-maintenance/"><meta property="og:site_name" content="Eric X. Liu's Personal Page"><meta property="og:title" content="A Comprehensive Guide to Breville Barista Pro Maintenance"><meta property="og:description" content="Proper maintenance is critical for the longevity and performance of a Breville Barista Pro espresso machine. Consistent cleaning not only ensures the machine functions correctly but also directly impacts the quality of the espresso produced. This guide provides a detailed, technical breakdown of the essential maintenance routines, from automated cycles to daily upkeep.
-Understanding the Two Primary Maintenance Cycles Link to heading The Breville Barista Pro has two distinct, automated maintenance procedures: the Cleaning (Flush) Cycle and the Descale Cycle. It is important to understand that these are not interchangeable, as they address different types of buildup within the machine."><meta property="og:locale" content="en"><meta property="og:type" content="article"><meta property="article:section" content="posts"><meta property="article:published_time" content="2025-08-20T04:16:13+00:00"><meta property="article:modified_time" content="2025-08-20T04:16:22+00:00"><link rel=canonical href=/posts/a-comprehensive-guide-to-breville-barista-pro-maintenance/><link rel=preload href=/fonts/fa-brands-400.woff2 as=font type=font/woff2 crossorigin><link rel=preload href=/fonts/fa-regular-400.woff2 as=font type=font/woff2 crossorigin><link rel=preload href=/fonts/fa-solid-900.woff2 as=font type=font/woff2 crossorigin><link rel=stylesheet href=/css/coder.min.6445a802b9389c9660e1b07b724dcf5718b1065ed2d71b4eeaf981cc7cc5fc46.css integrity="sha256-ZEWoArk4nJZg4bB7ck3PVxixBl7S1xtO6vmBzHzF/EY=" crossorigin=anonymous media=screen><link rel=stylesheet href=/css/coder-dark.min.a00e6364bacbc8266ad1cc81230774a1397198f8cfb7bcba29b7d6fcb54ce57f.css integrity="sha256-oA5jZLrLyCZq0cyBIwd0oTlxmPjPt7y6KbfW/LVM5X8=" crossorigin=anonymous media=screen><link rel=icon type=image/svg+xml href=/images/favicon.svg sizes=any><link rel=icon type=image/png href=/images/favicon-32x32.png sizes=32x32><link rel=icon type=image/png href=/images/favicon-16x16.png sizes=16x16><link rel=apple-touch-icon href=/images/apple-touch-icon.png><link rel=apple-touch-icon sizes=180x180 href=/images/apple-touch-icon.png><link rel=manifest href=/site.webmanifest><link rel=mask-icon href=/images/safari-pinned-tab.svg color=#5bbad5></head><body class="preload-transitions colorscheme-auto"><div class=float-container><a id=dark-mode-toggle class=colorscheme-toggle><i class="fa-solid fa-adjust fa-fw" aria-hidden=true></i></a></div><main class=wrapper><nav class=navigation><section class=container><a class=navigation-title href=/>Eric X. Liu's Personal Page
+Understanding the Two Primary Maintenance Cycles Link to heading The Breville Barista Pro has two distinct, automated maintenance procedures: the Cleaning (Flush) Cycle and the Descale Cycle. It is important to understand that these are not interchangeable, as they address different types of buildup within the machine."><meta property="og:locale" content="en"><meta property="og:type" content="article"><meta property="article:section" content="posts"><meta property="article:published_time" content="2025-08-20T04:32:52+00:00"><meta property="article:modified_time" content="2025-08-20T04:32:59+00:00"><link rel=canonical href=/posts/a-comprehensive-guide-to-breville-barista-pro-maintenance/><link rel=preload href=/fonts/fa-brands-400.woff2 as=font type=font/woff2 crossorigin><link rel=preload href=/fonts/fa-regular-400.woff2 as=font type=font/woff2 crossorigin><link rel=preload href=/fonts/fa-solid-900.woff2 as=font type=font/woff2 crossorigin><link rel=stylesheet href=/css/coder.min.6445a802b9389c9660e1b07b724dcf5718b1065ed2d71b4eeaf981cc7cc5fc46.css integrity="sha256-ZEWoArk4nJZg4bB7ck3PVxixBl7S1xtO6vmBzHzF/EY=" crossorigin=anonymous media=screen><link rel=stylesheet href=/css/coder-dark.min.a00e6364bacbc8266ad1cc81230774a1397198f8cfb7bcba29b7d6fcb54ce57f.css integrity="sha256-oA5jZLrLyCZq0cyBIwd0oTlxmPjPt7y6KbfW/LVM5X8=" crossorigin=anonymous media=screen><link rel=icon type=image/svg+xml href=/images/favicon.svg sizes=any><link rel=icon type=image/png href=/images/favicon-32x32.png sizes=32x32><link rel=icon type=image/png href=/images/favicon-16x16.png sizes=16x16><link rel=apple-touch-icon href=/images/apple-touch-icon.png><link rel=apple-touch-icon sizes=180x180 href=/images/apple-touch-icon.png><link rel=manifest href=/site.webmanifest><link rel=mask-icon href=/images/safari-pinned-tab.svg color=#5bbad5></head><body class="preload-transitions colorscheme-auto"><div class=float-container><a id=dark-mode-toggle class=colorscheme-toggle><i class="fa-solid fa-adjust fa-fw" aria-hidden=true></i></a></div><main class=wrapper><nav class=navigation><section class=container><a class=navigation-title href=/>Eric X. Liu's Personal Page
 </a><input type=checkbox id=menu-toggle>
 <label class="menu-button float-right" for=menu-toggle><i class="fa-solid fa-bars fa-fw" aria-hidden=true></i></label><ul class=navigation-list><li class=navigation-item><a class=navigation-link href=/posts/>Posts</a></li><li class=navigation-item><a class=navigation-link href=https://chat.ericxliu.me>Chat</a></li><li class=navigation-item><a class=navigation-link href=https://git.ericxliu.me/user/oauth2/Authenitk>Git</a></li><li class=navigation-item><a class=navigation-link href=https://coder.ericxliu.me/api/v2/users/oidc/callback>Coder</a></li><li class=navigation-item><a class=navigation-link href=/>|</a></li><li class=navigation-item><a class=navigation-link href=https://sso.ericxliu.me>Sign in</a></li></ul></section></nav><div class=content><section class="container post"><article><header><div class=post-title><h1 class=title><a class=title-link href=/posts/a-comprehensive-guide-to-breville-barista-pro-maintenance/>A Comprehensive Guide to Breville Barista Pro Maintenance</a></h1></div><div class=post-meta><div class=date><span class=posted-on><i class="fa-solid fa-calendar" aria-hidden=true></i>
-<time datetime=2025-08-20T04:16:13Z>August 20, 2025
+<time datetime=2025-08-20T04:32:52Z>August 20, 2025
 </time></span><span class=reading-time><i class="fa-solid fa-clock" aria-hidden=true></i>
 5-minute read</span></div></div></header><div class=post-content><p>Proper maintenance is critical for the longevity and performance of a Breville Barista Pro espresso machine. Consistent cleaning not only ensures the machine functions correctly but also directly impacts the quality of the espresso produced. This guide provides a detailed, technical breakdown of the essential maintenance routines, from automated cycles to daily upkeep.</p><h4 id=understanding-the-two-primary-maintenance-cycles><strong>Understanding the Two Primary Maintenance Cycles</strong>
 <a class=heading-link href=#understanding-the-two-primary-maintenance-cycles><i class="fa-solid fa-link" aria-hidden=true title="Link to heading"></i>
@@ -25,4 +25,4 @@ Understanding the Two Primary Maintenance Cycles Link to heading The Breville Ba
 2016 -
 2025
 Eric X. Liu
-<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/c06e978">[c06e978]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
\ No newline at end of file
+<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/0945204">[0945204]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
\ No newline at end of file
diff --git a/posts/a-deep-dive-into-ppo-for-language-models/index.html b/posts/a-deep-dive-into-ppo-for-language-models/index.html
index 71797ab..c54c6d2 100644
--- a/posts/a-deep-dive-into-ppo-for-language-models/index.html
+++ b/posts/a-deep-dive-into-ppo-for-language-models/index.html
@@ -23,4 +23,4 @@ where <code>δ_t = r_t + γV(s_{t+1}) - V(s_t)</code></p><ul><li><strong>γ (gam
 2016 -
 2025
 Eric X. Liu
-<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/c06e978">[c06e978]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
\ No newline at end of file
+<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/0945204">[0945204]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
\ No newline at end of file
diff --git a/posts/a-technical-deep-dive-into-the-transformer-s-core-mechanics/index.html b/posts/a-technical-deep-dive-into-the-transformer-s-core-mechanics/index.html
new file mode 100644
index 0000000..5def962
--- /dev/null
+++ b/posts/a-technical-deep-dive-into-the-transformer-s-core-mechanics/index.html
@@ -0,0 +1,39 @@
+<!doctype html><html lang=en><head><title>A Technical Deep Dive into the Transformer's Core Mechanics · Eric X. Liu's Personal Page</title><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><meta name=color-scheme content="light dark"><meta name=author content="Eric X. Liu"><meta name=description content="The Transformer architecture is the bedrock of modern Large Language Models (LLMs). While its high-level success is widely known, a deeper understanding requires dissecting its core components. This article provides a detailed, technical breakdown of the fundamental concepts within a Transformer block, from the notion of &ldquo;channels&rdquo; to the intricate workings of the attention mechanism and its relationship with other advanced architectures like Mixture of Experts.
+
+  1. The &ldquo;Channel&rdquo;: A Foundational View of d_model
+  
+    
+    Link to heading
+  
+
+In deep learning, a &ldquo;channel&rdquo; can be thought of as a feature dimension. While this term is common in Convolutional Neural Networks for images (e.g., Red, Green, Blue channels), in LLMs, the analogous concept is the model&rsquo;s primary embedding dimension, commonly referred to as d_model."><meta name=keywords content="software engineer,performance engineering,Google engineer,tech blog,software development,performance optimization,Eric Liu,engineering blog,mountain biking,Jeep enthusiast,overlanding,camping,outdoor adventures"><meta name=twitter:card content="summary"><meta name=twitter:title content="A Technical Deep Dive into the Transformer's Core Mechanics"><meta name=twitter:description content="The Transformer architecture is the bedrock of modern Large Language Models (LLMs). While its high-level success is widely known, a deeper understanding requires dissecting its core components. This article provides a detailed, technical breakdown of the fundamental concepts within a Transformer block, from the notion of “channels” to the intricate workings of the attention mechanism and its relationship with other advanced architectures like Mixture of Experts.
+1. The “Channel”: A Foundational View of d_model Link to heading In deep learning, a “channel” can be thought of as a feature dimension. While this term is common in Convolutional Neural Networks for images (e.g., Red, Green, Blue channels), in LLMs, the analogous concept is the model’s primary embedding dimension, commonly referred to as d_model."><meta property="og:url" content="/posts/a-technical-deep-dive-into-the-transformer-s-core-mechanics/"><meta property="og:site_name" content="Eric X. Liu's Personal Page"><meta property="og:title" content="A Technical Deep Dive into the Transformer's Core Mechanics"><meta property="og:description" content="The Transformer architecture is the bedrock of modern Large Language Models (LLMs). While its high-level success is widely known, a deeper understanding requires dissecting its core components. This article provides a detailed, technical breakdown of the fundamental concepts within a Transformer block, from the notion of “channels” to the intricate workings of the attention mechanism and its relationship with other advanced architectures like Mixture of Experts.
+1. The “Channel”: A Foundational View of d_model Link to heading In deep learning, a “channel” can be thought of as a feature dimension. While this term is common in Convolutional Neural Networks for images (e.g., Red, Green, Blue channels), in LLMs, the analogous concept is the model’s primary embedding dimension, commonly referred to as d_model."><meta property="og:locale" content="en"><meta property="og:type" content="article"><meta property="article:section" content="posts"><meta property="article:published_time" content="2025-08-20T04:32:52+00:00"><meta property="article:modified_time" content="2025-08-20T04:32:59+00:00"><link rel=canonical href=/posts/a-technical-deep-dive-into-the-transformer-s-core-mechanics/><link rel=preload href=/fonts/fa-brands-400.woff2 as=font type=font/woff2 crossorigin><link rel=preload href=/fonts/fa-regular-400.woff2 as=font type=font/woff2 crossorigin><link rel=preload href=/fonts/fa-solid-900.woff2 as=font type=font/woff2 crossorigin><link rel=stylesheet href=/css/coder.min.6445a802b9389c9660e1b07b724dcf5718b1065ed2d71b4eeaf981cc7cc5fc46.css integrity="sha256-ZEWoArk4nJZg4bB7ck3PVxixBl7S1xtO6vmBzHzF/EY=" crossorigin=anonymous media=screen><link rel=stylesheet href=/css/coder-dark.min.a00e6364bacbc8266ad1cc81230774a1397198f8cfb7bcba29b7d6fcb54ce57f.css integrity="sha256-oA5jZLrLyCZq0cyBIwd0oTlxmPjPt7y6KbfW/LVM5X8=" crossorigin=anonymous media=screen><link rel=icon type=image/svg+xml href=/images/favicon.svg sizes=any><link rel=icon type=image/png href=/images/favicon-32x32.png sizes=32x32><link rel=icon type=image/png href=/images/favicon-16x16.png sizes=16x16><link rel=apple-touch-icon href=/images/apple-touch-icon.png><link rel=apple-touch-icon sizes=180x180 href=/images/apple-touch-icon.png><link rel=manifest href=/site.webmanifest><link rel=mask-icon href=/images/safari-pinned-tab.svg color=#5bbad5></head><body class="preload-transitions colorscheme-auto"><div class=float-container><a id=dark-mode-toggle class=colorscheme-toggle><i class="fa-solid fa-adjust fa-fw" aria-hidden=true></i></a></div><main class=wrapper><nav class=navigation><section class=container><a class=navigation-title href=/>Eric X. Liu's Personal Page
+</a><input type=checkbox id=menu-toggle>
+<label class="menu-button float-right" for=menu-toggle><i class="fa-solid fa-bars fa-fw" aria-hidden=true></i></label><ul class=navigation-list><li class=navigation-item><a class=navigation-link href=/posts/>Posts</a></li><li class=navigation-item><a class=navigation-link href=https://chat.ericxliu.me>Chat</a></li><li class=navigation-item><a class=navigation-link href=https://git.ericxliu.me/user/oauth2/Authenitk>Git</a></li><li class=navigation-item><a class=navigation-link href=https://coder.ericxliu.me/api/v2/users/oidc/callback>Coder</a></li><li class=navigation-item><a class=navigation-link href=/>|</a></li><li class=navigation-item><a class=navigation-link href=https://sso.ericxliu.me>Sign in</a></li></ul></section></nav><div class=content><section class="container post"><article><header><div class=post-title><h1 class=title><a class=title-link href=/posts/a-technical-deep-dive-into-the-transformer-s-core-mechanics/>A Technical Deep Dive into the Transformer's Core Mechanics</a></h1></div><div class=post-meta><div class=date><span class=posted-on><i class="fa-solid fa-calendar" aria-hidden=true></i>
+<time datetime=2025-08-20T04:32:52Z>August 20, 2025
+</time></span><span class=reading-time><i class="fa-solid fa-clock" aria-hidden=true></i>
+7-minute read</span></div></div></header><div class=post-content><p>The Transformer architecture is the bedrock of modern Large Language Models (LLMs). While its high-level success is widely known, a deeper understanding requires dissecting its core components. This article provides a detailed, technical breakdown of the fundamental concepts within a Transformer block, from the notion of &ldquo;channels&rdquo; to the intricate workings of the attention mechanism and its relationship with other advanced architectures like Mixture of Experts.</p><h3 id=1-the-channel-a-foundational-view-of-d_model>1. The &ldquo;Channel&rdquo;: A Foundational View of <code>d_model</code>
+<a class=heading-link href=#1-the-channel-a-foundational-view-of-d_model><i class="fa-solid fa-link" aria-hidden=true title="Link to heading"></i>
+<span class=sr-only>Link to heading</span></a></h3><p>In deep learning, a &ldquo;channel&rdquo; can be thought of as a feature dimension. While this term is common in Convolutional Neural Networks for images (e.g., Red, Green, Blue channels), in LLMs, the analogous concept is the model&rsquo;s primary embedding dimension, commonly referred to as <code>d_model</code>.</p><p>An input text is first tokenized, and each token is mapped to a vector of size <code>d_model</code> (e.g., 4096). Each of the 4096 dimensions in this vector can be considered a &ldquo;channel,&rdquo; representing a different semantic or syntactic feature of the token.</p><p>As this data, represented by a tensor of shape <code>[batch_size, sequence_length, d_model]</code>, progresses through the layers of the Transformer, these channels are continuously transformed. However, a critical design choice is that the output dimension of every main sub-layer (like the attention block or the FFN block) is also <code>d_model</code>. This consistency is essential for enabling <strong>residual connections</strong>, where the input to a block is added to its output (<code>output = input + SubLayer(input)</code>). This technique is vital for training the extremely deep networks common today.</p><h3 id=2-the-building-blocks-dimensions-of-key-layers>2. The Building Blocks: Dimensions of Key Layers
+<a class=heading-link href=#2-the-building-blocks-dimensions-of-key-layers><i class="fa-solid fa-link" aria-hidden=true title="Link to heading"></i>
+<span class=sr-only>Link to heading</span></a></h3><p>A Transformer layer is primarily composed of two sub-layers: a Multi-Head Attention block and a position-wise Feed-Forward Network (FFN). The parameters for these are stored in several key weight matrices. Understanding their dimensions is crucial.</p><p>Let&rsquo;s define our variables:</p><ul><li><code>d_model</code>: The core embedding dimension.</li><li><code>d_ff</code>: The inner dimension of the FFN, typically <code>4 * d_model</code>.</li><li><code>h</code>: The number of attention heads.</li><li><code>d_head</code>: The dimension of each attention head, where <code>d_model = h * d_head</code>.</li></ul><p>The dimensions of the weight matrices are as follows:</p><table><thead><tr><th>Layer</th><th>Weight Matrix</th><th>Input Vector Shape</th><th>Output Vector Shape</th><th><strong>Weight Matrix Dimension</strong></th></tr></thead><tbody><tr><td><strong>Attention Projections</strong></td><td></td><td></td><td></td><td></td></tr><tr><td>Query</td><td><code>W_Q</code></td><td><code>d_model</code></td><td><code>d_model</code></td><td><strong><code>[d_model, d_model]</code></strong></td></tr><tr><td>Key</td><td><code>W_K</code></td><td><code>d_model</code></td><td><code>d_model</code></td><td><strong><code>[d_model, d_model]</code></strong></td></tr><tr><td>Value</td><td><code>W_V</code></td><td><code>d_model</code></td><td><code>d_model</code></td><td><strong><code>[d_model, d_model]</code></strong></td></tr><tr><td>Output</td><td><code>W_O</code></td><td><code>d_model</code></td><td><code>d_model</code></td><td><strong><code>[d_model, d_model]</code></strong></td></tr><tr><td><strong>Feed-Forward Network</strong></td><td></td><td></td><td></td><td></td></tr><tr><td>Layer 1 (Up-projection)</td><td><code>W_ff1</code></td><td><code>d_model</code></td><td><code>d_ff</code></td><td><strong><code>[d_model, d_ff]</code></strong></td></tr><tr><td>Layer 2 (Down-projection)</td><td><code>W_ff2</code></td><td><code>d_ff</code></td><td><code>d_model</code></td><td><strong><code>[d_ff, d_model]</code></strong></td></tr></tbody></table><h3 id=3-deconstructing-multi-head-attention-mha>3. Deconstructing Multi-Head Attention (MHA)
+<a class=heading-link href=#3-deconstructing-multi-head-attention-mha><i class="fa-solid fa-link" aria-hidden=true title="Link to heading"></i>
+<span class=sr-only>Link to heading</span></a></h3><p>The core innovation of the Transformer is Multi-Head Attention. It allows the model to weigh the importance of different tokens in the sequence from multiple perspectives simultaneously.
+<img src=/images/a-technical-deep-dive-into-the-transformer-s-core-mechanics/.png alt></p><h4 id=31-the-why-beyond-a-single-attention>3.1. The &ldquo;Why&rdquo;: Beyond a Single Attention
+<a class=heading-link href=#31-the-why-beyond-a-single-attention><i class="fa-solid fa-link" aria-hidden=true title="Link to heading"></i>
+<span class=sr-only>Link to heading</span></a></h4><p>A single attention mechanism would force the model to average all types of linguistic relationships into one pattern. MHA avoids this by creating <code>h</code> parallel subspaces. Each &ldquo;head&rdquo; can specialize, with one head learning syntactic dependencies, another tracking semantic similarity, and so on. This creates a much richer representation.</p><h4 id=32-an-encodingdecoding-analogy>3.2. An Encoding/Decoding Analogy
+<a class=heading-link href=#32-an-encodingdecoding-analogy><i class="fa-solid fa-link" aria-hidden=true title="Link to heading"></i>
+<span class=sr-only>Link to heading</span></a></h4><p>A powerful way to conceptualize the attention calculation is as a two-stage process:</p><ol><li><strong>Encoding Relationships:</strong> The first part of the calculation, <code>softmax(Q @ K.T)</code>, can be seen as an encoding step. It does not use the actual &ldquo;content&rdquo; of the tokens (the <code>V</code> vectors). Instead, it uses the Queries and Keys to build a dynamic &ldquo;relationship map&rdquo; between tokens in the sequence. This map, a matrix of attention scores, answers the question: &ldquo;For each token, how important is every other token right now?&rdquo;</li><li><strong>Decoding via Information Retrieval:</strong> The second part, <code>scores @ V</code>, acts as a decoding step. It uses the relationship map to retrieve and synthesize information. For each token, it creates a new vector by taking a weighted sum of all the <code>V</code> vectors in the sequence, using the scores as the precise mixing recipe. It decodes the relational structure into a new, context-aware representation.</li></ol><h4 id=33-the-how-a-step-by-step-flow>3.3. The &ldquo;How&rdquo;: A Step-by-Step Flow
+<a class=heading-link href=#33-the-how-a-step-by-step-flow><i class="fa-solid fa-link" aria-hidden=true title="Link to heading"></i>
+<span class=sr-only>Link to heading</span></a></h4><p>The MHA process is designed for maximum computational efficiency.</p><ol><li><strong>Initial Projections:</strong> The input vectors (shape <code>[seq_len, d_model]</code>) are multiplied by <code>W_Q</code>, <code>W_K</code>, and <code>W_V</code>. These matrices are all <code>[d_model, d_model]</code> not to create one large query, but to <strong>efficiently compute the vectors for all <code>h</code> heads at once</strong>. The single large output vector is then reshaped into <code>h</code> separate vectors, each of size <code>d_head</code>.</li><li><strong>Attention Score Calculation:</strong> For each head <code>i</code>, a score matrix is calculated: <code>scores_i = softmax( (Q_i @ K_i.T) / sqrt(d_head) )</code>. Note that <code>Q_i</code> and <code>K_i</code> have dimensions <code>[seq_len, d_head]</code>, so the resulting <code>scores_i</code> matrix has a dimension of <strong><code>[seq_len, seq_len]</code></strong>.</li><li><strong>Weighted Value Calculation:</strong> The scores are used to create a weighted sum of the Value vectors for each head: <code>output_i = scores_i @ V_i</code>. Since <code>scores_i</code> is <code>[seq_len, seq_len]</code> and <code>V_i</code> is <code>[seq_len, d_head]</code>, the resulting <code>output_i</code> has a dimension of <strong><code>[seq_len, d_head]</code></strong>. This is the final output of a single head.</li><li><strong>Concatenation and Final Projection:</strong> The outputs of all <code>h</code> heads are concatenated along the last dimension. This produces a single large matrix of shape <code>[seq_len, h * d_head]</code>, which is equivalent to <code>[seq_len, d_model]</code>. This matrix is then passed through the final output projection layer, <code>W_O</code> (shape <code>[d_model, d_model]</code>), to produce the attention block&rsquo;s final output. The <code>W_O</code> matrix learns the optimal way to mix the information from all the specialized heads into a single, unified representation.</li></ol><h3 id=4-optimizing-attention-gqa-and-mqa>4. Optimizing Attention: GQA and MQA
+<a class=heading-link href=#4-optimizing-attention-gqa-and-mqa><i class="fa-solid fa-link" aria-hidden=true title="Link to heading"></i>
+<span class=sr-only>Link to heading</span></a></h3><p>During inference, storing the Key and Value vectors for all previous tokens (the KV Cache) is a major memory bottleneck. <strong>Grouped-Query Attention (GQA)</strong> and <strong>Multi-Query Attention (MQA)</strong> are architectural modifications that address this by allowing multiple Query heads to share the same Key and Value heads.</p><p>Let&rsquo;s use a concrete example, similar to Llama 2 7B:</p><ul><li><code>d_model</code> = 4096</li><li><code>h</code> = 32 Q heads</li><li><code>d_head</code> = 128</li><li><code>g</code> = 8 KV head groups for GQA</li></ul><p>The key insight is that only the dimensions of the <code>W_K</code> and <code>W_V</code> matrices change, which in turn reduces the size of the KV cache. The <code>W_Q</code> and <code>W_O</code> matrices remain <code>[4096, 4096]</code>.</p><table><thead><tr><th>Attention Type</th><th>No. of Q Heads</th><th>No. of KV Heads</th><th><code>W_K</code> & <code>W_V</code> Dimension</th><th>Relative KV Cache Size</th></tr></thead><tbody><tr><td><strong>MHA</strong> (Multi-Head)</td><td>32</td><td>32</td><td><code>[4096, 32*128]</code> = <code>[4096, 4096]</code></td><td>1x (Baseline)</td></tr><tr><td><strong>GQA</strong> (Grouped)</td><td>32</td><td>8</td><td><code>[4096, 8*128]</code> = <code>[4096, 1024]</code></td><td>1/4x</td></tr><tr><td><strong>MQA</strong> (Multi-Query)</td><td>32</td><td>1</td><td><code>[4096, 1*128]</code> = <code>[4096, 128]</code></td><td>1/32x</td></tr></tbody></table><p>GQA provides a robust balance, significantly reducing the memory and bandwidth requirements for the KV cache with negligible impact on model performance, making it a popular choice in modern LLMs.</p><h3 id=5-mha-vs-mixture-of-experts-moe-a-clarification>5. MHA vs. Mixture of Experts (MoE): A Clarification
+<a class=heading-link href=#5-mha-vs-mixture-of-experts-moe-a-clarification><i class="fa-solid fa-link" aria-hidden=true title="Link to heading"></i>
+<span class=sr-only>Link to heading</span></a></h3><p>While both MHA and MoE use the concept of &ldquo;experts,&rdquo; they are functionally and architecturally distinct.</p><ul><li><strong>MHA:</strong> The &ldquo;experts&rdquo; are the <strong>attention heads</strong>. All heads are active for every token to build a rich representation within the attention layer. This is akin to a board meeting where every member analyzes and contributes to every decision.</li><li><strong>MoE:</strong> The &ldquo;experts&rdquo; are full <strong>Feed-Forward Networks</strong>. A routing network selects a small subset of these FFNs for each token. This is a scaling strategy to increase a model&rsquo;s parameter count for greater capacity while keeping the computational cost fixed. It replaces the standard FFN block, whereas MHA <em>is</em> the attention block.</li></ul><p>By understanding these technical details, from the basic concept of a channel to the sophisticated interplay of heads and experts, one can build a more complete and accurate mental model of how LLMs truly operate.</p><hr><h3 id=references>References
+<a class=heading-link href=#references><i class="fa-solid fa-link" aria-hidden=true title="Link to heading"></i>
+<span class=sr-only>Link to heading</span></a></h3><ol><li>Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A. N., &mldr; & Polosukhin, I. (2017). Attention is all you need. <em>Advances in neural information processing systems</em>, 30.</li><li>Shazeer, N., Mirhoseini, A., Maziarz, K., Davis, A., Le, Q., Hinton, G., & Dean, J. (2017). Outrageously large neural networks: The sparsely-gated mixture-of-experts layer. <em>arXiv preprint arXiv:1701.06538</em>.</li><li>Ainslie, J., Ontanon, J., Cakka, E., Dosovitskiy, A., & Le, Q. V. (2023). GQA: Training Generalized Multi-Query Transformer Models from Multi-Head Checkpoints. <em>arXiv preprint arXiv:2305.13245</em>.</li></ol></div><footer><div id=disqus_thread></div><script>window.disqus_config=function(){},function(){if(["localhost","127.0.0.1"].indexOf(window.location.hostname)!=-1){document.getElementById("disqus_thread").innerHTML="Disqus comments not available by default when the website is previewed locally.";return}var t=document,e=t.createElement("script");e.async=!0,e.src="//ericxliu-me.disqus.com/embed.js",e.setAttribute("data-timestamp",+new Date),(t.head||t.body).appendChild(e)}(),document.addEventListener("themeChanged",function(){document.readyState=="complete"&&DISQUS.reset({reload:!0,config:disqus_config})})</script></footer></article><link rel=stylesheet href=https://cdn.jsdelivr.net/npm/katex@0.16.4/dist/katex.min.css integrity=sha384-vKruj+a13U8yHIkAyGgK1J3ArTLzrFGBbBc0tDp4ad/EyewESeXE/Iv67Aj8gKZ0 crossorigin=anonymous><script defer src=https://cdn.jsdelivr.net/npm/katex@0.16.4/dist/katex.min.js integrity=sha384-PwRUT/YqbnEjkZO0zZxNqcxACrXe+j766U2amXcgMg5457rve2Y7I6ZJSm2A0mS4 crossorigin=anonymous></script><script defer src=https://cdn.jsdelivr.net/npm/katex@0.16.4/dist/contrib/auto-render.min.js integrity=sha384-+VBxd3r6XgURycqtZ117nYw44OOcIax56Z4dCRWbxyPt0Koah1uHoK0o4+/RRE05 crossorigin=anonymous onload='renderMathInElement(document.body,{delimiters:[{left:"$$",right:"$$",display:!0},{left:"$",right:"$",display:!1},{left:"\\(",right:"\\)",display:!1},{left:"\\[",right:"\\]",display:!0}]})'></script></section></div><footer class=footer><section class=container>©
+2016 -
+2025
+Eric X. Liu
+<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/0945204">[0945204]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
\ No newline at end of file
diff --git a/posts/espresso-theory-application-a-guide-for-the-breville-barista-pro/index.html b/posts/espresso-theory-application-a-guide-for-the-breville-barista-pro/index.html
index 5001709..f2ce188 100644
--- a/posts/espresso-theory-application-a-guide-for-the-breville-barista-pro/index.html
+++ b/posts/espresso-theory-application-a-guide-for-the-breville-barista-pro/index.html
@@ -20,4 +20,4 @@ Our overarching philosophy is simple: isolate and change only one variable at a
 2016 -
 2025
 Eric X. Liu
-<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/c06e978">[c06e978]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
\ No newline at end of file
+<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/0945204">[0945204]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
\ No newline at end of file
diff --git a/posts/how-rvq-teaches-llms-to-see-and-hear/index.html b/posts/how-rvq-teaches-llms-to-see-and-hear/index.html
index f63a350..b9ede7c 100644
--- a/posts/how-rvq-teaches-llms-to-see-and-hear/index.html
+++ b/posts/how-rvq-teaches-llms-to-see-and-hear/index.html
@@ -18,4 +18,4 @@ The answer lies in creating a universal language—a bridge between the continuo
 2016 -
 2025
 Eric X. Liu
-<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/c06e978">[c06e978]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
\ No newline at end of file
+<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/0945204">[0945204]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
\ No newline at end of file
diff --git a/posts/index.html b/posts/index.html
index 79ed627..a552d3e 100644
--- a/posts/index.html
+++ b/posts/index.html
@@ -1,7 +1,8 @@
 <!doctype html><html lang=en><head><title>Posts · Eric X. Liu's Personal Page</title><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><meta name=color-scheme content="light dark"><meta name=author content="Eric X. Liu"><meta name=description content="Eric X. Liu - Software & Performance Engineer at Google. Sharing insights about software engineering, performance optimization, tech industry experiences, mountain biking adventures, Jeep overlanding, and outdoor activities."><meta name=keywords content="software engineer,performance engineering,Google engineer,tech blog,software development,performance optimization,Eric Liu,engineering blog,mountain biking,Jeep enthusiast,overlanding,camping,outdoor adventures"><meta name=twitter:card content="summary"><meta name=twitter:title content="Posts"><meta name=twitter:description content="Eric X. Liu - Software & Performance Engineer at Google. Sharing insights about software engineering, performance optimization, tech industry experiences, mountain biking adventures, Jeep overlanding, and outdoor activities."><meta property="og:url" content="/posts/"><meta property="og:site_name" content="Eric X. Liu's Personal Page"><meta property="og:title" content="Posts"><meta property="og:description" content="Eric X. Liu - Software & Performance Engineer at Google. Sharing insights about software engineering, performance optimization, tech industry experiences, mountain biking adventures, Jeep overlanding, and outdoor activities."><meta property="og:locale" content="en"><meta property="og:type" content="website"><link rel=canonical href=/posts/><link rel=preload href=/fonts/fa-brands-400.woff2 as=font type=font/woff2 crossorigin><link rel=preload href=/fonts/fa-regular-400.woff2 as=font type=font/woff2 crossorigin><link rel=preload href=/fonts/fa-solid-900.woff2 as=font type=font/woff2 crossorigin><link rel=stylesheet href=/css/coder.min.6445a802b9389c9660e1b07b724dcf5718b1065ed2d71b4eeaf981cc7cc5fc46.css integrity="sha256-ZEWoArk4nJZg4bB7ck3PVxixBl7S1xtO6vmBzHzF/EY=" crossorigin=anonymous media=screen><link rel=stylesheet href=/css/coder-dark.min.a00e6364bacbc8266ad1cc81230774a1397198f8cfb7bcba29b7d6fcb54ce57f.css integrity="sha256-oA5jZLrLyCZq0cyBIwd0oTlxmPjPt7y6KbfW/LVM5X8=" crossorigin=anonymous media=screen><link rel=icon type=image/svg+xml href=/images/favicon.svg sizes=any><link rel=icon type=image/png href=/images/favicon-32x32.png sizes=32x32><link rel=icon type=image/png href=/images/favicon-16x16.png sizes=16x16><link rel=apple-touch-icon href=/images/apple-touch-icon.png><link rel=apple-touch-icon sizes=180x180 href=/images/apple-touch-icon.png><link rel=manifest href=/site.webmanifest><link rel=mask-icon href=/images/safari-pinned-tab.svg color=#5bbad5><link rel=alternate type=application/rss+xml href=/posts/index.xml title="Eric X. Liu's Personal Page"></head><body class="preload-transitions colorscheme-auto"><div class=float-container><a id=dark-mode-toggle class=colorscheme-toggle><i class="fa-solid fa-adjust fa-fw" aria-hidden=true></i></a></div><main class=wrapper><nav class=navigation><section class=container><a class=navigation-title href=/>Eric X. Liu's Personal Page
 </a><input type=checkbox id=menu-toggle>
 <label class="menu-button float-right" for=menu-toggle><i class="fa-solid fa-bars fa-fw" aria-hidden=true></i></label><ul class=navigation-list><li class=navigation-item><a class=navigation-link href=/posts/>Posts</a></li><li class=navigation-item><a class=navigation-link href=https://chat.ericxliu.me>Chat</a></li><li class=navigation-item><a class=navigation-link href=https://git.ericxliu.me/user/oauth2/Authenitk>Git</a></li><li class=navigation-item><a class=navigation-link href=https://coder.ericxliu.me/api/v2/users/oidc/callback>Coder</a></li><li class=navigation-item><a class=navigation-link href=/>|</a></li><li class=navigation-item><a class=navigation-link href=https://sso.ericxliu.me>Sign in</a></li></ul></section></nav><div class=content><section class="container list"><header><h1 class=title><a class=title-link href=/posts/>Posts</a></h1></header><ul><li><span class=date>August 20, 2025</span>
-<a class=title href=/posts/a-comprehensive-guide-to-breville-barista-pro-maintenance/>A Comprehensive Guide to Breville Barista Pro Maintenance</a></li><li><span class=date>August 9, 2025</span>
+<a class=title href=/posts/a-comprehensive-guide-to-breville-barista-pro-maintenance/>A Comprehensive Guide to Breville Barista Pro Maintenance</a></li><li><span class=date>August 20, 2025</span>
+<a class=title href=/posts/a-technical-deep-dive-into-the-transformer-s-core-mechanics/>A Technical Deep Dive into the Transformer's Core Mechanics</a></li><li><span class=date>August 9, 2025</span>
 <a class=title href=/posts/secure-boot-dkms-and-mok-on-proxmox-debian/>Fixing GPU Operator Pods Stuck in Init: Secure Boot, DKMS, and MOK on Proxmox + Debian</a></li><li><span class=date>August 7, 2025</span>
 <a class=title href=/posts/how-rvq-teaches-llms-to-see-and-hear/>Beyond Words: How RVQ Teaches LLMs to See and Hear</a></li><li><span class=date>August 3, 2025</span>
 <a class=title href=/posts/supabase-deep-dive/>Supabase Deep Dive: It's Not Magic, It's Just Postgres</a></li><li><span class=date>August 2, 2025</span>
@@ -13,4 +14,4 @@
 2016 -
 2025
 Eric X. Liu
-<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/c06e978">[c06e978]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
\ No newline at end of file
+<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/0945204">[0945204]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
\ No newline at end of file
diff --git a/posts/index.xml b/posts/index.xml
index 435aafa..6812c99 100644
--- a/posts/index.xml
+++ b/posts/index.xml
@@ -1,4 +1,4 @@
-<?xml version="1.0" encoding="utf-8" standalone="yes"?><rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom"><channel><title>Posts on Eric X. Liu's Personal Page</title><link>/posts/</link><description>Recent content in Posts on Eric X. Liu's Personal Page</description><generator>Hugo</generator><language>en</language><lastBuildDate>Wed, 20 Aug 2025 04:16:22 +0000</lastBuildDate><atom:link href="/posts/index.xml" rel="self" type="application/rss+xml"/><item><title>A Comprehensive Guide to Breville Barista Pro Maintenance</title><link>/posts/a-comprehensive-guide-to-breville-barista-pro-maintenance/</link><pubDate>Wed, 20 Aug 2025 04:16:13 +0000</pubDate><guid>/posts/a-comprehensive-guide-to-breville-barista-pro-maintenance/</guid><description>&lt;p>Proper maintenance is critical for the longevity and performance of a Breville Barista Pro espresso machine. Consistent cleaning not only ensures the machine functions correctly but also directly impacts the quality of the espresso produced. This guide provides a detailed, technical breakdown of the essential maintenance routines, from automated cycles to daily upkeep.&lt;/p>
+<?xml version="1.0" encoding="utf-8" standalone="yes"?><rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom"><channel><title>Posts on Eric X. Liu's Personal Page</title><link>/posts/</link><description>Recent content in Posts on Eric X. Liu's Personal Page</description><generator>Hugo</generator><language>en</language><lastBuildDate>Wed, 20 Aug 2025 04:32:59 +0000</lastBuildDate><atom:link href="/posts/index.xml" rel="self" type="application/rss+xml"/><item><title>A Comprehensive Guide to Breville Barista Pro Maintenance</title><link>/posts/a-comprehensive-guide-to-breville-barista-pro-maintenance/</link><pubDate>Wed, 20 Aug 2025 04:32:52 +0000</pubDate><guid>/posts/a-comprehensive-guide-to-breville-barista-pro-maintenance/</guid><description>&lt;p>Proper maintenance is critical for the longevity and performance of a Breville Barista Pro espresso machine. Consistent cleaning not only ensures the machine functions correctly but also directly impacts the quality of the espresso produced. This guide provides a detailed, technical breakdown of the essential maintenance routines, from automated cycles to daily upkeep.&lt;/p>
 &lt;h4 id="understanding-the-two-primary-maintenance-cycles">
  &lt;strong>Understanding the Two Primary Maintenance Cycles&lt;/strong>
  &lt;a class="heading-link" href="#understanding-the-two-primary-maintenance-cycles">
@@ -6,7 +6,15 @@
  &lt;span class="sr-only">Link to heading&lt;/span>
  &lt;/a>
 &lt;/h4>
-&lt;p>The Breville Barista Pro has two distinct, automated maintenance procedures: the &lt;strong>Cleaning (Flush) Cycle&lt;/strong> and the &lt;strong>Descale Cycle&lt;/strong>. It is important to understand that these are not interchangeable, as they address different types of buildup within the machine.&lt;/p></description></item><item><title>Fixing GPU Operator Pods Stuck in Init: Secure Boot, DKMS, and MOK on Proxmox + Debian</title><link>/posts/secure-boot-dkms-and-mok-on-proxmox-debian/</link><pubDate>Sat, 09 Aug 2025 00:00:00 +0000</pubDate><guid>/posts/secure-boot-dkms-and-mok-on-proxmox-debian/</guid><description>&lt;p>I hit an issue where all GPU Operator pods on one node were stuck in Init after migrating from Legacy BIOS to UEFI. The common error was NVIDIA components waiting for “toolkit-ready,” while the toolkit init container looped with:&lt;/p>
+&lt;p>The Breville Barista Pro has two distinct, automated maintenance procedures: the &lt;strong>Cleaning (Flush) Cycle&lt;/strong> and the &lt;strong>Descale Cycle&lt;/strong>. It is important to understand that these are not interchangeable, as they address different types of buildup within the machine.&lt;/p></description></item><item><title>A Technical Deep Dive into the Transformer's Core Mechanics</title><link>/posts/a-technical-deep-dive-into-the-transformer-s-core-mechanics/</link><pubDate>Wed, 20 Aug 2025 04:32:52 +0000</pubDate><guid>/posts/a-technical-deep-dive-into-the-transformer-s-core-mechanics/</guid><description>&lt;p>The Transformer architecture is the bedrock of modern Large Language Models (LLMs). While its high-level success is widely known, a deeper understanding requires dissecting its core components. This article provides a detailed, technical breakdown of the fundamental concepts within a Transformer block, from the notion of &amp;ldquo;channels&amp;rdquo; to the intricate workings of the attention mechanism and its relationship with other advanced architectures like Mixture of Experts.&lt;/p>
+&lt;h3 id="1-the-channel-a-foundational-view-of-d_model">
+ 1. The &amp;ldquo;Channel&amp;rdquo;: A Foundational View of &lt;code>d_model&lt;/code>
+ &lt;a class="heading-link" href="#1-the-channel-a-foundational-view-of-d_model">
+ &lt;i class="fa-solid fa-link" aria-hidden="true" title="Link to heading">&lt;/i>
+ &lt;span class="sr-only">Link to heading&lt;/span>
+ &lt;/a>
+&lt;/h3>
+&lt;p>In deep learning, a &amp;ldquo;channel&amp;rdquo; can be thought of as a feature dimension. While this term is common in Convolutional Neural Networks for images (e.g., Red, Green, Blue channels), in LLMs, the analogous concept is the model&amp;rsquo;s primary embedding dimension, commonly referred to as &lt;code>d_model&lt;/code>.&lt;/p></description></item><item><title>Fixing GPU Operator Pods Stuck in Init: Secure Boot, DKMS, and MOK on Proxmox + Debian</title><link>/posts/secure-boot-dkms-and-mok-on-proxmox-debian/</link><pubDate>Sat, 09 Aug 2025 00:00:00 +0000</pubDate><guid>/posts/secure-boot-dkms-and-mok-on-proxmox-debian/</guid><description>&lt;p>I hit an issue where all GPU Operator pods on one node were stuck in Init after migrating from Legacy BIOS to UEFI. The common error was NVIDIA components waiting for “toolkit-ready,” while the toolkit init container looped with:&lt;/p>
 &lt;ul>
 &lt;li>nvidia-smi failed to communicate with the NVIDIA driver&lt;/li>
 &lt;li>modprobe nvidia → “Key was rejected by service”&lt;/li>
diff --git a/posts/mixture-of-experts-moe-models-challenges-solutions-in-practice/index.html b/posts/mixture-of-experts-moe-models-challenges-solutions-in-practice/index.html
index 66b5034..8f70bdb 100644
--- a/posts/mixture-of-experts-moe-models-challenges-solutions-in-practice/index.html
+++ b/posts/mixture-of-experts-moe-models-challenges-solutions-in-practice/index.html
@@ -44,4 +44,4 @@ The <strong>Top-K routing</strong> mechanism, as illustrated in the provided ima
 2016 -
 2025
 Eric X. Liu
-<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/c06e978">[c06e978]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
\ No newline at end of file
+<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/0945204">[0945204]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
\ No newline at end of file
diff --git a/posts/secure-boot-dkms-and-mok-on-proxmox-debian/index.html b/posts/secure-boot-dkms-and-mok-on-proxmox-debian/index.html
index 13e11d3..56a0ae6 100644
--- a/posts/secure-boot-dkms-and-mok-on-proxmox-debian/index.html
+++ b/posts/secure-boot-dkms-and-mok-on-proxmox-debian/index.html
@@ -59,4 +59,4 @@ nvidia-smi failed to communicate with the NVIDIA driver modprobe nvidia → “K
 2016 -
 2025
 Eric X. Liu
-<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/c06e978">[c06e978]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
\ No newline at end of file
+<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/0945204">[0945204]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
\ No newline at end of file
diff --git a/posts/supabase-deep-dive/index.html b/posts/supabase-deep-dive/index.html
index 6fec890..d4b271c 100644
--- a/posts/supabase-deep-dive/index.html
+++ b/posts/supabase-deep-dive/index.html
@@ -90,4 +90,4 @@ Supabase enters this space with a radically different philosophy: transparency.
 2016 -
 2025
 Eric X. Liu
-<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/c06e978">[c06e978]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
\ No newline at end of file
+<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/0945204">[0945204]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
\ No newline at end of file
diff --git a/posts/t5-the-transformer-that-zigged-when-others-zagged-an-architectural-deep-dive/index.html b/posts/t5-the-transformer-that-zigged-when-others-zagged-an-architectural-deep-dive/index.html
index 55ebf8a..6bba40d 100644
--- a/posts/t5-the-transformer-that-zigged-when-others-zagged-an-architectural-deep-dive/index.html
+++ b/posts/t5-the-transformer-that-zigged-when-others-zagged-an-architectural-deep-dive/index.html
@@ -30,4 +30,4 @@ But to truly understand the field, we must look at the pivotal models that explo
 2016 -
 2025
 Eric X. Liu
-<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/c06e978">[c06e978]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
\ No newline at end of file
+<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/0945204">[0945204]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
\ No newline at end of file
diff --git a/posts/useful/index.html b/posts/useful/index.html
index ddd3c28..de320ed 100644
--- a/posts/useful/index.html
+++ b/posts/useful/index.html
@@ -9,4 +9,4 @@ One-minute read</span></div></div></header><div class=post-content><ul><li><a hr
 2016 -
 2025
 Eric X. Liu
-<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/c06e978">[c06e978]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
\ No newline at end of file
+<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/0945204">[0945204]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
\ No newline at end of file
diff --git a/sitemap.xml b/sitemap.xml
index d382b1e..f5929cb 100644
--- a/sitemap.xml
+++ b/sitemap.xml
@@ -1 +1 @@
-<?xml version="1.0" encoding="utf-8" standalone="yes"?><urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:xhtml="http://www.w3.org/1999/xhtml"><url><loc>/posts/a-comprehensive-guide-to-breville-barista-pro-maintenance/</loc><lastmod>2025-08-20T04:16:22+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/</loc><lastmod>2025-08-20T04:16:22+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/posts/</loc><lastmod>2025-08-20T04:16:22+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/posts/secure-boot-dkms-and-mok-on-proxmox-debian/</loc><lastmod>2025-08-14T06:50:22+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/posts/how-rvq-teaches-llms-to-see-and-hear/</loc><lastmod>2025-08-08T17:36:52+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/posts/supabase-deep-dive/</loc><lastmod>2025-08-04T03:59:37+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/posts/a-deep-dive-into-ppo-for-language-models/</loc><lastmod>2025-08-16T21:13:18+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/posts/mixture-of-experts-moe-models-challenges-solutions-in-practice/</loc><lastmod>2025-08-03T06:02:48+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/posts/t5-the-transformer-that-zigged-when-others-zagged-an-architectural-deep-dive/</loc><lastmod>2025-08-03T03:41:10+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/posts/espresso-theory-application-a-guide-for-the-breville-barista-pro/</loc><lastmod>2025-08-03T04:20:20+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/posts/useful/</loc><lastmod>2025-08-03T08:37:28-07:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/about/</loc><lastmod>2020-06-16T23:30:17-07:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/categories/</loc><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/tags/</loc><changefreq>weekly</changefreq><priority>0.5</priority></url></urlset>
\ No newline at end of file
+<?xml version="1.0" encoding="utf-8" standalone="yes"?><urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:xhtml="http://www.w3.org/1999/xhtml"><url><loc>/posts/a-comprehensive-guide-to-breville-barista-pro-maintenance/</loc><lastmod>2025-08-20T04:32:59+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/posts/a-technical-deep-dive-into-the-transformer-s-core-mechanics/</loc><lastmod>2025-08-20T04:32:59+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/</loc><lastmod>2025-08-20T04:32:59+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/posts/</loc><lastmod>2025-08-20T04:32:59+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/posts/secure-boot-dkms-and-mok-on-proxmox-debian/</loc><lastmod>2025-08-14T06:50:22+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/posts/how-rvq-teaches-llms-to-see-and-hear/</loc><lastmod>2025-08-08T17:36:52+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/posts/supabase-deep-dive/</loc><lastmod>2025-08-04T03:59:37+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/posts/a-deep-dive-into-ppo-for-language-models/</loc><lastmod>2025-08-16T21:13:18+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/posts/mixture-of-experts-moe-models-challenges-solutions-in-practice/</loc><lastmod>2025-08-03T06:02:48+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/posts/t5-the-transformer-that-zigged-when-others-zagged-an-architectural-deep-dive/</loc><lastmod>2025-08-03T03:41:10+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/posts/espresso-theory-application-a-guide-for-the-breville-barista-pro/</loc><lastmod>2025-08-03T04:20:20+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/posts/useful/</loc><lastmod>2025-08-03T08:37:28-07:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/about/</loc><lastmod>2020-06-16T23:30:17-07:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/categories/</loc><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/tags/</loc><changefreq>weekly</changefreq><priority>0.5</priority></url></urlset>
\ No newline at end of file
diff --git a/tags/index.html b/tags/index.html
index 51d4fe0..a692fb4 100644
--- a/tags/index.html
+++ b/tags/index.html
@@ -4,4 +4,4 @@
 2016 -
 2025
 Eric X. Liu
-<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/c06e978">[c06e978]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
\ No newline at end of file
+<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/0945204">[0945204]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
\ No newline at end of file

Layer	Weight Matrix	Input Vector Shape	Output Vector Shape	Weight Matrix Dimension
Attention Projections
Query	`W_Q`	`d_model`	`d_model`	`[d_model, d_model]`
Key	`W_K`	`d_model`	`d_model`	`[d_model, d_model]`
Value	`W_V`	`d_model`	`d_model`	`[d_model, d_model]`
Output	`W_O`	`d_model`	`d_model`	`[d_model, d_model]`
Feed-Forward Network
Layer 1 (Up-projection)	`W_ff1`	`d_model`	`d_ff`	`[d_model, d_ff]`
Layer 2 (Down-projection)	`W_ff2`	`d_ff`	`d_model`	`[d_ff, d_model]`
Attention Type	No. of Q Heads	No. of KV Heads	`W_K` & `W_V` Dimension	Relative KV Cache Size
MHA (Multi-Head)	32	32	`[4096, 32*128]` = `[4096, 4096]`	1x (Baseline)
GQA (Grouped)	32	8	`[4096, 8*128]` = `[4096, 1024]`	1/4x
MQA (Multi-Query)	32	1	`[4096, 1*128]` = `[4096, 128]`	1/32x