deploy: 6ed1d69396

2025-10-04 20:42:01 +00:00
parent 175644c1bf
commit 19d2678a16
25 changed files with 45 additions and 45 deletions
--- a/404.html
+++ b/404.html
@@ -4,4 +4,4 @@
 2016 -
 2025
 Eric X. Liu
-<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/2f73eae">[2f73eae]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
+<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/6ed1d69">[6ed1d69]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
--- a/about/index.html
+++ b/about/index.html
@@ -4,4 +4,4 @@
 2016 -
 2025
 Eric X. Liu
-<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/2f73eae">[2f73eae]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
+<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/6ed1d69">[6ed1d69]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
--- a/categories/index.html
+++ b/categories/index.html
@@ -4,4 +4,4 @@
 2016 -
 2025
 Eric X. Liu
-<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/2f73eae">[2f73eae]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
+<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/6ed1d69">[6ed1d69]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
--- a/index.html
+++ b/index.html
@@ -4,4 +4,4 @@
 2016 -
 2025
 Eric X. Liu
-<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/2f73eae">[2f73eae]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
+<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/6ed1d69">[6ed1d69]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
--- a/index.xml
+++ b/index.xml
@@ -1,4 +1,4 @@
-<?xml version="1.0" encoding="utf-8" standalone="yes"?><rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom"><channel><title>Eric X. Liu's Personal Page</title><link>/</link><description>Recent content on Eric X. Liu's Personal Page</description><generator>Hugo</generator><language>en</language><lastBuildDate>Sat, 04 Oct 2025 17:44:47 +0000</lastBuildDate><atom:link href="/index.xml" rel="self" type="application/rss+xml"/><item><title>Why Your Jetson Orin Nano's 40 TOPS Goes Unused (And What That Means for Edge AI)</title><link>/posts/benchmarking-llms-on-jetson-orin-nano/</link><pubDate>Sat, 04 Oct 2025 00:00:00 +0000</pubDate><guid>/posts/benchmarking-llms-on-jetson-orin-nano/</guid><description>&lt;h2 id="introduction"&gt;
+<?xml version="1.0" encoding="utf-8" standalone="yes"?><rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom"><channel><title>Eric X. Liu's Personal Page</title><link>/</link><description>Recent content on Eric X. Liu's Personal Page</description><generator>Hugo</generator><language>en</language><lastBuildDate>Sat, 04 Oct 2025 20:41:50 +0000</lastBuildDate><atom:link href="/index.xml" rel="self" type="application/rss+xml"/><item><title>Why Your Jetson Orin Nano's 40 TOPS Goes Unused (And What That Means for Edge AI)</title><link>/posts/benchmarking-llms-on-jetson-orin-nano/</link><pubDate>Sat, 04 Oct 2025 00:00:00 +0000</pubDate><guid>/posts/benchmarking-llms-on-jetson-orin-nano/</guid><description>&lt;h2 id="introduction"&gt;
 Introduction
 &lt;a class="heading-link" href="#introduction"&gt;
 &lt;i class="fa-solid fa-link" aria-hidden="true" title="Link to heading"&gt;&lt;/i&gt;
@@ -6,7 +6,7 @@
 &lt;/a&gt;
 &lt;/h2&gt;
 &lt;p&gt;NVIDIA&amp;rsquo;s Jetson Orin Nano promises impressive specs: 1024 CUDA cores, 32 Tensor Cores, and 40 TOPS of INT8 compute performance packed into a compact, power-efficient edge device. On paper, it looks like a capable platform for running Large Language Models locally. But there&amp;rsquo;s a catch—one that reveals a fundamental tension in modern edge AI hardware design.&lt;/p&gt;
-&lt;p&gt;After running 66 inference tests across seven different language models ranging from 0.5B to 5.4B parameters, I discovered something counterintuitive: the device&amp;rsquo;s computational muscle sits largely idle during LLM inference. The bottleneck isn&amp;rsquo;t computation—it&amp;rsquo;s memory bandwidth. This isn&amp;rsquo;t just a quirk of one device; it&amp;rsquo;s a reality that affects how we should think about deploying LLMs at the edge.&lt;/p&gt;</description></item><item><title>Flashing Jetson Orin Nano in Virtualized Environments</title><link>/posts/flashing-jetson-orin-nano-in-virtualized-environments/</link><pubDate>Thu, 02 Oct 2025 00:00:00 +0000</pubDate><guid>/posts/flashing-jetson-orin-nano-in-virtualized-environments/</guid><description>&lt;h1 id="flashing-jetson-orin-nano-in-virtualized-environments"&gt;
+&lt;p&gt;After running 66 inference tests across seven different language models ranging from 0.5B to 5.4B parameters, I discovered something counterintuitive: the device&amp;rsquo;s computational muscle sits largely idle during single-stream LLM inference. The bottleneck isn&amp;rsquo;t computation—it&amp;rsquo;s memory bandwidth. This isn&amp;rsquo;t just a quirk of one device; it&amp;rsquo;s a fundamental characteristic of single-user, autoregressive token generation on edge hardware—a reality that shapes how we should approach local LLM deployment.&lt;/p&gt;</description></item><item><title>Flashing Jetson Orin Nano in Virtualized Environments</title><link>/posts/flashing-jetson-orin-nano-in-virtualized-environments/</link><pubDate>Thu, 02 Oct 2025 00:00:00 +0000</pubDate><guid>/posts/flashing-jetson-orin-nano-in-virtualized-environments/</guid><description>&lt;h1 id="flashing-jetson-orin-nano-in-virtualized-environments"&gt;
 Flashing Jetson Orin Nano in Virtualized Environments
 &lt;a class="heading-link" href="#flashing-jetson-orin-nano-in-virtualized-environments"&gt;
 &lt;i class="fa-solid fa-link" aria-hidden="true" title="Link to heading"&gt;&lt;/i&gt;
--- a/posts/benchmarking-llms-on-jetson-orin-nano/index.html
+++ b/posts/benchmarking-llms-on-jetson-orin-nano/index.html
@@ -6,16 +6,16 @@
  

 NVIDIA&rsquo;s Jetson Orin Nano promises impressive specs: 1024 CUDA cores, 32 Tensor Cores, and 40 TOPS of INT8 compute performance packed into a compact, power-efficient edge device. On paper, it looks like a capable platform for running Large Language Models locally. But there&rsquo;s a catch—one that reveals a fundamental tension in modern edge AI hardware design.
-After running 66 inference tests across seven different language models ranging from 0.5B to 5.4B parameters, I discovered something counterintuitive: the device&rsquo;s computational muscle sits largely idle during LLM inference. The bottleneck isn&rsquo;t computation—it&rsquo;s memory bandwidth. This isn&rsquo;t just a quirk of one device; it&rsquo;s a reality that affects how we should think about deploying LLMs at the edge."><meta name=keywords content="software engineer,performance engineering,Google engineer,tech blog,software development,performance optimization,Eric Liu,engineering blog,mountain biking,Jeep enthusiast,overlanding,camping,outdoor adventures"><meta name=twitter:card content="summary"><meta name=twitter:title content="Why Your Jetson Orin Nano's 40 TOPS Goes Unused (And What That Means for Edge AI)"><meta name=twitter:description content="Introduction Link to heading NVIDIA’s Jetson Orin Nano promises impressive specs: 1024 CUDA cores, 32 Tensor Cores, and 40 TOPS of INT8 compute performance packed into a compact, power-efficient edge device. On paper, it looks like a capable platform for running Large Language Models locally. But there’s a catch—one that reveals a fundamental tension in modern edge AI hardware design.
-After running 66 inference tests across seven different language models ranging from 0.5B to 5.4B parameters, I discovered something counterintuitive: the device’s computational muscle sits largely idle during LLM inference. The bottleneck isn’t computation—it’s memory bandwidth. This isn’t just a quirk of one device; it’s a reality that affects how we should think about deploying LLMs at the edge."><meta property="og:url" content="/posts/benchmarking-llms-on-jetson-orin-nano/"><meta property="og:site_name" content="Eric X. Liu's Personal Page"><meta property="og:title" content="Why Your Jetson Orin Nano's 40 TOPS Goes Unused (And What That Means for Edge AI)"><meta property="og:description" content="Introduction Link to heading NVIDIA’s Jetson Orin Nano promises impressive specs: 1024 CUDA cores, 32 Tensor Cores, and 40 TOPS of INT8 compute performance packed into a compact, power-efficient edge device. On paper, it looks like a capable platform for running Large Language Models locally. But there’s a catch—one that reveals a fundamental tension in modern edge AI hardware design.
-After running 66 inference tests across seven different language models ranging from 0.5B to 5.4B parameters, I discovered something counterintuitive: the device’s computational muscle sits largely idle during LLM inference. The bottleneck isn’t computation—it’s memory bandwidth. This isn’t just a quirk of one device; it’s a reality that affects how we should think about deploying LLMs at the edge."><meta property="og:locale" content="en"><meta property="og:type" content="article"><meta property="article:section" content="posts"><meta property="article:published_time" content="2025-10-04T00:00:00+00:00"><meta property="article:modified_time" content="2025-10-04T17:44:47+00:00"><link rel=canonical href=/posts/benchmarking-llms-on-jetson-orin-nano/><link rel=preload href=/fonts/fa-brands-400.woff2 as=font type=font/woff2 crossorigin><link rel=preload href=/fonts/fa-regular-400.woff2 as=font type=font/woff2 crossorigin><link rel=preload href=/fonts/fa-solid-900.woff2 as=font type=font/woff2 crossorigin><link rel=stylesheet href=/css/coder.min.f03d6359cf766772af14fbe07ce6aca734b321c2e15acba0bbf4e2254941c460.css integrity="sha256-8D1jWc92Z3KvFPvgfOaspzSzIcLhWsugu/TiJUlBxGA=" crossorigin=anonymous media=screen><link rel=stylesheet href=/css/coder-dark.min.a00e6364bacbc8266ad1cc81230774a1397198f8cfb7bcba29b7d6fcb54ce57f.css integrity="sha256-oA5jZLrLyCZq0cyBIwd0oTlxmPjPt7y6KbfW/LVM5X8=" crossorigin=anonymous media=screen><link rel=icon type=image/svg+xml href=/images/favicon.svg sizes=any><link rel=icon type=image/png href=/images/favicon-32x32.png sizes=32x32><link rel=icon type=image/png href=/images/favicon-16x16.png sizes=16x16><link rel=apple-touch-icon href=/images/apple-touch-icon.png><link rel=apple-touch-icon sizes=180x180 href=/images/apple-touch-icon.png><link rel=manifest href=/site.webmanifest><link rel=mask-icon href=/images/safari-pinned-tab.svg color=#5bbad5></head><body class="preload-transitions colorscheme-auto"><div class=float-container><a id=dark-mode-toggle class=colorscheme-toggle><i class="fa-solid fa-adjust fa-fw" aria-hidden=true></i></a></div><main class=wrapper><nav class=navigation><section class=container><a class=navigation-title href=/>Eric X. Liu's Personal Page
+After running 66 inference tests across seven different language models ranging from 0.5B to 5.4B parameters, I discovered something counterintuitive: the device&rsquo;s computational muscle sits largely idle during single-stream LLM inference. The bottleneck isn&rsquo;t computation—it&rsquo;s memory bandwidth. This isn&rsquo;t just a quirk of one device; it&rsquo;s a fundamental characteristic of single-user, autoregressive token generation on edge hardware—a reality that shapes how we should approach local LLM deployment."><meta name=keywords content="software engineer,performance engineering,Google engineer,tech blog,software development,performance optimization,Eric Liu,engineering blog,mountain biking,Jeep enthusiast,overlanding,camping,outdoor adventures"><meta name=twitter:card content="summary"><meta name=twitter:title content="Why Your Jetson Orin Nano's 40 TOPS Goes Unused (And What That Means for Edge AI)"><meta name=twitter:description content="Introduction Link to heading NVIDIA’s Jetson Orin Nano promises impressive specs: 1024 CUDA cores, 32 Tensor Cores, and 40 TOPS of INT8 compute performance packed into a compact, power-efficient edge device. On paper, it looks like a capable platform for running Large Language Models locally. But there’s a catch—one that reveals a fundamental tension in modern edge AI hardware design.
+After running 66 inference tests across seven different language models ranging from 0.5B to 5.4B parameters, I discovered something counterintuitive: the device’s computational muscle sits largely idle during single-stream LLM inference. The bottleneck isn’t computation—it’s memory bandwidth. This isn’t just a quirk of one device; it’s a fundamental characteristic of single-user, autoregressive token generation on edge hardware—a reality that shapes how we should approach local LLM deployment."><meta property="og:url" content="/posts/benchmarking-llms-on-jetson-orin-nano/"><meta property="og:site_name" content="Eric X. Liu's Personal Page"><meta property="og:title" content="Why Your Jetson Orin Nano's 40 TOPS Goes Unused (And What That Means for Edge AI)"><meta property="og:description" content="Introduction Link to heading NVIDIA’s Jetson Orin Nano promises impressive specs: 1024 CUDA cores, 32 Tensor Cores, and 40 TOPS of INT8 compute performance packed into a compact, power-efficient edge device. On paper, it looks like a capable platform for running Large Language Models locally. But there’s a catch—one that reveals a fundamental tension in modern edge AI hardware design.
+After running 66 inference tests across seven different language models ranging from 0.5B to 5.4B parameters, I discovered something counterintuitive: the device’s computational muscle sits largely idle during single-stream LLM inference. The bottleneck isn’t computation—it’s memory bandwidth. This isn’t just a quirk of one device; it’s a fundamental characteristic of single-user, autoregressive token generation on edge hardware—a reality that shapes how we should approach local LLM deployment."><meta property="og:locale" content="en"><meta property="og:type" content="article"><meta property="article:section" content="posts"><meta property="article:published_time" content="2025-10-04T00:00:00+00:00"><meta property="article:modified_time" content="2025-10-04T20:41:50+00:00"><link rel=canonical href=/posts/benchmarking-llms-on-jetson-orin-nano/><link rel=preload href=/fonts/fa-brands-400.woff2 as=font type=font/woff2 crossorigin><link rel=preload href=/fonts/fa-regular-400.woff2 as=font type=font/woff2 crossorigin><link rel=preload href=/fonts/fa-solid-900.woff2 as=font type=font/woff2 crossorigin><link rel=stylesheet href=/css/coder.min.f03d6359cf766772af14fbe07ce6aca734b321c2e15acba0bbf4e2254941c460.css integrity="sha256-8D1jWc92Z3KvFPvgfOaspzSzIcLhWsugu/TiJUlBxGA=" crossorigin=anonymous media=screen><link rel=stylesheet href=/css/coder-dark.min.a00e6364bacbc8266ad1cc81230774a1397198f8cfb7bcba29b7d6fcb54ce57f.css integrity="sha256-oA5jZLrLyCZq0cyBIwd0oTlxmPjPt7y6KbfW/LVM5X8=" crossorigin=anonymous media=screen><link rel=icon type=image/svg+xml href=/images/favicon.svg sizes=any><link rel=icon type=image/png href=/images/favicon-32x32.png sizes=32x32><link rel=icon type=image/png href=/images/favicon-16x16.png sizes=16x16><link rel=apple-touch-icon href=/images/apple-touch-icon.png><link rel=apple-touch-icon sizes=180x180 href=/images/apple-touch-icon.png><link rel=manifest href=/site.webmanifest><link rel=mask-icon href=/images/safari-pinned-tab.svg color=#5bbad5></head><body class="preload-transitions colorscheme-auto"><div class=float-container><a id=dark-mode-toggle class=colorscheme-toggle><i class="fa-solid fa-adjust fa-fw" aria-hidden=true></i></a></div><main class=wrapper><nav class=navigation><section class=container><a class=navigation-title href=/>Eric X. Liu's Personal Page
 </a><input type=checkbox id=menu-toggle>
 <label class="menu-button float-right" for=menu-toggle><i class="fa-solid fa-bars fa-fw" aria-hidden=true></i></label><ul class=navigation-list><li class=navigation-item><a class=navigation-link href=/posts/>Posts</a></li><li class=navigation-item><a class=navigation-link href=https://chat.ericxliu.me>Chat</a></li><li class=navigation-item><a class=navigation-link href=https://git.ericxliu.me/user/oauth2/Authenitk>Git</a></li><li class=navigation-item><a class=navigation-link href=https://coder.ericxliu.me/api/v2/users/oidc/callback>Coder</a></li><li class=navigation-item><a class=navigation-link href=/>|</a></li><li class=navigation-item><a class=navigation-link href=https://sso.ericxliu.me>Sign in</a></li></ul></section></nav><div class=content><section class="container post"><article><header><div class=post-title><h1 class=title><a class=title-link href=/posts/benchmarking-llms-on-jetson-orin-nano/>Why Your Jetson Orin Nano's 40 TOPS Goes Unused (And What That Means for Edge AI)</a></h1></div><div class=post-meta><div class=date><span class=posted-on><i class="fa-solid fa-calendar" aria-hidden=true></i>
 <time datetime=2025-10-04T00:00:00Z>October 4, 2025
 </time></span><span class=reading-time><i class="fa-solid fa-clock" aria-hidden=true></i>
-8-minute read</span></div></div></header><div class=post-content><h2 id=introduction>Introduction
+9-minute read</span></div></div></header><div class=post-content><h2 id=introduction>Introduction
 <a class=heading-link href=#introduction><i class="fa-solid fa-link" aria-hidden=true title="Link to heading"></i>
-<span class=sr-only>Link to heading</span></a></h2><p>NVIDIA&rsquo;s Jetson Orin Nano promises impressive specs: 1024 CUDA cores, 32 Tensor Cores, and 40 TOPS of INT8 compute performance packed into a compact, power-efficient edge device. On paper, it looks like a capable platform for running Large Language Models locally. But there&rsquo;s a catch—one that reveals a fundamental tension in modern edge AI hardware design.</p><p>After running 66 inference tests across seven different language models ranging from 0.5B to 5.4B parameters, I discovered something counterintuitive: the device&rsquo;s computational muscle sits largely idle during LLM inference. The bottleneck isn&rsquo;t computation—it&rsquo;s memory bandwidth. This isn&rsquo;t just a quirk of one device; it&rsquo;s a reality that affects how we should think about deploying LLMs at the edge.</p><h2 id=the-hardware-what-were-working-with>The Hardware: What We&rsquo;re Working With
+<span class=sr-only>Link to heading</span></a></h2><p>NVIDIA&rsquo;s Jetson Orin Nano promises impressive specs: 1024 CUDA cores, 32 Tensor Cores, and 40 TOPS of INT8 compute performance packed into a compact, power-efficient edge device. On paper, it looks like a capable platform for running Large Language Models locally. But there&rsquo;s a catch—one that reveals a fundamental tension in modern edge AI hardware design.</p><p>After running 66 inference tests across seven different language models ranging from 0.5B to 5.4B parameters, I discovered something counterintuitive: the device&rsquo;s computational muscle sits largely idle during single-stream LLM inference. The bottleneck isn&rsquo;t computation—it&rsquo;s memory bandwidth. This isn&rsquo;t just a quirk of one device; it&rsquo;s a fundamental characteristic of single-user, autoregressive token generation on edge hardware—a reality that shapes how we should approach local LLM deployment.</p><h2 id=the-hardware-what-were-working-with>The Hardware: What We&rsquo;re Working With
 <a class=heading-link href=#the-hardware-what-were-working-with><i class="fa-solid fa-link" aria-hidden=true title="Link to heading"></i>
 <span class=sr-only>Link to heading</span></a></h2><p>The NVIDIA Jetson Orin Nano 8GB I tested features:</p><ul><li><strong>GPU</strong>: NVIDIA Ampere architecture with 1024 CUDA cores and 32 Tensor Cores</li><li><strong>Compute Performance</strong>: 40 TOPS (INT8), 10 TFLOPS (FP16), 5 TFLOPS (FP32)</li><li><strong>Memory</strong>: 8GB LPDDR5 unified memory with 68 GB/s bandwidth</li><li><strong>Available VRAM</strong>: Approximately 5.2GB after OS overhead</li><li><strong>CPU</strong>: 6-core ARM Cortex-A78AE (ARMv8.2, 64-bit)</li><li><strong>TDP</strong>: 7-25W configurable</li></ul><p>The unified memory architecture is a double-edged sword: CPU and GPU share the same physical memory pool, which eliminates PCIe transfer overhead but also means you&rsquo;re working with just 5.2GB of usable VRAM after the OS takes its share. This constraint shapes everything about LLM deployment on this device.</p><h2 id=testing-methodology>Testing Methodology
 <a class=heading-link href=#testing-methodology><i class="fa-solid fa-link" aria-hidden=true title="Link to heading"></i>
@@ -23,7 +23,7 @@ After running 66 inference tests across seven different language models ranging
 <a class=heading-link href=#the-models><i class="fa-solid fa-link" aria-hidden=true title="Link to heading"></i>
 <span class=sr-only>Link to heading</span></a></h3><p>I tested seven models ranging from 0.5B to 5.4B parameters—essentially the entire practical deployment range for this hardware. The selection covered two inference backends (Ollama and vLLM) and various quantization strategies:</p><p><strong>Ollama-served models (with quantization):</strong></p><ul><li>Gemma 3 1B (Q4_K_M, 815MB)</li><li>Gemma 3n E2B (Q4_K_M, 3.5GB, 5.44B total params, 2B effective)</li><li>Qwen 2.5 0.5B (Q4_K_M, 350MB)</li><li>Qwen 3 0.6B (FP8, 600MB)</li></ul><p><strong>vLLM-served models (minimal/no quantization):</strong></p><ul><li>google/gemma-3-1b-it (FP16, 2GB)</li><li>Qwen/Qwen2.5-0.5B-Instruct (FP16, 1GB)</li><li>Qwen/Qwen3-0.6B-FP8 (FP8, 600MB)</li></ul><h3 id=the-testing-process>The Testing Process
 <a class=heading-link href=#the-testing-process><i class="fa-solid fa-link" aria-hidden=true title="Link to heading"></i>
-<span class=sr-only>Link to heading</span></a></h3><p>Each model faced 10-12 prompts of varying complexity—from simple arithmetic to technical explanations about LLMs themselves. Out of 84 planned tests, 66 completed successfully (78.6% success rate). The failures? Mostly out-of-memory crashes on larger models and occasional inference engine instability.</p><h3 id=understanding-the-limits-roofline-analysis>Understanding the Limits: Roofline Analysis
+<span class=sr-only>Link to heading</span></a></h3><p>Each model faced 10-12 prompts of varying complexity—from simple arithmetic to technical explanations about LLMs themselves. All tests ran with batch size = 1, simulating a single user interacting with a local chatbot—the typical edge deployment scenario. Out of 84 planned tests, 66 completed successfully (78.6% success rate). The failures? Mostly out-of-memory crashes on larger models and occasional inference engine instability.</p><h3 id=understanding-the-limits-roofline-analysis>Understanding the Limits: Roofline Analysis
 <a class=heading-link href=#understanding-the-limits-roofline-analysis><i class="fa-solid fa-link" aria-hidden=true title="Link to heading"></i>
 <span class=sr-only>Link to heading</span></a></h3><p>To understand where performance hits its ceiling, I applied roofline analysis—a method that reveals whether a workload is compute-bound (limited by processing power) or memory-bound (limited by data transfer speed). For each model, I calculated:</p><ul><li><strong>FLOPs per token</strong>: Approximately 2 × total_parameters (accounting for matrix multiplications in forward pass)</li><li><strong>Bytes per token</strong>: model_size × 1.1 (including 10% overhead for activations and KV cache)</li><li><strong>Operational Intensity (OI)</strong>: FLOPs per token / Bytes per token</li><li><strong>Theoretical performance</strong>: min(compute_limit, bandwidth_limit)</li></ul><p>The roofline model works by comparing a workload&rsquo;s operational intensity (how many calculations you do per byte of data moved) against the device&rsquo;s balance point. If your operational intensity is too low, you&rsquo;re bottlenecked by memory bandwidth—and as we&rsquo;ll see, that&rsquo;s exactly what happens with LLM inference.</p><p><img src=/images/benchmarking-llms-on-jetson-orin-nano/16d64bdc9cf14b05b7c40c4718b8091b.png alt="S3 File"></p><h2 id=the-results-speed-and-efficiency>The Results: Speed and Efficiency
 <a class=heading-link href=#the-results-speed-and-efficiency><i class="fa-solid fa-link" aria-hidden=true title="Link to heading"></i>
@@ -34,32 +34,32 @@ After running 66 inference tests across seven different language models ranging
 <a class=heading-link href=#responsiveness-first-token-latency><i class="fa-solid fa-link" aria-hidden=true title="Link to heading"></i>
 <span class=sr-only>Link to heading</span></a></h3><p>The time to generate the first output token—a critical metric for interactive applications—varied significantly:</p><ul><li>qwen3:0.6b (Ollama): 0.522 seconds</li><li>gemma3:1b (Ollama): 1.000 seconds</li><li>qwen2.5:0.5b (Ollama): 1.415 seconds</li><li>gemma3n:e2b (Ollama): 1.998 seconds</li></ul><p>Smaller, quantized models get to that first token faster—exactly what you want for a chatbot or interactive assistant where perceived responsiveness matters as much as raw throughput.</p><h3 id=the-memory-bottleneck-revealed>The Memory Bottleneck Revealed
 <a class=heading-link href=#the-memory-bottleneck-revealed><i class="fa-solid fa-link" aria-hidden=true title="Link to heading"></i>
-<span class=sr-only>Link to heading</span></a></h3><p>When I compared actual performance against theoretical limits, the results were striking:</p><table><thead><tr><th>Model</th><th>Theoretical (t/s)</th><th>Actual (t/s)</th><th>Efficiency</th><th>Bottleneck</th><th>OI (FLOPs/byte)</th></tr></thead><tbody><tr><td>gemma3:1b</td><td>109.90</td><td>26.33</td><td>24.0%</td><td>Memory</td><td>3.23</td></tr><tr><td>qwen3:0.6b</td><td>103.03</td><td>38.84</td><td>37.7%</td><td>Memory</td><td>1.82</td></tr><tr><td>qwen2.5:0.5b</td><td>219.80</td><td>35.24</td><td>16.0%</td><td>Memory</td><td>3.23</td></tr><tr><td>gemma3n:e2b</td><td>54.95</td><td>8.98</td><td>16.3%</td><td>Memory</td><td>3.23</td></tr><tr><td>google/gemma-3-1b-it</td><td>30.91</td><td>4.59</td><td>14.9%</td><td>Memory</td><td>0.91</td></tr><tr><td>Qwen/Qwen3-0.6B-FP8</td><td>103.03</td><td>12.81</td><td>12.4%</td><td>Memory</td><td>1.82</td></tr><tr><td>Qwen/Qwen2.5-0.5B-Instruct</td><td>61.82</td><td>15.18</td><td>24.6%</td><td>Memory</td><td>0.91</td></tr></tbody></table><p>Every single model is memory-bound. Average hardware efficiency sits at just 20.8%—meaning the computational units spend most of their time waiting for data rather than crunching numbers. That advertised 40 TOPS? Largely untapped.
+<span class=sr-only>Link to heading</span></a></h3><p>When I compared actual performance against theoretical limits, the results were striking:</p><table><thead><tr><th>Model</th><th>Theoretical (t/s)</th><th>Actual (t/s)</th><th>Efficiency</th><th>Bottleneck</th><th>OI (FLOPs/byte)</th></tr></thead><tbody><tr><td>gemma3:1b</td><td>109.90</td><td>26.33</td><td>24.0%</td><td>Memory</td><td>3.23</td></tr><tr><td>qwen3:0.6b</td><td>103.03</td><td>38.84</td><td>37.7%</td><td>Memory</td><td>1.82</td></tr><tr><td>qwen2.5:0.5b</td><td>219.80</td><td>35.24</td><td>16.0%</td><td>Memory</td><td>3.23</td></tr><tr><td>gemma3n:e2b</td><td>54.95</td><td>8.98</td><td>16.3%</td><td>Memory</td><td>3.23</td></tr><tr><td>google/gemma-3-1b-it</td><td>30.91</td><td>4.59</td><td>14.9%</td><td>Memory</td><td>0.91</td></tr><tr><td>Qwen/Qwen3-0.6B-FP8</td><td>103.03</td><td>12.81</td><td>12.4%</td><td>Memory</td><td>1.82</td></tr><tr><td>Qwen/Qwen2.5-0.5B-Instruct</td><td>61.82</td><td>15.18</td><td>24.6%</td><td>Memory</td><td>0.91</td></tr></tbody></table><p>Every single model is memory-bound in this single-stream inference scenario. Average hardware efficiency sits at just 20.8%—meaning the computational units spend most of their time waiting for data rather than crunching numbers. That advertised 40 TOPS? Largely untapped when generating one token at a time for a single user.
 <img src=/images/benchmarking-llms-on-jetson-orin-nano/ee04876d75d247f9b27a647462555777.png alt="S3 File"></p><h2 id=what-this-actually-means>What This Actually Means
 <a class=heading-link href=#what-this-actually-means><i class="fa-solid fa-link" aria-hidden=true title="Link to heading"></i>
-<span class=sr-only>Link to heading</span></a></h2><h3 id=why-memory-bandwidth-dominates>Why Memory Bandwidth Dominates
-<a class=heading-link href=#why-memory-bandwidth-dominates><i class="fa-solid fa-link" aria-hidden=true title="Link to heading"></i>
-<span class=sr-only>Link to heading</span></a></h3><p>The roofline numbers tell a clear story: operational intensity ranges from 0.91 to 3.23 FLOPs/byte across all tested models. To actually saturate those 1024 CUDA cores and hit compute-bound operation, you&rsquo;d need an operational intensity around 147 FLOPs/byte at the device&rsquo;s 68 GB/s memory bandwidth.</p><p>In practice, for a model to actually become compute-bound on this device, it would need an operational intensity exceeding:</p><div class=highlight><pre tabindex=0 style=color:#e6edf3;background-color:#0d1117;-moz-tab-size:4;-o-tab-size:4;tab-size:4><code class=language-fallback data-lang=fallback><span style=display:flex><span>OI_threshold = Peak_Compute / Memory_Bandwidth
+<span class=sr-only>Link to heading</span></a></h2><h3 id=why-memory-bandwidth-dominates-in-single-stream-inference>Why Memory Bandwidth Dominates (in Single-Stream Inference)
+<a class=heading-link href=#why-memory-bandwidth-dominates-in-single-stream-inference><i class="fa-solid fa-link" aria-hidden=true title="Link to heading"></i>
+<span class=sr-only>Link to heading</span></a></h3><p>The roofline numbers tell a clear story: operational intensity ranges from 0.91 to 3.23 FLOPs/byte across all tested models during single-token generation (batch size = 1). To actually saturate those 1024 CUDA cores and hit compute-bound operation, you&rsquo;d need an operational intensity around 147 FLOPs/byte at the device&rsquo;s 68 GB/s memory bandwidth.</p><p>In practice, for a model to actually become compute-bound on this device during single-stream inference, it would need an operational intensity exceeding:</p><div class=highlight><pre tabindex=0 style=color:#e6edf3;background-color:#0d1117;-moz-tab-size:4;-o-tab-size:4;tab-size:4><code class=language-fallback data-lang=fallback><span style=display:flex><span>OI_threshold = Peak_Compute / Memory_Bandwidth
 </span></span><span style=display:flex><span>             = (40 × 10^12 ops/s) / (68 × 10^9 bytes/s)
 </span></span><span style=display:flex><span>             = 588 FLOPs/byte
-</span></span></code></pre></div><p>Current LLM architectures fall 100-600× short of this threshold during autoregressive decoding. The compute units are idle most of the time, simply waiting for model weights and activations to arrive from memory.</p><p>The largest model tested—gemma3n:e2b at 3.5GB quantized (5.44B total parameters, 2B effective)—shows only 16.3% efficiency, similar to other quantized models. Despite being the largest model, Q4_K_M quantization keeps its memory footprint manageable, resulting in similar operational intensity (3.23 FLOPs/byte) to the other INT4-quantized models. Its MatFormer architecture with selective parameter activation (only 2B of 5.44B params active per token) actually helps reduce memory traffic, though this benefit is partially offset by the overhead of routing logic.</p><h3 id=what-this-means-for-deployment>What This Means for Deployment
-<a class=heading-link href=#what-this-means-for-deployment><i class="fa-solid fa-link" aria-hidden=true title="Link to heading"></i>
-<span class=sr-only>Link to heading</span></a></h3><p>The performance gap between Ollama and vLLM (2.3-5.7×) tells us something important about optimization priorities for edge devices:</p><p><strong>Qwen 2.5 0.5B:</strong> Ollama (Q4_K_M, 350MB) at 35.24 t/s vs vLLM (FP16, 1GB) at 15.18 t/s—2.32× faster
+</span></span></code></pre></div><p>Single-stream autoregressive decoding falls 100-600× short of this threshold because each token generation requires loading the entire model from memory (matrix-vector multiplication) while performing only ~2 FLOPs per parameter. The compute units are idle most of the time, simply waiting for model weights and activations to arrive from memory.</p><p>Note: Production LLM serving with large batch sizes (32-256 requests) changes this dynamic dramatically—batching transforms matrix-vector operations into matrix-matrix multiplications, increasing operational intensity by 30-250× and making workloads compute-bound. However, edge devices serving single users cannot exploit this optimization.</p><p>The largest model tested—gemma3n:e2b at 3.5GB quantized (5.44B total parameters, 2B effective)—shows only 16.3% efficiency, similar to other quantized models. Despite being the largest model, Q4_K_M quantization keeps its memory footprint manageable, resulting in similar operational intensity (3.23 FLOPs/byte) to the other INT4-quantized models. Its MatFormer architecture with selective parameter activation (only 2B of 5.44B params active per token) actually helps reduce memory traffic, though this benefit is partially offset by the overhead of routing logic.</p><h3 id=what-this-means-for-edge-deployment>What This Means for Edge Deployment
+<a class=heading-link href=#what-this-means-for-edge-deployment><i class="fa-solid fa-link" aria-hidden=true title="Link to heading"></i>
+<span class=sr-only>Link to heading</span></a></h3><p>The performance gap between Ollama and vLLM (2.3-5.7×) tells us something important about optimization priorities for single-user edge devices:</p><p><strong>Qwen 2.5 0.5B:</strong> Ollama (Q4_K_M, 350MB) at 35.24 t/s vs vLLM (FP16, 1GB) at 15.18 t/s—2.32× faster
 <strong>Qwen 3 0.6B:</strong> Ollama (FP8) at 38.84 t/s vs vLLM (FP8) at 12.81 t/s—3.03× faster despite identical quantization
-<strong>Gemma 3 1B:</strong> Ollama (Q4_K_M, 815MB) at 26.33 t/s vs vLLM (FP16, 2GB) at 4.59 t/s—5.74× faster</p><p>Quantization delivers near-linear performance gains by directly attacking the memory bandwidth bottleneck. Q4_K_M quantization (4.5 bits/parameter) hits a sweet spot between model quality and speed. Going lower to INT2 might help further, but you&rsquo;ll need to carefully evaluate output quality.</p><p>The real insight: Ollama&rsquo;s edge-first design philosophy (GGUF format, streamlined execution, optimized kernels from llama.cpp) is fundamentally better aligned with single-stream, memory-constrained edge scenarios. vLLM&rsquo;s datacenter features—continuous batching, PagedAttention, tensor parallelism—add overhead without providing benefits on unified memory architectures serving single users.</p><p><strong>What you should actually do</strong>: Stick with Ollama or TensorRT-LLM using Q4_K_M/INT4 quantized models in GGUF format. Target the 0.5-1B parameter range (under 3GB) to leave headroom for KV cache. Focus your optimization efforts on memory access patterns and bandwidth reduction. Watch for emerging techniques like INT4 AWQ, sparse attention, and quantized KV caches.</p><h3 id=room-for-improvement>Room for Improvement
+<strong>Gemma 3 1B:</strong> Ollama (Q4_K_M, 815MB) at 26.33 t/s vs vLLM (FP16, 2GB) at 4.59 t/s—5.74× faster</p><p>In single-stream scenarios, quantization delivers near-linear performance gains by directly attacking the memory bandwidth bottleneck. Q4_K_M quantization (4.5 bits/parameter) hits a sweet spot between model quality and speed. Going lower to INT2 might help further, but you&rsquo;ll need to carefully evaluate output quality.</p><p>The real insight: Ollama&rsquo;s edge-first design philosophy (GGUF format, streamlined execution, optimized kernels from llama.cpp) is fundamentally better aligned with single-stream, memory-constrained edge scenarios. vLLM&rsquo;s datacenter features—continuous batching, PagedAttention, tensor parallelism—add overhead without providing benefits when serving individual users on unified memory architectures. These features shine in multi-user production serving where batching can be exploited, but hurt performance in the single-stream case.</p><p><strong>What you should actually do</strong>: Stick with Ollama or TensorRT-LLM using Q4_K_M/INT4 quantized models in GGUF format. Target the 0.5-1B parameter range (under 3GB) to leave headroom for KV cache. Focus your optimization efforts on memory access patterns and bandwidth reduction. Watch for emerging techniques like INT4 AWQ, sparse attention, and quantized KV caches.</p><h3 id=room-for-improvement>Room for Improvement
 <a class=heading-link href=#room-for-improvement><i class="fa-solid fa-link" aria-hidden=true title="Link to heading"></i>
-<span class=sr-only>Link to heading</span></a></h3><p>The 20.8% average efficiency might sound terrible, but it&rsquo;s actually typical for edge AI devices. Datacenter GPUs hit 60-80% on optimized workloads, while edge devices commonly land in the 15-40% range due to architectural tradeoffs and memory bandwidth constraints.</p><p>Three factors explain the gap:</p><ol><li><strong>Architecture</strong>: Unified memory sacrifices bandwidth for integration simplicity. The 4MB L2 cache and 7-15W TDP limit further constrain performance.</li><li><strong>Software maturity</strong>: Edge inference frameworks lag behind their datacenter counterparts in optimization.</li><li><strong>Runtime overhead</strong>: Quantization/dequantization operations, Python abstractions, and non-optimized kernels all add up.</li></ol><p>The consistent 16-24% efficiency across most models suggests there&rsquo;s room for 2-3× speedups through better software optimization—particularly in memory access patterns and kernel implementations. But fundamental performance leaps will require hardware changes—specifically, prioritizing memory bandwidth (200+ GB/s) over raw compute capability in future edge AI chips.</p><h2 id=where-to-go-from-here>Where to Go From Here
+<span class=sr-only>Link to heading</span></a></h3><p>The 20.8% average efficiency might sound terrible, but it&rsquo;s actually typical for edge AI devices running single-stream inference. Datacenter GPUs hit 60-80% efficiency on optimized workloads—but that&rsquo;s typically with large batch sizes that increase operational intensity. In comparable single-stream scenarios, even high-end GPUs see similar efficiency drops. Edge devices commonly land in the 15-40% range due to architectural tradeoffs and memory bandwidth constraints relative to their compute capability.</p><p>Three factors explain the gap:</p><ol><li><strong>Architecture</strong>: Unified memory sacrifices bandwidth for integration simplicity. The 4MB L2 cache and 7-15W TDP limit further constrain performance.</li><li><strong>Software maturity</strong>: Edge inference frameworks lag behind their datacenter counterparts in optimization.</li><li><strong>Runtime overhead</strong>: Quantization/dequantization operations, Python abstractions, and non-optimized kernels all add up.</li></ol><p>The consistent 16-24% efficiency across most models suggests there&rsquo;s room for 2-3× speedups through better software optimization—particularly in memory access patterns and kernel implementations. But fundamental performance leaps will require hardware changes—specifically, prioritizing memory bandwidth (200+ GB/s) over raw compute capability in future edge AI chips.</p><h2 id=where-to-go-from-here>Where to Go From Here
 <a class=heading-link href=#where-to-go-from-here><i class="fa-solid fa-link" aria-hidden=true title="Link to heading"></i>
 <span class=sr-only>Link to heading</span></a></h2><h3 id=software-optimizations-worth-pursuing>Software Optimizations Worth Pursuing
 <a class=heading-link href=#software-optimizations-worth-pursuing><i class="fa-solid fa-link" aria-hidden=true title="Link to heading"></i>
 <span class=sr-only>Link to heading</span></a></h3><ul><li>Optimize memory access patterns in attention and MLP kernels</li><li>Implement quantized KV cache (8-bit or lower)</li><li>Tune for small batch sizes (2-4) to improve memory bus utilization</li><li>Overlap CPU-GPU pipeline operations to hide latency</li></ul><h3 id=research-directions>Research Directions
 <a class=heading-link href=#research-directions><i class="fa-solid fa-link" aria-hidden=true title="Link to heading"></i>
-<span class=sr-only>Link to heading</span></a></h3><ul><li>Architectures with higher operational intensity (fewer memory accesses per compute operation)</li><li>Sparse attention patterns to reduce memory movement</li><li>On-device LoRA fine-tuning with frozen, quantized base weights</li><li>Multi-model serving with shared base model weights</li></ul><h3 id=what-hardware-designers-should-focus-on>What Hardware Designers Should Focus On
-<a class=heading-link href=#what-hardware-designers-should-focus-on><i class="fa-solid fa-link" aria-hidden=true title="Link to heading"></i>
-<span class=sr-only>Link to heading</span></a></h3><p>Future edge AI devices need a fundamental shift in priorities: memory bandwidth over raw compute capability. Specifically:</p><ul><li>200+ GB/s memory bandwidth (3× current Jetson Orin Nano)</li><li>HBM integration for higher bandwidth density</li><li>16GB+ capacity to support 7B+ parameter models</li><li>Purpose-built INT4/INT8 accelerators with larger on-chip caches to reduce DRAM traffic</li></ul><hr><h2 id=references>References
+<span class=sr-only>Link to heading</span></a></h3><ul><li>Architectures with higher operational intensity (fewer memory accesses per compute operation)</li><li>Sparse attention patterns to reduce memory movement</li><li>On-device LoRA fine-tuning with frozen, quantized base weights</li><li>Multi-model serving with shared base model weights</li></ul><h3 id=what-edge-ai-hardware-designers-should-focus-on>What Edge AI Hardware Designers Should Focus On
+<a class=heading-link href=#what-edge-ai-hardware-designers-should-focus-on><i class="fa-solid fa-link" aria-hidden=true title="Link to heading"></i>
+<span class=sr-only>Link to heading</span></a></h3><p>Future edge AI devices optimized for local, single-user LLM inference need a fundamental shift in priorities: memory bandwidth over raw compute capability. Specifically:</p><ul><li>200+ GB/s memory bandwidth (3× current Jetson Orin Nano)</li><li>HBM integration for higher bandwidth density</li><li>16GB+ capacity to support 7B+ parameter models</li><li>Purpose-built INT4/INT8 accelerators with larger on-chip caches to reduce DRAM traffic</li></ul><hr><h2 id=references>References
 <a class=heading-link href=#references><i class="fa-solid fa-link" aria-hidden=true title="Link to heading"></i>
 <span class=sr-only>Link to heading</span></a></h2><ol><li><p>Williams, S., Waterman, A., & Patterson, D. (2009). &ldquo;Roofline: An Insightful Visual Performance Model for Multicore Architectures.&rdquo; <em>Communications of the ACM</em>, 52(4), 65-76.</p></li><li><p>NVIDIA Corporation. (2024). &ldquo;Jetson Orin Nano Developer Kit Technical Specifications.&rdquo; <a href=https://developer.nvidia.com/embedded/jetson-orin-nano-developer-kit class=external-link target=_blank rel=noopener>https://developer.nvidia.com/embedded/jetson-orin-nano-developer-kit</a></p></li><li><p>&ldquo;Jetson AI Lab Benchmarks.&rdquo; NVIDIA Jetson AI Lab. <a href=https://www.jetson-ai-lab.com/benchmarks.html class=external-link target=_blank rel=noopener>https://www.jetson-ai-lab.com/benchmarks.html</a></p></li><li><p>Gerganov, G., et al. (2023). &ldquo;GGML - AI at the edge.&rdquo; <em>GitHub</em>. <a href=https://github.com/ggerganov/ggml class=external-link target=_blank rel=noopener>https://github.com/ggerganov/ggml</a></p></li><li><p>Kwon, W., et al. (2023). &ldquo;Efficient Memory Management for Large Language Model Serving with PagedAttention.&rdquo; <em>Proceedings of SOSP 2023</em>.</p></li><li><p>Team, G., Mesnard, T., et al. (2025). &ldquo;Gemma 3: Technical Report.&rdquo; <em>arXiv preprint arXiv:2503.19786v1</em>. <a href=https://arxiv.org/html/2503.19786v1 class=external-link target=_blank rel=noopener>https://arxiv.org/html/2503.19786v1</a></p></li><li><p>Yang, A., et al. (2025). &ldquo;Qwen3 Technical Report.&rdquo; <em>arXiv preprint arXiv:2505.09388</em>. <a href=https://arxiv.org/pdf/2505.09388 class=external-link target=_blank rel=noopener>https://arxiv.org/pdf/2505.09388</a></p></li><li><p>DeepSeek-AI. (2025). &ldquo;DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning.&rdquo; <em>arXiv preprint arXiv:2501.12948v1</em>. <a href=https://arxiv.org/html/2501.12948v1 class=external-link target=_blank rel=noopener>https://arxiv.org/html/2501.12948v1</a></p></li><li><p>&ldquo;Running LLMs with TensorRT-LLM on NVIDIA Jetson Orin Nano Super.&rdquo; Collabnix. <a href=https://collabnix.com/running-llms-with-tensorrt-llm-on-nvidia-jetson-orin-nano-super/ class=external-link target=_blank rel=noopener>https://collabnix.com/running-llms-with-tensorrt-llm-on-nvidia-jetson-orin-nano-super/</a></p></li><li><p>Pope, R., et al. (2022). &ldquo;Efficiently Scaling Transformer Inference.&rdquo; <em>Proceedings of MLSys 2022</em>.</p></li><li><p>Frantar, E., et al. (2023). &ldquo;GPTQ: Accurate Post-Training Quantization for Generative Pre-trained Transformers.&rdquo; <em>Proceedings of ICLR 2023</em>.</p></li><li><p>Dettmers, T., et al. (2023). &ldquo;QLoRA: Efficient Finetuning of Quantized LLMs.&rdquo; <em>Proceedings of NeurIPS 2023</em>.</p></li><li><p>Lin, J., et al. (2023). &ldquo;AWQ: Activation-aware Weight Quantization for LLM Compression and Acceleration.&rdquo; <em>arXiv preprint arXiv:2306.00978</em>.</p></li></ol></div><footer><div id=disqus_thread></div><script>window.disqus_config=function(){},function(){if(["localhost","127.0.0.1"].indexOf(window.location.hostname)!=-1){document.getElementById("disqus_thread").innerHTML="Disqus comments not available by default when the website is previewed locally.";return}var t=document,e=t.createElement("script");e.async=!0,e.src="//ericxliu-me.disqus.com/embed.js",e.setAttribute("data-timestamp",+new Date),(t.head||t.body).appendChild(e)}(),document.addEventListener("themeChanged",function(){document.readyState=="complete"&&DISQUS.reset({reload:!0,config:disqus_config})})</script></footer></article><link rel=stylesheet href=https://cdn.jsdelivr.net/npm/katex@0.16.4/dist/katex.min.css integrity=sha384-vKruj+a13U8yHIkAyGgK1J3ArTLzrFGBbBc0tDp4ad/EyewESeXE/Iv67Aj8gKZ0 crossorigin=anonymous><script defer src=https://cdn.jsdelivr.net/npm/katex@0.16.4/dist/katex.min.js integrity=sha384-PwRUT/YqbnEjkZO0zZxNqcxACrXe+j766U2amXcgMg5457rve2Y7I6ZJSm2A0mS4 crossorigin=anonymous></script><script defer src=https://cdn.jsdelivr.net/npm/katex@0.16.4/dist/contrib/auto-render.min.js integrity=sha384-+VBxd3r6XgURycqtZ117nYw44OOcIax56Z4dCRWbxyPt0Koah1uHoK0o4+/RRE05 crossorigin=anonymous onload='renderMathInElement(document.body,{delimiters:[{left:"$$",right:"$$",display:!0},{left:"$",right:"$",display:!1},{left:"\\(",right:"\\)",display:!1},{left:"\\[",right:"\\]",display:!0}]})'></script></section></div><footer class=footer><section class=container>©
 2016 -
 2025
 Eric X. Liu
-<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/2f73eae">[2f73eae]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
+<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/6ed1d69">[6ed1d69]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
--- a/posts/breville-barista-pro-maintenance/index.html
+++ b/posts/breville-barista-pro-maintenance/index.html
@@ -25,4 +25,4 @@ Understanding the Two Primary Maintenance Cycles Link to heading The Breville Ba
 2016 -
 2025
 Eric X. Liu
-<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/2f73eae">[2f73eae]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
+<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/6ed1d69">[6ed1d69]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
--- a/posts/espresso-theory-application-a-guide-for-the-breville-barista-pro/index.html
+++ b/posts/espresso-theory-application-a-guide-for-the-breville-barista-pro/index.html
@@ -20,4 +20,4 @@ Our overarching philosophy is simple: isolate and change only one variable at a
 2016 -
 2025
 Eric X. Liu
-<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/2f73eae">[2f73eae]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
+<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/6ed1d69">[6ed1d69]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
--- a/posts/flashing-jetson-orin-nano-in-virtualized-environments/index.html
+++ b/posts/flashing-jetson-orin-nano-in-virtualized-environments/index.html
@@ -168,4 +168,4 @@ Flashing NVIDIA Jetson devices remotely presents unique challenges when the host
 2016 -
 2025
 Eric X. Liu
-<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/2f73eae">[2f73eae]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
+<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/6ed1d69">[6ed1d69]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
--- a/posts/how-rvq-teaches-llms-to-see-and-hear/index.html
+++ b/posts/how-rvq-teaches-llms-to-see-and-hear/index.html
@@ -18,4 +18,4 @@ The answer lies in creating a universal language—a bridge between the continuo
 2016 -
 2025
 Eric X. Liu
-<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/2f73eae">[2f73eae]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
+<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/6ed1d69">[6ed1d69]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
--- a/posts/index.html
+++ b/posts/index.html
@@ -14,4 +14,4 @@
 2016 -
 2025
 Eric X. Liu
-<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/2f73eae">[2f73eae]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
+<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/6ed1d69">[6ed1d69]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
--- a/posts/index.xml
+++ b/posts/index.xml
@@ -1,4 +1,4 @@
-<?xml version="1.0" encoding="utf-8" standalone="yes"?><rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom"><channel><title>Posts on Eric X. Liu's Personal Page</title><link>/posts/</link><description>Recent content in Posts on Eric X. Liu's Personal Page</description><generator>Hugo</generator><language>en</language><lastBuildDate>Sat, 04 Oct 2025 17:44:47 +0000</lastBuildDate><atom:link href="/posts/index.xml" rel="self" type="application/rss+xml"/><item><title>Why Your Jetson Orin Nano's 40 TOPS Goes Unused (And What That Means for Edge AI)</title><link>/posts/benchmarking-llms-on-jetson-orin-nano/</link><pubDate>Sat, 04 Oct 2025 00:00:00 +0000</pubDate><guid>/posts/benchmarking-llms-on-jetson-orin-nano/</guid><description>&lt;h2 id="introduction"&gt;
+<?xml version="1.0" encoding="utf-8" standalone="yes"?><rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom"><channel><title>Posts on Eric X. Liu's Personal Page</title><link>/posts/</link><description>Recent content in Posts on Eric X. Liu's Personal Page</description><generator>Hugo</generator><language>en</language><lastBuildDate>Sat, 04 Oct 2025 20:41:50 +0000</lastBuildDate><atom:link href="/posts/index.xml" rel="self" type="application/rss+xml"/><item><title>Why Your Jetson Orin Nano's 40 TOPS Goes Unused (And What That Means for Edge AI)</title><link>/posts/benchmarking-llms-on-jetson-orin-nano/</link><pubDate>Sat, 04 Oct 2025 00:00:00 +0000</pubDate><guid>/posts/benchmarking-llms-on-jetson-orin-nano/</guid><description>&lt;h2 id="introduction"&gt;
 Introduction
 &lt;a class="heading-link" href="#introduction"&gt;
 &lt;i class="fa-solid fa-link" aria-hidden="true" title="Link to heading"&gt;&lt;/i&gt;
@@ -6,7 +6,7 @@
 &lt;/a&gt;
 &lt;/h2&gt;
 &lt;p&gt;NVIDIA&amp;rsquo;s Jetson Orin Nano promises impressive specs: 1024 CUDA cores, 32 Tensor Cores, and 40 TOPS of INT8 compute performance packed into a compact, power-efficient edge device. On paper, it looks like a capable platform for running Large Language Models locally. But there&amp;rsquo;s a catch—one that reveals a fundamental tension in modern edge AI hardware design.&lt;/p&gt;
-&lt;p&gt;After running 66 inference tests across seven different language models ranging from 0.5B to 5.4B parameters, I discovered something counterintuitive: the device&amp;rsquo;s computational muscle sits largely idle during LLM inference. The bottleneck isn&amp;rsquo;t computation—it&amp;rsquo;s memory bandwidth. This isn&amp;rsquo;t just a quirk of one device; it&amp;rsquo;s a reality that affects how we should think about deploying LLMs at the edge.&lt;/p&gt;</description></item><item><title>Flashing Jetson Orin Nano in Virtualized Environments</title><link>/posts/flashing-jetson-orin-nano-in-virtualized-environments/</link><pubDate>Thu, 02 Oct 2025 00:00:00 +0000</pubDate><guid>/posts/flashing-jetson-orin-nano-in-virtualized-environments/</guid><description>&lt;h1 id="flashing-jetson-orin-nano-in-virtualized-environments"&gt;
+&lt;p&gt;After running 66 inference tests across seven different language models ranging from 0.5B to 5.4B parameters, I discovered something counterintuitive: the device&amp;rsquo;s computational muscle sits largely idle during single-stream LLM inference. The bottleneck isn&amp;rsquo;t computation—it&amp;rsquo;s memory bandwidth. This isn&amp;rsquo;t just a quirk of one device; it&amp;rsquo;s a fundamental characteristic of single-user, autoregressive token generation on edge hardware—a reality that shapes how we should approach local LLM deployment.&lt;/p&gt;</description></item><item><title>Flashing Jetson Orin Nano in Virtualized Environments</title><link>/posts/flashing-jetson-orin-nano-in-virtualized-environments/</link><pubDate>Thu, 02 Oct 2025 00:00:00 +0000</pubDate><guid>/posts/flashing-jetson-orin-nano-in-virtualized-environments/</guid><description>&lt;h1 id="flashing-jetson-orin-nano-in-virtualized-environments"&gt;
 Flashing Jetson Orin Nano in Virtualized Environments
 &lt;a class="heading-link" href="#flashing-jetson-orin-nano-in-virtualized-environments"&gt;
 &lt;i class="fa-solid fa-link" aria-hidden="true" title="Link to heading"&gt;&lt;/i&gt;
--- a/posts/mixture-of-experts-moe-models-challenges-solutions-in-practice/index.html
+++ b/posts/mixture-of-experts-moe-models-challenges-solutions-in-practice/index.html
@@ -44,4 +44,4 @@ The <strong>Top-K routing</strong> mechanism, as illustrated in the provided ima
 2016 -
 2025
 Eric X. Liu
-<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/2f73eae">[2f73eae]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
+<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/6ed1d69">[6ed1d69]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
--- a/posts/openwrt-mwan3-wireguard-endpoint-exclusion/index.html
+++ b/posts/openwrt-mwan3-wireguard-endpoint-exclusion/index.html
@@ -98,4 +98,4 @@ When using WireGuard together with MWAN3 on OpenWrt, the tunnel can fail to esta
 2016 -
 2025
 Eric X. Liu
-<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/2f73eae">[2f73eae]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
+<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/6ed1d69">[6ed1d69]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
--- a/posts/page/2/index.html
+++ b/posts/page/2/index.html
@@ -9,4 +9,4 @@
 2016 -
 2025
 Eric X. Liu
-<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/2f73eae">[2f73eae]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
+<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/6ed1d69">[6ed1d69]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
--- a/posts/ppo-for-language-models/index.html
+++ b/posts/ppo-for-language-models/index.html
@@ -25,4 +25,4 @@ where <code>δ_t = r_t + γV(s_{t+1}) - V(s_t)</code></p><ul><li><strong>γ (gam
 2016 -
 2025
 Eric X. Liu
-<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/2f73eae">[2f73eae]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
+<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/6ed1d69">[6ed1d69]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
--- a/posts/quantization-in-llms/index.html
+++ b/posts/quantization-in-llms/index.html
@@ -7,4 +7,4 @@
 2016 -
 2025
 Eric X. Liu
-<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/2f73eae">[2f73eae]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
+<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/6ed1d69">[6ed1d69]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
--- a/posts/secure-boot-dkms-and-mok-on-proxmox-debian/index.html
+++ b/posts/secure-boot-dkms-and-mok-on-proxmox-debian/index.html
@@ -59,4 +59,4 @@ nvidia-smi failed to communicate with the NVIDIA driver modprobe nvidia → “K
 2016 -
 2025
 Eric X. Liu
-<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/2f73eae">[2f73eae]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
+<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/6ed1d69">[6ed1d69]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
--- a/posts/supabase-deep-dive/index.html
+++ b/posts/supabase-deep-dive/index.html
@@ -90,4 +90,4 @@ Supabase enters this space with a radically different philosophy: transparency.
 2016 -
 2025
 Eric X. Liu
-<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/2f73eae">[2f73eae]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
+<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/6ed1d69">[6ed1d69]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
--- a/posts/t5-the-transformer-that-zigged-when-others-zagged-an-architectural-deep-dive/index.html
+++ b/posts/t5-the-transformer-that-zigged-when-others-zagged-an-architectural-deep-dive/index.html
@@ -30,4 +30,4 @@ But to truly understand the field, we must look at the pivotal models that explo
 2016 -
 2025
 Eric X. Liu
-<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/2f73eae">[2f73eae]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
+<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/6ed1d69">[6ed1d69]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
--- a/posts/transformer-s-core-mechanics/index.html
+++ b/posts/transformer-s-core-mechanics/index.html
@@ -36,4 +36,4 @@ In deep learning, a &ldquo;channel&rdquo; can be thought of as a feature dimensi
 2016 -
 2025
 Eric X. Liu
-<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/2f73eae">[2f73eae]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
+<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/6ed1d69">[6ed1d69]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
--- a/posts/unifi-vlan-migration-to-zone-based-architecture/index.html
+++ b/posts/unifi-vlan-migration-to-zone-based-architecture/index.html
@@ -28,4 +28,4 @@ This article documents that journey. It details the pitfalls encountered, the co
 2016 -
 2025
 Eric X. Liu
-<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/2f73eae">[2f73eae]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
+<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/6ed1d69">[6ed1d69]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
--- a/posts/useful/index.html
+++ b/posts/useful/index.html
@@ -9,4 +9,4 @@ One-minute read</span></div></div></header><div class=post-content><ul><li><a hr
 2016 -
 2025
 Eric X. Liu
-<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/2f73eae">[2f73eae]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
+<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/6ed1d69">[6ed1d69]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
--- a/sitemap.xml
+++ b/sitemap.xml
@@ -1 +1 @@
-<?xml version="1.0" encoding="utf-8" standalone="yes"?><urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:xhtml="http://www.w3.org/1999/xhtml"><url><loc>/</loc><lastmod>2025-10-04T17:44:47+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/posts/</loc><lastmod>2025-10-04T17:44:47+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/posts/benchmarking-llms-on-jetson-orin-nano/</loc><lastmod>2025-10-04T17:44:47+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/posts/flashing-jetson-orin-nano-in-virtualized-environments/</loc><lastmod>2025-10-02T08:42:39+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/posts/openwrt-mwan3-wireguard-endpoint-exclusion/</loc><lastmod>2025-10-02T08:34:05+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/posts/unifi-vlan-migration-to-zone-based-architecture/</loc><lastmod>2025-10-02T08:42:39+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/posts/quantization-in-llms/</loc><lastmod>2025-08-20T06:02:35+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/posts/breville-barista-pro-maintenance/</loc><lastmod>2025-08-20T06:04:36+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/posts/secure-boot-dkms-and-mok-on-proxmox-debian/</loc><lastmod>2025-08-14T06:50:22+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/posts/how-rvq-teaches-llms-to-see-and-hear/</loc><lastmod>2025-08-08T17:36:52+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/posts/supabase-deep-dive/</loc><lastmod>2025-08-04T03:59:37+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/posts/ppo-for-language-models/</loc><lastmod>2025-10-02T08:42:39+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/posts/mixture-of-experts-moe-models-challenges-solutions-in-practice/</loc><lastmod>2025-08-03T06:02:48+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/posts/t5-the-transformer-that-zigged-when-others-zagged-an-architectural-deep-dive/</loc><lastmod>2025-08-03T03:41:10+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/posts/espresso-theory-application-a-guide-for-the-breville-barista-pro/</loc><lastmod>2025-08-03T04:20:20+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/posts/transformer-s-core-mechanics/</loc><lastmod>2025-10-02T08:42:39+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/posts/useful/</loc><lastmod>2025-08-03T08:37:28-07:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/about/</loc><lastmod>2020-06-16T23:30:17-07:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/categories/</loc><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/tags/</loc><changefreq>weekly</changefreq><priority>0.5</priority></url></urlset>
+<?xml version="1.0" encoding="utf-8" standalone="yes"?><urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:xhtml="http://www.w3.org/1999/xhtml"><url><loc>/</loc><lastmod>2025-10-04T20:41:50+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/posts/</loc><lastmod>2025-10-04T20:41:50+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/posts/benchmarking-llms-on-jetson-orin-nano/</loc><lastmod>2025-10-04T20:41:50+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/posts/flashing-jetson-orin-nano-in-virtualized-environments/</loc><lastmod>2025-10-02T08:42:39+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/posts/openwrt-mwan3-wireguard-endpoint-exclusion/</loc><lastmod>2025-10-02T08:34:05+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/posts/unifi-vlan-migration-to-zone-based-architecture/</loc><lastmod>2025-10-02T08:42:39+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/posts/quantization-in-llms/</loc><lastmod>2025-08-20T06:02:35+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/posts/breville-barista-pro-maintenance/</loc><lastmod>2025-08-20T06:04:36+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/posts/secure-boot-dkms-and-mok-on-proxmox-debian/</loc><lastmod>2025-08-14T06:50:22+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/posts/how-rvq-teaches-llms-to-see-and-hear/</loc><lastmod>2025-08-08T17:36:52+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/posts/supabase-deep-dive/</loc><lastmod>2025-08-04T03:59:37+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/posts/ppo-for-language-models/</loc><lastmod>2025-10-02T08:42:39+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/posts/mixture-of-experts-moe-models-challenges-solutions-in-practice/</loc><lastmod>2025-08-03T06:02:48+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/posts/t5-the-transformer-that-zigged-when-others-zagged-an-architectural-deep-dive/</loc><lastmod>2025-08-03T03:41:10+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/posts/espresso-theory-application-a-guide-for-the-breville-barista-pro/</loc><lastmod>2025-08-03T04:20:20+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/posts/transformer-s-core-mechanics/</loc><lastmod>2025-10-02T08:42:39+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/posts/useful/</loc><lastmod>2025-08-03T08:37:28-07:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/about/</loc><lastmod>2020-06-16T23:30:17-07:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/categories/</loc><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/tags/</loc><changefreq>weekly</changefreq><priority>0.5</priority></url></urlset>
--- a/tags/index.html
+++ b/tags/index.html
@@ -4,4 +4,4 @@
 2016 -
 2025
 Eric X. Liu
-<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/2f73eae">[2f73eae]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>
+<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/6ed1d69">[6ed1d69]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script><script defer src=https://static.cloudflareinsights.com/beacon.min.js data-cf-beacon='{"token": "987638e636ce4dbb932d038af74c17d1"}'></script></body></html>