deploy: 88cbb7efd5

2025-08-02 22:46:40 +00:00
parent 20c1888f78
commit d801fe9307
11 changed files with 42 additions and 13 deletions
--- a/404.html
+++ b/404.html
@@ -4,4 +4,4 @@
 2016 -
 2025
 Eric X. Liu
-<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/e6c8ccf">[e6c8ccf]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script></body></html>
+<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/88cbb7e">[88cbb7e]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script></body></html>
--- a/about/index.html
+++ b/about/index.html
@@ -4,4 +4,4 @@
 2016 -
 2025
 Eric X. Liu
-<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/e6c8ccf">[e6c8ccf]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script></body></html>
+<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/88cbb7e">[88cbb7e]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script></body></html>
--- a/categories/index.html
+++ b/categories/index.html
@@ -4,4 +4,4 @@
 2016 -
 2025
 Eric X. Liu
-<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/e6c8ccf">[e6c8ccf]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script></body></html>
+<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/88cbb7e">[88cbb7e]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script></body></html>
--- a/index.html
+++ b/index.html
@@ -4,4 +4,4 @@
 2016 -
 2025
 Eric X. Liu
-<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/e6c8ccf">[e6c8ccf]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script></body></html>
+<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/88cbb7e">[88cbb7e]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script></body></html>
--- a/index.xml
+++ b/index.xml
@@ -1,4 +1,5 @@
-<?xml version="1.0" encoding="utf-8" standalone="yes"?><rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom"><channel><title>Eric X. Liu's Personal Page</title><link>/</link><description>Recent content on Eric X. Liu's Personal Page</description><generator>Hugo</generator><language>en</language><lastBuildDate>Mon, 26 Oct 2020 04:47:36 +0000</lastBuildDate><atom:link href="/index.xml" rel="self" type="application/rss+xml"/><item><title>Some useful files</title><link>/posts/useful/</link><pubDate>Mon, 26 Oct 2020 04:14:43 +0000</pubDate><guid>/posts/useful/</guid><description>&lt;ul>
+<?xml version="1.0" encoding="utf-8" standalone="yes"?><rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom"><channel><title>Eric X. Liu's Personal Page</title><link>/</link><description>Recent content on Eric X. Liu's Personal Page</description><generator>Hugo</generator><language>en</language><lastBuildDate>Sat, 02 Aug 2025 15:46:24 -0700</lastBuildDate><atom:link href="/index.xml" rel="self" type="application/rss+xml"/><item><title>Some useful files</title><link>/posts/useful/</link><pubDate>Mon, 26 Oct 2020 04:14:43 +0000</pubDate><guid>/posts/useful/</guid><description>&lt;ul>
 &lt;li>&lt;a href="https://ericxliu.me/rootCA.pem" class="external-link" target="_blank" rel="noopener">rootCA.pem&lt;/a>&lt;/li>
 &lt;li>&lt;a href="https://ericxliu.me/vpnclient.ovpn" class="external-link" target="_blank" rel="noopener">vpnclient.ovpn&lt;/a>&lt;/li>
-&lt;/ul></description></item><item><title>About</title><link>/about/</link><pubDate>Fri, 01 Jun 2018 07:13:52 +0000</pubDate><guid>/about/</guid><description/></item></channel></rss>
+&lt;/ul></description></item><item><title>About</title><link>/about/</link><pubDate>Fri, 01 Jun 2018 07:13:52 +0000</pubDate><guid>/about/</guid><description/></item><item><title/><link>/posts/a-deep-dive-into-ppo-for-language-models/</link><pubDate>Mon, 01 Jan 0001 00:00:00 +0000</pubDate><guid>/posts/a-deep-dive-into-ppo-for-language-models/</guid><description>&lt;p>Large Language Models (LLMs) have demonstrated astonishing capabilities, but out-of-the-box, they are simply powerful text predictors. They don&amp;rsquo;t inherently understand what makes a response helpful, harmless, or aligned with human values. The technique that has proven most effective at bridging this gap is Reinforcement Learning from Human Feedback (RLHF), and at its heart lies a powerful algorithm: Proximal Policy Optimization (PPO).&lt;/p>
+&lt;p>You may have seen diagrams like the one below, which outlines the RLHF training process. It can look intimidating, with a web of interconnected models, losses, and data flows.&lt;/p></description></item></channel></rss>
--- a/posts/a-deep-dive-into-ppo-for-language-models/index.html
+++ b/posts/a-deep-dive-into-ppo-for-language-models/index.html
@@ -0,0 +1,26 @@
+<!doctype html><html lang=en><head><title>· Eric X. Liu's Personal Page</title><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><meta name=color-scheme content="light dark"><meta name=author content="Eric X. Liu"><meta name=description content="Large Language Models (LLMs) have demonstrated astonishing capabilities, but out-of-the-box, they are simply powerful text predictors. They don&rsquo;t inherently understand what makes a response helpful, harmless, or aligned with human values. The technique that has proven most effective at bridging this gap is Reinforcement Learning from Human Feedback (RLHF), and at its heart lies a powerful algorithm: Proximal Policy Optimization (PPO).
+You may have seen diagrams like the one below, which outlines the RLHF training process. It can look intimidating, with a web of interconnected models, losses, and data flows."><meta name=keywords content="software engineer,performance engineering,Google engineer,tech blog,software development,performance optimization,Eric Liu,engineering blog,mountain biking,Jeep enthusiast,overlanding,camping,outdoor adventures"><meta name=fediverse:creator content><meta name=twitter:card content="summary"><meta name=twitter:title content="Eric X. Liu's Personal Page"><meta name=twitter:description content="Large Language Models (LLMs) have demonstrated astonishing capabilities, but out-of-the-box, they are simply powerful text predictors. They don’t inherently understand what makes a response helpful, harmless, or aligned with human values. The technique that has proven most effective at bridging this gap is Reinforcement Learning from Human Feedback (RLHF), and at its heart lies a powerful algorithm: Proximal Policy Optimization (PPO).
+You may have seen diagrams like the one below, which outlines the RLHF training process. It can look intimidating, with a web of interconnected models, losses, and data flows."><meta property="og:url" content="/posts/a-deep-dive-into-ppo-for-language-models/"><meta property="og:site_name" content="Eric X. Liu's Personal Page"><meta property="og:title" content="Eric X. Liu's Personal Page"><meta property="og:description" content="Large Language Models (LLMs) have demonstrated astonishing capabilities, but out-of-the-box, they are simply powerful text predictors. They don’t inherently understand what makes a response helpful, harmless, or aligned with human values. The technique that has proven most effective at bridging this gap is Reinforcement Learning from Human Feedback (RLHF), and at its heart lies a powerful algorithm: Proximal Policy Optimization (PPO).
+You may have seen diagrams like the one below, which outlines the RLHF training process. It can look intimidating, with a web of interconnected models, losses, and data flows."><meta property="og:locale" content="en"><meta property="og:type" content="article"><meta property="article:section" content="posts"><meta property="article:modified_time" content="2025-08-02T15:46:24-07:00"><link rel=canonical href=/posts/a-deep-dive-into-ppo-for-language-models/><link rel=preload href=/fonts/fa-brands-400.woff2 as=font type=font/woff2 crossorigin><link rel=preload href=/fonts/fa-regular-400.woff2 as=font type=font/woff2 crossorigin><link rel=preload href=/fonts/fa-solid-900.woff2 as=font type=font/woff2 crossorigin><link rel=stylesheet href=/css/coder.min.60f552a2c0452fcc0254c54f21c3e0728460c1ae85f97a9c35833a222ef8b884.css integrity="sha256-YPVSosBFL8wCVMVPIcPgcoRgwa6F+XqcNYM6Ii74uIQ=" crossorigin=anonymous media=screen><link rel=stylesheet href=/css/coder-dark.min.a00e6364bacbc8266ad1cc81230774a1397198f8cfb7bcba29b7d6fcb54ce57f.css integrity="sha256-oA5jZLrLyCZq0cyBIwd0oTlxmPjPt7y6KbfW/LVM5X8=" crossorigin=anonymous media=screen><link rel=icon type=image/svg+xml href=/images/favicon.svg sizes=any><link rel=icon type=image/png href=/images/favicon-32x32.png sizes=32x32><link rel=icon type=image/png href=/images/favicon-16x16.png sizes=16x16><link rel=apple-touch-icon href=/images/apple-touch-icon.png><link rel=apple-touch-icon sizes=180x180 href=/images/apple-touch-icon.png><link rel=manifest href=/site.webmanifest><link rel=mask-icon href=/images/safari-pinned-tab.svg color=#5bbad5></head><body class="preload-transitions colorscheme-auto"><div class=float-container><a id=dark-mode-toggle class=colorscheme-toggle><i class="fa-solid fa-adjust fa-fw" aria-hidden=true></i></a></div><main class=wrapper><nav class=navigation><section class=container><a class=navigation-title href=/>Eric X. Liu's Personal Page
+</a><input type=checkbox id=menu-toggle>
+<label class="menu-button float-right" for=menu-toggle><i class="fa-solid fa-bars fa-fw" aria-hidden=true></i></label><ul class=navigation-list><li class=navigation-item><a class=navigation-link href=/posts/>Posts</a></li><li class=navigation-item><a class=navigation-link href=https://chat.ericxliu.me>Chat</a></li><li class=navigation-item><a class=navigation-link href=https://git.ericxliu.me/user/oauth2/Authenitk>Git</a></li><li class=navigation-item><a class=navigation-link href=https://coder.ericxliu.me/api/v2/users/oidc/callback>Coder</a></li><li class=navigation-item><a class=navigation-link href=https://rss.ericxliu.me/oauth2/oidc/redirect>RSS</a></li><li class=navigation-item><a class=navigation-link href=/>|</a></li><li class=navigation-item><a class=navigation-link href=https://sso.ericxliu.me>Sign in</a></li></ul></section></nav><div class=content><section class="container post"><article><header><div class=post-title><h1 class=title><a class=title-link href=/posts/a-deep-dive-into-ppo-for-language-models/></a></h1></div><div class=post-meta><div class=date><span class=posted-on><i class="fa-solid fa-calendar" aria-hidden=true></i>
+<time datetime=0001-01-01T00:00:00Z>January 1, 0001
+</time></span><span class=reading-time><i class="fa-solid fa-clock" aria-hidden=true></i>
+7-minute read</span></div></div></header><div class=post-content><p>Large Language Models (LLMs) have demonstrated astonishing capabilities, but out-of-the-box, they are simply powerful text predictors. They don&rsquo;t inherently understand what makes a response helpful, harmless, or aligned with human values. The technique that has proven most effective at bridging this gap is Reinforcement Learning from Human Feedback (RLHF), and at its heart lies a powerful algorithm: Proximal Policy Optimization (PPO).</p><p>You may have seen diagrams like the one below, which outlines the RLHF training process. It can look intimidating, with a web of interconnected models, losses, and data flows.</p><p>![[Pasted image 20250730232756.png]]</p><p>This post will decode that diagram, piece by piece. We&rsquo;ll explore the &ldquo;why&rdquo; behind each component, moving from high-level concepts to the deep technical reasoning that makes this process work.</p><h3 id=translating-rl-to-a-conversation>Translating RL to a Conversation
+<a class=heading-link href=#translating-rl-to-a-conversation><i class="fa-solid fa-link" aria-hidden=true title="Link to heading"></i>
+<span class=sr-only>Link to heading</span></a></h3><p>The first step is to understand how the traditional language of reinforcement learning maps to the world of text generation.</p><ul><li><strong>State (<code>s_t</code>)</strong>: In a chat setting, the &ldquo;state&rdquo; is the context of the conversation so far. It&rsquo;s the initial prompt (<code>x</code>) plus all the text the model has generated up to the current moment (<code>y₁, ..., y_{t-1}</code>).</li><li><strong>Action (<code>a_t</code>)</strong>: The &ldquo;action&rdquo; is the model&rsquo;s decision at each step. For an LLM, this means generating the very next token (<code>y_t</code>). A full response is a sequence of these actions.blob:https://aistudio.google.com/872e746f-88c1-40ec-8e45-fa0efce97299</li><li><strong>Reward (<code>r</code>)</strong>: The &ldquo;reward&rdquo; is a numeric score that tells the model how good its full response (<code>y</code>) was. This score comes from a separate <strong>Reward Model</strong>, which has been trained on a large dataset of human preference comparisons (e.g., humans rating which of two responses is better). This reward is often only awarded at the end of the entire generated sequence.</li></ul><p>Let&rsquo;s make this concrete. If a user provides the prompt <strong>(x)</strong>: <em>&ldquo;The best thing about AI is&rdquo;</em>, and the model generates the response <strong>(y)</strong>: <em>&ldquo;its potential to solve problems.&rdquo;</em>, here is how it&rsquo;s broken down for training:</p><ul><li><strong>State 1</strong>: &ldquo;The best thing about AI is&rdquo;<ul><li><strong>Action 1</strong>: &ldquo;its&rdquo;</li></ul></li><li><strong>State 2</strong>: &ldquo;The best thing about AI is its&rdquo;<ul><li><strong>Action 2</strong>: " potential"</li></ul></li><li><strong>State 3</strong>: &ldquo;The best thing about AI is its potential&rdquo;<ul><li><strong>Action 3</strong>: " to"</li></ul></li><li>&mldr;and so on for every generated token.</li></ul><p>This breakdown transforms a single prompt-response pair into a rich trajectory of state-action pairs, which becomes the raw data for our learning algorithm.</p><h3 id=the-cast-of-models-an-actor-critic-ensemble>The Cast of Models: An Actor-Critic Ensemble
+<a class=heading-link href=#the-cast-of-models-an-actor-critic-ensemble><i class="fa-solid fa-link" aria-hidden=true title="Link to heading"></i>
+<span class=sr-only>Link to heading</span></a></h3><p>The PPO process doesn&rsquo;t rely on a single model but an ensemble where each member has a distinct role.</p><ol><li><strong>The Actor (Policy LM)</strong>: This is the star of the show—the LLM we are actively fine-tuning. Its role is to take a state (the current text) and decide on an action (the next token). We refer to its decision-making process as its &ldquo;policy&rdquo; (<code>π</code>).</li><li><strong>The Critic (Value Model)</strong>: This is the Actor&rsquo;s coach. The Critic doesn&rsquo;t generate text. Instead, it observes a state and estimates the <em>potential future reward</em> the Actor is likely to receive from that point onward. This estimate is called the &ldquo;value&rdquo; (<code>V(s_t)</code>). The Critic&rsquo;s feedback helps the Actor understand whether it&rsquo;s in a promising or a dead-end situation, which is a much more immediate learning signal than waiting for the final reward.</li><li><strong>The Reward Model</strong>: This is the ultimate judge. As mentioned, it&rsquo;s a separate model trained on human preference data that provides the final score for a complete generation. Its judgment is treated as the ground truth for training both the Actor and the Critic.</li></ol><h3 id=the-challenge-of-credit-assignment-generalized-advantage-estimation-gae>The Challenge of Credit Assignment: Generalized Advantage Estimation (GAE)
+<a class=heading-link href=#the-challenge-of-credit-assignment-generalized-advantage-estimation-gae><i class="fa-solid fa-link" aria-hidden=true title="Link to heading"></i>
+<span class=sr-only>Link to heading</span></a></h3><p>A key problem in RL is assigning credit. If a 20-token response gets a high reward, was it because of the first token, the last one, or all of them? The Critic helps solve this. By comparing the reward at each step with the Critic&rsquo;s value estimate, we can calculate the <strong>Advantage (<code>Â</code>)</strong>.</p><p>A simple advantage calculation might be: <code>Advantage = reward + Value_of_next_state - Value_of_current_state</code>.</p><p>However, this can be noisy. PPO uses a more sophisticated technique called <strong>Generalized Advantage Estimation (GAE)</strong>. The formula looks complex, but the idea is intuitive:</p><p><code>Â(s_t, a_t) = Σ(γλ)^l * δ_{t+l}</code>
+where <code>δ_t = r_t + γV(s_{t+1}) - V(s_t)</code></p><ul><li><strong>γ (gamma)</strong> is a discount factor (e.g., 0.99), which values immediate rewards slightly more than distant ones.</li><li><strong>λ (lambda)</strong> is a smoothing parameter that balances the trade-off between bias and variance. It creates a weighted average of advantages over multiple future time steps.</li></ul><p>In essence, GAE provides a more stable and accurate estimate of how much better a specific action was compared to the policy&rsquo;s average behavior in that state.</p><h3 id=the-heart-of-ppo-the-quest-for-stable-updates>The Heart of PPO: The Quest for Stable Updates
+<a class=heading-link href=#the-heart-of-ppo-the-quest-for-stable-updates><i class="fa-solid fa-link" aria-hidden=true title="Link to heading"></i>
+<span class=sr-only>Link to heading</span></a></h3><p>Now we arrive at the core innovation of PPO. We want to update our Actor model to take actions with higher advantages. The naive way to do this is to re-weight our training objective by an <strong>importance sampling ratio</strong>: <code>(π_new / π_old)</code>. This corrects for the fact that the data we are learning from was generated by a slightly older version of our policy.</p><p>However, this ratio is incredibly dangerous. If the new policy becomes very different from the old one, the ratio can explode, leading to massive, unstable gradient updates that destroy the model.</p><p>PPO solves this with its signature <strong>Clipped Surrogate Objective</strong>. The PPO loss function is:</p><p><code>L_CLIP(θ) = Ê_t [ min( r_t(θ)Â_t, clip(r_t(θ), 1 - ε, 1 + ε)Â_t ) ]</code></p><p>Let&rsquo;s translate this from math to English:</p><ul><li><code>r_t(θ)</code> is the probability ratio <code>π_new(a_t|s_t) / π_old(a_t|s_t)</code>.</li><li>The goal is to increase the objective by an amount proportional to the advantage <code>Â_t</code>.</li><li><strong>The <code>clip</code> function is the crucial safeguard.</strong> It forbids the probability ratio from moving outside a small window (e.g., <code>[0.8, 1.2]</code>).</li></ul><p>This means the algorithm says: &ldquo;Let&rsquo;s update our policy to favor this good action. But if the required update would change the policy too drastically from the old one, we&rsquo;ll &lsquo;clip&rsquo; the update to a more modest size.&rdquo; This creates a &ldquo;trust region,&rdquo; ensuring stable, incremental improvements.</p><h3 id=avoiding-amnesia-the-pretraining-loss>Avoiding Amnesia: The Pretraining Loss
+<a class=heading-link href=#avoiding-amnesia-the-pretraining-loss><i class="fa-solid fa-link" aria-hidden=true title="Link to heading"></i>
+<span class=sr-only>Link to heading</span></a></h3><p>There&rsquo;s one final problem. If we only optimize for the PPO loss, the model might learn to &ldquo;hack&rdquo; the reward model by generating repetitive or nonsensical text that gets a high score. In doing so, it could suffer from <strong>catastrophic forgetting</strong>, losing its fundamental grasp of grammar and facts.</p><p>To prevent this, we introduce a second loss term. As seen in the diagram, we mix in data from the original <strong>Pretraining Data</strong> (or the dataset used for Supervised Fine-Tuning). We calculate a standard next-token prediction loss (<code>LM Loss</code>) on this high-quality data.</p><p>The final loss for the Actor is a combination of both objectives:</p><p><strong>Total Loss = Loss_PPO + <code>λ_ptx</code> * Loss_LM</strong></p><p>This brilliantly balances two goals:</p><ol><li>The <code>Loss_PPO</code> pushes the model towards behaviors that align with human preferences.</li><li>The <code>Loss_LM</code> acts as a regularizer, pulling the model back towards its core language capabilities and preventing it from drifting into gibberish.</li></ol><h3 id=the-full-training-loop>The Full Training Loop
+<a class=heading-link href=#the-full-training-loop><i class="fa-solid fa-link" aria-hidden=true title="Link to heading"></i>
+<span class=sr-only>Link to heading</span></a></h3><p>Now, we can assemble the entire process into a clear, iterative loop:</p><ol><li><strong>Collect</strong>: The current Actor policy <code>π_k</code> generates responses to a batch of prompts. These experiences—<code>(state, action, probability, reward, value)</code>—are stored in an <strong>Experience Buffer</strong>.</li><li><strong>Calculate</strong>: Once the buffer is full, we use the collected data to compute the advantage estimates <code>Â_t</code> for every single token-generation step.</li><li><strong>Optimize</strong>: For a few epochs, we repeatedly sample mini-batches from the buffer and update the Actor and Critic models. The Actor is updated using the combined <code>PPO-clip Loss</code> and <code>LM Loss</code>. The Critic is updated to improve its value predictions.</li><li><strong>Flush and Repeat</strong>: After the optimization phase, the entire experience buffer is discarded. The data is now &ldquo;stale&rdquo; because our policy has changed. The newly updated policy <code>π_{k+1}</code> becomes the new Actor, and we return to step 1 to collect fresh data.</li></ol><p>This cycle of collection and optimization allows the language model to gradually and safely steer its behavior towards human-defined goals, creating the helpful and aligned AI assistants we interact with today.</p><hr><p><strong>References:</strong></p><ol><li>Schulman, J., Wolski, F., Dhariwal, P., Radford, A., & Klimov, O. (2017). <em>Proximal Policy Optimization Algorithms</em>. arXiv preprint arXiv:1707.06347.</li><li>Schulman, J., Moritz, P., Levine, S., Jordan, M., & Abbeel, P. (2015). <em>High-Dimensional Continuous Control Using Generalized Advantage Estimation</em>. arXiv preprint arXiv:1506.02438.</li><li>Ouyang, L., et al. (2022). <em>Training language models to follow instructions with human feedback</em>. Advances in Neural Information Processing Systems 35.</li></ol></div><footer></footer></article></section></div><footer class=footer><section class=container>©
+2016 -
+2025
+Eric X. Liu
+<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/88cbb7e">[88cbb7e]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script></body></html>
--- a/posts/index.html
+++ b/posts/index.html
@@ -1,8 +1,9 @@
 <!doctype html><html lang=en><head><title>Posts · Eric X. Liu's Personal Page</title><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><meta name=color-scheme content="light dark"><meta name=author content="Eric X. Liu"><meta name=description content="Eric X. Liu - Software & Performance Engineer at Google. Sharing insights about software engineering, performance optimization, tech industry experiences, mountain biking adventures, Jeep overlanding, and outdoor activities."><meta name=keywords content="software engineer,performance engineering,Google engineer,tech blog,software development,performance optimization,Eric Liu,engineering blog,mountain biking,Jeep enthusiast,overlanding,camping,outdoor adventures"><meta name=fediverse:creator content><meta name=twitter:card content="summary"><meta name=twitter:title content="Posts"><meta name=twitter:description content="Eric X. Liu - Software & Performance Engineer at Google. Sharing insights about software engineering, performance optimization, tech industry experiences, mountain biking adventures, Jeep overlanding, and outdoor activities."><meta property="og:url" content="/posts/"><meta property="og:site_name" content="Eric X. Liu's Personal Page"><meta property="og:title" content="Posts"><meta property="og:description" content="Eric X. Liu - Software & Performance Engineer at Google. Sharing insights about software engineering, performance optimization, tech industry experiences, mountain biking adventures, Jeep overlanding, and outdoor activities."><meta property="og:locale" content="en"><meta property="og:type" content="website"><link rel=canonical href=/posts/><link rel=preload href=/fonts/fa-brands-400.woff2 as=font type=font/woff2 crossorigin><link rel=preload href=/fonts/fa-regular-400.woff2 as=font type=font/woff2 crossorigin><link rel=preload href=/fonts/fa-solid-900.woff2 as=font type=font/woff2 crossorigin><link rel=stylesheet href=/css/coder.min.60f552a2c0452fcc0254c54f21c3e0728460c1ae85f97a9c35833a222ef8b884.css integrity="sha256-YPVSosBFL8wCVMVPIcPgcoRgwa6F+XqcNYM6Ii74uIQ=" crossorigin=anonymous media=screen><link rel=stylesheet href=/css/coder-dark.min.a00e6364bacbc8266ad1cc81230774a1397198f8cfb7bcba29b7d6fcb54ce57f.css integrity="sha256-oA5jZLrLyCZq0cyBIwd0oTlxmPjPt7y6KbfW/LVM5X8=" crossorigin=anonymous media=screen><link rel=icon type=image/svg+xml href=/images/favicon.svg sizes=any><link rel=icon type=image/png href=/images/favicon-32x32.png sizes=32x32><link rel=icon type=image/png href=/images/favicon-16x16.png sizes=16x16><link rel=apple-touch-icon href=/images/apple-touch-icon.png><link rel=apple-touch-icon sizes=180x180 href=/images/apple-touch-icon.png><link rel=manifest href=/site.webmanifest><link rel=mask-icon href=/images/safari-pinned-tab.svg color=#5bbad5><link rel=alternate type=application/rss+xml href=/posts/index.xml title="Eric X. Liu's Personal Page"></head><body class="preload-transitions colorscheme-auto"><div class=float-container><a id=dark-mode-toggle class=colorscheme-toggle><i class="fa-solid fa-adjust fa-fw" aria-hidden=true></i></a></div><main class=wrapper><nav class=navigation><section class=container><a class=navigation-title href=/>Eric X. Liu's Personal Page
 </a><input type=checkbox id=menu-toggle>
 <label class="menu-button float-right" for=menu-toggle><i class="fa-solid fa-bars fa-fw" aria-hidden=true></i></label><ul class=navigation-list><li class=navigation-item><a class=navigation-link href=/posts/>Posts</a></li><li class=navigation-item><a class=navigation-link href=https://chat.ericxliu.me>Chat</a></li><li class=navigation-item><a class=navigation-link href=https://git.ericxliu.me/user/oauth2/Authenitk>Git</a></li><li class=navigation-item><a class=navigation-link href=https://coder.ericxliu.me/api/v2/users/oidc/callback>Coder</a></li><li class=navigation-item><a class=navigation-link href=https://rss.ericxliu.me/oauth2/oidc/redirect>RSS</a></li><li class=navigation-item><a class=navigation-link href=/>|</a></li><li class=navigation-item><a class=navigation-link href=https://sso.ericxliu.me>Sign in</a></li></ul></section></nav><div class=content><section class="container list"><header><h1 class=title><a class=title-link href=/posts/>Posts</a></h1></header><ul><li><span class=date>October 26, 2020</span>
-<a class=title href=/posts/useful/>Some useful files</a></li></ul></section></div><footer class=footer><section class=container>©
+<a class=title href=/posts/useful/>Some useful files</a></li><li><span class=date>January 1, 0001</span>
+<a class=title href=/posts/a-deep-dive-into-ppo-for-language-models/></a></li></ul></section></div><footer class=footer><section class=container>©
 2016 -
 2025
 Eric X. Liu
-<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/e6c8ccf">[e6c8ccf]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script></body></html>
+<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/88cbb7e">[88cbb7e]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script></body></html>
--- a/posts/index.xml
+++ b/posts/index.xml
@@ -1,4 +1,5 @@
-<?xml version="1.0" encoding="utf-8" standalone="yes"?><rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom"><channel><title>Posts on Eric X. Liu's Personal Page</title><link>/posts/</link><description>Recent content in Posts on Eric X. Liu's Personal Page</description><generator>Hugo</generator><language>en</language><lastBuildDate>Mon, 26 Oct 2020 04:47:36 +0000</lastBuildDate><atom:link href="/posts/index.xml" rel="self" type="application/rss+xml"/><item><title>Some useful files</title><link>/posts/useful/</link><pubDate>Mon, 26 Oct 2020 04:14:43 +0000</pubDate><guid>/posts/useful/</guid><description>&lt;ul>
+<?xml version="1.0" encoding="utf-8" standalone="yes"?><rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom"><channel><title>Posts on Eric X. Liu's Personal Page</title><link>/posts/</link><description>Recent content in Posts on Eric X. Liu's Personal Page</description><generator>Hugo</generator><language>en</language><lastBuildDate>Sat, 02 Aug 2025 15:46:24 -0700</lastBuildDate><atom:link href="/posts/index.xml" rel="self" type="application/rss+xml"/><item><title>Some useful files</title><link>/posts/useful/</link><pubDate>Mon, 26 Oct 2020 04:14:43 +0000</pubDate><guid>/posts/useful/</guid><description>&lt;ul>
 &lt;li>&lt;a href="https://ericxliu.me/rootCA.pem" class="external-link" target="_blank" rel="noopener">rootCA.pem&lt;/a>&lt;/li>
 &lt;li>&lt;a href="https://ericxliu.me/vpnclient.ovpn" class="external-link" target="_blank" rel="noopener">vpnclient.ovpn&lt;/a>&lt;/li>
-&lt;/ul></description></item></channel></rss>
+&lt;/ul></description></item><item><title/><link>/posts/a-deep-dive-into-ppo-for-language-models/</link><pubDate>Mon, 01 Jan 0001 00:00:00 +0000</pubDate><guid>/posts/a-deep-dive-into-ppo-for-language-models/</guid><description>&lt;p>Large Language Models (LLMs) have demonstrated astonishing capabilities, but out-of-the-box, they are simply powerful text predictors. They don&amp;rsquo;t inherently understand what makes a response helpful, harmless, or aligned with human values. The technique that has proven most effective at bridging this gap is Reinforcement Learning from Human Feedback (RLHF), and at its heart lies a powerful algorithm: Proximal Policy Optimization (PPO).&lt;/p>
+&lt;p>You may have seen diagrams like the one below, which outlines the RLHF training process. It can look intimidating, with a web of interconnected models, losses, and data flows.&lt;/p></description></item></channel></rss>
--- a/posts/useful/index.html
+++ b/posts/useful/index.html
@@ -10,4 +10,4 @@ One-minute read</span></div></div></header><div class=post-content><ul><li><a hr
 2016 -
 2025
 Eric X. Liu
-<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/e6c8ccf">[e6c8ccf]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script></body></html>
+<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/88cbb7e">[88cbb7e]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script></body></html>
--- a/sitemap.xml
+++ b/sitemap.xml
@@ -1 +1 @@
-<?xml version="1.0" encoding="utf-8" standalone="yes"?><urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:xhtml="http://www.w3.org/1999/xhtml"><url><loc>/</loc><lastmod>2020-10-26T04:47:36+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/posts/</loc><lastmod>2020-10-26T04:47:36+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/posts/useful/</loc><lastmod>2020-10-26T04:47:36+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/about/</loc><lastmod>2020-06-16T23:30:17-07:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/categories/</loc><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/tags/</loc><changefreq>weekly</changefreq><priority>0.5</priority></url></urlset>
+<?xml version="1.0" encoding="utf-8" standalone="yes"?><urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:xhtml="http://www.w3.org/1999/xhtml"><url><loc>/</loc><lastmod>2025-08-02T15:46:24-07:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/posts/</loc><lastmod>2025-08-02T15:46:24-07:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/posts/useful/</loc><lastmod>2020-10-26T04:47:36+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/about/</loc><lastmod>2020-06-16T23:30:17-07:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/posts/a-deep-dive-into-ppo-for-language-models/</loc><lastmod>2025-08-02T15:46:24-07:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/categories/</loc><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>/tags/</loc><changefreq>weekly</changefreq><priority>0.5</priority></url></urlset>
--- a/tags/index.html
+++ b/tags/index.html
@@ -4,4 +4,4 @@
 2016 -
 2025
 Eric X. Liu
-<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/e6c8ccf">[e6c8ccf]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script></body></html>
+<a href="https://git.ericxliu.me/eric/ericxliu-me/commit/88cbb7e">[88cbb7e]</a></section></footer></main><script src=/js/coder.min.6ae284be93d2d19dad1f02b0039508d9aab3180a12a06dcc71b0b0ef7825a317.js integrity="sha256-auKEvpPS0Z2tHwKwA5UI2aqzGAoSoG3McbCw73gloxc="></script></body></html>