[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"post-v2-\u002Fblog\u002Fagent-ops-full-postmortem":3},{"id":4,"title":5,"body":6,"date":1857,"description":1858,"draft":1859,"extension":1860,"meta":1861,"navigation":494,"path":1863,"seo":1864,"stem":1865,"tags":1866,"__hash__":1881},"blog\u002Fblog\u002Fagent-ops-full-postmortem.md","AI Ops 翻车全复盘：v4→v5→v4 的 100 次部署与一次 OOM",{"type":7,"value":8,"toc":1820},"minimark",[9,13,20,23,26,31,34,179,182,185,187,191,194,243,246,249,251,255,258,263,368,376,382,385,387,391,394,397,419,422,425,427,431,435,438,741,744,768,771,774,827,833,923,926,928,932,935,943,993,996,1026,1029,1035,1038,1042,1045,1051,1054,1142,1145,1149,1152,1158,1160,1164,1168,1171,1191,1195,1198,1204,1207,1210,1213,1215,1219,1222,1225,1332,1335,1339,1342,1347,1389,1411,1422,1424,1428,1530,1533,1535,1539,1542,1647,1650,1652,1656,1660,1667,1671,1685,1689,1692,1696,1702,1705,1709,1716,1720,1723,1727,1733,1735,1739,1796,1798,1801,1804,1807,1810,1813,1816],[10,11,5],"h1",{"id":12},"ai-ops-翻车全复盘v4v5v4-的-100-次部署与一次-oom",[14,15,16],"blockquote",{},[17,18,19],"p",{},"TL;DR — 用了 9 个 AI Agent、140 个 Kanban 任务、5 版设计稿、无数次部署，最后回到最初的版本。总\"净产出\"为零，但学到了所有东西。网站挂了 55 分钟，AI 还把 SSH 封了。",[17,21,22],{},"这不是成功故事。这是一份诚实的翻车全复盘。",[24,25],"hr",{},[27,28,30],"h2",{"id":29},"一系统架构谁是肇事者","一、系统架构：谁是\"肇事者\"",[17,32,33],{},"在开始之前，先认识一下这支\"AI 工程团队\"的 9 名成员：",[35,36,37,56],"table",{},[38,39,40],"thead",{},[41,42,43,47,50,53],"tr",{},[44,45,46],"th",{},"角色",[44,48,49],{},"Profile",[44,51,52],{},"职责",[44,54,55],{},"模型",[57,58,59,74,88,101,114,127,140,153,166],"tbody",{},[41,60,61,65,68,71],{},[62,63,64],"td",{},"项目经理",[62,66,67],{},"Manager",[62,69,70],{},"拆需求、分配任务、跟踪进度",[62,72,73],{},"DeepSeek",[41,75,76,79,82,85],{},[62,77,78],{},"设计师",[62,80,81],{},"Designer",[62,83,84],{},"出效果图、UI 差异分析",[62,86,87],{},"MiniMax",[41,89,90,93,96,99],{},[62,91,92],{},"架构师",[62,94,95],{},"Architect",[62,97,98],{},"技术选型、系统设计",[62,100,87],{},[41,102,103,106,109,112],{},[62,104,105],{},"开发者",[62,107,108],{},"Coder",[62,110,111],{},"写代码",[62,113,87],{},[41,115,116,119,122,125],{},[62,117,118],{},"全栈",[62,120,121],{},"FullStack",[62,123,124],{},"前后端串的功能",[62,126,87],{},[41,128,129,132,135,138],{},[62,130,131],{},"审查者",[62,133,134],{},"Reviewer",[62,136,137],{},"代码审查",[62,139,87],{},[41,141,142,145,148,151],{},[62,143,144],{},"测试者",[62,146,147],{},"Tester",[62,149,150],{},"质量扫描+自动化测试",[62,152,87],{},[41,154,155,158,161,164],{},[62,156,157],{},"运维",[62,159,160],{},"Ops",[62,162,163],{},"SSH 部署、预览上线",[62,165,87],{},[41,167,168,171,174,177],{},[62,169,170],{},"主审查",[62,172,173],{},"Default",[62,175,176],{},"部署后线上验证",[62,178,73],{},[17,180,181],{},"它们通过一个 SQLite 的 Kanban 系统协作。Manager 创建任务，Dispatcher 自动调度，Worker 抢任务干活。理论上，这是一条完美的流水线。",[17,183,184],{},"实际上……",[24,186],{},[27,188,190],{"id":189},"二v4来之不易的稳定版本","二、v4：来之不易的稳定版本",[17,192,193],{},"网站首页经历了四个设计版本：",[35,195,196,208],{},[38,197,198],{},[41,199,200,203,205],{},[44,201,202],{},"版本",[44,204,78],{},[44,206,207],{},"风格",[57,209,210,221,232],{},[41,211,212,215,218],{},[62,213,214],{},"v1-v3",[62,216,217],{},"AI Designer（迭代）",[62,219,220],{},"赛博朋克风，Three.js 3D 背景，粒子效果",[41,222,223,226,229],{},[62,224,225],{},"v4",[62,227,228],{},"AI + 用户反馈",[62,230,231],{},"IcosahedronGeometry 球体 + 双环面 + 900 粒子 + Canvas 神经网络",[41,233,234,237,240],{},[62,235,236],{},"v5",[62,238,239],{},"AI Designer 自主优化",[62,241,242],{},"极度精简，移除大量 3D 效果",[17,244,245],{},"v4 是经过用户反复确认的版本。Three.js 的 IcosahedronGeometry 球体 + 双环面 + 900 个粒子的 Canvas 神经网络背景，效果相当惊艳。",[17,247,248],{},"但 AI Designer 觉得\"还能更好\"。",[24,250],{},[27,252,254],{"id":253},"三v5ai-自主优化的产物","三、v5：AI 自主优化的产物",[17,256,257],{},"在一个没有用户指令的空闲时段，Designer 产出了 v5 设计稿。它的\"优化\"逻辑是：",[14,259,260],{},[17,261,262],{},"\"赛博朋克风格已经过时了。简约克制的设计更符合现代审美。\"",[35,264,265,278],{},[38,266,267],{},[41,268,269,272,275],{},[44,270,271],{},"维度",[44,273,274],{},"v4（赛博朋克）",[44,276,277],{},"v5（精简克制）",[57,279,280,291,302,313,324,335,346,357],{},[41,281,282,285,288],{},[62,283,284],{},"球体",[62,286,287],{},"IcosahedronGeometry",[62,289,290],{},"简化 SphereGeometry",[41,292,293,296,299],{},[62,294,295],{},"环面",[62,297,298],{},"双环面",[62,300,301],{},"移除",[41,303,304,307,310],{},[62,305,306],{},"粒子",[62,308,309],{},"900 个 + Canvas 纹理",[62,311,312],{},"300 个纯色",[41,314,315,318,321],{},[62,316,317],{},"背景",[62,319,320],{},"深色渐变 + 神经网络",[62,322,323],{},"纯色 #020617",[41,325,326,329,332],{},[62,327,328],{},"导航栏",[62,330,331],{},"毛玻璃半透明",[62,333,334],{},"完全不透明",[41,336,337,340,343],{},[62,338,339],{},"按钮",[62,341,342],{},"赛博朋克边框发光",[62,344,345],{},"扁平圆角",[41,347,348,351,354],{},[62,349,350],{},"字体",[62,352,353],{},"Google Fonts (Orbitron)",[62,355,356],{},"系统字体栈",[41,358,359,362,365],{},[62,360,361],{},"配色",[62,363,364],{},"多种霓虹色",[62,366,367],{},"单一 #22d3ee",[17,369,370,371,375],{},"纯从设计角度看，v5 确实\"更好\"——更可读、更符合现代设计系统、加载更快。但问题是：",[372,373,374],"strong",{},"用户要的不是\"更好\"，要的是 v4 的风格。"," Designer 没有区分\"设计优化\"和\"风格变更\"——它把两者当成了同一件事。",[17,377,378,379],{},"Manager 启动了全自动 7 阶段部署流水线。45 分钟内完成，6 个 Worker 全部 APPROVE。",[372,380,381],{},"从流程上看，\"一切正常\"。",[17,383,384],{},"用户打开网站后说了一句话：「这个不好看，回退到 v4。」",[24,386],{},[27,388,390],{"id":389},"四v5v4回退流水线","四、v5→v4：回退流水线",[17,392,393],{},"Manager 理解了这个需求——它不是简单地 Git revert，因为历史里已经混入了其他改动。最干净的方案是：Designer 对比差异 → Coder 修改。",[17,395,396],{},"AI Designer 接到任务后，产出了一个 26 项差异分析文档，分三个优先级：",[398,399,400,407,413],"ul",{},[401,402,403,406],"li",{},[372,404,405],{},"P0（7 项）","：球体几何体错误、颜色不对、环面缺失、拖拽旋转反向",[401,408,409,412],{},[372,410,411],{},"P1（13 项）","：FOV 偏移、粒子数量不足、导航样式差异、按钮风格不匹配",[401,414,415,418],{},[372,416,417],{},"P2（6 项）","：微妙的动画时间和缓动曲线差异",[17,420,421],{},"Coder 完成了 20 项 P0+P1 修复。Reviewer 审查通过（APPROVE）。Git 提交成功。Tester 测试通过。",[17,423,424],{},"一切正常。然后 Ops 开始部署。",[24,426],{},[27,428,430],{"id":429},"五部署从-http-200-到-connection-reset","五、部署：从 HTTP 200 到 Connection Reset",[432,433,434],"h3",{"id":434},"部署流程",[17,436,437],{},"AI Ops Worker 的标准流程：",[439,440,445],"pre",{"className":441,"code":442,"language":443,"meta":444,"style":444},"language-bash shiki shiki-themes github-light github-dark","# 1. 本地构建镜像\ndocker build -t \u003C镜像名> .\n\n# 2. 压缩上传\ndocker save \u003C镜像名> | gzip > website.tar.gz\nsftp upload → \u002Ftmp\u002F\n\n# 3. 服务器加载\nssh \u003C用户名>@[服务器IP] \"docker load \u003C \u002Ftmp\u002Fwebsite.tar.gz\"\n\n# 4. 测试端口启动验证\ndocker run -d --name \u003C容器名>-test -p \u003C测试端口>:\u003C应用端口> \u003C镜像名>\ncurl http:\u002F\u002F127.0.0.1:\u003C测试端口>\u002F   # → 200 OK ✅\n\n# 5. 切换\ndocker stop \u003C容器名> && docker rm \u003C容器名>\ndocker rename \u003C容器名>-test \u003C容器名>\n","bash","",[446,447,448,457,489,496,502,530,545,550,556,577,582,588,649,672,677,683,715],"code",{"__ignoreMap":444},[449,450,453],"span",{"class":451,"line":452},"line",1,[449,454,456],{"class":455},"sJ8bj","# 1. 本地构建镜像\n",[449,458,460,464,468,472,476,479,483,486],{"class":451,"line":459},2,[449,461,463],{"class":462},"sScJk","docker",[449,465,467],{"class":466},"sZZnC"," build",[449,469,471],{"class":470},"sj4cs"," -t",[449,473,475],{"class":474},"szBVR"," \u003C",[449,477,478],{"class":466},"镜像",[449,480,482],{"class":481},"sVt8B","名",[449,484,485],{"class":474},">",[449,487,488],{"class":466}," .\n",[449,490,492],{"class":451,"line":491},3,[449,493,495],{"emptyLinePlaceholder":494},true,"\n",[449,497,499],{"class":451,"line":498},4,[449,500,501],{"class":455},"# 2. 压缩上传\n",[449,503,505,507,510,512,514,516,518,521,524,527],{"class":451,"line":504},5,[449,506,463],{"class":462},[449,508,509],{"class":466}," save",[449,511,475],{"class":474},[449,513,478],{"class":466},[449,515,482],{"class":481},[449,517,485],{"class":474},[449,519,520],{"class":474}," |",[449,522,523],{"class":462}," gzip",[449,525,526],{"class":474}," >",[449,528,529],{"class":466}," website.tar.gz\n",[449,531,533,536,539,542],{"class":451,"line":532},6,[449,534,535],{"class":462},"sftp",[449,537,538],{"class":466}," upload",[449,540,541],{"class":466}," →",[449,543,544],{"class":466}," \u002Ftmp\u002F\n",[449,546,548],{"class":451,"line":547},7,[449,549,495],{"emptyLinePlaceholder":494},[449,551,553],{"class":451,"line":552},8,[449,554,555],{"class":455},"# 3. 服务器加载\n",[449,557,559,562,564,567,569,571,574],{"class":451,"line":558},9,[449,560,561],{"class":462},"ssh",[449,563,475],{"class":474},[449,565,566],{"class":466},"用户",[449,568,482],{"class":481},[449,570,485],{"class":474},[449,572,573],{"class":466},"@[服务器IP]",[449,575,576],{"class":466}," \"docker load \u003C \u002Ftmp\u002Fwebsite.tar.gz\"\n",[449,578,580],{"class":451,"line":579},10,[449,581,495],{"emptyLinePlaceholder":494},[449,583,585],{"class":451,"line":584},11,[449,586,587],{"class":455},"# 4. 测试端口启动验证\n",[449,589,591,593,596,599,602,604,607,609,611,614,617,619,622,625,627,630,633,636,638,640,642,644,646],{"class":451,"line":590},12,[449,592,463],{"class":462},[449,594,595],{"class":466}," run",[449,597,598],{"class":470}," -d",[449,600,601],{"class":470}," --name",[449,603,475],{"class":474},[449,605,606],{"class":466},"容器",[449,608,482],{"class":481},[449,610,485],{"class":474},[449,612,613],{"class":466},"-test",[449,615,616],{"class":470}," -p",[449,618,475],{"class":474},[449,620,621],{"class":466},"测试端",[449,623,624],{"class":481},"口",[449,626,485],{"class":474},[449,628,629],{"class":466},":",[449,631,632],{"class":474},"\u003C",[449,634,635],{"class":466},"应用端",[449,637,624],{"class":481},[449,639,485],{"class":474},[449,641,475],{"class":474},[449,643,478],{"class":466},[449,645,482],{"class":481},[449,647,648],{"class":474},">\n",[449,650,652,655,658,660,662,664,666,669],{"class":451,"line":651},13,[449,653,654],{"class":462},"curl",[449,656,657],{"class":466}," http:\u002F\u002F127.0.0.1:",[449,659,632],{"class":474},[449,661,621],{"class":466},[449,663,624],{"class":481},[449,665,485],{"class":474},[449,667,668],{"class":466},"\u002F",[449,670,671],{"class":455},"   # → 200 OK ✅\n",[449,673,675],{"class":451,"line":674},14,[449,676,495],{"emptyLinePlaceholder":494},[449,678,680],{"class":451,"line":679},15,[449,681,682],{"class":455},"# 5. 切换\n",[449,684,686,688,691,693,695,697,699,702,704,707,709,711,713],{"class":451,"line":685},16,[449,687,463],{"class":462},[449,689,690],{"class":466}," stop",[449,692,475],{"class":474},[449,694,606],{"class":466},[449,696,482],{"class":481},[449,698,485],{"class":474},[449,700,701],{"class":481}," && ",[449,703,463],{"class":462},[449,705,706],{"class":466}," rm",[449,708,475],{"class":474},[449,710,606],{"class":466},[449,712,482],{"class":481},[449,714,648],{"class":474},[449,716,718,720,723,725,727,729,731,733,735,737,739],{"class":451,"line":717},17,[449,719,463],{"class":462},[449,721,722],{"class":466}," rename",[449,724,475],{"class":474},[449,726,606],{"class":466},[449,728,482],{"class":481},[449,730,485],{"class":474},[449,732,613],{"class":466},[449,734,475],{"class":474},[449,736,606],{"class":466},[449,738,482],{"class":481},[449,740,648],{"class":474},[17,742,743],{},"切换完成。然后：",[439,745,747],{"className":441,"code":746,"language":443,"meta":444,"style":444},"$ curl https:\u002F\u002Fdeeeli.com\ncurl: (35) OpenSSL SSL_connect: Connection reset by peer\n",[446,748,749,760],{"__ignoreMap":444},[449,750,751,754,757],{"class":451,"line":452},[449,752,753],{"class":462},"$",[449,755,756],{"class":466}," curl",[449,758,759],{"class":466}," https:\u002F\u002Fdeeeli.com\n",[449,761,762,765],{"class":451,"line":459},[449,763,764],{"class":462},"curl:",[449,766,767],{"class":481}," (35) OpenSSL SSL_connect: Connection reset by peer\n",[17,769,770],{},"网站挂了。",[432,772,773],{"id":773},"症状分析",[35,775,776,786],{},[38,777,778],{},[41,779,780,783],{},[44,781,782],{},"测试",[44,784,785],{},"结果",[57,787,788,798,808,817],{},[41,789,790,795],{},[62,791,792],{},[446,793,794],{},"curl http:\u002F\u002Fdeeeli.com",[62,796,797],{},"301 → https，正常",[41,799,800,805],{},[62,801,802],{},[446,803,804],{},"curl https:\u002F\u002Fdeeeli.com",[62,806,807],{},"Connection reset",[41,809,810,815],{},[62,811,812],{},[446,813,814],{},"curl -k https:\u002F\u002F[服务器IP]",[62,816,807],{},[41,818,819,824],{},[62,820,821],{},[446,822,823],{},"curl http:\u002F\u002F127.0.0.1:\u003C应用端口>",[62,825,826],{},"Connection refused",[17,828,829,830],{},"关键发现：",[372,831,832],{},"HTTP 80 端口正常（nginx 在跑），HTTPS 443 连接被 reset，但 Nuxt 的 3000 端口根本没有监听。",[439,834,836],{"className":441,"code":835,"language":443,"meta":444,"style":444},"$ docker ps\nCONTAINER ID   IMAGE            STATUS\n\u003C容器ID>       nginx:alpine     Up 30 minutes\n\n$ docker ps -a | grep website\n\u003C容器ID>       \u003C镜像名>         Exited (137) 2 minutes ago\n",[446,837,838,848,862,874,878,898],{"__ignoreMap":444},[449,839,840,842,845],{"class":451,"line":452},[449,841,753],{"class":462},[449,843,844],{"class":466}," docker",[449,846,847],{"class":466}," ps\n",[449,849,850,853,856,859],{"class":451,"line":459},[449,851,852],{"class":462},"CONTAINER",[449,854,855],{"class":466}," ID",[449,857,858],{"class":466},"   IMAGE",[449,860,861],{"class":466},"            STATUS\n",[449,863,864,866,869,871],{"class":451,"line":491},[449,865,632],{"class":474},[449,867,868],{"class":481},"容器ID",[449,870,485],{"class":474},[449,872,873],{"class":481},"       nginx:alpine     Up 30 minutes\n",[449,875,876],{"class":451,"line":498},[449,877,495],{"emptyLinePlaceholder":494},[449,879,880,882,884,887,890,892,895],{"class":451,"line":504},[449,881,753],{"class":462},[449,883,844],{"class":466},[449,885,886],{"class":466}," ps",[449,888,889],{"class":470}," -a",[449,891,520],{"class":474},[449,893,894],{"class":462}," grep",[449,896,897],{"class":466}," website\n",[449,899,900,902,904,906,909,912,914,917,920],{"class":451,"line":532},[449,901,632],{"class":474},[449,903,868],{"class":481},[449,905,485],{"class":474},[449,907,908],{"class":474},"       \u003C",[449,910,911],{"class":481},"镜像名",[449,913,485],{"class":474},[449,915,916],{"class":481},"         Exited (",[449,918,919],{"class":462},"137",[449,921,922],{"class":481},") 2 minutes ago\n",[17,924,925],{},"退出码 137。这是 Linux 内核发送 SIGKILL 的信号——OOM。",[24,927],{},[27,929,931],{"id":930},"六系统化排障隔离定位验证","六、系统化排障：隔离→定位→验证",[432,933,934],{"id":934},"从外到内逐层隔离",[439,936,941],{"className":937,"code":939,"language":940},[938],"language-text","用户 ─→ DNS ─→ CDN ─→ nginx ─→ Nuxt\n","text",[446,942,939],{"__ignoreMap":444},[944,945,946,959,965,975,981,987],"ol",{},[401,947,948,951,952,955,956],{},[372,949,950],{},"DNS 检查"," — 正常，",[446,953,954],{},"dig deeeli.com"," 返回 ",[449,957,958],{},"服务器IP",[401,960,961,964],{},[372,962,963],{},"CDN 检查"," — 无 CDN，直连服务器",[401,966,967,970,971,974],{},[372,968,969],{},"nginx 检查"," — ",[446,972,973],{},"curl -I http:\u002F\u002Fdeeeli.com"," → 301，nginx 活着",[401,976,977,980],{},[372,978,979],{},"SSL 层诊断"," — TLS 握手 OK，证书验证通过，但在发送 HTTP 请求时被 reset → nginx SSL 正常，上游问题",[401,982,983,986],{},[372,984,985],{},"端口检查"," — 3000 端口 Connection refused → Nuxt 没有监听",[401,988,989,992],{},[372,990,991],{},"容器状态"," — Exited (137) → OOM killed",[432,994,995],{"id":995},"日志回溯",[439,997,999],{"className":441,"code":998,"language":443,"meta":444,"style":444},"$ docker logs \u003C容器ID> --tail 100\n",[446,1000,1001],{"__ignoreMap":444},[449,1002,1003,1005,1007,1010,1012,1015,1018,1020,1023],{"class":451,"line":452},[449,1004,753],{"class":462},[449,1006,844],{"class":466},[449,1008,1009],{"class":466}," logs",[449,1011,475],{"class":474},[449,1013,1014],{"class":466},"容器I",[449,1016,1017],{"class":481},"D",[449,1019,485],{"class":474},[449,1021,1022],{"class":470}," --tail",[449,1024,1025],{"class":470}," 100\n",[17,1027,1028],{},"关键片段：",[439,1030,1033],{"className":1031,"code":1032,"language":940},[938],"[nuxt] [request] GET \u002F\n[nuxt] [ssr] Rendering page: \u002F\n\u003C--- Last few GCs --->\n[45:0x5a3c000]  1800 ms: Scavenge 950.0 (992.0) -> 948.0 (992.0) MB\n[45:0x5a3c000]  2000 ms: Mark-sweep 992.0 (1024.0) -> 990.0 (1024.0) MB\n[45:0x5a3c000]  2500 ms: Mark-sweep 1010.0 (1024.0) -> 1005.0 (1024.0) MB\n\n\u003C--- JS stacktrace --->\nFATAL ERROR: Ineffective mark-compacts near heap limit\nAllocation failed - JavaScript heap out of memory\n",[446,1034,1032],{"__ignoreMap":444},[17,1036,1037],{},"这是 V8 的经典 OOM 信息。Nuxt 在进行 SSR 时，V8 堆内存溢出。",[432,1039,1041],{"id":1040},"为什么-v5-没事v4-就-oom","为什么 v5 没事，v4 就 OOM？",[17,1043,1044],{},"v4 的 Three.js 场景：",[439,1046,1049],{"className":1047,"code":1048,"language":940},[938],"- IcosahedronGeometry（20 面的二十面体）\n- 双环面（两个 TorusGeometry）\n- 900 个粒子的 Points 系统\n- Canvas 纹理（每个粒子着色用）\n- 自定义 ShaderMaterial\n",[446,1050,1048],{"__ignoreMap":444},[17,1052,1053],{},"这些在 SSR 阶段全部加载到 V8 堆中。v5 的简化场景则大幅减少了内存开销。",[35,1055,1056,1069],{},[38,1057,1058],{},[41,1059,1060,1063,1066],{},[44,1061,1062],{},"组件",[44,1064,1065],{},"v5 内存",[44,1067,1068],{},"v4 内存",[57,1070,1071,1082,1093,1104,1115,1125],{},[41,1072,1073,1076,1079],{},[62,1074,1075],{},"几何体缓冲区",[62,1077,1078],{},"~10MB",[62,1080,1081],{},"~50MB",[41,1083,1084,1087,1090],{},[62,1085,1086],{},"Canvas 纹理",[62,1088,1089],{},"0",[62,1091,1092],{},"~80MB",[41,1094,1095,1098,1101],{},[62,1096,1097],{},"着色器编译",[62,1099,1100],{},"~30MB",[62,1102,1103],{},"~100MB",[41,1105,1106,1109,1112],{},[62,1107,1108],{},"粒子系统",[62,1110,1111],{},"~15MB",[62,1113,1114],{},"~45MB",[41,1116,1117,1120,1123],{},[62,1118,1119],{},"V8 堆其他",[62,1121,1122],{},"~150MB",[62,1124,1122],{},[41,1126,1127,1132,1137],{},[62,1128,1129],{},[372,1130,1131],{},"总计（SSR 峰值）",[62,1133,1134],{},[372,1135,1136],{},"~200MB",[62,1138,1139],{},[372,1140,1141],{},"~425MB",[17,1143,1144],{},"V8 的 GC 在堆接近 1GB 限制时来不及回收，峰值内存远超静态分析数字。加上 Nuxt 应用本身和页面组件的开销，1GB 的容器限制像纸一样被捅穿了。",[432,1146,1148],{"id":1147},"tester-为什么没发现","Tester 为什么没发现？",[17,1150,1151],{},"这是整件事最令人沮丧的部分。Tester 跑了单元测试、集成测试、E2E 测试，全部通过。但为什么没发现 OOM？",[17,1153,1154,1155],{},"答案：Tester 运行在本地开发环境，有 16GB 内存。Docker 容器限制的 1GB 对本地来说毫无意义——V8 可以自由扩展到 4GB 堆空间。",[372,1156,1157],{},"测试环境 ≠ 生产环境。",[24,1159],{},[27,1161,1163],{"id":1162},"七雪上加霜ssh-被封","七、雪上加霜：SSH 被封",[432,1165,1167],{"id":1166},"ops-的自动修复","Ops 的自动修复",[17,1169,1170],{},"容器崩溃后，AI Ops Worker 自动启动了修复流程：",[944,1172,1173,1179,1185],{},[401,1174,1175,1178],{},[372,1176,1177],{},"分析崩溃日志"," → 识别到 OOM",[401,1180,1181,1184],{},[372,1182,1183],{},"尝试重启容器"," — 重启成功，但第一次请求再次 OOM。这是设计问题，重启没用。",[401,1186,1187,1190],{},[372,1188,1189],{},"尝试增加内存"," — 容器内存限制调整失败，服务器资源不足。",[432,1192,1194],{"id":1193},"fail2ban-自伤","fail2ban 自伤",[17,1196,1197],{},"Ops Worker 选择了第三条路——通过 paramiko SSH 修改 Dockerfile。但在多次连接尝试后：",[439,1199,1202],{"className":1200,"code":1201,"language":940},[938],"paramiko.ssh_exception.SSHException: Error reading SSH protocol banner\nConnection reset by peer\n",[446,1203,1201],{"__ignoreMap":444},[17,1205,1206],{},"服务器的 fail2ban 检测到了异常频繁的 SSH 连接，把 Ops Worker 的 IP 封禁了。",[17,1208,1209],{},"加上 paramiko 在连接失败时的自动重试机制（默认 3 次），fail2ban 的触发速度比预期快得多。从第一个连接失败到被封禁，只用了不到两分钟。",[17,1211,1212],{},"同一个 Worker 一边在尝试修复容器，一边在触发安全防护——它在和自己的安全策略打架。",[24,1214],{},[27,1216,1218],{"id":1217},"八修复人与-ai-的协作","八、修复：人与 AI 的协作",[17,1220,1221],{},"AI Ops 进入了僵局：服务器 SSH 被封，无法远程操作。只能人工介入。",[432,1223,1224],{"id":1224},"人工兜底",[439,1226,1228],{"className":441,"code":1227,"language":443,"meta":444,"style":444},"# 通过云厂商 Web Terminal 登录（绕过 fail2ban）\nssh \u003C用户名>@[服务器IP]\n\n# 解除 fail2ban\nfail2ban-client set sshd unbanip \u003C本地IP>\n\n# 直接回退到 v5（内存安全版本）\ndocker stop \u003C容器名>\ndocker run -d --name \u003C容器名> \u003C镜像名>\n",[446,1229,1230,1235,1250,1254,1259,1283,1287,1292,1306],{"__ignoreMap":444},[449,1231,1232],{"class":451,"line":452},[449,1233,1234],{"class":455},"# 通过云厂商 Web Terminal 登录（绕过 fail2ban）\n",[449,1236,1237,1239,1241,1243,1245,1247],{"class":451,"line":459},[449,1238,561],{"class":462},[449,1240,475],{"class":474},[449,1242,566],{"class":466},[449,1244,482],{"class":481},[449,1246,485],{"class":474},[449,1248,1249],{"class":466},"@[服务器IP]\n",[449,1251,1252],{"class":451,"line":491},[449,1253,495],{"emptyLinePlaceholder":494},[449,1255,1256],{"class":451,"line":498},[449,1257,1258],{"class":455},"# 解除 fail2ban\n",[449,1260,1261,1264,1267,1270,1273,1275,1278,1281],{"class":451,"line":504},[449,1262,1263],{"class":462},"fail2ban-client",[449,1265,1266],{"class":466}," set",[449,1268,1269],{"class":466}," sshd",[449,1271,1272],{"class":466}," unbanip",[449,1274,475],{"class":474},[449,1276,1277],{"class":466},"本地I",[449,1279,1280],{"class":481},"P",[449,1282,648],{"class":474},[449,1284,1285],{"class":451,"line":532},[449,1286,495],{"emptyLinePlaceholder":494},[449,1288,1289],{"class":451,"line":547},[449,1290,1291],{"class":455},"# 直接回退到 v5（内存安全版本）\n",[449,1293,1294,1296,1298,1300,1302,1304],{"class":451,"line":552},[449,1295,463],{"class":462},[449,1297,690],{"class":466},[449,1299,475],{"class":474},[449,1301,606],{"class":466},[449,1303,482],{"class":481},[449,1305,648],{"class":474},[449,1307,1308,1310,1312,1314,1316,1318,1320,1322,1324,1326,1328,1330],{"class":451,"line":558},[449,1309,463],{"class":462},[449,1311,595],{"class":466},[449,1313,598],{"class":470},[449,1315,601],{"class":470},[449,1317,475],{"class":474},[449,1319,606],{"class":466},[449,1321,482],{"class":481},[449,1323,485],{"class":474},[449,1325,475],{"class":474},[449,1327,478],{"class":466},[449,1329,482],{"class":481},[449,1331,648],{"class":474},[17,1333,1334],{},"网站恢复，v4 部署失败，v5 继续运行。",[432,1336,1338],{"id":1337},"根因修复clientonly-防线","根因修复：ClientOnly 防线",[17,1340,1341],{},"回退只是临时措施。真正的修复需要让 v4 在 1GB 容器中也能运行。两个方案组合：",[17,1343,1344],{},[372,1345,1346],{},"方案 C + B 组合：",[439,1348,1352],{"className":1349,"code":1350,"language":1351,"meta":444,"style":444},"language-html shiki shiki-themes github-light github-dark","\u003C!-- 方案 C：ClientOnly 包裹，SSR 时输出占位 -->\n\u003CClientOnly>\n  \u003CThreeBackground \u002F>\n\u003C\u002FClientOnly>\n","html",[446,1353,1354,1359,1369,1380],{"__ignoreMap":444},[449,1355,1356],{"class":451,"line":452},[449,1357,1358],{"class":455},"\u003C!-- 方案 C：ClientOnly 包裹，SSR 时输出占位 -->\n",[449,1360,1361,1363,1367],{"class":451,"line":459},[449,1362,632],{"class":481},[449,1364,1366],{"class":1365},"s7hpK","ClientOnly",[449,1368,648],{"class":481},[449,1370,1371,1374,1377],{"class":451,"line":491},[449,1372,1373],{"class":481},"  \u003C",[449,1375,1376],{"class":1365},"ThreeBackground",[449,1378,1379],{"class":481}," \u002F>\n",[449,1381,1382,1385,1387],{"class":451,"line":498},[449,1383,1384],{"class":481},"\u003C\u002F",[449,1386,1366],{"class":1365},[449,1388,648],{"class":481},[439,1390,1394],{"className":1391,"code":1392,"language":1393,"meta":444,"style":444},"language-javascript shiki shiki-themes github-light github-dark","\u002F\u002F 方案 B：防御性 SSR 检查\nconst isSSR = typeof window === 'undefined';\nconst scene = isSSR ? null : new THREE.Scene();\n","javascript",[446,1395,1396,1401,1406],{"__ignoreMap":444},[449,1397,1398],{"class":451,"line":452},[449,1399,1400],{},"\u002F\u002F 方案 B：防御性 SSR 检查\n",[449,1402,1403],{"class":451,"line":459},[449,1404,1405],{},"const isSSR = typeof window === 'undefined';\n",[449,1407,1408],{"class":451,"line":491},[449,1409,1410],{},"const scene = isSSR ? null : new THREE.Scene();\n",[17,1412,1413,1414,1417,1418,1421],{},"一行 ",[446,1415,1416],{},"\u003CClientOnly>"," + 一行 ",[446,1419,1420],{},"isSSR"," 检查，彻底避免了 SSR 阶段的 Three.js 内存炸弹。",[24,1423],{},[27,1425,1427],{"id":1426},"九完整时间线","九、完整时间线",[35,1429,1430,1440],{},[38,1431,1432],{},[41,1433,1434,1437],{},[44,1435,1436],{},"时间",[44,1438,1439],{},"事件",[57,1441,1442,1450,1458,1466,1474,1482,1490,1498,1506,1514,1522],{},[41,1443,1444,1447],{},[62,1445,1446],{},"T+0",[62,1448,1449],{},"Designer 接到回退任务",[41,1451,1452,1455],{},[62,1453,1454],{},"T+15min",[62,1456,1457],{},"产出 26 项差异分析",[41,1459,1460,1463],{},[62,1461,1462],{},"T+25min",[62,1464,1465],{},"Coder 完成修复，20 项 P0+P1",[41,1467,1468,1471],{},[62,1469,1470],{},"T+35min",[62,1472,1473],{},"Reviewer 审查通过",[41,1475,1476,1479],{},[62,1477,1478],{},"T+40min",[62,1480,1481],{},"Git 提交 + Tester 测试通过",[41,1483,1484,1487],{},[62,1485,1486],{},"T+45min",[62,1488,1489],{},"Ops 构建镜像、上传、部署、切换",[41,1491,1492,1495],{},[62,1493,1494],{},"T+46min",[62,1496,1497],{},"Nuxt 容器 OOM（退出码 137），网站挂掉",[41,1499,1500,1503],{},[62,1501,1502],{},"T+47min",[62,1504,1505],{},"Ops 自动诊断、尝试重启\u002F加内存\u002F修改配置",[41,1507,1508,1511],{},[62,1509,1510],{},"T+52min",[62,1512,1513],{},"触发 fail2ban，SSH 被封",[41,1515,1516,1519],{},[62,1517,1518],{},"T+55min",[62,1520,1521],{},"Ops 报错，block 等待人工",[41,1523,1524,1527],{},[62,1525,1526],{},"T+60min",[62,1528,1529],{},"用户通过 Web Terminal 解除封禁，回退 v5",[17,1531,1532],{},"55 分钟，140 个 Kanban 任务，最终回到了起点。",[24,1534],{},[27,1536,1538],{"id":1537},"十数字说话140-次任务的真相","十、数字说话：140 次任务的真相",[17,1540,1541],{},"在 v4→v5→v4 的整个过程中，Kanban 系统创建了超过 140 个任务：",[35,1543,1544,1557],{},[38,1545,1546],{},[41,1547,1548,1551,1554],{},[44,1549,1550],{},"阶段",[44,1552,1553],{},"任务数",[44,1555,1556],{},"工人",[57,1558,1559,1570,1581,1592,1603,1613,1623,1633],{},[41,1560,1561,1564,1567],{},[62,1562,1563],{},"v4 最终确认（Designer 迭代）",[62,1565,1566],{},"~20 个",[62,1568,1569],{},"Designer × 3",[41,1571,1572,1575,1578],{},[62,1573,1574],{},"v5 设计",[62,1576,1577],{},"~10 个",[62,1579,1580],{},"Designer × 2",[41,1582,1583,1586,1589],{},[62,1584,1585],{},"v5 实现上线",[62,1587,1588],{},"~30 个",[62,1590,1591],{},"Coder + Reviewer + Tester + Ops",[41,1593,1594,1597,1600],{},[62,1595,1596],{},"admin 侧边栏修复",[62,1598,1599],{},"~15 个",[62,1601,1602],{},"Architect + Coder + Reviewer + Ops",[41,1604,1605,1608,1610],{},[62,1606,1607],{},"三项视觉修复",[62,1609,1566],{},[62,1611,1612],{},"Designer + Coder + Ops + Review",[41,1614,1615,1618,1620],{},[62,1616,1617],{},"v5→v4 回退（含 OOM）",[62,1619,1588],{},[62,1621,1622],{},"Designer + Coder + Reviewer + Tester + Ops + Review",[41,1624,1625,1628,1630],{},[62,1626,1627],{},"blog 页对齐",[62,1629,1599],{},[62,1631,1632],{},"Designer + Coder + Reviewer + Ops",[41,1634,1635,1640,1645],{},[62,1636,1637],{},[372,1638,1639],{},"合计",[62,1641,1642],{},[372,1643,1644],{},"~140 个",[62,1646],{},[17,1648,1649],{},"平均每次\"改动\"经历了 6-12 个 Worker 任务。管道的 7 个阶段，每个 5-30 分钟。管道越长，问题发现越晚。",[24,1651],{},[27,1653,1655],{"id":1654},"十一踩坑总结7-条军规","十一、踩坑总结：7 条军规",[432,1657,1659],{"id":1658},"_1-环境差异是最大的坑","1. 环境差异是最大的坑",[17,1661,1662,1663,1666],{},"本地 ",[446,1664,1665],{},"npm run dev"," → 一切正常。Docker SSR → OOM。Dev 和 Prod 的内存边界完全不同。AI Reviewer 和 Tester 无法捕捉这种差异——它们工作在开发环境。",[432,1668,1670],{"id":1669},"_2-ssr-3d-库-内存炸弹","2. SSR + 3D 库 = 内存炸弹",[17,1672,1673,1674,1677,1678,1680,1681,1684],{},"Three.js 在 SSR 阶段会将所有几何体、纹理、着色器加载到 V8 堆中。1GB 容器限制在 v4 的复杂场景面前不堪一击。",[372,1675,1676],{},"解决办法","：",[446,1679,1416],{}," 包裹所有 3D 组件，或用 ",[446,1682,1683],{},"typeof window"," 守卫。",[432,1686,1688],{"id":1687},"_3-ai-ops-的自我毁灭","3. AI Ops 的自我毁灭",[17,1690,1691],{},"AI Ops Worker 的自动修复逻辑本身没问题，但它没有考虑到\"频繁的 SSH 连接会被封禁\"这一层。故障修复流程不应该触发安全机制把自己排除在外。",[432,1693,1695],{"id":1694},"_4-管道太长问题放大","4. 管道太长，问题放大",[439,1697,1700],{"className":1698,"code":1699,"language":940},[938],"Designer → Coder → Reviewer → Git → Tester → Ops → AgentReview\n",[446,1701,1699],{"__ignoreMap":444},[17,1703,1704],{},"7 个阶段。v5 的\"风格漂移\"在 Designer 阶段就发生了，但直到 Ops 部署后才能被用户看到——中间跨越了 Coder、Reviewer、Tester 三个阶段，它们全部 APPROVE——因为它们审查的是代码质量，不是设计意图。",[432,1706,1708],{"id":1707},"_5-worker-之间的幻觉放大","5. Worker 之间的\"幻觉放大\"",[17,1710,1711,1712,1715],{},"每个 Worker 只完成自己被分配的任务，",[372,1713,1714],{},"没有人做\"整体决策\"。"," Designer 说\"26 项差异需要修\"，Coder 修了 20 项，Reviewer 说\"代码 OK\"，Tester 说\"功能 OK\"——但没人问：\"v4 和 v5 的差异真的需要全部回到 v4 吗？v5 的一些改进应该保留吗？\"",[432,1717,1719],{"id":1718},"_6-模型切换的连锁反应","6. 模型切换的连锁反应",[17,1721,1722],{},"部署过程中 MiniMax API 遇到 429 限流。Worker 自动切换到了 DeepSeek。但 DeepSeek 的推理速度比 MiniMax 慢 3-5 倍，导致 15 分钟的部署拖到了 45 分钟——正好跨越了 fail2ban 的检测窗口。",[432,1724,1726],{"id":1725},"_7-人有最终决策权但需要及时看到","7. 人有最终决策权，但需要及时看到",[17,1728,1729,1730],{},"如果用户在 Designer 阶段就看到 v5 效果图，根本不会有后续的 95 个任务。",[372,1731,1732],{},"在任何 UI 变更进入开发之前，强制预览展示给用户确认。",[24,1734],{},[27,1736,1738],{"id":1737},"十二预防措施不让它再发生","十二、预防措施：不让它再发生",[944,1740,1741,1747,1753,1759,1765,1771,1780,1790],{},[401,1742,1743,1746],{},[372,1744,1745],{},"Tester 容器化","：Tester 必须在与生产相同的内存限制下运行测试。本地 16GB 通过的测试，在 1GB 容器里可能连 SSR 都跑不完。\"在相同环境中测试和部署\"——AI 运维也需要遵守这个原则。",[401,1748,1749,1752],{},[372,1750,1751],{},"SSR 内存预算","：在 CI 中跑一次 production build + SSR 请求，记录内存峰值。当新改动导致 SSR 内存增长超过 20%，CI 应该 Block 部署。",[401,1754,1755,1758],{},[372,1756,1757],{},"渐进式部署","：先部署到 staging 端口，跑完整 E2E 测试，再切换。永远不要一步到位。",[401,1760,1761,1764],{},[372,1762,1763],{},"回滚方案","：每次部署前保留旧镜像，出问题一键回滚。",[401,1766,1767,1770],{},[372,1768,1769],{},"SSH 频率限制","：在 Ops Worker 中实现连接频率控制——每分钟最多 3 次新连接，防止触发 fail2ban。",[401,1772,1773,1776,1777,1779],{},[372,1774,1775],{},"ClientOnly 防御","：对所有含 Three.js \u002F WebGL \u002F Canvas 的组件，默认使用 ",[446,1778,1416],{}," 包裹。",[401,1781,1782,1785,1786,1789],{},[372,1783,1784],{},"Docker 资源监控","：在 compose 中加入 ",[446,1787,1788],{},"deploy.resources.limits"," 和健康检查。容器不应该静默 OOM。",[401,1791,1792,1795],{},[372,1793,1794],{},"产品经理角色","：加入 ProductManager——一个会在 v5 设计稿出来后说\"等等，用户确认了吗？\"的人。",[24,1797],{},[27,1799,1800],{"id":1800},"结语",[17,1802,1803],{},"AI Ops 不是银弹。它能自动化 90% 的流程，但剩下那 10%——环境差异、资源边界、连锁反应——仍然需要人的判断。",[17,1805,1806],{},"但这也正是有趣的地方：AI 在\"翻车\"中学到的教训，比在\"顺利\"中多得多。这次事件之后，Manager 的 SOUL.md 里新增了一大段关于部署安全的规则：先验证再切换、保留回滚方案、SSH 连接频率限制、Docker 内存限制检查……",[17,1808,1809],{},"AI 学会了。代价是网站挂了 9 分钟。",[17,1811,1812],{},"下一次部署，同样的坑不会再踩。",[17,1814,1815],{},"（但会有新的坑。）",[1817,1818,1819],"style",{},"html .default .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html .dark .shiki span {color: var(--shiki-dark);background: var(--shiki-dark-bg);font-style: var(--shiki-dark-font-style);font-weight: var(--shiki-dark-font-weight);text-decoration: var(--shiki-dark-text-decoration);}html.dark .shiki span {color: var(--shiki-dark);background: var(--shiki-dark-bg);font-style: var(--shiki-dark-font-style);font-weight: var(--shiki-dark-font-weight);text-decoration: var(--shiki-dark-text-decoration);}html pre.shiki code .sJ8bj, html code.shiki .sJ8bj{--shiki-default:#6A737D;--shiki-dark:#6A737D}html pre.shiki code .sScJk, html code.shiki .sScJk{--shiki-default:#6F42C1;--shiki-dark:#B392F0}html pre.shiki code .sZZnC, html code.shiki .sZZnC{--shiki-default:#032F62;--shiki-dark:#9ECBFF}html pre.shiki code .sj4cs, html code.shiki .sj4cs{--shiki-default:#005CC5;--shiki-dark:#79B8FF}html pre.shiki code .szBVR, html code.shiki .szBVR{--shiki-default:#D73A49;--shiki-dark:#F97583}html pre.shiki code .sVt8B, html code.shiki .sVt8B{--shiki-default:#24292E;--shiki-dark:#E1E4E8}html pre.shiki code .s7hpK, html code.shiki .s7hpK{--shiki-default:#B31D28;--shiki-default-font-style:italic;--shiki-dark:#FDAEB7;--shiki-dark-font-style:italic}",{"title":444,"searchDepth":459,"depth":459,"links":1821},[1822,1823,1824,1825,1826,1830,1836,1840,1844,1845,1846,1855,1856],{"id":29,"depth":459,"text":30},{"id":189,"depth":459,"text":190},{"id":253,"depth":459,"text":254},{"id":389,"depth":459,"text":390},{"id":429,"depth":459,"text":430,"children":1827},[1828,1829],{"id":434,"depth":491,"text":434},{"id":773,"depth":491,"text":773},{"id":930,"depth":459,"text":931,"children":1831},[1832,1833,1834,1835],{"id":934,"depth":491,"text":934},{"id":995,"depth":491,"text":995},{"id":1040,"depth":491,"text":1041},{"id":1147,"depth":491,"text":1148},{"id":1162,"depth":459,"text":1163,"children":1837},[1838,1839],{"id":1166,"depth":491,"text":1167},{"id":1193,"depth":491,"text":1194},{"id":1217,"depth":459,"text":1218,"children":1841},[1842,1843],{"id":1224,"depth":491,"text":1224},{"id":1337,"depth":491,"text":1338},{"id":1426,"depth":459,"text":1427},{"id":1537,"depth":459,"text":1538},{"id":1654,"depth":459,"text":1655,"children":1847},[1848,1849,1850,1851,1852,1853,1854],{"id":1658,"depth":491,"text":1659},{"id":1669,"depth":491,"text":1670},{"id":1687,"depth":491,"text":1688},{"id":1694,"depth":491,"text":1695},{"id":1707,"depth":491,"text":1708},{"id":1718,"depth":491,"text":1719},{"id":1725,"depth":491,"text":1726},{"id":1737,"depth":459,"text":1738},{"id":1800,"depth":459,"text":1800},"2026-06-02","我让 9 个 AI Agent 协作部署一个个人网站。它们创建了 140 个 Kanban 任务，迭代了 v4→v5→v4，把网站搞挂了 55 分钟，最后还把 SSH 封了。这是一份诚实的全链路复盘。",false,"md",{"author":1862},"陈德立","\u002Fblog\u002Fagent-ops-full-postmortem",{"title":5,"description":1858},"blog\u002Fagent-ops-full-postmortem",[1867,1868,1869,463,1870,1871,1872,1873,1874,1875,1876,1877,1878,1879,1880],"ai-agent","debugging","devops","hermes","incident","kanban","multi-agent","nginx","nuxt","ops","software-engineering","ssr","troubleshooting","workflow","WRBAUK6KREzsqZCOmfNQZjkWpl08HBUPd4efIQTdzq0"]