Files
aurak/server/scripts/cleanup-question-bank.cjs
T
Developer 5c974c50de feat: knowledge-base code review fixes + question bank cleanup
- 🔴 searchKnowledge: 移除随机mock向量,使用真实embedding
- 🔴 userId: 改为NOT NULL,清理遗留调试注释
- 🟡 文件移动事务安全:先移文件再创DB记录
- 🟡 Ollama嵌入并行化:串行→Promise.allSettled
- 🟡 三处重复降级代码提取为processChunksOneByOne(~200行→30行)
- 🟡 Chunk换算根据CJK比例动态调整(英4x/中2x/日2x)
- 🟡 findAll添加分页参数
- 🔵 清理冗余动态import、findByIds→findBy、日文标点补充
- chore: question-bank cleanup (删除47道概念/重复/ADV题)
- chore: qa-assessment-flow (Phase 1+2全量测试14项通过)
- fix: shuffleArray接收返回值(三处调用点)

Co-Authored-By: Claude <noreply@anthropic.com>
2026-06-25 11:27:16 +08:00

146 lines
8.2 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/**
* 清理题库中不符合"简单、应用为主"的题目
*
* 删除规则:
* 1. 纯概念/定义/术语类题目(考"什么是XX"而不是"遇到XX该怎么做"
* 2. 分类/层级背诵题(考"L1级别要求什么"等)
* 3. 完全重复的题目
* 4. 大量高度雷同的场景题(保留2-3个最佳,删除其余)
*
* 运行: node server/scripts/cleanup-question-bank.cjs
*/
const D = require('better-sqlite3');
const path = require('path');
const db = new D(path.join(__dirname, '../data/metadata.db'));
const BANK = '984632e0-b35d-486d-9a19-27a14845db37';
// Helper: find item ID by partial text match
function findIds(textLike) {
return db.prepare("SELECT id, question_text, questionType FROM question_bank_items WHERE bank_id=? AND question_text LIKE ? ORDER BY ROWID").all(BANK, textLike);
}
function del(id, reason) {
const item = db.prepare("SELECT question_text, dimension, questionType FROM question_bank_items WHERE id=?").get(id);
if (!item) { console.log(' ⚠️ 未找到:', id.substring(0,8)); return; }
db.prepare("DELETE FROM question_bank_items WHERE id=?").run(id);
console.log(` 🗑️ ${item.questionType} ${item.dimension} | ${reason} | ${item.question_text.replace(/\n/g,' ').substring(0,60)}`);
}
let total = 0;
function d(id, reason) { del(id, reason); total++; }
console.log('=== 清理题库 ===\n');
// ═══════════════ DEV_PATTERN ═══════════════
console.log('--- DEV_PATTERN: 概念/术语题 ---');
// 概念定义:SDD/Vibe Coding/Flow State/L1级别
d('0b00ac95-0000-0000-0000-000000000000', ''); // placeholder — use LIKE instead
// Since UUIDs are random, use text search
const devPatternConcepts = [
{ like: '%瀑布开发和敏捷开发的核心区别%', reason: '概念对比:瀑布vs敏捷' },
{ like: '%规范驱动开发%核心思想%', reason: '概念定义:SDD核心思想' },
{ like: '%Vibe Coding(氛围编程)是一种什么样的编程方式%', reason: '概念定义:Vibe Coding是什么' },
{ like: '%Flow State(心流状态)的核心特征%', reason: '概念定义:Flow State特征' },
{ like: '%Vibe Coding中人和AI的分工应该是%', reason: '概念定义:Vibe Coding分工' },
{ like: '%SDD中的"规范"应该是什么样的%', reason: 'ADV概念:SDD规范' },
{ like: '%当你一直按Tab接受AI代码却不看%', reason: 'ADV术语:Vibe Coding挂机' },
{ like: '%"概率性"的,这意味着什么%', reason: 'ADV理论:概率性' },
{ like: '%L1级别的AI开发范式维度要求%', reason: '分类背诵:L1级别' },
{ like: '%请简述规范驱动开发%典型流程%', reason: '概念阐述:SDD流程' },
{ like: '%Vibe Coding有助于接近Flow State%三个核心条件%', reason: '概念阐述:Vibe Coding+Flow State' },
{ like: '%从确定性到概率性%这一变化对开发流程%', reason: 'ADV理论:确定性到概率性' },
];
for (const c of devPatternConcepts) {
const items = findIds(c.like);
for (const item of items) d(item.id, c.reason);
}
// DEV_PATTERN duplicates
const devPatternDups = [
{ like1: '%你和AI分工完成一个功能:你负责设计,AI负责编码%', like2: '%', reason: '重复:责任划分' },
];
const dup1 = findIds('你和AI分工完成一个功能:你负责设计,AI负责编码');
// Keep first, delete rest
for (let i = 1; i < dup1.length; i++) d(dup1[i].id, '重复:责任划分');
const dup2 = findIds('你和同事用AI一起开发一个功能。同事直接提交了AI生成的代码没有审查');
for (let i = 1; i < dup2.length; i++) d(dup2[i].id, '重复:同事提交没审查');
// ═══════════════ LLM ═══════════════
console.log('\n--- LLM: 概念/原理题 ---');
const llmConcepts = [
{ like: 'AI的工作原理是根据上文猜下文%', reason: '原理:AI工作机制' },
{ like: 'AI的"幻觉"是指AI会编造%', reason: '定义:幻觉术语' },
{ like: 'AI训练数据的截止日期意味着%', reason: '原理:训练数据截止' },
{ like: 'AI有时会编造看似合理但实际不存在的信息,这被称为"幻觉"%', reason: '定义:幻觉术语(重复)' },
{ like: 'AI的知识训练数据只截止到%', reason: '原理:知识截止' },
{ like: 'AI不知道自己的知识边界%', reason: '原理:AI知识边界' },
{ like: '以下哪个是AI的固有问题%', reason: '列举:AI固有问题' },
{ like: 'AI说了一段话,听起来很有道理,但你查了资料发现它说的内容不存在。这是什么现象%', reason: '定义:这是什么现象' },
{ like: '%传统AI(判别式)和生成式AI的核心差异%', reason: 'ADV概念:判别式vs生成式' },
{ like: 'AI的"上下文有限"是指什么问题%', reason: '定义:上下文有限' },
];
for (const c of llmConcepts) {
const items = findIds(c.like);
for (const item of items) d(item.id, c.reason);
}
// LLM MC: 11 hallucination scenario duplicates — keep 2 (第一个+搜索引擎), delete rest
const hallMC = db.prepare("SELECT id, question_text FROM question_bank_items WHERE bank_id=? AND dimension=? AND questionType=? AND question_text LIKE '%场景%' AND (question_text LIKE '%fetchUser%' OR question_text LIKE '%validateUser%' OR question_text LIKE '%sendWelcome%') ORDER BY ROWID").all(BANK, 'LLM', 'MULTIPLE_CHOICE');
console.log(`\n--- LLM MC: 幻觉场景重复 (${hallMC.length} total, keep 2) ---`);
for (let i = 2; i < hallMC.length; i++) {
d(hallMC[i].id, '重复:幻觉场景MC #' + (i+1));
}
// LLM SA: 5页文档场景 duplicates — keep first (cleanest), delete rest
const sa5 = db.prepare("SELECT id, question_text FROM question_bank_items WHERE bank_id=? AND dimension=? AND questionType=? AND question_text LIKE '%5页%' ORDER BY ROWID").all(BANK, 'LLM', 'SHORT_ANSWER');
console.log(`\n--- LLM SA: 5页文档场景重复 (${sa5.length} total, keep 1) ---`);
for (let i = 1; i < sa5.length; i++) {
d(sa5[i].id, '重复:5页文档SA #' + (i+1));
}
// ═══════════════ PROMPT ═══════════════
console.log('\n--- PROMPT: 分类背诵题 ---');
const promptConcepts = [
{ like: 'L1级别的技术能力维度要求是什么%', reason: '分类背诵:L1维度' },
];
for (const c of promptConcepts) {
const items = findIds(c.like);
for (const item of items) d(item.id, c.reason);
}
// ═══════════════ WORK_CAPABILITY ═══════════════
console.log('\n--- WORK_CAPABILITY: 概念/分类题 ---');
const wcConcepts = [
{ like: '%"负责任AI"的组织原则中,"问责制"对员工的要求是什么%', reason: '概念:负责任AI问责制' },
{ like: '%智能体(Agent)与传统聊天AI最本质的区别是什么%', reason: 'ADV概念:Agent vs 聊天AI' },
{ like: '%智能体安全控制原则中"最小权限"是指什么%', reason: 'ADV概念:最小权限' },
{ like: '请简述AI的四个固有问题%', reason: '列举:AI四个固有问题' },
{ like: '数据分为"绝密""机密""公开"三个级别%', reason: '分类:数据分级' },
{ like: '%智能体安全的四条控制原则是什么%', reason: 'ADV列举:四条控制原则' },
];
for (const c of wcConcepts) {
const items = findIds(c.like);
for (const item of items) d(item.id, c.reason);
}
// WORK_CAPABILITY duplicate
const wcDups = findIds('你正在使用AI助手分析一份包含客户信息的Excel表格');
for (let i = 1; i < wcDups.length; i++) d(wcDups[i].id, '重复:客户Excel场景');
// ═══════════════ Summary ═══════════════
const remaining = db.prepare('SELECT COUNT(*) c FROM question_bank_items WHERE bank_id=?').get(BANK);
console.log(`\n${'═'.repeat(50)}`);
console.log(` 删除: ${total}`);
console.log(` 剩余: ${remaining.c} 题(原 ${remaining.c + total} 题)`);
console.log(`\n 各维度分布:`);
const byDim = db.prepare('SELECT dimension, questionType, COUNT(*) c FROM question_bank_items WHERE bank_id=? GROUP BY dimension, questionType ORDER BY dimension, questionType').all(BANK);
byDim.forEach(r => console.log(` ${r.dimension} ${r.questionType}: ${r.c}`));
db.close();