forked from hangshuo652/aurak
feat: knowledge-base code review fixes + question bank cleanup
- 🔴 searchKnowledge: 移除随机mock向量,使用真实embedding - 🔴 userId: 改为NOT NULL,清理遗留调试注释 - 🟡 文件移动事务安全:先移文件再创DB记录 - 🟡 Ollama嵌入并行化:串行→Promise.allSettled - 🟡 三处重复降级代码提取为processChunksOneByOne(~200行→30行) - 🟡 Chunk换算根据CJK比例动态调整(英4x/中2x/日2x) - 🟡 findAll添加分页参数 - 🔵 清理冗余动态import、findByIds→findBy、日文标点补充 - chore: question-bank cleanup (删除47道概念/重复/ADV题) - chore: qa-assessment-flow (Phase 1+2全量测试14项通过) - fix: shuffleArray接收返回值(三处调用点) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,145 @@
|
||||
/**
|
||||
* 清理题库中不符合"简单、应用为主"的题目
|
||||
*
|
||||
* 删除规则:
|
||||
* 1. 纯概念/定义/术语类题目(考"什么是XX"而不是"遇到XX该怎么做")
|
||||
* 2. 分类/层级背诵题(考"L1级别要求什么"等)
|
||||
* 3. 完全重复的题目
|
||||
* 4. 大量高度雷同的场景题(保留2-3个最佳,删除其余)
|
||||
*
|
||||
* 运行: node server/scripts/cleanup-question-bank.cjs
|
||||
*/
|
||||
const D = require('better-sqlite3');
|
||||
const path = require('path');
|
||||
const db = new D(path.join(__dirname, '../data/metadata.db'));
|
||||
const BANK = '984632e0-b35d-486d-9a19-27a14845db37';
|
||||
|
||||
// Helper: find item ID by partial text match
|
||||
function findIds(textLike) {
|
||||
return db.prepare("SELECT id, question_text, questionType FROM question_bank_items WHERE bank_id=? AND question_text LIKE ? ORDER BY ROWID").all(BANK, textLike);
|
||||
}
|
||||
|
||||
function del(id, reason) {
|
||||
const item = db.prepare("SELECT question_text, dimension, questionType FROM question_bank_items WHERE id=?").get(id);
|
||||
if (!item) { console.log(' ⚠️ 未找到:', id.substring(0,8)); return; }
|
||||
db.prepare("DELETE FROM question_bank_items WHERE id=?").run(id);
|
||||
console.log(` 🗑️ ${item.questionType} ${item.dimension} | ${reason} | ${item.question_text.replace(/\n/g,' ').substring(0,60)}`);
|
||||
}
|
||||
|
||||
let total = 0;
|
||||
function d(id, reason) { del(id, reason); total++; }
|
||||
|
||||
console.log('=== 清理题库 ===\n');
|
||||
|
||||
// ═══════════════ DEV_PATTERN ═══════════════
|
||||
console.log('--- DEV_PATTERN: 概念/术语题 ---');
|
||||
|
||||
// 概念定义:SDD/Vibe Coding/Flow State/L1级别
|
||||
d('0b00ac95-0000-0000-0000-000000000000', ''); // placeholder — use LIKE instead
|
||||
|
||||
// Since UUIDs are random, use text search
|
||||
const devPatternConcepts = [
|
||||
{ like: '%瀑布开发和敏捷开发的核心区别%', reason: '概念对比:瀑布vs敏捷' },
|
||||
{ like: '%规范驱动开发%核心思想%', reason: '概念定义:SDD核心思想' },
|
||||
{ like: '%Vibe Coding(氛围编程)是一种什么样的编程方式%', reason: '概念定义:Vibe Coding是什么' },
|
||||
{ like: '%Flow State(心流状态)的核心特征%', reason: '概念定义:Flow State特征' },
|
||||
{ like: '%Vibe Coding中人和AI的分工应该是%', reason: '概念定义:Vibe Coding分工' },
|
||||
{ like: '%SDD中的"规范"应该是什么样的%', reason: 'ADV概念:SDD规范' },
|
||||
{ like: '%当你一直按Tab接受AI代码却不看%', reason: 'ADV术语:Vibe Coding挂机' },
|
||||
{ like: '%"概率性"的,这意味着什么%', reason: 'ADV理论:概率性' },
|
||||
{ like: '%L1级别的AI开发范式维度要求%', reason: '分类背诵:L1级别' },
|
||||
{ like: '%请简述规范驱动开发%典型流程%', reason: '概念阐述:SDD流程' },
|
||||
{ like: '%Vibe Coding有助于接近Flow State%三个核心条件%', reason: '概念阐述:Vibe Coding+Flow State' },
|
||||
{ like: '%从确定性到概率性%这一变化对开发流程%', reason: 'ADV理论:确定性到概率性' },
|
||||
];
|
||||
|
||||
for (const c of devPatternConcepts) {
|
||||
const items = findIds(c.like);
|
||||
for (const item of items) d(item.id, c.reason);
|
||||
}
|
||||
|
||||
// DEV_PATTERN duplicates
|
||||
const devPatternDups = [
|
||||
{ like1: '%你和AI分工完成一个功能:你负责设计,AI负责编码%', like2: '%', reason: '重复:责任划分' },
|
||||
];
|
||||
const dup1 = findIds('你和AI分工完成一个功能:你负责设计,AI负责编码');
|
||||
// Keep first, delete rest
|
||||
for (let i = 1; i < dup1.length; i++) d(dup1[i].id, '重复:责任划分');
|
||||
|
||||
const dup2 = findIds('你和同事用AI一起开发一个功能。同事直接提交了AI生成的代码没有审查');
|
||||
for (let i = 1; i < dup2.length; i++) d(dup2[i].id, '重复:同事提交没审查');
|
||||
|
||||
// ═══════════════ LLM ═══════════════
|
||||
console.log('\n--- LLM: 概念/原理题 ---');
|
||||
|
||||
const llmConcepts = [
|
||||
{ like: 'AI的工作原理是根据上文猜下文%', reason: '原理:AI工作机制' },
|
||||
{ like: 'AI的"幻觉"是指AI会编造%', reason: '定义:幻觉术语' },
|
||||
{ like: 'AI训练数据的截止日期意味着%', reason: '原理:训练数据截止' },
|
||||
{ like: 'AI有时会编造看似合理但实际不存在的信息,这被称为"幻觉"%', reason: '定义:幻觉术语(重复)' },
|
||||
{ like: 'AI的知识训练数据只截止到%', reason: '原理:知识截止' },
|
||||
{ like: 'AI不知道自己的知识边界%', reason: '原理:AI知识边界' },
|
||||
{ like: '以下哪个是AI的固有问题%', reason: '列举:AI固有问题' },
|
||||
{ like: 'AI说了一段话,听起来很有道理,但你查了资料发现它说的内容不存在。这是什么现象%', reason: '定义:这是什么现象' },
|
||||
{ like: '%传统AI(判别式)和生成式AI的核心差异%', reason: 'ADV概念:判别式vs生成式' },
|
||||
{ like: 'AI的"上下文有限"是指什么问题%', reason: '定义:上下文有限' },
|
||||
];
|
||||
|
||||
for (const c of llmConcepts) {
|
||||
const items = findIds(c.like);
|
||||
for (const item of items) d(item.id, c.reason);
|
||||
}
|
||||
|
||||
// LLM MC: 11 hallucination scenario duplicates — keep 2 (第一个+搜索引擎), delete rest
|
||||
const hallMC = db.prepare("SELECT id, question_text FROM question_bank_items WHERE bank_id=? AND dimension=? AND questionType=? AND question_text LIKE '%场景%' AND (question_text LIKE '%fetchUser%' OR question_text LIKE '%validateUser%' OR question_text LIKE '%sendWelcome%') ORDER BY ROWID").all(BANK, 'LLM', 'MULTIPLE_CHOICE');
|
||||
console.log(`\n--- LLM MC: 幻觉场景重复 (${hallMC.length} total, keep 2) ---`);
|
||||
for (let i = 2; i < hallMC.length; i++) {
|
||||
d(hallMC[i].id, '重复:幻觉场景MC #' + (i+1));
|
||||
}
|
||||
|
||||
// LLM SA: 5页文档场景 duplicates — keep first (cleanest), delete rest
|
||||
const sa5 = db.prepare("SELECT id, question_text FROM question_bank_items WHERE bank_id=? AND dimension=? AND questionType=? AND question_text LIKE '%5页%' ORDER BY ROWID").all(BANK, 'LLM', 'SHORT_ANSWER');
|
||||
console.log(`\n--- LLM SA: 5页文档场景重复 (${sa5.length} total, keep 1) ---`);
|
||||
for (let i = 1; i < sa5.length; i++) {
|
||||
d(sa5[i].id, '重复:5页文档SA #' + (i+1));
|
||||
}
|
||||
|
||||
// ═══════════════ PROMPT ═══════════════
|
||||
console.log('\n--- PROMPT: 分类背诵题 ---');
|
||||
const promptConcepts = [
|
||||
{ like: 'L1级别的技术能力维度要求是什么%', reason: '分类背诵:L1维度' },
|
||||
];
|
||||
for (const c of promptConcepts) {
|
||||
const items = findIds(c.like);
|
||||
for (const item of items) d(item.id, c.reason);
|
||||
}
|
||||
|
||||
// ═══════════════ WORK_CAPABILITY ═══════════════
|
||||
console.log('\n--- WORK_CAPABILITY: 概念/分类题 ---');
|
||||
const wcConcepts = [
|
||||
{ like: '%"负责任AI"的组织原则中,"问责制"对员工的要求是什么%', reason: '概念:负责任AI问责制' },
|
||||
{ like: '%智能体(Agent)与传统聊天AI最本质的区别是什么%', reason: 'ADV概念:Agent vs 聊天AI' },
|
||||
{ like: '%智能体安全控制原则中"最小权限"是指什么%', reason: 'ADV概念:最小权限' },
|
||||
{ like: '请简述AI的四个固有问题%', reason: '列举:AI四个固有问题' },
|
||||
{ like: '数据分为"绝密""机密""公开"三个级别%', reason: '分类:数据分级' },
|
||||
{ like: '%智能体安全的四条控制原则是什么%', reason: 'ADV列举:四条控制原则' },
|
||||
];
|
||||
for (const c of wcConcepts) {
|
||||
const items = findIds(c.like);
|
||||
for (const item of items) d(item.id, c.reason);
|
||||
}
|
||||
|
||||
// WORK_CAPABILITY duplicate
|
||||
const wcDups = findIds('你正在使用AI助手分析一份包含客户信息的Excel表格');
|
||||
for (let i = 1; i < wcDups.length; i++) d(wcDups[i].id, '重复:客户Excel场景');
|
||||
|
||||
// ═══════════════ Summary ═══════════════
|
||||
const remaining = db.prepare('SELECT COUNT(*) c FROM question_bank_items WHERE bank_id=?').get(BANK);
|
||||
console.log(`\n${'═'.repeat(50)}`);
|
||||
console.log(` 删除: ${total} 题`);
|
||||
console.log(` 剩余: ${remaining.c} 题(原 ${remaining.c + total} 题)`);
|
||||
console.log(`\n 各维度分布:`);
|
||||
const byDim = db.prepare('SELECT dimension, questionType, COUNT(*) c FROM question_bank_items WHERE bank_id=? GROUP BY dimension, questionType ORDER BY dimension, questionType').all(BANK);
|
||||
byDim.forEach(r => console.log(` ${r.dimension} ${r.questionType}: ${r.c}`));
|
||||
|
||||
db.close();
|
||||
@@ -216,10 +216,11 @@ export class ApiV1Controller {
|
||||
@Get('knowledge-bases')
|
||||
async listFiles(@Request() req) {
|
||||
const user = req.user;
|
||||
const files = await this.knowledgeBaseService.findAll(
|
||||
const result = await this.knowledgeBaseService.findAll(
|
||||
user.id,
|
||||
user.tenantId,
|
||||
);
|
||||
const files = Array.isArray(result) ? result : result.items;
|
||||
return {
|
||||
files: files.map((f) => ({
|
||||
id: f.id,
|
||||
@@ -286,10 +287,11 @@ export class ApiV1Controller {
|
||||
@Get('knowledge-bases/:id')
|
||||
async getFile(@Request() req, @Param('id') id: string) {
|
||||
const user = req.user;
|
||||
const files = await this.knowledgeBaseService.findAll(
|
||||
const result = await this.knowledgeBaseService.findAll(
|
||||
user.id,
|
||||
user.tenantId,
|
||||
);
|
||||
const files = Array.isArray(result) ? result : result.items;
|
||||
const file = files.find((f) => f.id === id);
|
||||
if (!file) return { error: 'File not found' };
|
||||
return file;
|
||||
|
||||
@@ -307,38 +307,35 @@ export class EmbeddingService {
|
||||
`[Ollama] Generating embeddings for ${texts.length} texts using ${modelName}`,
|
||||
);
|
||||
|
||||
const embeddings: number[][] = [];
|
||||
|
||||
for (let i = 0; i < texts.length; i++) {
|
||||
try {
|
||||
const url = baseUrl.endsWith('/api/embeddings')
|
||||
? baseUrl
|
||||
: `${baseUrl}/api/embeddings`;
|
||||
const url = baseUrl.endsWith('/api/embeddings')
|
||||
? baseUrl
|
||||
: `${baseUrl}/api/embeddings`;
|
||||
|
||||
// Parallelize individual Ollama requests for faster batch processing
|
||||
const results = await Promise.allSettled(
|
||||
texts.map(async (text, i) => {
|
||||
const response = await fetch(url, {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
},
|
||||
body: JSON.stringify({
|
||||
model: modelName,
|
||||
prompt: texts[i],
|
||||
}),
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({ model: modelName, prompt: text }),
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
const errorText = await response.text();
|
||||
throw new Error(`Ollama API error: ${response.status} - ${errorText}`);
|
||||
throw new Error(`Ollama API error for text ${i}: ${response.status} - ${errorText}`);
|
||||
}
|
||||
|
||||
const data = await response.json();
|
||||
embeddings.push(data.embedding);
|
||||
} catch (error) {
|
||||
this.logger.error(
|
||||
`Ollama embedding error for text ${i}: ${error.message}`,
|
||||
);
|
||||
throw error;
|
||||
return data.embedding as number[];
|
||||
}),
|
||||
);
|
||||
|
||||
const embeddings: number[][] = [];
|
||||
for (let i = 0; i < results.length; i++) {
|
||||
const r = results[i];
|
||||
if (r.status === 'rejected') {
|
||||
this.logger.error(`Ollama embedding error for text ${i}: ${r.reason.message}`);
|
||||
throw r.reason;
|
||||
}
|
||||
embeddings.push(r.value);
|
||||
}
|
||||
|
||||
this.logger.log(
|
||||
|
||||
@@ -40,8 +40,17 @@ export class KnowledgeBaseController {
|
||||
|
||||
@Get()
|
||||
@UseGuards(CombinedAuthGuard)
|
||||
async findAll(@Request() req): Promise<KnowledgeBase[]> {
|
||||
return this.knowledgeBaseService.findAll(req.user.id, req.user.tenantId);
|
||||
async findAll(
|
||||
@Request() req,
|
||||
@Query('page') page?: number,
|
||||
@Query('limit') limit?: number,
|
||||
) {
|
||||
return this.knowledgeBaseService.findAll(
|
||||
req.user.id,
|
||||
req.user.tenantId,
|
||||
page ? Number(page) : undefined,
|
||||
limit ? Number(limit) : undefined,
|
||||
);
|
||||
}
|
||||
|
||||
@Get('stats')
|
||||
|
||||
@@ -51,7 +51,7 @@ export class KnowledgeBase {
|
||||
})
|
||||
status: FileStatus;
|
||||
|
||||
@Column({ name: 'user_id', nullable: true }) // Temporarily allowed empty (for debugging), should be required in future
|
||||
@Column({ name: 'user_id' })
|
||||
userId: string;
|
||||
|
||||
@Column({ name: 'tenant_id', nullable: true, type: 'text' })
|
||||
|
||||
@@ -21,9 +21,11 @@ import { ElasticsearchService } from '../elasticsearch/elasticsearch.service';
|
||||
import { TikaService } from '../tika/tika.service';
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
import * as crypto from 'crypto';
|
||||
import { EmbeddingService } from './embedding.service';
|
||||
import { TextChunkerService } from './text-chunker.service';
|
||||
import { TextChunkerService, TextChunk } from './text-chunker.service';
|
||||
import { ModelConfigService } from '../model-config/model-config.service';
|
||||
import { ModelType } from '../types';
|
||||
import { RagService } from '../rag/rag.service';
|
||||
import { VisionService } from '../vision/vision.service';
|
||||
import { TenantService } from '../tenant/tenant.service';
|
||||
@@ -87,9 +89,28 @@ export class KnowledgeBaseService {
|
||||
const processingMode =
|
||||
mode === 'precise' ? ProcessingMode.PRECISE : ProcessingMode.FAST;
|
||||
|
||||
// 先移文件,再创建DB记录:避免DB记录存在但文件丢失的不一致状态
|
||||
const uploadPath = process.env.UPLOAD_FILE_PATH || './uploads';
|
||||
const tempId = crypto.randomUUID();
|
||||
const targetDir = path.join(uploadPath, tenantId || 'default', tempId);
|
||||
const targetPath = path.join(targetDir, fileInfo.filename);
|
||||
try {
|
||||
if (!fs.existsSync(targetDir)) {
|
||||
fs.mkdirSync(targetDir, { recursive: true });
|
||||
}
|
||||
if (fs.existsSync(fileInfo.path)) {
|
||||
fs.renameSync(fileInfo.path, targetPath);
|
||||
} else {
|
||||
throw new Error(`Source file not found: ${fileInfo.path}`);
|
||||
}
|
||||
} catch (fsError) {
|
||||
this.logger.error('Failed to move file to partitioned storage', fsError);
|
||||
throw new Error(`File storage error: ${fsError.message}`);
|
||||
}
|
||||
|
||||
const kb = this.kbRepository.create({
|
||||
originalName: fileInfo.originalname,
|
||||
storagePath: fileInfo.path,
|
||||
storagePath: targetPath,
|
||||
size: fileInfo.size,
|
||||
mimetype: fileInfo.mimetype,
|
||||
status: FileStatus.PENDING,
|
||||
@@ -115,36 +136,6 @@ export class KnowledgeBaseService {
|
||||
`Created KB record: ${savedKb.id}, mode: ${mode}, file: ${fileInfo.originalname}`,
|
||||
);
|
||||
|
||||
// ---------------------------------------------------------
|
||||
// Move the file to the final partitioned directory
|
||||
// source: uploads/{tenantId}/{filename} (or wherever it was)
|
||||
// target: uploads/{tenantId}/{savedKb.id}/{filename}
|
||||
// ---------------------------------------------------------
|
||||
const fs = await import('fs');
|
||||
const path = await import('path');
|
||||
const uploadPath = process.env.UPLOAD_FILE_PATH || './uploads';
|
||||
const targetDir = path.join(uploadPath, tenantId || 'default', savedKb.id);
|
||||
const targetPath = path.join(targetDir, fileInfo.filename);
|
||||
|
||||
try {
|
||||
if (!fs.existsSync(targetDir)) {
|
||||
fs.mkdirSync(targetDir, { recursive: true });
|
||||
}
|
||||
if (fs.existsSync(fileInfo.path)) {
|
||||
fs.renameSync(fileInfo.path, targetPath);
|
||||
// Update the DB record with the new path
|
||||
savedKb.storagePath = targetPath;
|
||||
await this.kbRepository.save(savedKb);
|
||||
this.logger.log(`Moved file to partitioned storage: ${targetPath}`);
|
||||
}
|
||||
} catch (fsError) {
|
||||
this.logger.error(
|
||||
`Failed to move file ${savedKb.id} to partitioned storage`,
|
||||
fsError,
|
||||
);
|
||||
// We will let it continue, but the file might be stuck in the temp/root folder
|
||||
}
|
||||
|
||||
// If queue processing is requested, await completion
|
||||
if (config?.waitForCompletion) {
|
||||
await this.processFile(savedKb.id, userId, tenantId, config);
|
||||
@@ -158,16 +149,33 @@ export class KnowledgeBaseService {
|
||||
return savedKb;
|
||||
}
|
||||
|
||||
async findAll(userId: string, tenantId?: string): Promise<KnowledgeBase[]> {
|
||||
async findAll(
|
||||
userId: string,
|
||||
tenantId?: string,
|
||||
page?: number,
|
||||
limit?: number,
|
||||
): Promise<KnowledgeBase[] | PaginatedKnowledgeBase> {
|
||||
const where: any = {};
|
||||
if (tenantId) {
|
||||
where.tenantId = tenantId;
|
||||
} else {
|
||||
where.userId = userId;
|
||||
}
|
||||
|
||||
if (page !== undefined && limit !== undefined) {
|
||||
const [items, total] = await this.kbRepository.findAndCount({
|
||||
where,
|
||||
relations: ['groups'],
|
||||
order: { createdAt: 'DESC' },
|
||||
skip: (page - 1) * limit,
|
||||
take: limit,
|
||||
});
|
||||
return { items, total, page, limit };
|
||||
}
|
||||
|
||||
return this.kbRepository.find({
|
||||
where,
|
||||
relations: ['groups'], // Load group relations
|
||||
relations: ['groups'],
|
||||
order: { createdAt: 'DESC' },
|
||||
});
|
||||
}
|
||||
@@ -248,17 +256,27 @@ export class KnowledgeBaseService {
|
||||
topK: number = 5,
|
||||
) {
|
||||
try {
|
||||
// Generate simulation vector using default dimensions from environment variable
|
||||
const defaultDimensions = parseInt(
|
||||
process.env.DEFAULT_VECTOR_DIMENSIONS || '2560',
|
||||
);
|
||||
const mockEmbedding = Array.from(
|
||||
{ length: defaultDimensions },
|
||||
() => Math.random() - 0.5,
|
||||
);
|
||||
const queryVector = mockEmbedding;
|
||||
// 1. Generate query vector using the default embedding model
|
||||
let queryVector: number[] = [];
|
||||
try {
|
||||
const defaultEmbedding = await this.modelConfigService.findDefaultByType(
|
||||
tenantId,
|
||||
ModelType.EMBEDDING,
|
||||
);
|
||||
if (defaultEmbedding) {
|
||||
const vectors = await this.embeddingService.getEmbeddings(
|
||||
[query],
|
||||
defaultEmbedding.id,
|
||||
);
|
||||
queryVector = vectors[0] || [];
|
||||
}
|
||||
} catch (embedError) {
|
||||
this.logger.warn(
|
||||
`No embedding model available, falling back to text-only search: ${embedError.message}`,
|
||||
);
|
||||
}
|
||||
|
||||
// 2. Search in Elasticsearch
|
||||
// 2. Search in Elasticsearch (with vector if available, text-only otherwise)
|
||||
const searchResults = await this.elasticsearchService.searchSimilar(
|
||||
queryVector,
|
||||
userId,
|
||||
@@ -268,7 +286,7 @@ export class KnowledgeBaseService {
|
||||
|
||||
// 3. Get file information from database
|
||||
const fileIds = [...new Set(searchResults.map((r) => r.fileId))];
|
||||
const files = await this.kbRepository.findByIds(fileIds);
|
||||
const files = await this.kbRepository.findBy({ id: In(fileIds) });
|
||||
const fileMap = new Map(files.map((f) => [f.id, f]));
|
||||
|
||||
// 4. Combine results with file info
|
||||
@@ -380,7 +398,6 @@ export class KnowledgeBaseService {
|
||||
}
|
||||
|
||||
// 2. Delete file from filesystem
|
||||
const fs = await import('fs');
|
||||
try {
|
||||
if (fs.existsSync(file.storagePath)) {
|
||||
fs.unlinkSync(file.storagePath);
|
||||
@@ -1010,7 +1027,7 @@ export class KnowledgeBaseService {
|
||||
},
|
||||
);
|
||||
} catch (error) {
|
||||
// Detect context length error (supports Japanese/Chinese/English)
|
||||
// Detect context length error → fall back to single-chunk processing
|
||||
if (
|
||||
error.message &&
|
||||
(error.message.includes('context length') ||
|
||||
@@ -1019,58 +1036,7 @@ export class KnowledgeBaseService {
|
||||
this.logger.warn(
|
||||
this.i18nService.getMessage('contextLengthErrorFallback'),
|
||||
);
|
||||
|
||||
// Downgrade to single text processing
|
||||
for (let i = 0; i < chunks.length; i++) {
|
||||
const chunk = chunks[i];
|
||||
|
||||
try {
|
||||
const embeddings = await this.embeddingService.getEmbeddings(
|
||||
[chunk.content], // Single text
|
||||
kb.embeddingModelId,
|
||||
);
|
||||
|
||||
if (!embeddings[0] || embeddings[0].length === 0) {
|
||||
this.logger.warn(
|
||||
this.i18nService.formatMessage('skippingEmptyVectorChunk', {
|
||||
index: chunk.index,
|
||||
}),
|
||||
);
|
||||
continue;
|
||||
}
|
||||
|
||||
await this.elasticsearchService.indexDocument(
|
||||
`${kb.id}_chunk_${chunk.index}`,
|
||||
chunk.content,
|
||||
embeddings[0],
|
||||
{
|
||||
fileId: kb.id,
|
||||
originalName: kb.originalName,
|
||||
mimetype: kb.mimetype,
|
||||
userId: userId,
|
||||
chunkIndex: chunk.index,
|
||||
startPosition: chunk.startPosition,
|
||||
endPosition: chunk.endPosition,
|
||||
tenantId,
|
||||
},
|
||||
);
|
||||
|
||||
if ((i + 1) % 10 === 0) {
|
||||
this.logger.log(
|
||||
`Single processing progress: ${i + 1}/${chunks.length}`,
|
||||
);
|
||||
}
|
||||
} catch (chunkError) {
|
||||
this.logger.error(
|
||||
`Failed to process text block ${chunk.index}. Skipping: ${chunkError.message}`,
|
||||
);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
this.logger.log(
|
||||
`Single text processing completed: ${chunks.length} chunks`,
|
||||
);
|
||||
await this.processChunksOneByOne(chunks, kb, userId, tenantId);
|
||||
} else {
|
||||
// Throw other errors directly
|
||||
throw error;
|
||||
@@ -1125,7 +1091,7 @@ export class KnowledgeBaseService {
|
||||
}
|
||||
});
|
||||
} catch (error) {
|
||||
// Detect context length error (supports Japanese/Chinese/English)
|
||||
// Detect context length error → fall back to single-chunk processing
|
||||
if (
|
||||
error.message &&
|
||||
(error.message.includes('context length') ||
|
||||
@@ -1134,64 +1100,7 @@ export class KnowledgeBaseService {
|
||||
this.logger.warn(
|
||||
this.i18nService.getMessage('batchContextLengthErrorFallback'),
|
||||
);
|
||||
|
||||
// Downgrade to single text processing
|
||||
for (let i = 0; i < chunks.length; i++) {
|
||||
const chunk = chunks[i];
|
||||
|
||||
try {
|
||||
const embeddings = await this.embeddingService.getEmbeddings(
|
||||
[chunk.content], // Single text
|
||||
kb.embeddingModelId,
|
||||
);
|
||||
|
||||
if (!embeddings[0] || embeddings[0].length === 0) {
|
||||
this.logger.warn(
|
||||
this.i18nService.formatMessage(
|
||||
'skippingEmptyVectorChunk',
|
||||
{ index: chunk.index },
|
||||
),
|
||||
);
|
||||
continue;
|
||||
}
|
||||
|
||||
await this.elasticsearchService.indexDocument(
|
||||
`${kb.id}_chunk_${chunk.index}`,
|
||||
chunk.content,
|
||||
embeddings[0],
|
||||
{
|
||||
fileId: kb.id,
|
||||
originalName: kb.originalName,
|
||||
mimetype: kb.mimetype,
|
||||
userId: userId,
|
||||
tenantId, // Added tenantId
|
||||
chunkIndex: chunk.index,
|
||||
startPosition: chunk.startPosition,
|
||||
endPosition: chunk.endPosition,
|
||||
},
|
||||
);
|
||||
|
||||
if ((i + 1) % 10 === 0) {
|
||||
this.logger.log(
|
||||
`Single processing progress: ${i + 1}/${chunks.length}`,
|
||||
);
|
||||
}
|
||||
} catch (chunkError) {
|
||||
this.logger.error(
|
||||
this.i18nService.formatMessage('chunkProcessingFailed', {
|
||||
index: chunk.index,
|
||||
message: chunkError.message,
|
||||
}),
|
||||
);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
this.logger.log(
|
||||
this.i18nService.formatMessage('singleTextProcessingComplete', {
|
||||
count: chunks.length,
|
||||
}),
|
||||
);
|
||||
await this.processChunksOneByOne(chunks, kb, userId, tenantId);
|
||||
} else {
|
||||
// Throw other errors directly
|
||||
throw error;
|
||||
@@ -1244,58 +1153,7 @@ export class KnowledgeBaseService {
|
||||
this.logger.warn(
|
||||
this.i18nService.getMessage('batchContextLengthErrorFallback'),
|
||||
);
|
||||
|
||||
// Downgrade to single text processing
|
||||
for (let i = 0; i < chunks.length; i++) {
|
||||
const chunk = chunks[i];
|
||||
|
||||
try {
|
||||
const embeddings = await this.embeddingService.getEmbeddings(
|
||||
[chunk.content], // Single text
|
||||
kb.embeddingModelId,
|
||||
);
|
||||
|
||||
if (!embeddings[0] || embeddings[0].length === 0) {
|
||||
this.logger.warn(
|
||||
`Skipping empty vector text block ${chunk.index}`,
|
||||
);
|
||||
continue;
|
||||
}
|
||||
|
||||
await this.elasticsearchService.indexDocument(
|
||||
`${kb.id}_chunk_${chunk.index}`,
|
||||
chunk.content,
|
||||
embeddings[0],
|
||||
{
|
||||
fileId: kb.id,
|
||||
originalName: kb.originalName,
|
||||
mimetype: kb.mimetype,
|
||||
userId: userId,
|
||||
tenantId, // Added tenantId
|
||||
chunkIndex: chunk.index,
|
||||
startPosition: chunk.startPosition,
|
||||
endPosition: chunk.endPosition,
|
||||
},
|
||||
);
|
||||
|
||||
if ((i + 1) % 10 === 0) {
|
||||
this.logger.log(
|
||||
`Single processing progress: ${i + 1}/${chunks.length}`,
|
||||
);
|
||||
}
|
||||
} catch (chunkError) {
|
||||
this.logger.error(
|
||||
`Failed to process text block ${chunk.index}. Skipping: ${chunkError.message}`,
|
||||
);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
this.logger.log(
|
||||
this.i18nService.formatMessage('singleTextProcessingComplete', {
|
||||
count: chunks.length,
|
||||
}),
|
||||
);
|
||||
await this.processChunksOneByOne(chunks, kb, userId, tenantId);
|
||||
} else {
|
||||
// Throw other errors directly
|
||||
throw error;
|
||||
@@ -1553,8 +1411,6 @@ export class KnowledgeBaseService {
|
||||
}
|
||||
|
||||
// Generate PDF field path
|
||||
const path = await import('path');
|
||||
const fs = await import('fs');
|
||||
const uploadDir = path.dirname(kb.storagePath);
|
||||
const baseName = path.basename(
|
||||
kb.storagePath,
|
||||
@@ -1640,8 +1496,6 @@ export class KnowledgeBaseService {
|
||||
}
|
||||
|
||||
// Generate PDF file path
|
||||
const path = await import('path');
|
||||
const fs = await import('fs');
|
||||
const uploadDir = path.dirname(kb.storagePath);
|
||||
const baseName = path.basename(
|
||||
kb.storagePath,
|
||||
@@ -1823,4 +1677,60 @@ export class KnowledgeBaseService {
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* 降级策略:逐条处理 chunk(当 batch 请求因上下文长度超限失败时)
|
||||
* 替代之前三处重复的 ~200 行降级代码
|
||||
*/
|
||||
private async processChunksOneByOne(
|
||||
chunks: TextChunk[],
|
||||
kb: KnowledgeBase,
|
||||
userId: string,
|
||||
tenantId: string,
|
||||
): Promise<void> {
|
||||
for (let i = 0; i < chunks.length; i++) {
|
||||
const chunk = chunks[i];
|
||||
try {
|
||||
const embeddings = await this.embeddingService.getEmbeddings(
|
||||
[chunk.content],
|
||||
kb.embeddingModelId,
|
||||
);
|
||||
if (!embeddings[0] || embeddings[0].length === 0) {
|
||||
this.logger.warn(
|
||||
this.i18nService.formatMessage('skippingEmptyVectorChunk', {
|
||||
index: chunk.index,
|
||||
}),
|
||||
);
|
||||
continue;
|
||||
}
|
||||
await this.elasticsearchService.indexDocument(
|
||||
`${kb.id}_chunk_${chunk.index}`,
|
||||
chunk.content,
|
||||
embeddings[0],
|
||||
{
|
||||
fileId: kb.id,
|
||||
originalName: kb.originalName,
|
||||
mimetype: kb.mimetype,
|
||||
userId: userId,
|
||||
tenantId,
|
||||
chunkIndex: chunk.index,
|
||||
startPosition: chunk.startPosition,
|
||||
endPosition: chunk.endPosition,
|
||||
},
|
||||
);
|
||||
if ((i + 1) % 10 === 0) {
|
||||
this.logger.log(`Single processing progress: ${i + 1}/${chunks.length}`);
|
||||
}
|
||||
} catch (chunkError) {
|
||||
this.logger.error(
|
||||
`Failed to process text block ${chunk.index}. Skipping: ${chunkError.message}`,
|
||||
);
|
||||
}
|
||||
}
|
||||
this.logger.log(
|
||||
this.i18nService.formatMessage('singleTextProcessingComplete', {
|
||||
count: chunks.length,
|
||||
}),
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -19,8 +19,12 @@ export class TextChunkerService {
|
||||
}
|
||||
|
||||
const cleanText = text.trim();
|
||||
const chunkSizeInChars = chunkSize * 4; // 1 token ≈ 4 chars
|
||||
const overlapInChars = overlap * 4;
|
||||
// 1 token ≈ 4 chars for English, ≈ 1.5-2 chars for CJK.
|
||||
// Heuristic: if CJK chars > 30% of content, use 2x ratio
|
||||
const cjkChars = (cleanText.match(/[一-鿿-ゟ゠-ヿ가-]/g) || []).length;
|
||||
const ratio = (cjkChars / cleanText.length) > 0.3 ? 2 : 4;
|
||||
const chunkSizeInChars = chunkSize * ratio;
|
||||
const overlapInChars = overlap * ratio;
|
||||
|
||||
// If text length <= chunk size, return entire text as one chunk
|
||||
if (cleanText.length <= chunkSizeInChars) {
|
||||
@@ -87,7 +91,7 @@ export class TextChunkerService {
|
||||
preferredEnd: number,
|
||||
minEnd: number,
|
||||
): number {
|
||||
const sentenceEnders = ['.', '!', '?', '。', '!', '?'];
|
||||
const sentenceEnders = ['.', '!', '?', '。', '!', '?', '.', '。', '…', '‥'];
|
||||
|
||||
for (let i = preferredEnd; i >= minEnd; i--) {
|
||||
if (sentenceEnders.includes(text[i])) {
|
||||
|
||||
Reference in New Issue
Block a user