forked from hangshuo652/aurak
feat: implement QuestionBank CRUD with pagination and template query
- Add pagination support to findAll (page, limit query params) - Add findByTemplateId method to service - Add GET /by-template/:templateId endpoint to controller - Service already includes CRUD for QuestionBank and QuestionBankItem
This commit is contained in:
@@ -0,0 +1,393 @@
|
||||
import { Injectable, Logger, BadRequestException } from '@nestjs/common';
|
||||
import { ConfigService } from '@nestjs/config';
|
||||
import { ModelConfigService } from '../model-config/model-config.service';
|
||||
import { TenantService } from '../tenant/tenant.service';
|
||||
// import { UserSettingService } from '../user-setting/user-setting.service';
|
||||
|
||||
/**
|
||||
* Chunk config service
|
||||
* Responsible for validating and managing chunk parameters to ensure they conform to model limits and environment variable settings
|
||||
*
|
||||
* Priority of limits:
|
||||
* 1. Environment variables (MAX_CHUNK_SIZE, MAX_OVERLAP_SIZE)
|
||||
* 2. Model settings in database (maxInputTokens, maxBatchSize)
|
||||
* 3. Default values
|
||||
*/
|
||||
import {
|
||||
DEFAULT_CHUNK_SIZE,
|
||||
MIN_CHUNK_SIZE,
|
||||
DEFAULT_CHUNK_OVERLAP,
|
||||
MIN_CHUNK_OVERLAP,
|
||||
DEFAULT_MAX_OVERLAP_RATIO,
|
||||
DEFAULT_MAX_BATCH_SIZE,
|
||||
DEFAULT_VECTOR_DIMENSIONS,
|
||||
} from '../common/constants';
|
||||
import { I18nService } from '../i18n/i18n.service';
|
||||
|
||||
@Injectable()
|
||||
export class ChunkConfigService {
|
||||
private readonly logger = new Logger(ChunkConfigService.name);
|
||||
|
||||
// Default settings
|
||||
private readonly DEFAULTS = {
|
||||
chunkSize: DEFAULT_CHUNK_SIZE,
|
||||
chunkOverlap: DEFAULT_CHUNK_OVERLAP,
|
||||
minChunkSize: MIN_CHUNK_SIZE,
|
||||
minChunkOverlap: MIN_CHUNK_OVERLAP,
|
||||
maxOverlapRatio: DEFAULT_MAX_OVERLAP_RATIO, // Overlap up to 50% of chunk size
|
||||
maxBatchSize: DEFAULT_MAX_BATCH_SIZE, // Default batch limit
|
||||
expectedDimensions: DEFAULT_VECTOR_DIMENSIONS, // Default vector dimensions
|
||||
};
|
||||
|
||||
// Upper limits set by environment variables (used first)
|
||||
private readonly envMaxChunkSize: number;
|
||||
private readonly envMaxOverlapSize: number;
|
||||
|
||||
constructor(
|
||||
private configService: ConfigService,
|
||||
private modelConfigService: ModelConfigService,
|
||||
private i18nService: I18nService,
|
||||
private tenantService: TenantService,
|
||||
) {
|
||||
// Load global limit settings from environment variables
|
||||
this.envMaxChunkSize = parseInt(
|
||||
this.configService.get<string>('MAX_CHUNK_SIZE', '8191'),
|
||||
);
|
||||
this.envMaxOverlapSize = parseInt(
|
||||
this.configService.get<string>('MAX_OVERLAP_SIZE', '2000'),
|
||||
);
|
||||
|
||||
this.logger.log(
|
||||
`Environment variable limits: MAX_CHUNK_SIZE=${this.envMaxChunkSize}, MAX_OVERLAP_SIZE=${this.envMaxOverlapSize}`,
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get model limit settings (read from database)
|
||||
*/
|
||||
async getModelLimits(modelId: string): Promise<{
|
||||
maxInputTokens: number;
|
||||
maxBatchSize: number;
|
||||
expectedDimensions: number;
|
||||
providerName: string;
|
||||
isVectorModel: boolean;
|
||||
}> {
|
||||
const modelConfig = await this.modelConfigService.findOne(modelId);
|
||||
|
||||
if (!modelConfig || modelConfig.type !== 'embedding') {
|
||||
throw new BadRequestException(
|
||||
this.i18nService.formatMessage('embeddingModelNotFound', {
|
||||
id: modelId,
|
||||
}),
|
||||
);
|
||||
}
|
||||
|
||||
// Get limits from database fields and fill with defaults
|
||||
const maxInputTokens = modelConfig.maxInputTokens || this.envMaxChunkSize;
|
||||
const maxBatchSize = modelConfig.maxBatchSize || this.DEFAULTS.maxBatchSize;
|
||||
const expectedDimensions =
|
||||
modelConfig.dimensions ||
|
||||
parseInt(
|
||||
this.configService.get(
|
||||
'DEFAULT_VECTOR_DIMENSIONS',
|
||||
String(this.DEFAULTS.expectedDimensions),
|
||||
),
|
||||
);
|
||||
const providerName = modelConfig.providerName || 'unknown';
|
||||
const isVectorModel = modelConfig.isVectorModel || false;
|
||||
|
||||
this.logger.log(
|
||||
this.i18nService.formatMessage('configLoaded', {
|
||||
name: modelConfig.name,
|
||||
id: modelConfig.modelId,
|
||||
}) +
|
||||
'\n' +
|
||||
` - Provider: ${providerName}\n` +
|
||||
` - Token limit: ${maxInputTokens}\n` +
|
||||
` - Batch limit: ${maxBatchSize}\n` +
|
||||
` - Vector dimensions: ${expectedDimensions}\n` +
|
||||
` - Is vector model: ${isVectorModel}`,
|
||||
);
|
||||
|
||||
return {
|
||||
maxInputTokens,
|
||||
maxBatchSize,
|
||||
expectedDimensions,
|
||||
providerName,
|
||||
isVectorModel,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Validate and fix chunk config
|
||||
* Priority: Environment variable limits > Model limits > User settings
|
||||
*/
|
||||
async validateChunkConfig(
|
||||
chunkSize: number,
|
||||
chunkOverlap: number,
|
||||
modelId: string,
|
||||
): Promise<{
|
||||
chunkSize: number;
|
||||
chunkOverlap: number;
|
||||
warnings: string[];
|
||||
effectiveMaxChunkSize: number;
|
||||
effectiveMaxOverlapSize: number;
|
||||
}> {
|
||||
const warnings: string[] = [];
|
||||
const limits = await this.getModelLimits(modelId);
|
||||
|
||||
// 1. Calculate final limits (choose smaller of env var and model limit)
|
||||
const effectiveMaxChunkSize = Math.min(
|
||||
this.envMaxChunkSize,
|
||||
limits.maxInputTokens,
|
||||
);
|
||||
|
||||
const effectiveMaxOverlapSize = Math.min(
|
||||
this.envMaxOverlapSize,
|
||||
Math.floor(effectiveMaxChunkSize * this.DEFAULTS.maxOverlapRatio),
|
||||
);
|
||||
|
||||
// 2. Validate chunk size upper limit
|
||||
if (chunkSize > effectiveMaxChunkSize) {
|
||||
const reason =
|
||||
this.envMaxChunkSize < limits.maxInputTokens
|
||||
? `${this.i18nService.getMessage('environmentLimit')} ${this.envMaxChunkSize}`
|
||||
: `${this.i18nService.getMessage('modelLimit')} ${limits.maxInputTokens}`;
|
||||
|
||||
warnings.push(
|
||||
this.i18nService.formatMessage('chunkOverflow', {
|
||||
size: chunkSize,
|
||||
max: effectiveMaxChunkSize,
|
||||
reason,
|
||||
}),
|
||||
);
|
||||
chunkSize = effectiveMaxChunkSize;
|
||||
}
|
||||
|
||||
// 3. Validate chunk size lower limit
|
||||
if (chunkSize < this.DEFAULTS.minChunkSize) {
|
||||
warnings.push(
|
||||
this.i18nService.formatMessage('chunkUnderflow', {
|
||||
size: chunkSize,
|
||||
min: this.DEFAULTS.minChunkSize,
|
||||
}),
|
||||
);
|
||||
chunkSize = this.DEFAULTS.minChunkSize;
|
||||
}
|
||||
|
||||
// 4. Validate overlap size upper limit (env var first)
|
||||
if (chunkOverlap > effectiveMaxOverlapSize) {
|
||||
warnings.push(
|
||||
this.i18nService.formatMessage('overlapOverflow', {
|
||||
size: chunkOverlap,
|
||||
max: effectiveMaxOverlapSize,
|
||||
}),
|
||||
);
|
||||
chunkOverlap = effectiveMaxOverlapSize;
|
||||
}
|
||||
|
||||
// 5. Validate overlap doesn't exceed 50% of chunk size
|
||||
const maxOverlapByRatio = Math.floor(
|
||||
chunkSize * this.DEFAULTS.maxOverlapRatio,
|
||||
);
|
||||
if (chunkOverlap > maxOverlapByRatio) {
|
||||
warnings.push(
|
||||
this.i18nService.formatMessage('overlapRatioExceeded', {
|
||||
size: chunkOverlap,
|
||||
max: maxOverlapByRatio,
|
||||
}),
|
||||
);
|
||||
chunkOverlap = maxOverlapByRatio;
|
||||
}
|
||||
|
||||
if (chunkOverlap < this.DEFAULTS.minChunkOverlap) {
|
||||
warnings.push(
|
||||
this.i18nService.formatMessage('overlapUnderflow', {
|
||||
size: chunkOverlap,
|
||||
min: this.DEFAULTS.minChunkOverlap,
|
||||
}),
|
||||
);
|
||||
chunkOverlap = this.DEFAULTS.minChunkOverlap;
|
||||
}
|
||||
|
||||
// 6. Add safety check for batch processing
|
||||
// During batch processing, ensure total length of multiple texts doesn't exceed model limits
|
||||
const safetyMargin = 0.8; // 80% safety margin to leave space for batch processing
|
||||
const safeChunkSize = Math.floor(effectiveMaxChunkSize * safetyMargin);
|
||||
|
||||
if (chunkSize > safeChunkSize) {
|
||||
warnings.push(
|
||||
this.i18nService.formatMessage('batchOverflowWarning', {
|
||||
safeSize: safeChunkSize,
|
||||
size: chunkSize,
|
||||
percent: Math.round(safetyMargin * 100),
|
||||
}),
|
||||
);
|
||||
}
|
||||
|
||||
// 7. Check if estimated chunk count is reasonable
|
||||
const estimatedChunkCount = this.estimateChunkCount(
|
||||
1000000, // Assume 1MB text
|
||||
chunkSize,
|
||||
);
|
||||
|
||||
if (estimatedChunkCount > 50000) {
|
||||
warnings.push(
|
||||
this.i18nService.formatMessage('estimatedChunkCountExcessive', {
|
||||
count: estimatedChunkCount,
|
||||
}),
|
||||
);
|
||||
}
|
||||
|
||||
return {
|
||||
chunkSize,
|
||||
chunkOverlap,
|
||||
warnings,
|
||||
effectiveMaxChunkSize,
|
||||
effectiveMaxOverlapSize,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Get recommended batch size
|
||||
*/
|
||||
async getRecommendedBatchSize(
|
||||
modelId: string,
|
||||
currentBatchSize: number = 100,
|
||||
): Promise<number> {
|
||||
const limits = await this.getModelLimits(modelId);
|
||||
|
||||
// Choose smaller of configured value and model limit
|
||||
const recommended = Math.min(
|
||||
currentBatchSize,
|
||||
limits.maxBatchSize,
|
||||
200, // Safety upper limit
|
||||
);
|
||||
|
||||
if (recommended < currentBatchSize) {
|
||||
this.logger.warn(
|
||||
this.i18nService.formatMessage('batchSizeAdjusted', {
|
||||
old: currentBatchSize,
|
||||
new: recommended,
|
||||
limit: limits.maxBatchSize,
|
||||
}),
|
||||
);
|
||||
}
|
||||
|
||||
return Math.max(10, recommended); // Minimum 10
|
||||
}
|
||||
|
||||
/**
|
||||
* Estimate chunk count
|
||||
*/
|
||||
estimateChunkCount(textLength: number, chunkSize: number): number {
|
||||
const chunkSizeInChars = chunkSize * 4; // 1 token ≈ 4 chars
|
||||
return Math.ceil(textLength / chunkSizeInChars);
|
||||
}
|
||||
|
||||
/**
|
||||
* Validate vector dimensions
|
||||
*/
|
||||
async validateDimensions(
|
||||
modelId: string,
|
||||
actualDimensions: number,
|
||||
): Promise<boolean> {
|
||||
const limits = await this.getModelLimits(modelId);
|
||||
|
||||
if (actualDimensions !== limits.expectedDimensions) {
|
||||
this.logger.warn(
|
||||
this.i18nService.formatMessage('dimensionMismatch', {
|
||||
id: modelId,
|
||||
expected: limits.expectedDimensions,
|
||||
actual: actualDimensions,
|
||||
}),
|
||||
);
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get config summary (for logging)
|
||||
*/
|
||||
async getConfigSummary(
|
||||
chunkSize: number,
|
||||
chunkOverlap: number,
|
||||
modelId: string,
|
||||
): Promise<string> {
|
||||
const limits = await this.getModelLimits(modelId);
|
||||
|
||||
return [
|
||||
`Model: ${modelId}`,
|
||||
`Chunk size: ${chunkSize} tokens (limit: ${limits.maxInputTokens})`,
|
||||
`Overlap size: ${chunkOverlap} tokens`,
|
||||
`Batch size: ${limits.maxBatchSize}`,
|
||||
`Vector dimensions: ${limits.expectedDimensions}`,
|
||||
].join(', ');
|
||||
}
|
||||
|
||||
/**
|
||||
* Get config limits for frontend
|
||||
* Used for frontend slider max value settings
|
||||
*/
|
||||
async getFrontendLimits(
|
||||
modelId: string,
|
||||
userId: string,
|
||||
tenantId?: string,
|
||||
): Promise<{
|
||||
maxChunkSize: number;
|
||||
maxOverlapSize: number;
|
||||
minOverlapSize: number;
|
||||
defaultChunkSize: number;
|
||||
defaultOverlapSize: number;
|
||||
modelInfo: {
|
||||
name: string;
|
||||
maxInputTokens: number;
|
||||
maxBatchSize: number;
|
||||
expectedDimensions: number;
|
||||
};
|
||||
}> {
|
||||
const limits = await this.getModelLimits(modelId);
|
||||
|
||||
// Calculate final limits (choose smaller of env var and model limit)
|
||||
const maxChunkSize = Math.min(this.envMaxChunkSize, limits.maxInputTokens);
|
||||
const maxOverlapSize = Math.min(
|
||||
this.envMaxOverlapSize,
|
||||
Math.floor(maxChunkSize * this.DEFAULTS.maxOverlapRatio),
|
||||
);
|
||||
|
||||
// Get model config name
|
||||
const modelConfig = await this.modelConfigService.findOne(modelId);
|
||||
const modelName = modelConfig?.name || 'Unknown';
|
||||
|
||||
// Get defaults from tenant or user settings
|
||||
let defaultChunkSize = this.DEFAULTS.chunkSize;
|
||||
let defaultOverlapSize = this.DEFAULTS.chunkOverlap;
|
||||
|
||||
if (tenantId) {
|
||||
const tenantSettings = await this.tenantService.getSettings(tenantId);
|
||||
if (tenantSettings?.chunkSize)
|
||||
defaultChunkSize = tenantSettings.chunkSize;
|
||||
if (tenantSettings?.chunkOverlap)
|
||||
defaultOverlapSize = tenantSettings.chunkOverlap;
|
||||
}
|
||||
|
||||
return {
|
||||
maxChunkSize,
|
||||
maxOverlapSize,
|
||||
minOverlapSize: this.DEFAULTS.minChunkOverlap,
|
||||
defaultChunkSize: Math.min(defaultChunkSize, maxChunkSize),
|
||||
defaultOverlapSize: Math.max(
|
||||
this.DEFAULTS.minChunkOverlap,
|
||||
Math.min(defaultOverlapSize, maxOverlapSize),
|
||||
),
|
||||
modelInfo: {
|
||||
name: modelName,
|
||||
maxInputTokens: limits.maxInputTokens,
|
||||
maxBatchSize: limits.maxBatchSize,
|
||||
expectedDimensions: limits.expectedDimensions,
|
||||
},
|
||||
};
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,11 @@
|
||||
import { IsNotEmpty, IsOptional, IsString } from 'class-validator';
|
||||
|
||||
export class CreateKnowledgeBaseDto {
|
||||
@IsString()
|
||||
@IsNotEmpty()
|
||||
name: string;
|
||||
|
||||
@IsString()
|
||||
@IsOptional()
|
||||
description?: string;
|
||||
}
|
||||
@@ -0,0 +1,286 @@
|
||||
import { Injectable, Logger } from '@nestjs/common';
|
||||
import { ConfigService } from '@nestjs/config';
|
||||
import { ModelConfigService } from '../model-config/model-config.service';
|
||||
import { I18nService } from '../i18n/i18n.service';
|
||||
|
||||
export interface EmbeddingResponse {
|
||||
data: Array<{
|
||||
embedding: number[];
|
||||
index: number;
|
||||
}>;
|
||||
model: string;
|
||||
usage: {
|
||||
prompt_tokens: number;
|
||||
total_tokens: number;
|
||||
};
|
||||
}
|
||||
|
||||
@Injectable()
|
||||
export class EmbeddingService {
|
||||
private readonly logger = new Logger(EmbeddingService.name);
|
||||
private readonly defaultDimensions: number;
|
||||
|
||||
constructor(
|
||||
private modelConfigService: ModelConfigService,
|
||||
private configService: ConfigService,
|
||||
private i18nService: I18nService,
|
||||
) {
|
||||
this.defaultDimensions = parseInt(
|
||||
this.configService.get<string>('DEFAULT_VECTOR_DIMENSIONS', '2560'),
|
||||
);
|
||||
this.logger.log(
|
||||
`Default vector dimensions set to ${this.defaultDimensions}`,
|
||||
);
|
||||
}
|
||||
|
||||
async getEmbeddings(
|
||||
texts: string[],
|
||||
embeddingModelConfigId: string,
|
||||
): Promise<number[][]> {
|
||||
this.logger.log(`Generating embeddings for ${texts.length} texts`);
|
||||
|
||||
const modelConfig = await this.modelConfigService.findOne(
|
||||
embeddingModelConfigId,
|
||||
);
|
||||
if (!modelConfig || modelConfig.type !== 'embedding') {
|
||||
throw new Error(
|
||||
this.i18nService.formatMessage('embeddingModelNotFound', {
|
||||
id: embeddingModelConfigId,
|
||||
}),
|
||||
);
|
||||
}
|
||||
|
||||
if (modelConfig.isEnabled === false) {
|
||||
throw new Error(
|
||||
`Model ${modelConfig.name} is disabled and cannot generate embeddings`,
|
||||
);
|
||||
}
|
||||
|
||||
if (!modelConfig.baseUrl) {
|
||||
throw new Error(
|
||||
`Model ${modelConfig.name} does not have baseUrl configured`,
|
||||
);
|
||||
}
|
||||
|
||||
// Determine max batch size based on model name
|
||||
const maxBatchSize = this.getMaxBatchSizeForModel(
|
||||
modelConfig.modelId,
|
||||
modelConfig.maxBatchSize,
|
||||
);
|
||||
|
||||
// Split processing if batch size exceeds limit
|
||||
if (texts.length > maxBatchSize) {
|
||||
this.logger.log(
|
||||
`Splitting ${texts.length} texts into batches (model batch limit: ${maxBatchSize})`,
|
||||
);
|
||||
|
||||
const allEmbeddings: number[][] = [];
|
||||
|
||||
for (let i = 0; i < texts.length; i += maxBatchSize) {
|
||||
const batch = texts.slice(i, i + maxBatchSize);
|
||||
const batchEmbeddings = await this.getEmbeddingsForBatch(
|
||||
batch,
|
||||
modelConfig,
|
||||
maxBatchSize,
|
||||
);
|
||||
|
||||
allEmbeddings.push(...batchEmbeddings);
|
||||
|
||||
// Wait briefly to avoid API rate limiting
|
||||
if (i + maxBatchSize < texts.length) {
|
||||
await new Promise((resolve) => setTimeout(resolve, 100)); // Wait 100ms
|
||||
}
|
||||
}
|
||||
|
||||
return allEmbeddings;
|
||||
} else {
|
||||
// Normal processing (within batch size)
|
||||
return await this.getEmbeddingsForBatch(
|
||||
texts,
|
||||
modelConfig,
|
||||
maxBatchSize,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Determine max batch size based on model ID
|
||||
*/
|
||||
private getMaxBatchSizeForModel(
|
||||
modelId: string,
|
||||
configuredMaxBatchSize?: number,
|
||||
): number {
|
||||
// Model-specific batch size limits
|
||||
if (
|
||||
modelId.includes('text-embedding-004') ||
|
||||
modelId.includes('text-embedding-v4') ||
|
||||
modelId.includes('text-embedding-ada-002')
|
||||
) {
|
||||
return Math.min(10, configuredMaxBatchSize || 100); // Google limit: 10
|
||||
} else if (
|
||||
modelId.includes('text-embedding-3') ||
|
||||
modelId.includes('text-embedding-003')
|
||||
) {
|
||||
return Math.min(2048, configuredMaxBatchSize || 2048); // OpenAI v3 limit: 2048
|
||||
} else {
|
||||
// Default: smaller of configured max or 100
|
||||
return Math.min(configuredMaxBatchSize || 100, 100);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Process single batch embedding
|
||||
*/
|
||||
private async getEmbeddingsForBatch(
|
||||
texts: string[],
|
||||
modelConfig: any,
|
||||
maxBatchSize: number,
|
||||
): Promise<number[][]> {
|
||||
const apiUrl = modelConfig.baseUrl.endsWith('/embeddings')
|
||||
? modelConfig.baseUrl
|
||||
: `${modelConfig.baseUrl}/embeddings`;
|
||||
|
||||
let lastError;
|
||||
const MAX_RETRIES = 3;
|
||||
|
||||
for (let attempt = 1; attempt <= MAX_RETRIES; attempt++) {
|
||||
try {
|
||||
const controller = new AbortController();
|
||||
const timeoutId = setTimeout(() => {
|
||||
controller.abort();
|
||||
this.logger.error(`Embedding API timeout after 60s: ${apiUrl}`);
|
||||
}, 60000); // 60s timeout
|
||||
|
||||
this.logger.log(
|
||||
`[Model call] Type: Embedding, Model: ${modelConfig.name} (${modelConfig.modelId}), Text count: ${texts.length}`,
|
||||
);
|
||||
this.logger.log(
|
||||
`Calling embedding API (attempt ${attempt}/${MAX_RETRIES}): ${apiUrl}`,
|
||||
);
|
||||
|
||||
let response;
|
||||
try {
|
||||
response = await fetch(apiUrl, {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
Authorization: `Bearer ${modelConfig.apiKey}`,
|
||||
},
|
||||
body: JSON.stringify({
|
||||
encoding_format: 'float',
|
||||
input: texts,
|
||||
model: modelConfig.modelId,
|
||||
}),
|
||||
signal: controller.signal,
|
||||
});
|
||||
} finally {
|
||||
clearTimeout(timeoutId);
|
||||
}
|
||||
|
||||
if (!response.ok) {
|
||||
const errorText = await response.text();
|
||||
|
||||
// Detect batch size limit error
|
||||
if (
|
||||
errorText.includes('batch size is invalid') ||
|
||||
errorText.includes('batch_size') ||
|
||||
errorText.includes('invalid') ||
|
||||
errorText.includes('larger than')
|
||||
) {
|
||||
this.logger.warn(
|
||||
`Batch size limit error detected. Splitting batch in half and retrying: ${maxBatchSize} -> ${Math.floor(maxBatchSize / 2)}`,
|
||||
);
|
||||
|
||||
// Split batch into smaller units and retry
|
||||
if (texts.length > 1) {
|
||||
const midPoint = Math.floor(texts.length / 2);
|
||||
const firstHalf = texts.slice(0, midPoint);
|
||||
const secondHalf = texts.slice(midPoint);
|
||||
|
||||
const firstResult = await this.getEmbeddingsForBatch(
|
||||
firstHalf,
|
||||
modelConfig,
|
||||
Math.floor(maxBatchSize / 2),
|
||||
);
|
||||
const secondResult = await this.getEmbeddingsForBatch(
|
||||
secondHalf,
|
||||
modelConfig,
|
||||
Math.floor(maxBatchSize / 2),
|
||||
);
|
||||
|
||||
return [...firstResult, ...secondResult];
|
||||
}
|
||||
}
|
||||
|
||||
// Detect context length excess error
|
||||
if (
|
||||
errorText.includes('context length') ||
|
||||
errorText.includes('exceeds')
|
||||
) {
|
||||
const avgLength =
|
||||
texts.reduce((s, t) => s + t.length, 0) / texts.length;
|
||||
const totalLength = texts.reduce((s, t) => s + t.length, 0);
|
||||
this.logger.error(
|
||||
`Text length exceeds limit: ${texts.length} texts, ` +
|
||||
`total ${totalLength} characters, average ${Math.round(avgLength)} characters, ` +
|
||||
`model limit: ${modelConfig.maxInputTokens || 8192} tokens`,
|
||||
);
|
||||
throw new Error(
|
||||
`Text length exceeds model limit. ` +
|
||||
`Current: ${texts.length} texts with total ${totalLength} characters, ` +
|
||||
`model limit: ${modelConfig.maxInputTokens || 8192} tokens. ` +
|
||||
`Advice: Reduce chunk size or batch size`,
|
||||
);
|
||||
}
|
||||
|
||||
// Retry on 429 (Too Many Requests) or 5xx (Server Error)
|
||||
if (response.status === 429 || response.status >= 500) {
|
||||
this.logger.warn(
|
||||
`Temporary error from embedding API (${response.status}): ${errorText}`,
|
||||
);
|
||||
throw new Error(`API Error ${response.status}: ${errorText}`);
|
||||
}
|
||||
|
||||
this.logger.error(`Embedding API error details: ${errorText}`);
|
||||
this.logger.error(
|
||||
`Request parameters: model=${modelConfig.modelId}, inputLength=${texts[0]?.length}`,
|
||||
);
|
||||
throw new Error(
|
||||
`Embedding API call failed: ${response.statusText} - ${errorText}`,
|
||||
);
|
||||
}
|
||||
|
||||
const data: EmbeddingResponse = await response.json();
|
||||
const embeddings = data.data.map((item) => item.embedding);
|
||||
|
||||
// Get dimensions from actual response
|
||||
const actualDimensions =
|
||||
embeddings[0]?.length || this.defaultDimensions;
|
||||
this.logger.log(
|
||||
`Got ${embeddings.length} embedding vectors from ${modelConfig.name}. Dimensions: ${actualDimensions}`,
|
||||
);
|
||||
|
||||
return embeddings;
|
||||
} catch (error) {
|
||||
lastError = error;
|
||||
|
||||
// If not the last attempt and error appears temporary (or for robustness on all), retry after waiting
|
||||
if (attempt < MAX_RETRIES) {
|
||||
const delay = Math.pow(2, attempt - 1) * 1000; // 1s, 2s, 4s
|
||||
this.logger.warn(
|
||||
`Embedding request failed. Retrying after ${delay}ms: ${error.message}`,
|
||||
);
|
||||
await new Promise((resolve) => setTimeout(resolve, delay));
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
throw lastError;
|
||||
}
|
||||
|
||||
private getEstimatedDimensions(modelId: string): number {
|
||||
// Use default dimensions from environment variable
|
||||
return this.defaultDimensions;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,362 @@
|
||||
import {
|
||||
Body,
|
||||
Controller,
|
||||
Delete,
|
||||
Get,
|
||||
Param,
|
||||
Post,
|
||||
Query,
|
||||
Request,
|
||||
UseGuards,
|
||||
Res,
|
||||
NotFoundException,
|
||||
InternalServerErrorException,
|
||||
} from '@nestjs/common';
|
||||
import { Response } from 'express';
|
||||
import * as path from 'path';
|
||||
import { Logger } from '@nestjs/common';
|
||||
import { KnowledgeBaseService } from './knowledge-base.service';
|
||||
import { CombinedAuthGuard } from '../auth/combined-auth.guard';
|
||||
import { RolesGuard } from '../auth/roles.guard';
|
||||
import { Roles } from '../auth/roles.decorator';
|
||||
import { UserRole } from '../user/user-role.enum';
|
||||
import { Public } from '../auth/public.decorator';
|
||||
import { KnowledgeBase } from './knowledge-base.entity';
|
||||
import { ChunkConfigService } from './chunk-config.service';
|
||||
import { KnowledgeGroupService } from '../knowledge-group/knowledge-group.service';
|
||||
import { I18nService } from '../i18n/i18n.service';
|
||||
|
||||
@Controller('knowledge-bases')
|
||||
@UseGuards(CombinedAuthGuard, RolesGuard)
|
||||
export class KnowledgeBaseController {
|
||||
private readonly logger = new Logger(KnowledgeBaseController.name);
|
||||
|
||||
constructor(
|
||||
private readonly knowledgeBaseService: KnowledgeBaseService,
|
||||
private readonly chunkConfigService: ChunkConfigService,
|
||||
private readonly knowledgeGroupService: KnowledgeGroupService,
|
||||
private readonly i18nService: I18nService,
|
||||
) {}
|
||||
|
||||
@Get()
|
||||
@UseGuards(CombinedAuthGuard)
|
||||
async findAll(@Request() req): Promise<KnowledgeBase[]> {
|
||||
return this.knowledgeBaseService.findAll(req.user.id, req.user.tenantId);
|
||||
}
|
||||
|
||||
@Get('stats')
|
||||
@UseGuards(CombinedAuthGuard)
|
||||
async getStats(
|
||||
@Request() req,
|
||||
): Promise<{ total: number; uncategorized: number }> {
|
||||
return this.knowledgeBaseService.getStats(req.user.id, req.user.tenantId);
|
||||
}
|
||||
|
||||
@Delete('clear')
|
||||
@Roles(UserRole.TENANT_ADMIN, UserRole.SUPER_ADMIN)
|
||||
async clearAll(@Request() req): Promise<{ message: string }> {
|
||||
await this.knowledgeBaseService.clearAll(req.user.id, req.user.tenantId);
|
||||
return { message: this.i18nService.getMessage('kbCleared') };
|
||||
}
|
||||
|
||||
@Post('search')
|
||||
async search(@Request() req, @Body() body: { query: string; topK?: number }) {
|
||||
return this.knowledgeBaseService.searchKnowledge(
|
||||
req.user.id,
|
||||
req.user.tenantId, // New
|
||||
body.query,
|
||||
body.topK || 5,
|
||||
);
|
||||
}
|
||||
|
||||
@Post('rag-search')
|
||||
async ragSearch(
|
||||
@Request() req,
|
||||
@Body() body: { query: string; settings: any },
|
||||
) {
|
||||
return this.knowledgeBaseService.ragSearch(
|
||||
req.user.id,
|
||||
req.user.tenantId, // New
|
||||
body.query,
|
||||
body.settings,
|
||||
);
|
||||
}
|
||||
|
||||
@Delete(':id')
|
||||
@Roles(UserRole.TENANT_ADMIN, UserRole.SUPER_ADMIN)
|
||||
async deleteFile(
|
||||
@Request() req,
|
||||
@Param('id') fileId: string,
|
||||
): Promise<{ message: string }> {
|
||||
await this.knowledgeBaseService.deleteFile(
|
||||
fileId,
|
||||
req.user.id,
|
||||
req.user.tenantId,
|
||||
);
|
||||
return { message: this.i18nService.getMessage('fileDeleted') };
|
||||
}
|
||||
|
||||
@Post(':id/retry')
|
||||
@Roles(UserRole.TENANT_ADMIN, UserRole.SUPER_ADMIN)
|
||||
async retryFile(
|
||||
@Request() req,
|
||||
@Param('id') fileId: string,
|
||||
): Promise<KnowledgeBase> {
|
||||
return this.knowledgeBaseService.retryFailedFile(
|
||||
fileId,
|
||||
req.user.id,
|
||||
req.user.tenantId,
|
||||
);
|
||||
}
|
||||
|
||||
@Get(':id/chunks')
|
||||
async getFileChunks(@Request() req, @Param('id') fileId: string) {
|
||||
return this.knowledgeBaseService.getFileChunks(
|
||||
fileId,
|
||||
req.user.id,
|
||||
req.user.tenantId,
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get chunk config limits (for frontend slider settings)
|
||||
* Query parameter: embeddingModelId - embedding model ID
|
||||
*/
|
||||
@Get('chunk-config/limits')
|
||||
async getChunkConfigLimits(
|
||||
@Request() req,
|
||||
@Query('embeddingModelId') embeddingModelId: string,
|
||||
) {
|
||||
if (!embeddingModelId) {
|
||||
return {
|
||||
maxChunkSize: parseInt(process.env.MAX_CHUNK_SIZE || '8191'),
|
||||
maxOverlapSize: parseInt(process.env.MAX_OVERLAP_SIZE || '2000'),
|
||||
minOverlapSize: 25,
|
||||
defaultChunkSize: 200,
|
||||
defaultOverlapSize: 40,
|
||||
modelInfo: {
|
||||
name: this.i18nService.getMessage('modelNotConfigured'),
|
||||
maxInputTokens: parseInt(process.env.MAX_CHUNK_SIZE || '8191'),
|
||||
maxBatchSize: 2048,
|
||||
expectedDimensions: parseInt(
|
||||
process.env.DEFAULT_VECTOR_DIMENSIONS || '2560',
|
||||
),
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
return await this.chunkConfigService.getFrontendLimits(
|
||||
embeddingModelId,
|
||||
req.user.id,
|
||||
req.user.tenantId,
|
||||
);
|
||||
}
|
||||
|
||||
// File group management - requires admin permission
|
||||
@Post(':id/groups')
|
||||
@Roles(UserRole.TENANT_ADMIN, UserRole.SUPER_ADMIN)
|
||||
async addFileToGroups(
|
||||
@Param('id') fileId: string,
|
||||
@Body() body: { groupIds: string[] },
|
||||
@Request() req,
|
||||
) {
|
||||
await this.knowledgeGroupService.addFilesToGroup(
|
||||
fileId,
|
||||
body.groupIds,
|
||||
req.user.id,
|
||||
req.user.tenantId,
|
||||
);
|
||||
return { message: this.i18nService.getMessage('groupSyncSuccess') };
|
||||
}
|
||||
|
||||
@Delete(':id/groups/:groupId')
|
||||
@Roles(UserRole.TENANT_ADMIN, UserRole.SUPER_ADMIN)
|
||||
async removeFileFromGroup(
|
||||
@Param('id') fileId: string,
|
||||
@Param('groupId') groupId: string,
|
||||
@Request() req,
|
||||
) {
|
||||
await this.knowledgeGroupService.removeFileFromGroup(
|
||||
fileId,
|
||||
groupId,
|
||||
req.user.id,
|
||||
req.user.tenantId,
|
||||
);
|
||||
return { message: this.i18nService.getMessage('fileDeletedFromGroup') };
|
||||
}
|
||||
|
||||
// PDF preview - public access
|
||||
@Public()
|
||||
@Get(':id/pdf')
|
||||
async getPDFPreview(
|
||||
@Param('id') fileId: string,
|
||||
@Query('token') token: string,
|
||||
@Res() res: Response,
|
||||
) {
|
||||
try {
|
||||
if (!token) {
|
||||
throw new NotFoundException(
|
||||
this.i18nService.getMessage('accessDeniedNoToken'),
|
||||
);
|
||||
}
|
||||
|
||||
const jwt = await import('jsonwebtoken');
|
||||
const secret = process.env.JWT_SECRET;
|
||||
if (!secret) {
|
||||
throw new InternalServerErrorException(
|
||||
this.i18nService.getMessage('jwtSecretRequired'),
|
||||
);
|
||||
}
|
||||
|
||||
let decoded;
|
||||
try {
|
||||
decoded = jwt.verify(token, secret) as any;
|
||||
} catch {
|
||||
throw new NotFoundException(
|
||||
this.i18nService.getMessage('invalidToken'),
|
||||
);
|
||||
}
|
||||
|
||||
if (decoded.type !== 'pdf-access' || decoded.fileId !== fileId) {
|
||||
throw new NotFoundException(
|
||||
this.i18nService.getMessage('invalidToken'),
|
||||
);
|
||||
}
|
||||
|
||||
const pdfPath = await this.knowledgeBaseService.ensurePDFExists(
|
||||
fileId,
|
||||
decoded.userId,
|
||||
decoded.tenantId, // New
|
||||
);
|
||||
|
||||
const fs = await import('fs');
|
||||
const path = await import('path');
|
||||
|
||||
if (!fs.existsSync(pdfPath)) {
|
||||
throw new NotFoundException(
|
||||
this.i18nService.getMessage('pdfFileNotFound'),
|
||||
);
|
||||
}
|
||||
|
||||
const stat = fs.statSync(pdfPath);
|
||||
const fileName = path.basename(pdfPath);
|
||||
|
||||
if (stat.size === 0) {
|
||||
this.logger.warn(`PDF file is empty: ${pdfPath}`);
|
||||
try {
|
||||
fs.unlinkSync(pdfPath); // Delete empty file
|
||||
} catch (e) {}
|
||||
throw new NotFoundException(
|
||||
this.i18nService.getMessage('pdfFileEmpty'),
|
||||
);
|
||||
}
|
||||
|
||||
res.setHeader('Content-Type', 'application/pdf');
|
||||
res.setHeader('Content-Length', stat.size);
|
||||
|
||||
const stream = fs.createReadStream(pdfPath);
|
||||
stream.pipe(res);
|
||||
} catch (error) {
|
||||
if (error instanceof NotFoundException) {
|
||||
throw error;
|
||||
}
|
||||
this.logger.error(`PDF preview error: ${error.message}`);
|
||||
throw new NotFoundException(
|
||||
this.i18nService.getMessage('pdfConversionFailed'),
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// Get PDF preview URL
|
||||
@Get(':id/pdf-url')
|
||||
async getPDFUrl(
|
||||
@Param('id') fileId: string,
|
||||
@Query('force') force: string,
|
||||
@Request() req,
|
||||
) {
|
||||
try {
|
||||
// Trigger PDF conversion
|
||||
await this.knowledgeBaseService.ensurePDFExists(
|
||||
fileId,
|
||||
req.user.id,
|
||||
req.user.tenantId,
|
||||
force === 'true',
|
||||
);
|
||||
|
||||
// Generate temporary access token
|
||||
const jwt = await import('jsonwebtoken');
|
||||
|
||||
const secret = process.env.JWT_SECRET;
|
||||
if (!secret) {
|
||||
throw new InternalServerErrorException(
|
||||
this.i18nService.getMessage('jwtSecretRequired'),
|
||||
);
|
||||
}
|
||||
|
||||
const token = jwt.sign(
|
||||
{
|
||||
fileId,
|
||||
userId: req.user.id,
|
||||
tenantId: req.user.tenantId,
|
||||
type: 'pdf-access',
|
||||
},
|
||||
secret,
|
||||
{ expiresIn: '1h' },
|
||||
);
|
||||
|
||||
return {
|
||||
url: `/api/knowledge-bases/${fileId}/pdf?token=${token}`,
|
||||
};
|
||||
} catch (error) {
|
||||
if (error.message.includes('LibreOffice')) {
|
||||
throw new InternalServerErrorException(
|
||||
this.i18nService.formatMessage('pdfServiceUnavailable', {
|
||||
message: error.message,
|
||||
}),
|
||||
);
|
||||
}
|
||||
throw new InternalServerErrorException(error.message);
|
||||
}
|
||||
}
|
||||
|
||||
@Get(':id/pdf-status')
|
||||
async getPDFStatus(@Param('id') fileId: string, @Request() req) {
|
||||
return await this.knowledgeBaseService.getPDFStatus(
|
||||
fileId,
|
||||
req.user.id,
|
||||
req.user.tenantId,
|
||||
);
|
||||
}
|
||||
|
||||
// Get specific page of PDF as image
|
||||
@Get(':id/page/:index')
|
||||
async getPageImage(
|
||||
@Param('id') fileId: string,
|
||||
@Param('index') index: number,
|
||||
@Request() req,
|
||||
@Res() res: Response,
|
||||
) {
|
||||
try {
|
||||
const imagePath = await this.knowledgeBaseService.getPageAsImage(
|
||||
fileId,
|
||||
Number(index),
|
||||
req.user.id,
|
||||
req.user.tenantId,
|
||||
);
|
||||
|
||||
const fs = await import('fs');
|
||||
if (!fs.existsSync(imagePath)) {
|
||||
throw new NotFoundException(
|
||||
this.i18nService.getMessage('pageImageNotFound'),
|
||||
);
|
||||
}
|
||||
|
||||
res.sendFile(path.resolve(imagePath));
|
||||
} catch (error) {
|
||||
this.logger.error(`Failed to get PDF page image: ${error.message}`);
|
||||
throw new NotFoundException(
|
||||
this.i18nService.getMessage('pdfPageImageFailed'),
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,99 @@
|
||||
import {
|
||||
Column,
|
||||
CreateDateColumn,
|
||||
Entity,
|
||||
PrimaryGeneratedColumn,
|
||||
UpdateDateColumn,
|
||||
ManyToMany,
|
||||
ManyToOne,
|
||||
JoinColumn,
|
||||
} from 'typeorm';
|
||||
import { KnowledgeGroup } from '../knowledge-group/knowledge-group.entity';
|
||||
import { Tenant } from '../tenant/tenant.entity';
|
||||
|
||||
export enum FileStatus {
|
||||
PENDING = 'pending',
|
||||
INDEXING = 'indexing',
|
||||
EXTRACTED = 'extracted', // Text extraction completed and saved to database
|
||||
VECTORIZED = 'vectorized', // Vectorization completed and indexed to ES
|
||||
FAILED = 'failed',
|
||||
}
|
||||
|
||||
export enum ProcessingMode {
|
||||
FAST = 'fast', // Fast mode - use Tika
|
||||
PRECISE = 'precise', // Precise mode - use Vision Pipeline
|
||||
}
|
||||
|
||||
@Entity('knowledge_bases')
|
||||
export class KnowledgeBase {
|
||||
@PrimaryGeneratedColumn('uuid')
|
||||
id: string;
|
||||
|
||||
@Column({ name: 'original_name' })
|
||||
originalName: string;
|
||||
|
||||
@Column({ nullable: true })
|
||||
title: string;
|
||||
|
||||
@Column({ name: 'storage_path' })
|
||||
storagePath: string;
|
||||
|
||||
@Column({ type: 'integer', default: 0 })
|
||||
size: number;
|
||||
|
||||
@Column({ length: 100, nullable: true })
|
||||
mimetype: string;
|
||||
|
||||
@Column({
|
||||
type: 'simple-enum',
|
||||
enum: FileStatus,
|
||||
default: FileStatus.PENDING,
|
||||
})
|
||||
status: FileStatus;
|
||||
|
||||
@Column({ name: 'user_id', nullable: true }) // Temporarily allowed empty (for debugging), should be required in future
|
||||
userId: string;
|
||||
|
||||
@Column({ name: 'tenant_id', nullable: true, type: 'text' })
|
||||
tenantId: string;
|
||||
|
||||
@ManyToOne(() => Tenant, { nullable: true, onDelete: 'CASCADE' })
|
||||
@JoinColumn({ name: 'tenant_id' })
|
||||
tenant: Tenant;
|
||||
|
||||
@Column({ type: 'text', nullable: true })
|
||||
content: string; // Stores text content extracted by Tika
|
||||
|
||||
// Index setting parameters
|
||||
@Column({ name: 'chunk_size', type: 'integer', default: 1000 })
|
||||
chunkSize: number;
|
||||
|
||||
@Column({ name: 'chunk_overlap', type: 'integer', default: 200 })
|
||||
chunkOverlap: number;
|
||||
|
||||
@Column({ name: 'embedding_model_id', nullable: true })
|
||||
embeddingModelId: string;
|
||||
|
||||
@Column({
|
||||
type: 'simple-enum',
|
||||
enum: ProcessingMode,
|
||||
default: ProcessingMode.FAST,
|
||||
name: 'processing_mode',
|
||||
})
|
||||
processingMode: ProcessingMode;
|
||||
|
||||
@Column({ type: 'json', nullable: true })
|
||||
metadata: any; // Stores additional metadata (image descriptions, confidence, etc.)
|
||||
|
||||
@Column({ name: 'pdf_path', nullable: true })
|
||||
pdfPath: string; // PDF file path (for preview)
|
||||
|
||||
@ManyToMany(() => KnowledgeGroup, (group) => group.knowledgeBases)
|
||||
groups: KnowledgeGroup[];
|
||||
|
||||
@CreateDateColumn({ name: 'created_at' })
|
||||
createdAt: Date;
|
||||
|
||||
@UpdateDateColumn({ name: 'updated_at' })
|
||||
updatedAt: Date;
|
||||
}
|
||||
@@ -0,0 +1,52 @@
|
||||
import { Module, forwardRef } from '@nestjs/common';
|
||||
import { TypeOrmModule } from '@nestjs/typeorm';
|
||||
import { KnowledgeBase } from './knowledge-base.entity';
|
||||
import { KnowledgeGroup } from '../knowledge-group/knowledge-group.entity';
|
||||
import { KnowledgeBaseService } from './knowledge-base.service';
|
||||
import { KnowledgeBaseController } from './knowledge-base.controller';
|
||||
import { ElasticsearchModule } from '../elasticsearch/elasticsearch.module';
|
||||
import { TikaModule } from '../tika/tika.module';
|
||||
import { ModelConfigModule } from '../model-config/model-config.module';
|
||||
import { EmbeddingService } from './embedding.service';
|
||||
import { TextChunkerService } from './text-chunker.service';
|
||||
import { RagModule } from '../rag/rag.module';
|
||||
import { VisionModule } from '../vision/vision.module';
|
||||
import { MemoryMonitorService } from './memory-monitor.service';
|
||||
import { ChunkConfigService } from './chunk-config.service';
|
||||
import { LibreOfficeModule } from '../libreoffice/libreoffice.module';
|
||||
import { Pdf2ImageModule } from '../pdf2image/pdf2image.module';
|
||||
import { VisionPipelineModule } from '../vision-pipeline/vision-pipeline.module';
|
||||
import { KnowledgeGroupModule } from '../knowledge-group/knowledge-group.module';
|
||||
import { ChatModule } from '../chat/chat.module';
|
||||
import { UserModule } from '../user/user.module';
|
||||
import { TenantModule } from '../tenant/tenant.module';
|
||||
import { CombinedAuthGuard } from '../auth/combined-auth.guard';
|
||||
|
||||
@Module({
|
||||
imports: [
|
||||
TypeOrmModule.forFeature([KnowledgeBase, KnowledgeGroup]),
|
||||
forwardRef(() => ElasticsearchModule),
|
||||
TikaModule,
|
||||
ModelConfigModule,
|
||||
forwardRef(() => RagModule),
|
||||
VisionModule,
|
||||
LibreOfficeModule,
|
||||
Pdf2ImageModule,
|
||||
VisionPipelineModule,
|
||||
forwardRef(() => KnowledgeGroupModule),
|
||||
forwardRef(() => ChatModule),
|
||||
UserModule,
|
||||
TenantModule,
|
||||
],
|
||||
controllers: [KnowledgeBaseController],
|
||||
providers: [
|
||||
KnowledgeBaseService,
|
||||
EmbeddingService,
|
||||
TextChunkerService,
|
||||
MemoryMonitorService,
|
||||
ChunkConfigService,
|
||||
CombinedAuthGuard,
|
||||
],
|
||||
exports: [KnowledgeBaseService, EmbeddingService],
|
||||
})
|
||||
export class KnowledgeBaseModule {}
|
||||
@@ -0,0 +1,1826 @@
|
||||
import {
|
||||
Injectable,
|
||||
Logger,
|
||||
NotFoundException,
|
||||
ForbiddenException,
|
||||
Inject,
|
||||
forwardRef,
|
||||
} from '@nestjs/common';
|
||||
import { ConfigService } from '@nestjs/config';
|
||||
import { DEFAULT_LANGUAGE } from '../common/constants';
|
||||
import { I18nService } from '../i18n/i18n.service';
|
||||
import { InjectRepository } from '@nestjs/typeorm';
|
||||
import { Repository, In } from 'typeorm';
|
||||
import {
|
||||
FileStatus,
|
||||
KnowledgeBase,
|
||||
ProcessingMode,
|
||||
} from './knowledge-base.entity';
|
||||
import { KnowledgeGroup } from '../knowledge-group/knowledge-group.entity';
|
||||
import { ElasticsearchService } from '../elasticsearch/elasticsearch.service';
|
||||
import { TikaService } from '../tika/tika.service';
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
import { EmbeddingService } from './embedding.service';
|
||||
import { TextChunkerService } from './text-chunker.service';
|
||||
import { ModelConfigService } from '../model-config/model-config.service';
|
||||
import { RagService } from '../rag/rag.service';
|
||||
import { VisionService } from '../vision/vision.service';
|
||||
import { TenantService } from '../tenant/tenant.service';
|
||||
import { MemoryMonitorService } from './memory-monitor.service';
|
||||
import { ChunkConfigService } from './chunk-config.service';
|
||||
import { VisionPipelineService } from '../vision-pipeline/vision-pipeline.service';
|
||||
import { LibreOfficeService } from '../libreoffice/libreoffice.service';
|
||||
import { Pdf2ImageService } from '../pdf2image/pdf2image.service';
|
||||
import {
|
||||
DOC_EXTENSIONS,
|
||||
IMAGE_EXTENSIONS,
|
||||
} from '../common/file-support.constants';
|
||||
import { ChatService } from '../chat/chat.service';
|
||||
import { UserSettingService } from '../user/user-setting.service';
|
||||
|
||||
export interface PaginatedKnowledgeBase {
|
||||
items: KnowledgeBase[];
|
||||
total: number;
|
||||
page: number;
|
||||
limit: number;
|
||||
}
|
||||
|
||||
@Injectable()
|
||||
export class KnowledgeBaseService {
|
||||
private readonly logger = new Logger(KnowledgeBaseService.name);
|
||||
|
||||
constructor(
|
||||
@InjectRepository(KnowledgeBase)
|
||||
private kbRepository: Repository<KnowledgeBase>,
|
||||
@InjectRepository(KnowledgeGroup)
|
||||
private groupRepository: Repository<KnowledgeGroup>,
|
||||
@Inject(forwardRef(() => ElasticsearchService))
|
||||
private elasticsearchService: ElasticsearchService,
|
||||
private tikaService: TikaService,
|
||||
private embeddingService: EmbeddingService,
|
||||
private textChunkerService: TextChunkerService,
|
||||
private modelConfigService: ModelConfigService,
|
||||
@Inject(forwardRef(() => RagService))
|
||||
private ragService: RagService,
|
||||
private visionService: VisionService,
|
||||
private tenantService: TenantService,
|
||||
private memoryMonitor: MemoryMonitorService,
|
||||
private chunkConfigService: ChunkConfigService,
|
||||
private visionPipelineService: VisionPipelineService,
|
||||
private libreOfficeService: LibreOfficeService,
|
||||
private pdf2ImageService: Pdf2ImageService,
|
||||
private configService: ConfigService,
|
||||
private i18nService: I18nService,
|
||||
@Inject(forwardRef(() => ChatService))
|
||||
private chatService: ChatService,
|
||||
private userSettingService: UserSettingService,
|
||||
) {}
|
||||
|
||||
async createAndIndex(
|
||||
fileInfo: any,
|
||||
userId: string,
|
||||
tenantId: string,
|
||||
config?: any,
|
||||
): Promise<KnowledgeBase> {
|
||||
const mode = config?.mode || 'fast';
|
||||
const processingMode =
|
||||
mode === 'precise' ? ProcessingMode.PRECISE : ProcessingMode.FAST;
|
||||
|
||||
const kb = this.kbRepository.create({
|
||||
originalName: fileInfo.originalname,
|
||||
storagePath: fileInfo.path,
|
||||
size: fileInfo.size,
|
||||
mimetype: fileInfo.mimetype,
|
||||
status: FileStatus.PENDING,
|
||||
userId: userId,
|
||||
tenantId: tenantId,
|
||||
chunkSize: config?.chunkSize || 200,
|
||||
chunkOverlap: config?.chunkOverlap || 40,
|
||||
embeddingModelId: config?.embeddingModelId || null,
|
||||
processingMode: processingMode,
|
||||
});
|
||||
|
||||
// Associate groups
|
||||
if (config?.groupIds && config.groupIds.length > 0) {
|
||||
const groups = await this.groupRepository.find({
|
||||
where: { id: In(config.groupIds), tenantId: tenantId },
|
||||
});
|
||||
kb.groups = groups;
|
||||
}
|
||||
|
||||
const savedKb = await this.kbRepository.save(kb);
|
||||
|
||||
this.logger.log(
|
||||
`Created KB record: ${savedKb.id}, mode: ${mode}, file: ${fileInfo.originalname}`,
|
||||
);
|
||||
|
||||
// ---------------------------------------------------------
|
||||
// Move the file to the final partitioned directory
|
||||
// source: uploads/{tenantId}/{filename} (or wherever it was)
|
||||
// target: uploads/{tenantId}/{savedKb.id}/{filename}
|
||||
// ---------------------------------------------------------
|
||||
const fs = await import('fs');
|
||||
const path = await import('path');
|
||||
const uploadPath = process.env.UPLOAD_FILE_PATH || './uploads';
|
||||
const targetDir = path.join(uploadPath, tenantId || 'default', savedKb.id);
|
||||
const targetPath = path.join(targetDir, fileInfo.filename);
|
||||
|
||||
try {
|
||||
if (!fs.existsSync(targetDir)) {
|
||||
fs.mkdirSync(targetDir, { recursive: true });
|
||||
}
|
||||
if (fs.existsSync(fileInfo.path)) {
|
||||
fs.renameSync(fileInfo.path, targetPath);
|
||||
// Update the DB record with the new path
|
||||
savedKb.storagePath = targetPath;
|
||||
await this.kbRepository.save(savedKb);
|
||||
this.logger.log(`Moved file to partitioned storage: ${targetPath}`);
|
||||
}
|
||||
} catch (fsError) {
|
||||
this.logger.error(
|
||||
`Failed to move file ${savedKb.id} to partitioned storage`,
|
||||
fsError,
|
||||
);
|
||||
// We will let it continue, but the file might be stuck in the temp/root folder
|
||||
}
|
||||
|
||||
// If queue processing is requested, await completion
|
||||
if (config?.waitForCompletion) {
|
||||
await this.processFile(savedKb.id, userId, tenantId, config);
|
||||
} else {
|
||||
// Otherwise trigger asynchronously (default)
|
||||
this.processFile(savedKb.id, userId, tenantId, config).catch((err) => {
|
||||
this.logger.error(`Error processing file ${savedKb.id}`, err);
|
||||
});
|
||||
}
|
||||
|
||||
return savedKb;
|
||||
}
|
||||
|
||||
async findAll(userId: string, tenantId?: string): Promise<KnowledgeBase[]> {
|
||||
const where: any = {};
|
||||
if (tenantId) {
|
||||
where.tenantId = tenantId;
|
||||
} else {
|
||||
where.userId = userId;
|
||||
}
|
||||
return this.kbRepository.find({
|
||||
where,
|
||||
relations: ['groups'], // Load group relations
|
||||
order: { createdAt: 'DESC' },
|
||||
});
|
||||
}
|
||||
|
||||
async findOne(
|
||||
id: string,
|
||||
userId: string,
|
||||
tenantId: string,
|
||||
): Promise<KnowledgeBase> {
|
||||
const kb = await this.kbRepository.findOne({
|
||||
where: { id },
|
||||
relations: ['groups'],
|
||||
});
|
||||
|
||||
if (!kb) {
|
||||
throw new NotFoundException(
|
||||
this.i18nService.getMessage('knowledgeBaseNotFound'),
|
||||
);
|
||||
}
|
||||
|
||||
// Check permission using TenantService
|
||||
const hasAccess = await this.tenantService.canAccessTenant(
|
||||
userId,
|
||||
kb.tenantId,
|
||||
tenantId,
|
||||
);
|
||||
if (!hasAccess) {
|
||||
throw new ForbiddenException(
|
||||
`You do not have permission to access this knowledge base`,
|
||||
);
|
||||
}
|
||||
|
||||
return kb;
|
||||
}
|
||||
|
||||
async getStats(
|
||||
userId: string,
|
||||
tenantId?: string,
|
||||
): Promise<{ total: number; uncategorized: number }> {
|
||||
const where: any = {};
|
||||
if (tenantId) {
|
||||
where.tenantId = tenantId;
|
||||
} else {
|
||||
where.userId = userId;
|
||||
}
|
||||
|
||||
// Get total count
|
||||
const total = await this.kbRepository.count({ where });
|
||||
|
||||
// Get uncategorized count (files with no groups)
|
||||
// We need to use query builder to check for empty groups relation
|
||||
const uncategorizedQuery = this.kbRepository
|
||||
.createQueryBuilder('kb')
|
||||
.leftJoin('kb.groups', 'groups');
|
||||
|
||||
// Apply where conditions
|
||||
if (tenantId) {
|
||||
uncategorizedQuery.where('kb.tenantId = :tenantId', { tenantId });
|
||||
} else {
|
||||
uncategorizedQuery.where('kb.userId = :userId', { userId });
|
||||
}
|
||||
|
||||
// Count files where groups array is empty
|
||||
const uncategorizedCount = await uncategorizedQuery
|
||||
.andWhere('groups.id IS NULL')
|
||||
.getCount();
|
||||
|
||||
return {
|
||||
total,
|
||||
uncategorized: uncategorizedCount,
|
||||
};
|
||||
}
|
||||
|
||||
async searchKnowledge(
|
||||
userId: string,
|
||||
tenantId: string,
|
||||
query: string,
|
||||
topK: number = 5,
|
||||
) {
|
||||
try {
|
||||
// Generate simulation vector using default dimensions from environment variable
|
||||
const defaultDimensions = parseInt(
|
||||
process.env.DEFAULT_VECTOR_DIMENSIONS || '2560',
|
||||
);
|
||||
const mockEmbedding = Array.from(
|
||||
{ length: defaultDimensions },
|
||||
() => Math.random() - 0.5,
|
||||
);
|
||||
const queryVector = mockEmbedding;
|
||||
|
||||
// 2. Search in Elasticsearch
|
||||
const searchResults = await this.elasticsearchService.searchSimilar(
|
||||
queryVector,
|
||||
userId,
|
||||
topK,
|
||||
tenantId, // Ensure shared visibility within tenant
|
||||
);
|
||||
|
||||
// 3. Get file information from database
|
||||
const fileIds = [...new Set(searchResults.map((r) => r.fileId))];
|
||||
const files = await this.kbRepository.findByIds(fileIds);
|
||||
const fileMap = new Map(files.map((f) => [f.id, f]));
|
||||
|
||||
// 4. Combine results with file info
|
||||
const results = searchResults.map((result) => {
|
||||
const file = fileMap.get(result.fileId);
|
||||
return {
|
||||
...result,
|
||||
file: file
|
||||
? {
|
||||
id: file.id,
|
||||
name: file.originalName,
|
||||
mimetype: file.mimetype,
|
||||
size: file.size,
|
||||
createdAt: file.createdAt,
|
||||
}
|
||||
: null,
|
||||
};
|
||||
});
|
||||
|
||||
return {
|
||||
query,
|
||||
results,
|
||||
total: results.length,
|
||||
};
|
||||
} catch (error) {
|
||||
this.logger.error(
|
||||
`Metadata search failed for tenant ${tenantId}:`,
|
||||
error.stack || error.message,
|
||||
);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
async ragSearch(
|
||||
userId: string,
|
||||
tenantId: string,
|
||||
query: string,
|
||||
settings: any,
|
||||
) {
|
||||
this.logger.log(
|
||||
`RAG search request: userId=${userId}, query="${query}", settings=${JSON.stringify(settings)}`,
|
||||
);
|
||||
|
||||
try {
|
||||
const ragResults = await this.ragService.searchKnowledge(
|
||||
query,
|
||||
userId,
|
||||
settings.topK,
|
||||
settings.similarityThreshold,
|
||||
settings.selectedEmbeddingId,
|
||||
settings.enableFullTextSearch,
|
||||
settings.enableRerank,
|
||||
settings.selectedRerankId,
|
||||
undefined,
|
||||
undefined,
|
||||
settings.rerankSimilarityThreshold,
|
||||
tenantId, // Ensure shared visibility within tenant for RAG
|
||||
);
|
||||
|
||||
const sources = this.ragService.extractSources(ragResults);
|
||||
const ragPrompt = this.ragService.buildRagPrompt(
|
||||
query,
|
||||
ragResults,
|
||||
settings.language || DEFAULT_LANGUAGE,
|
||||
);
|
||||
|
||||
const result = {
|
||||
searchResults: ragResults,
|
||||
sources,
|
||||
ragPrompt,
|
||||
hasRelevantContent: ragResults.length > 0,
|
||||
};
|
||||
|
||||
this.logger.log(
|
||||
`RAG search completed: found ${ragResults.length} results`,
|
||||
);
|
||||
return result;
|
||||
} catch (error) {
|
||||
this.logger.error(
|
||||
`RAG search failed for user ${userId}:`,
|
||||
error.stack || error.message,
|
||||
);
|
||||
// Return empty result instead of throwing error to keep system running
|
||||
return {
|
||||
searchResults: [],
|
||||
sources: [],
|
||||
ragPrompt: query, // Use original query
|
||||
hasRelevantContent: false,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
async deleteFile(
|
||||
fileId: string,
|
||||
userId: string,
|
||||
tenantId: string,
|
||||
): Promise<void> {
|
||||
this.logger.log(`Deleting file ${fileId} for user ${userId}`);
|
||||
|
||||
try {
|
||||
// 1. Get file info
|
||||
const file = await this.kbRepository.findOne({
|
||||
where: { id: fileId, tenantId }, // Filter by tenantId
|
||||
});
|
||||
if (!file) {
|
||||
throw new NotFoundException(
|
||||
this.i18nService.getMessage('fileNotFound'),
|
||||
);
|
||||
}
|
||||
|
||||
// 2. Delete file from filesystem
|
||||
const fs = await import('fs');
|
||||
try {
|
||||
if (fs.existsSync(file.storagePath)) {
|
||||
fs.unlinkSync(file.storagePath);
|
||||
this.logger.log(`Deleted file: ${file.storagePath}`);
|
||||
}
|
||||
} catch (error) {
|
||||
this.logger.warn(`Failed to delete file ${file.storagePath}:`, error);
|
||||
}
|
||||
|
||||
// 3. Delete from Elasticsearch
|
||||
try {
|
||||
await this.elasticsearchService.deleteByFileId(
|
||||
fileId,
|
||||
userId,
|
||||
tenantId,
|
||||
);
|
||||
this.logger.log(`Deleted ES documents for file ${fileId}`);
|
||||
} catch (error) {
|
||||
this.logger.warn(
|
||||
`Failed to delete ES documents for file ${fileId}:`,
|
||||
error,
|
||||
);
|
||||
}
|
||||
|
||||
// 4. Remove from all groups (cleanup M2M relations)
|
||||
const fileWithGroups = await this.kbRepository.findOne({
|
||||
where: { id: fileId, tenantId },
|
||||
relations: ['groups'],
|
||||
});
|
||||
|
||||
if (
|
||||
fileWithGroups &&
|
||||
fileWithGroups.groups &&
|
||||
fileWithGroups.groups.length > 0
|
||||
) {
|
||||
// Clear groups to remove entries from join table
|
||||
fileWithGroups.groups = [];
|
||||
await this.kbRepository.save(fileWithGroups);
|
||||
this.logger.log(`Cleared group associations for file ${fileId}`);
|
||||
}
|
||||
|
||||
// 5. Delete from SQLite
|
||||
await this.kbRepository.delete({ id: fileId });
|
||||
this.logger.log(`Deleted database record for file ${fileId}`);
|
||||
} catch (error) {
|
||||
this.logger.error(`Failed to delete file ${fileId}`, error);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
async clearAll(userId: string, tenantId: string): Promise<void> {
|
||||
this.logger.log(
|
||||
`Clearing all knowledge base data for user ${userId} in tenant ${tenantId}`,
|
||||
);
|
||||
|
||||
try {
|
||||
// Get all files for the specific tenant and delete them one by one
|
||||
const files = await this.kbRepository.find({ where: { tenantId } });
|
||||
|
||||
for (const file of files) {
|
||||
await this.deleteFile(file.id, userId, tenantId);
|
||||
}
|
||||
|
||||
this.logger.log(`Cleared all knowledge base data for user ${userId}`);
|
||||
} catch (error) {
|
||||
this.logger.error(
|
||||
`Failed to clear knowledge base for user ${userId}`,
|
||||
error,
|
||||
);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
private async processFile(
|
||||
kbId: string,
|
||||
userId: string,
|
||||
tenantId: string,
|
||||
config?: any,
|
||||
) {
|
||||
this.logger.log(
|
||||
`Starting processing for file ${kbId}, mode: ${config?.mode || 'fast'}`,
|
||||
);
|
||||
await this.updateStatus(kbId, FileStatus.INDEXING);
|
||||
|
||||
try {
|
||||
const kb = await this.kbRepository.findOne({ where: { id: kbId } });
|
||||
if (!kb) {
|
||||
this.logger.error(`KB not found: ${kbId}`);
|
||||
return;
|
||||
}
|
||||
|
||||
// Memory monitor - pre-processing check
|
||||
const memBefore = this.memoryMonitor.getMemoryUsage();
|
||||
this.logger.log(
|
||||
`Memory state - before processing: ${memBefore.heapUsed}/${memBefore.heapTotal}MB`,
|
||||
);
|
||||
|
||||
// Select processing flow based on mode
|
||||
const mode = config?.mode || 'fast';
|
||||
|
||||
if (mode === 'precise') {
|
||||
// Precise mode - use Vision Pipeline
|
||||
await this.processPreciseMode(kb, userId, tenantId, config);
|
||||
} else {
|
||||
// Fast mode - use Tika
|
||||
await this.processFastMode(kb, userId, tenantId, config);
|
||||
}
|
||||
|
||||
this.logger.log(`File ${kbId} processed successfully in ${mode} mode.`);
|
||||
} catch (error) {
|
||||
this.logger.error(`Failed to process file ${kbId}`, error);
|
||||
await this.updateStatus(kbId, FileStatus.FAILED);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Fast mode processing (existing flow)
|
||||
*/
|
||||
private async processFastMode(
|
||||
kb: KnowledgeBase,
|
||||
userId: string,
|
||||
tenantId: string,
|
||||
config?: any,
|
||||
) {
|
||||
// 1. Extract text using Tika
|
||||
let text = await this.tikaService.extractText(kb.storagePath);
|
||||
|
||||
// Use vision model for image files
|
||||
if (this.visionService.isImageFile(kb.mimetype)) {
|
||||
const settings = await this.tenantService.getSettings(
|
||||
tenantId || 'default',
|
||||
);
|
||||
const visionModelId = settings?.selectedVisionId;
|
||||
if (visionModelId) {
|
||||
const visionModel =
|
||||
await this.modelConfigService.findOne(visionModelId);
|
||||
if (
|
||||
visionModel &&
|
||||
visionModel.type === 'vision' &&
|
||||
visionModel.isEnabled !== false
|
||||
) {
|
||||
text = await this.visionService.extractImageContent(kb.storagePath, {
|
||||
baseUrl: visionModel.baseUrl || '',
|
||||
apiKey: visionModel.apiKey || '',
|
||||
modelId: visionModel.modelId,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!text || text.trim().length === 0) {
|
||||
this.logger.warn(this.i18nService.getMessage('noTextExtracted'));
|
||||
}
|
||||
|
||||
// Check text size
|
||||
const textSizeMB = Math.round(text.length / 1024 / 1024);
|
||||
if (textSizeMB > 50) {
|
||||
this.logger.warn(
|
||||
this.i18nService.formatMessage('extractedTextTooLarge', {
|
||||
size: textSizeMB,
|
||||
}),
|
||||
);
|
||||
}
|
||||
|
||||
// Save text to database
|
||||
await this.kbRepository.update(kb.id, { content: text });
|
||||
await this.updateStatus(kb.id, FileStatus.EXTRACTED);
|
||||
|
||||
// Async vectorization
|
||||
await this.vectorizeToElasticsearch(
|
||||
kb.id,
|
||||
userId,
|
||||
tenantId,
|
||||
text,
|
||||
config,
|
||||
).catch((err) => {
|
||||
this.logger.error(`Error vectorizing file ${kb.id}`, err);
|
||||
});
|
||||
|
||||
// Auto-generate title (async execution)
|
||||
this.generateTitle(kb.id).catch((err) => {
|
||||
this.logger.error(`Error generating title for file ${kb.id}`, err);
|
||||
});
|
||||
|
||||
// Trigger PDF conversion asynchronously (for document files)
|
||||
this.ensurePDFExists(kb.id, userId, tenantId).catch((err) => {
|
||||
this.logger.warn(
|
||||
this.i18nService.formatMessage('pdfConversionFailedDetail', {
|
||||
id: kb.id,
|
||||
}),
|
||||
err,
|
||||
);
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Precise mode processing (new flow)
|
||||
*/
|
||||
private async processPreciseMode(
|
||||
kb: KnowledgeBase,
|
||||
userId: string,
|
||||
tenantId: string,
|
||||
config?: any,
|
||||
) {
|
||||
// Check if precise mode is supported
|
||||
const preciseFormats = ['.pdf', '.doc', '.docx', '.ppt', '.pptx'];
|
||||
const ext = kb.originalName
|
||||
.toLowerCase()
|
||||
.substring(kb.originalName.lastIndexOf('.'));
|
||||
|
||||
if (!preciseFormats.includes(ext)) {
|
||||
this.logger.warn(
|
||||
this.i18nService.formatMessage('preciseModeUnsupported', { ext }),
|
||||
);
|
||||
return this.processFastMode(kb, userId, tenantId, config);
|
||||
}
|
||||
|
||||
// Check if Vision model is configured
|
||||
const settings = await this.tenantService.getSettings(
|
||||
tenantId || 'default',
|
||||
);
|
||||
const visionModelId = settings?.selectedVisionId;
|
||||
if (!visionModelId) {
|
||||
this.logger.warn(
|
||||
this.i18nService.getMessage('visionModelNotConfiguredFallback'),
|
||||
);
|
||||
return this.processFastMode(kb, userId, tenantId, config);
|
||||
}
|
||||
|
||||
const visionModel = await this.modelConfigService.findOne(visionModelId);
|
||||
if (
|
||||
!visionModel ||
|
||||
visionModel.type !== 'vision' ||
|
||||
visionModel.isEnabled === false
|
||||
) {
|
||||
this.logger.warn(
|
||||
this.i18nService.getMessage('visionModelInvalidFallback'),
|
||||
);
|
||||
return this.processFastMode(kb, userId, tenantId, config);
|
||||
}
|
||||
|
||||
// Call Vision Pipeline
|
||||
try {
|
||||
const result = await this.visionPipelineService.processPreciseMode(
|
||||
kb.storagePath,
|
||||
{
|
||||
userId,
|
||||
tenantId, // New
|
||||
modelId: visionModelId,
|
||||
fileId: kb.id,
|
||||
fileName: kb.originalName,
|
||||
skipQualityCheck: false,
|
||||
},
|
||||
);
|
||||
|
||||
if (!result.success) {
|
||||
this.logger.error(`Vision pipeline failed, falling back to fast mode`);
|
||||
this.logger.warn(this.i18nService.getMessage('visionPipelineFailed'));
|
||||
return this.processFastMode(kb, userId, tenantId, config);
|
||||
}
|
||||
|
||||
// Save text content to database
|
||||
const combinedText = result.results.map((r) => r.text).join('\n\n');
|
||||
const metadata = {
|
||||
processedPages: result.processedPages,
|
||||
failedPages: result.failedPages,
|
||||
cost: result.cost,
|
||||
duration: result.duration,
|
||||
results: result.results.map((r) => ({
|
||||
pageIndex: r.pageIndex,
|
||||
confidence: r.confidence,
|
||||
layout: r.layout,
|
||||
imageCount: r.images.length,
|
||||
})),
|
||||
};
|
||||
await this.kbRepository.update(kb.id, {
|
||||
content: combinedText,
|
||||
metadata: metadata as any,
|
||||
});
|
||||
|
||||
await this.updateStatus(kb.id, FileStatus.EXTRACTED);
|
||||
this.logger.log(
|
||||
this.i18nService.formatMessage('preciseModeComplete', {
|
||||
pages: result.processedPages,
|
||||
cost: result.cost.toFixed(2),
|
||||
}),
|
||||
);
|
||||
|
||||
// Async vectorization and Elasticsearch indexing
|
||||
// Create each page as separate document with metadata
|
||||
this.indexPreciseResults(
|
||||
kb,
|
||||
userId,
|
||||
tenantId,
|
||||
kb.embeddingModelId,
|
||||
result.results,
|
||||
).catch((err) => {
|
||||
this.logger.error(`Error indexing precise results for ${kb.id}`, err);
|
||||
});
|
||||
|
||||
// Trigger PDF conversion asynchronously
|
||||
this.ensurePDFExists(kb.id, userId, tenantId).catch((err) => {
|
||||
this.logger.warn(`Initial PDF conversion failed for ${kb.id}`, err);
|
||||
});
|
||||
|
||||
// Auto-generate title (async execution)
|
||||
this.generateTitle(kb.id).catch((err) => {
|
||||
this.logger.error(`Error generating title for file ${kb.id}`, err);
|
||||
});
|
||||
} catch (error) {
|
||||
this.logger.error(`Vision pipeline error: ${error.message}`, error.stack);
|
||||
this.logger.error(`Falling back to fast mode for file ${kb.id}`);
|
||||
return this.processFastMode(kb, userId, tenantId, config);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Index precise mode results
|
||||
*/
|
||||
private async indexPreciseResults(
|
||||
kb: KnowledgeBase,
|
||||
userId: string,
|
||||
tenantId: string,
|
||||
embeddingModelId: string,
|
||||
results: any[],
|
||||
): Promise<void> {
|
||||
this.logger.log(`Indexing ${results.length} precise results for ${kb.id}`);
|
||||
|
||||
// Check index existence - get actual model dimensions
|
||||
const actualDimensions = await this.getActualModelDimensions(
|
||||
embeddingModelId,
|
||||
);
|
||||
await this.elasticsearchService.createIndexIfNotExists(actualDimensions);
|
||||
|
||||
// Batch vectorization and indexing
|
||||
const batchSize = parseInt(process.env.CHUNK_BATCH_SIZE || '50');
|
||||
|
||||
for (let i = 0; i < results.length; i += batchSize) {
|
||||
const batch = results.slice(i, i + batchSize);
|
||||
const texts = batch.map((r) => r.text);
|
||||
|
||||
try {
|
||||
// Generate vectors
|
||||
const embeddings = await this.embeddingService.getEmbeddings(
|
||||
texts,
|
||||
embeddingModelId,
|
||||
);
|
||||
|
||||
// Index each result
|
||||
for (let j = 0; j < batch.length; j++) {
|
||||
const result = batch[j];
|
||||
const embedding = embeddings[j];
|
||||
|
||||
if (!embedding || embedding.length === 0) {
|
||||
this.logger.warn(
|
||||
this.i18nService.formatMessage('skippingEmptyVectorPage', {
|
||||
page: result.pageIndex,
|
||||
}),
|
||||
);
|
||||
continue;
|
||||
}
|
||||
|
||||
await this.elasticsearchService.indexDocument(
|
||||
`${kb.id}_page_${result.pageIndex}`,
|
||||
result.text,
|
||||
embedding,
|
||||
{
|
||||
fileId: kb.id,
|
||||
originalName: kb.originalName,
|
||||
mimetype: kb.mimetype,
|
||||
userId: userId,
|
||||
tenantId: tenantId, // New
|
||||
pageNumber: result.pageIndex,
|
||||
images: result.images,
|
||||
layout: result.layout,
|
||||
confidence: result.confidence,
|
||||
source: 'precise',
|
||||
mode: 'vision',
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
this.logger.log(
|
||||
`Batch ${Math.floor(i / batchSize) + 1} completed: ${batch.length} pages`,
|
||||
);
|
||||
} catch (error) {
|
||||
this.logger.error(
|
||||
`Batch ${Math.floor(i / batchSize) + 1} processing failed`,
|
||||
error,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
await this.updateStatus(kb.id, FileStatus.VECTORIZED);
|
||||
this.logger.log(`Precise mode indexing completed: ${results.length} pages`);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get specific page of PDF as image
|
||||
*/
|
||||
async getPageAsImage(
|
||||
fileId: string,
|
||||
pageIndex: number,
|
||||
userId: string,
|
||||
tenantId: string,
|
||||
): Promise<string> {
|
||||
const pdfPath = await this.ensurePDFExists(fileId, userId, tenantId);
|
||||
|
||||
// Convert specific pages
|
||||
const result = await this.pdf2ImageService.convertToImages(pdfPath, {
|
||||
density: 150,
|
||||
quality: 75,
|
||||
format: 'jpeg',
|
||||
});
|
||||
|
||||
// Find images for corresponding page numbers
|
||||
const pageImage = result.images.find(
|
||||
(img) => img.pageIndex === pageIndex + 1,
|
||||
);
|
||||
if (!pageImage) {
|
||||
throw new NotFoundException(
|
||||
this.i18nService.formatMessage('pageImageNotFoundDetail', {
|
||||
page: pageIndex + 1,
|
||||
}),
|
||||
);
|
||||
}
|
||||
|
||||
return pageImage.path;
|
||||
}
|
||||
|
||||
private async vectorizeToElasticsearch(
|
||||
kbId: string,
|
||||
userId: string,
|
||||
tenantId: string,
|
||||
text: string,
|
||||
config?: any,
|
||||
) {
|
||||
try {
|
||||
const kb = await this.kbRepository.findOne({
|
||||
where: { id: kbId, tenantId },
|
||||
});
|
||||
if (!kb) return;
|
||||
|
||||
// Memory monitor - pre-vectorization check
|
||||
const memBeforeChunk = this.memoryMonitor.getMemoryUsage();
|
||||
this.logger.log(
|
||||
`Pre-vectorization memory: ${memBeforeChunk.heapUsed}/${memBeforeChunk.heapTotal}MB`,
|
||||
);
|
||||
|
||||
this.logger.debug(`File ${kbId}: Validating chunk config...`);
|
||||
// 1. Validate and fix chunk config (based on model limits and env vars)
|
||||
const validatedConfig = await this.chunkConfigService.validateChunkConfig(
|
||||
kb.chunkSize,
|
||||
kb.chunkOverlap,
|
||||
kb.embeddingModelId,
|
||||
);
|
||||
this.logger.debug(`File ${kbId}: Chunk config validated.`);
|
||||
|
||||
// If config modified, log warning and update database
|
||||
if (validatedConfig.warnings.length > 0) {
|
||||
this.logger.warn(
|
||||
this.i18nService.formatMessage('chunkConfigCorrection', {
|
||||
warnings: validatedConfig.warnings.join(', '),
|
||||
}),
|
||||
);
|
||||
|
||||
// Update config in database
|
||||
if (
|
||||
validatedConfig.chunkSize !== kb.chunkSize ||
|
||||
validatedConfig.chunkOverlap !== kb.chunkOverlap
|
||||
) {
|
||||
await this.kbRepository.update(kbId, {
|
||||
chunkSize: validatedConfig.chunkSize,
|
||||
chunkOverlap: validatedConfig.chunkOverlap,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Display config summary (including actual limits applied)
|
||||
this.logger.debug(`File ${kbId}: Getting config summary...`);
|
||||
const configSummary = await this.chunkConfigService.getConfigSummary(
|
||||
validatedConfig.chunkSize,
|
||||
validatedConfig.chunkOverlap,
|
||||
kb.embeddingModelId,
|
||||
);
|
||||
this.logger.log(`Chunk config: ${configSummary}`);
|
||||
this.logger.log(
|
||||
`Config limits: chunk=${validatedConfig.effectiveMaxChunkSize}, overlap=${validatedConfig.effectiveMaxOverlapSize}`,
|
||||
);
|
||||
|
||||
// 2. Split text using validated config
|
||||
const chunks = this.textChunkerService.chunkText(
|
||||
text,
|
||||
validatedConfig.chunkSize,
|
||||
validatedConfig.chunkOverlap,
|
||||
);
|
||||
this.logger.log(`File ${kbId} split into ${chunks.length} text blocks`);
|
||||
|
||||
if (chunks.length === 0) {
|
||||
this.logger.warn(
|
||||
this.i18nService.formatMessage('noChunksGenerated', { id: kbId }),
|
||||
);
|
||||
await this.updateStatus(kbId, FileStatus.VECTORIZED);
|
||||
return;
|
||||
}
|
||||
|
||||
// 3. Validate chunk count is reasonable
|
||||
const estimatedChunkCount = this.chunkConfigService.estimateChunkCount(
|
||||
text.length,
|
||||
validatedConfig.chunkSize,
|
||||
);
|
||||
if (chunks.length > estimatedChunkCount * 1.2) {
|
||||
this.logger.warn(
|
||||
this.i18nService.formatMessage('chunkCountAnomaly', {
|
||||
actual: chunks.length,
|
||||
estimated: estimatedChunkCount,
|
||||
}),
|
||||
);
|
||||
}
|
||||
|
||||
// 4. Get recommended batch size (based on model limits)
|
||||
const recommendedBatchSize =
|
||||
await this.chunkConfigService.getRecommendedBatchSize(
|
||||
kb.embeddingModelId,
|
||||
parseInt(process.env.CHUNK_BATCH_SIZE || '100'),
|
||||
);
|
||||
|
||||
// 5. Estimate memory usage
|
||||
const avgChunkSize =
|
||||
chunks.reduce((sum, c) => sum + c.content.length, 0) / chunks.length;
|
||||
const estimatedMemory = this.memoryMonitor.estimateMemoryUsage(
|
||||
chunks.length,
|
||||
avgChunkSize,
|
||||
parseInt(process.env.DEFAULT_VECTOR_DIMENSIONS || '2560'),
|
||||
);
|
||||
this.logger.log(
|
||||
`Estimated memory usage: ${estimatedMemory}MB (batch size: ${recommendedBatchSize})`,
|
||||
);
|
||||
|
||||
// 6. Get actual model dimensions and check index exists
|
||||
const actualDimensions = await this.getActualModelDimensions(
|
||||
kb.embeddingModelId,
|
||||
);
|
||||
await this.elasticsearchService.createIndexIfNotExists(actualDimensions);
|
||||
|
||||
// 7. Batch vectorization and indexing
|
||||
const useBatching = this.memoryMonitor.shouldUseBatching(
|
||||
chunks.length,
|
||||
avgChunkSize,
|
||||
actualDimensions,
|
||||
);
|
||||
|
||||
if (useBatching) {
|
||||
try {
|
||||
await this.processInBatches(
|
||||
chunks,
|
||||
async (batch, batchIndex) => {
|
||||
// Verify batch size not exceeding model limit
|
||||
if (batch.length > recommendedBatchSize) {
|
||||
this.logger.warn(
|
||||
this.i18nService.formatMessage('batchSizeExceeded', {
|
||||
index: batchIndex,
|
||||
actual: batch.length,
|
||||
limit: recommendedBatchSize,
|
||||
}),
|
||||
);
|
||||
}
|
||||
|
||||
const chunkTexts = batch.map((chunk) => chunk.content);
|
||||
const embeddings = await this.embeddingService.getEmbeddings(
|
||||
chunkTexts,
|
||||
kb.embeddingModelId,
|
||||
);
|
||||
|
||||
// Validate dimension consistency
|
||||
if (
|
||||
embeddings.length > 0 &&
|
||||
embeddings[0].length !== actualDimensions
|
||||
) {
|
||||
this.logger.warn(
|
||||
`Vector dimension mismatch: expected ${actualDimensions}, got ${embeddings[0].length}`,
|
||||
);
|
||||
}
|
||||
|
||||
// Index this batch data immediately
|
||||
for (let i = 0; i < batch.length; i++) {
|
||||
const chunk = batch[i];
|
||||
const embedding = embeddings[i];
|
||||
|
||||
if (!embedding || embedding.length === 0) {
|
||||
this.logger.warn(
|
||||
this.i18nService.formatMessage('skippingEmptyVectorChunk', {
|
||||
index: chunk.index,
|
||||
}),
|
||||
);
|
||||
continue;
|
||||
}
|
||||
|
||||
await this.elasticsearchService.indexDocument(
|
||||
`${kb.id}_chunk_${chunk.index}`,
|
||||
chunk.content,
|
||||
embedding,
|
||||
{
|
||||
fileId: kb.id,
|
||||
originalName: kb.originalName,
|
||||
mimetype: kb.mimetype,
|
||||
userId: userId,
|
||||
chunkIndex: chunk.index,
|
||||
startPosition: chunk.startPosition,
|
||||
tenantId, // Passing tenantId to ES
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
this.logger.log(
|
||||
`Batch ${batchIndex} completed: ${batch.length} chunks`,
|
||||
);
|
||||
},
|
||||
{
|
||||
batchSize: recommendedBatchSize,
|
||||
onBatchComplete: (batchIndex, totalBatches) => {
|
||||
const mem = this.memoryMonitor.getMemoryUsage();
|
||||
this.logger.log(
|
||||
`Batch ${batchIndex}/${totalBatches} completed, memory: ${mem.heapUsed}MB`,
|
||||
);
|
||||
},
|
||||
},
|
||||
);
|
||||
} catch (error) {
|
||||
// Detect context length error (supports Japanese/Chinese/English)
|
||||
if (
|
||||
error.message &&
|
||||
(error.message.includes('context length') ||
|
||||
error.message.includes('context length exceeded'))
|
||||
) {
|
||||
this.logger.warn(
|
||||
this.i18nService.getMessage('contextLengthErrorFallback'),
|
||||
);
|
||||
|
||||
// Downgrade to single text processing
|
||||
for (let i = 0; i < chunks.length; i++) {
|
||||
const chunk = chunks[i];
|
||||
|
||||
try {
|
||||
const embeddings = await this.embeddingService.getEmbeddings(
|
||||
[chunk.content], // Single text
|
||||
kb.embeddingModelId,
|
||||
);
|
||||
|
||||
if (!embeddings[0] || embeddings[0].length === 0) {
|
||||
this.logger.warn(
|
||||
this.i18nService.formatMessage('skippingEmptyVectorChunk', {
|
||||
index: chunk.index,
|
||||
}),
|
||||
);
|
||||
continue;
|
||||
}
|
||||
|
||||
await this.elasticsearchService.indexDocument(
|
||||
`${kb.id}_chunk_${chunk.index}`,
|
||||
chunk.content,
|
||||
embeddings[0],
|
||||
{
|
||||
fileId: kb.id,
|
||||
originalName: kb.originalName,
|
||||
mimetype: kb.mimetype,
|
||||
userId: userId,
|
||||
chunkIndex: chunk.index,
|
||||
startPosition: chunk.startPosition,
|
||||
endPosition: chunk.endPosition,
|
||||
tenantId,
|
||||
},
|
||||
);
|
||||
|
||||
if ((i + 1) % 10 === 0) {
|
||||
this.logger.log(
|
||||
`Single processing progress: ${i + 1}/${chunks.length}`,
|
||||
);
|
||||
}
|
||||
} catch (chunkError) {
|
||||
this.logger.error(
|
||||
`Failed to process text block ${chunk.index}. Skipping: ${chunkError.message}`,
|
||||
);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
this.logger.log(
|
||||
`Single text processing completed: ${chunks.length} chunks`,
|
||||
);
|
||||
} else {
|
||||
// Throw other errors directly
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Small files, batch processing (but need to check batch limits)
|
||||
const chunkTexts = chunks.map((chunk) => chunk.content);
|
||||
|
||||
// Force batch processing if chunk count exceeds model batch limit
|
||||
if (chunks.length > recommendedBatchSize) {
|
||||
this.logger.warn(
|
||||
this.i18nService.formatMessage('chunkLimitExceededForceBatch', {
|
||||
actual: chunks.length,
|
||||
limit: recommendedBatchSize,
|
||||
}),
|
||||
);
|
||||
try {
|
||||
await this.processInBatches(chunks, async (batch, batchIndex) => {
|
||||
const batchTexts = batch.map((c) => c.content);
|
||||
const embeddings = await this.embeddingService.getEmbeddings(
|
||||
batchTexts,
|
||||
kb.embeddingModelId,
|
||||
);
|
||||
|
||||
for (let i = 0; i < batch.length; i++) {
|
||||
const chunk = batch[i];
|
||||
const embedding = embeddings[i];
|
||||
|
||||
if (!embedding || embedding.length === 0) {
|
||||
this.logger.warn(
|
||||
`Skipping empty vector text block ${chunk.index}`,
|
||||
);
|
||||
continue;
|
||||
}
|
||||
|
||||
await this.elasticsearchService.indexDocument(
|
||||
`${kb.id}_chunk_${chunk.index}`,
|
||||
chunk.content,
|
||||
embedding,
|
||||
{
|
||||
fileId: kb.id,
|
||||
originalName: kb.originalName,
|
||||
mimetype: kb.mimetype,
|
||||
userId: userId,
|
||||
chunkIndex: chunk.index,
|
||||
startPosition: chunk.startPosition,
|
||||
endPosition: chunk.endPosition,
|
||||
tenantId, // Passing tenantId to ES metadata
|
||||
},
|
||||
);
|
||||
}
|
||||
});
|
||||
} catch (error) {
|
||||
// Detect context length error (supports Japanese/Chinese/English)
|
||||
if (
|
||||
error.message &&
|
||||
(error.message.includes('context length') ||
|
||||
error.message.includes('context length exceeded'))
|
||||
) {
|
||||
this.logger.warn(
|
||||
this.i18nService.getMessage('batchContextLengthErrorFallback'),
|
||||
);
|
||||
|
||||
// Downgrade to single text processing
|
||||
for (let i = 0; i < chunks.length; i++) {
|
||||
const chunk = chunks[i];
|
||||
|
||||
try {
|
||||
const embeddings = await this.embeddingService.getEmbeddings(
|
||||
[chunk.content], // Single text
|
||||
kb.embeddingModelId,
|
||||
);
|
||||
|
||||
if (!embeddings[0] || embeddings[0].length === 0) {
|
||||
this.logger.warn(
|
||||
this.i18nService.formatMessage(
|
||||
'skippingEmptyVectorChunk',
|
||||
{ index: chunk.index },
|
||||
),
|
||||
);
|
||||
continue;
|
||||
}
|
||||
|
||||
await this.elasticsearchService.indexDocument(
|
||||
`${kb.id}_chunk_${chunk.index}`,
|
||||
chunk.content,
|
||||
embeddings[0],
|
||||
{
|
||||
fileId: kb.id,
|
||||
originalName: kb.originalName,
|
||||
mimetype: kb.mimetype,
|
||||
userId: userId,
|
||||
tenantId, // Added tenantId
|
||||
chunkIndex: chunk.index,
|
||||
startPosition: chunk.startPosition,
|
||||
endPosition: chunk.endPosition,
|
||||
},
|
||||
);
|
||||
|
||||
if ((i + 1) % 10 === 0) {
|
||||
this.logger.log(
|
||||
`Single processing progress: ${i + 1}/${chunks.length}`,
|
||||
);
|
||||
}
|
||||
} catch (chunkError) {
|
||||
this.logger.error(
|
||||
this.i18nService.formatMessage('chunkProcessingFailed', {
|
||||
index: chunk.index,
|
||||
message: chunkError.message,
|
||||
}),
|
||||
);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
this.logger.log(
|
||||
this.i18nService.formatMessage('singleTextProcessingComplete', {
|
||||
count: chunks.length,
|
||||
}),
|
||||
);
|
||||
} else {
|
||||
// Throw other errors directly
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Process if file is small enough
|
||||
try {
|
||||
const embeddings = await this.embeddingService.getEmbeddings(
|
||||
chunkTexts,
|
||||
kb.embeddingModelId,
|
||||
);
|
||||
|
||||
for (let i = 0; i < chunks.length; i++) {
|
||||
const chunk = chunks[i];
|
||||
const embedding = embeddings[i];
|
||||
|
||||
if (!embedding || embedding.length === 0) {
|
||||
this.logger.warn(
|
||||
this.i18nService.formatMessage('skippingEmptyVectorChunk', {
|
||||
index: chunk.index,
|
||||
}),
|
||||
);
|
||||
continue;
|
||||
}
|
||||
|
||||
await this.elasticsearchService.indexDocument(
|
||||
`${kb.id}_chunk_${chunk.index}`,
|
||||
chunk.content,
|
||||
embedding,
|
||||
{
|
||||
fileId: kb.id,
|
||||
originalName: kb.originalName,
|
||||
mimetype: kb.mimetype,
|
||||
userId: userId,
|
||||
tenantId, // Added tenantId
|
||||
chunkIndex: chunk.index,
|
||||
startPosition: chunk.startPosition,
|
||||
endPosition: chunk.endPosition,
|
||||
},
|
||||
);
|
||||
}
|
||||
} catch (error) {
|
||||
// Detect context length error (supports Japanese/Chinese/English)
|
||||
if (
|
||||
error.message &&
|
||||
(error.message.includes('context length') ||
|
||||
error.message.includes('context length exceeded'))
|
||||
) {
|
||||
this.logger.warn(
|
||||
this.i18nService.getMessage('batchContextLengthErrorFallback'),
|
||||
);
|
||||
|
||||
// Downgrade to single text processing
|
||||
for (let i = 0; i < chunks.length; i++) {
|
||||
const chunk = chunks[i];
|
||||
|
||||
try {
|
||||
const embeddings = await this.embeddingService.getEmbeddings(
|
||||
[chunk.content], // Single text
|
||||
kb.embeddingModelId,
|
||||
);
|
||||
|
||||
if (!embeddings[0] || embeddings[0].length === 0) {
|
||||
this.logger.warn(
|
||||
`Skipping empty vector text block ${chunk.index}`,
|
||||
);
|
||||
continue;
|
||||
}
|
||||
|
||||
await this.elasticsearchService.indexDocument(
|
||||
`${kb.id}_chunk_${chunk.index}`,
|
||||
chunk.content,
|
||||
embeddings[0],
|
||||
{
|
||||
fileId: kb.id,
|
||||
originalName: kb.originalName,
|
||||
mimetype: kb.mimetype,
|
||||
userId: userId,
|
||||
tenantId, // Added tenantId
|
||||
chunkIndex: chunk.index,
|
||||
startPosition: chunk.startPosition,
|
||||
endPosition: chunk.endPosition,
|
||||
},
|
||||
);
|
||||
|
||||
if ((i + 1) % 10 === 0) {
|
||||
this.logger.log(
|
||||
`Single processing progress: ${i + 1}/${chunks.length}`,
|
||||
);
|
||||
}
|
||||
} catch (chunkError) {
|
||||
this.logger.error(
|
||||
`Failed to process text block ${chunk.index}. Skipping: ${chunkError.message}`,
|
||||
);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
this.logger.log(
|
||||
this.i18nService.formatMessage('singleTextProcessingComplete', {
|
||||
count: chunks.length,
|
||||
}),
|
||||
);
|
||||
} else {
|
||||
// Throw other errors directly
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
await this.updateStatus(kbId, FileStatus.VECTORIZED);
|
||||
const memAfter = this.memoryMonitor.getMemoryUsage();
|
||||
this.logger.log(
|
||||
this.i18nService.formatMessage('fileVectorizationComplete', {
|
||||
id: kbId,
|
||||
count: chunks.length,
|
||||
memory: memAfter.heapUsed,
|
||||
}),
|
||||
);
|
||||
} catch (error) {
|
||||
this.logger.error(
|
||||
this.i18nService.formatMessage('fileVectorizationFailed', { id: kbId }),
|
||||
error,
|
||||
);
|
||||
|
||||
// Save error info to metadata
|
||||
try {
|
||||
const kb = await this.kbRepository.findOne({ where: { id: kbId } });
|
||||
if (kb) {
|
||||
const metadata = kb.metadata || {};
|
||||
metadata.lastError = error.message;
|
||||
metadata.failedAt = new Date().toISOString();
|
||||
await this.kbRepository.update(kbId, { metadata });
|
||||
}
|
||||
} catch (e) {
|
||||
this.logger.warn(
|
||||
`Failed to update metadata for failed file ${kbId}`,
|
||||
e,
|
||||
);
|
||||
}
|
||||
|
||||
await this.updateStatus(kbId, FileStatus.FAILED);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Batch processing with memory control
|
||||
*/
|
||||
private async processInBatches<T>(
|
||||
items: T[],
|
||||
processor: (batch: T[], batchIndex: number) => Promise<void>,
|
||||
options?: {
|
||||
batchSize?: number;
|
||||
onBatchComplete?: (batchIndex: number, totalBatches: number) => void;
|
||||
},
|
||||
): Promise<void> {
|
||||
const totalItems = items.length;
|
||||
if (totalItems === 0) return;
|
||||
|
||||
const startTime = Date.now();
|
||||
this.logger.log(
|
||||
this.i18nService.formatMessage('batchProcessingStarted', {
|
||||
count: totalItems,
|
||||
}),
|
||||
);
|
||||
|
||||
// Use provided batch size or fallback to env/default
|
||||
const initialBatchSize =
|
||||
options?.batchSize || parseInt(process.env.CHUNK_BATCH_SIZE || '100');
|
||||
const totalBatches = Math.ceil(totalItems / initialBatchSize);
|
||||
|
||||
for (let i = 0; i < totalItems; ) {
|
||||
// Check memory and wait
|
||||
await this.memoryMonitor.waitForMemoryAvailable();
|
||||
|
||||
// Dynamically adjust batch size (start from initialBatchSize, memory monitor can reduce if needed)
|
||||
// Note: memoryMonitor.getDynamicBatchSize may return larger values based on memory situation,
|
||||
// but we must respect model limits (initialBatchSize)
|
||||
const currentMem = this.memoryMonitor.getMemoryUsage().heapUsed;
|
||||
const dynamicBatchSize =
|
||||
this.memoryMonitor.getDynamicBatchSize(currentMem);
|
||||
|
||||
// Ensure we don't exceed the model's limit (initialBatchSize) even if memory allows more
|
||||
const batchSize = Math.min(dynamicBatchSize, initialBatchSize);
|
||||
|
||||
// Get current batch
|
||||
const batch = items.slice(i, i + batchSize);
|
||||
const batchIndex = Math.floor(i / batchSize) + 1;
|
||||
|
||||
this.logger.log(
|
||||
this.i18nService.formatMessage('batchProcessingProgress', {
|
||||
index: batchIndex,
|
||||
total: totalBatches,
|
||||
count: batch.length,
|
||||
}),
|
||||
);
|
||||
|
||||
// Process batch
|
||||
await processor(batch, batchIndex);
|
||||
|
||||
// Callback notification
|
||||
if (options?.onBatchComplete) {
|
||||
options.onBatchComplete(batchIndex, totalBatches);
|
||||
}
|
||||
|
||||
// Force GC (if memory is near threshold)
|
||||
if (currentMem > 800) {
|
||||
this.memoryMonitor.forceGC();
|
||||
}
|
||||
|
||||
// Clear references to help GC
|
||||
batch.length = 0;
|
||||
|
||||
i += batchSize;
|
||||
}
|
||||
|
||||
const duration = ((Date.now() - startTime) / 1000).toFixed(2);
|
||||
this.logger.log(
|
||||
this.i18nService.formatMessage('batchProcessingComplete', {
|
||||
count: totalItems,
|
||||
duration,
|
||||
}),
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Retry vectorization for failed files
|
||||
*/
|
||||
async retryFailedFile(
|
||||
fileId: string,
|
||||
userId: string,
|
||||
tenantId: string,
|
||||
): Promise<KnowledgeBase> {
|
||||
this.logger.log(
|
||||
`Retrying failed file ${fileId} for user ${userId} in tenant ${tenantId}`,
|
||||
);
|
||||
|
||||
// 1. Get file with tenant restriction
|
||||
const kb = await this.kbRepository.findOne({
|
||||
where: { id: fileId, tenantId },
|
||||
});
|
||||
|
||||
if (!kb) {
|
||||
throw new NotFoundException(this.i18nService.getMessage('fileNotFound'));
|
||||
}
|
||||
|
||||
if (kb.status !== FileStatus.FAILED) {
|
||||
throw new Error(
|
||||
this.i18nService.formatMessage('onlyFailedFilesRetryable', {
|
||||
status: kb.status,
|
||||
}),
|
||||
);
|
||||
}
|
||||
|
||||
if (!kb.content || kb.content.trim().length === 0) {
|
||||
throw new Error(this.i18nService.getMessage('emptyFileRetryFailed'));
|
||||
}
|
||||
|
||||
// 2. Reset status to INDEXING
|
||||
await this.updateStatus(fileId, FileStatus.INDEXING);
|
||||
|
||||
// 3. Trigger vectorization asynchronously (reuse existing logic)
|
||||
this.vectorizeToElasticsearch(fileId, userId, tenantId, kb.content, {
|
||||
chunkSize: kb.chunkSize,
|
||||
chunkOverlap: kb.chunkOverlap,
|
||||
embeddingModelId: kb.embeddingModelId,
|
||||
}).catch((err) => {
|
||||
this.logger.error(`Retry vectorization failed for file ${fileId}`, err);
|
||||
});
|
||||
|
||||
// 4. Return updated file status
|
||||
const updatedKb = await this.kbRepository.findOne({
|
||||
where: { id: fileId, tenantId },
|
||||
});
|
||||
if (!updatedKb) {
|
||||
throw new NotFoundException(this.i18nService.getMessage('fileNotFound'));
|
||||
}
|
||||
return updatedKb;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get all chunk information for a file
|
||||
*/
|
||||
async getFileChunks(fileId: string, userId: string, tenantId: string) {
|
||||
this.logger.log(
|
||||
`Getting chunks for file ${fileId}, user ${userId}, tenant ${tenantId}`,
|
||||
);
|
||||
|
||||
// 1. Get file with tenant check
|
||||
const kb = await this.kbRepository.findOne({
|
||||
where: { id: fileId, tenantId },
|
||||
});
|
||||
|
||||
if (!kb) {
|
||||
throw new NotFoundException(this.i18nService.getMessage('fileNotFound'));
|
||||
}
|
||||
|
||||
// 2. Get all chunks from Elasticsearch
|
||||
const chunks = await this.elasticsearchService.getFileChunks(
|
||||
fileId,
|
||||
userId,
|
||||
tenantId,
|
||||
);
|
||||
|
||||
// 3. Return chunk info
|
||||
return {
|
||||
fileId: kb.id,
|
||||
fileName: kb.originalName,
|
||||
totalChunks: chunks.length,
|
||||
chunkSize: kb.chunkSize,
|
||||
chunkOverlap: kb.chunkOverlap,
|
||||
chunks: chunks.map((chunk) => ({
|
||||
index: chunk.chunkIndex,
|
||||
content: chunk.content,
|
||||
contentLength: chunk.content.length,
|
||||
startPosition: chunk.startPosition,
|
||||
endPosition: chunk.endPosition,
|
||||
})),
|
||||
};
|
||||
}
|
||||
|
||||
private async updateStatus(id: string, status: FileStatus) {
|
||||
await this.kbRepository.update(id, { status });
|
||||
}
|
||||
|
||||
// PDF preview related methods
|
||||
async ensurePDFExists(
|
||||
fileId: string,
|
||||
userId: string,
|
||||
tenantId: string,
|
||||
force: boolean = false,
|
||||
): Promise<string> {
|
||||
const kb = await this.kbRepository.findOne({
|
||||
where: { id: fileId, tenantId },
|
||||
});
|
||||
|
||||
if (!kb) {
|
||||
throw new NotFoundException(this.i18nService.getMessage('fileNotFound'));
|
||||
}
|
||||
|
||||
// If original file is PDF, return the original file path directly
|
||||
if (kb.mimetype === 'application/pdf') {
|
||||
return kb.storagePath;
|
||||
}
|
||||
|
||||
// Check if preview conversion is supported (only documents or images allowed)
|
||||
const ext = kb.originalName.toLowerCase().split('.').pop() || '';
|
||||
const isConvertible = [...DOC_EXTENSIONS, ...IMAGE_EXTENSIONS].includes(
|
||||
ext,
|
||||
);
|
||||
|
||||
if (!isConvertible) {
|
||||
this.logger.log(
|
||||
`Skipping PDF conversion for unsupported format: .${ext} (${kb.originalName})`,
|
||||
);
|
||||
throw new Error(this.i18nService.getMessage('pdfPreviewNotSupported'));
|
||||
}
|
||||
|
||||
// Generate PDF field path
|
||||
const path = await import('path');
|
||||
const fs = await import('fs');
|
||||
const uploadDir = path.dirname(kb.storagePath);
|
||||
const baseName = path.basename(
|
||||
kb.storagePath,
|
||||
path.extname(kb.storagePath),
|
||||
);
|
||||
const pdfPath = path.join(uploadDir, `${baseName}.pdf`);
|
||||
|
||||
// Delete if forced regeneration specified and file exists
|
||||
if (force && fs.existsSync(pdfPath)) {
|
||||
try {
|
||||
fs.unlinkSync(pdfPath);
|
||||
this.logger.log(`Forced regeneration: Deleted existing PDF ${pdfPath}`);
|
||||
} catch (e) {
|
||||
this.logger.warn(
|
||||
`Failed to delete existing PDF for regeneration: ${e.message}`,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// Check if already converted and regeneration not needed
|
||||
if (fs.existsSync(pdfPath) && !force) {
|
||||
if (!kb.pdfPath) {
|
||||
await this.kbRepository.update(kb.id, { pdfPath: pdfPath });
|
||||
}
|
||||
return pdfPath;
|
||||
}
|
||||
|
||||
// Need to convert to PDF
|
||||
try {
|
||||
this.logger.log(
|
||||
`Starting PDF conversion for ${kb.originalName} at ${kb.storagePath}`,
|
||||
);
|
||||
|
||||
// Convert file
|
||||
await this.libreOfficeService.convertToPDF(kb.storagePath);
|
||||
|
||||
// Check conversion result
|
||||
if (!fs.existsSync(pdfPath)) {
|
||||
throw new Error(
|
||||
`PDF conversion completed but file not found at ${pdfPath}`,
|
||||
);
|
||||
}
|
||||
|
||||
const stats = fs.statSync(pdfPath);
|
||||
if (stats.size === 0) {
|
||||
fs.unlinkSync(pdfPath);
|
||||
throw new Error(`PDF conversion failed: output file is empty`);
|
||||
}
|
||||
|
||||
await this.kbRepository.update(kb.id, { pdfPath: pdfPath });
|
||||
|
||||
this.logger.log(`PDF conversion successful: ${pdfPath}`);
|
||||
return pdfPath;
|
||||
} catch (error) {
|
||||
this.logger.error(
|
||||
`PDF conversion failed for ${fileId}: ${error.message}`,
|
||||
error.stack,
|
||||
);
|
||||
throw new Error(
|
||||
this.i18nService.formatMessage('pdfConversionFailedDetail', {
|
||||
id: fileId,
|
||||
}),
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
async getPDFStatus(fileId: string, userId: string, tenantId: string) {
|
||||
const kb = await this.kbRepository.findOne({
|
||||
where: { id: fileId, tenantId },
|
||||
});
|
||||
|
||||
if (!kb) {
|
||||
throw new NotFoundException(this.i18nService.getMessage('fileNotFound'));
|
||||
}
|
||||
|
||||
// If original file is PDF
|
||||
if (kb.mimetype === 'application/pdf') {
|
||||
const token = this.generateTempToken(fileId, userId, tenantId);
|
||||
return {
|
||||
status: 'ready',
|
||||
url: `/api/knowledge-bases/${fileId}/pdf?token=${token}`,
|
||||
};
|
||||
}
|
||||
|
||||
// Generate PDF file path
|
||||
const path = await import('path');
|
||||
const fs = await import('fs');
|
||||
const uploadDir = path.dirname(kb.storagePath);
|
||||
const baseName = path.basename(
|
||||
kb.storagePath,
|
||||
path.extname(kb.storagePath),
|
||||
);
|
||||
const pdfPath = path.join(uploadDir, `${baseName}.pdf`);
|
||||
|
||||
// Check if converted
|
||||
if (fs.existsSync(pdfPath)) {
|
||||
if (!kb.pdfPath) {
|
||||
kb.pdfPath = pdfPath;
|
||||
await this.kbRepository.save(kb);
|
||||
}
|
||||
const token = this.generateTempToken(fileId, userId, tenantId);
|
||||
return {
|
||||
status: 'ready',
|
||||
url: `/api/knowledge-bases/${fileId}/pdf?token=${token}`,
|
||||
};
|
||||
}
|
||||
|
||||
// Conversion needed
|
||||
return {
|
||||
status: 'pending',
|
||||
};
|
||||
}
|
||||
|
||||
private generateTempToken(
|
||||
fileId: string,
|
||||
userId: string,
|
||||
tenantId: string,
|
||||
): string {
|
||||
const jwt = require('jsonwebtoken');
|
||||
|
||||
const secret = process.env.JWT_SECRET;
|
||||
if (!secret) {
|
||||
throw new Error(this.i18nService.getMessage('jwtSecretRequired'));
|
||||
}
|
||||
|
||||
return jwt.sign({ fileId, userId, tenantId, type: 'pdf-access' }, secret, {
|
||||
expiresIn: '1h',
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Get actual model dimensions (with cache check and probe logic)
|
||||
*/
|
||||
private async getActualModelDimensions(
|
||||
embeddingModelId: string,
|
||||
): Promise<number> {
|
||||
const defaultDimensions = parseInt(
|
||||
process.env.DEFAULT_VECTOR_DIMENSIONS || '2560',
|
||||
);
|
||||
|
||||
try {
|
||||
// 1. Prioritize getting from model config
|
||||
const modelConfig =
|
||||
await this.modelConfigService.findOne(embeddingModelId);
|
||||
|
||||
if (modelConfig && modelConfig.dimensions) {
|
||||
this.logger.log(
|
||||
`Got dimensions from ${modelConfig.name} config: ${modelConfig.dimensions}`,
|
||||
);
|
||||
return modelConfig.dimensions;
|
||||
}
|
||||
|
||||
// 2. Otherwise probe for dimensions
|
||||
this.logger.log(`Probing model dimensions: ${embeddingModelId}`);
|
||||
const probeEmbeddings = await this.embeddingService.getEmbeddings(
|
||||
['probe'],
|
||||
embeddingModelId,
|
||||
);
|
||||
|
||||
if (probeEmbeddings.length > 0) {
|
||||
const actualDimensions = probeEmbeddings[0].length;
|
||||
this.logger.log(
|
||||
`Detected actual model dimensions: ${actualDimensions}`,
|
||||
);
|
||||
|
||||
// Update model config for next use
|
||||
if (modelConfig) {
|
||||
try {
|
||||
await this.modelConfigService.update(modelConfig.id, {
|
||||
dimensions: actualDimensions,
|
||||
});
|
||||
this.logger.log(
|
||||
`Updated model ${modelConfig.name} dimension config to ${actualDimensions}`,
|
||||
);
|
||||
} catch (updateErr) {
|
||||
this.logger.warn(
|
||||
`Failed to update model dimension config: ${updateErr.message}`,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
return actualDimensions;
|
||||
}
|
||||
} catch (err) {
|
||||
this.logger.warn(
|
||||
`Failed to get dimensions. Using default: ${defaultDimensions}`,
|
||||
err.message,
|
||||
);
|
||||
}
|
||||
|
||||
return defaultDimensions;
|
||||
}
|
||||
|
||||
/**
|
||||
* Auto-generate document title using AI
|
||||
*/
|
||||
async generateTitle(kbId: string): Promise<string | null> {
|
||||
this.logger.log(`Generating automatic title for file ${kbId}`);
|
||||
|
||||
try {
|
||||
const kb = await this.kbRepository.findOne({ where: { id: kbId } });
|
||||
if (!kb || !kb.content || kb.content.trim().length === 0) {
|
||||
return null;
|
||||
}
|
||||
const tenantId = kb.tenantId;
|
||||
|
||||
// Skip if title already exists
|
||||
if (kb.title) {
|
||||
return kb.title;
|
||||
}
|
||||
|
||||
// Get content sample (max 2500 characters)
|
||||
const contentSample = kb.content.substring(0, 2500);
|
||||
|
||||
// Get language from org settings, or use default
|
||||
const userSettings = await this.userSettingService.getByUser(kb.userId);
|
||||
const language = userSettings.language || 'zh';
|
||||
|
||||
// Build prompt
|
||||
const prompt = this.i18nService.getDocumentTitlePrompt(
|
||||
language,
|
||||
contentSample,
|
||||
);
|
||||
|
||||
// Call LLM to generate title
|
||||
let generatedTitle: string | undefined;
|
||||
try {
|
||||
generatedTitle = await this.chatService.generateSimpleChat(
|
||||
[{ role: 'user', content: prompt }],
|
||||
kb.userId,
|
||||
kb.tenantId,
|
||||
);
|
||||
} catch (err) {
|
||||
this.logger.warn(
|
||||
`Failed to generate title for document ${kbId} due to LLM configuration issue: ${err.message}`,
|
||||
);
|
||||
return null; // Skip title generation if LLM is not configured for this tenant
|
||||
}
|
||||
|
||||
if (generatedTitle && generatedTitle.trim().length > 0) {
|
||||
// Remove extra quotes and newlines
|
||||
const cleanedTitle = generatedTitle
|
||||
.trim()
|
||||
.replace(/^["']|["']$/g, '')
|
||||
.substring(0, 100);
|
||||
await this.kbRepository.update(kbId, { title: cleanedTitle });
|
||||
|
||||
// Also update ES chunks
|
||||
await this.elasticsearchService
|
||||
.updateTitleByFileId(kbId, cleanedTitle, tenantId)
|
||||
.catch((err) => {
|
||||
this.logger.error(
|
||||
`Failed to update title in Elasticsearch for ${kbId}`,
|
||||
err,
|
||||
);
|
||||
});
|
||||
|
||||
this.logger.log(
|
||||
`Successfully generated title for ${kbId}: ${cleanedTitle}`,
|
||||
);
|
||||
return cleanedTitle;
|
||||
}
|
||||
} catch (error) {
|
||||
this.logger.error(`Failed to generate title for ${kbId}`, error);
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,255 @@
|
||||
import { Injectable, Logger } from '@nestjs/common';
|
||||
|
||||
export interface MemoryStats {
|
||||
heapUsed: number; // Used heap memory (MB)
|
||||
heapTotal: number; // Total heap memory (MB)
|
||||
external: number; // External memory (MB)
|
||||
rss: number; // RSS (Resident Set Size) (MB)
|
||||
timestamp: Date;
|
||||
}
|
||||
|
||||
@Injectable()
|
||||
export class MemoryMonitorService {
|
||||
private readonly logger = new Logger(MemoryMonitorService.name);
|
||||
private readonly MAX_MEMORY_MB: number;
|
||||
private readonly BATCH_SIZE: number;
|
||||
private readonly GC_THRESHOLD_MB: number;
|
||||
|
||||
constructor() {
|
||||
// Load config from env vars. Default values for memory optimization
|
||||
this.MAX_MEMORY_MB = parseInt(process.env.MAX_MEMORY_USAGE_MB || '1024'); // 1GB limit
|
||||
this.BATCH_SIZE = parseInt(process.env.CHUNK_BATCH_SIZE || '100'); // 100 chunks per batch
|
||||
this.GC_THRESHOLD_MB = parseInt(process.env.GC_THRESHOLD_MB || '800'); // Trigger GC at 800MB
|
||||
|
||||
this.logger.log(
|
||||
`Memory monitor initialized: limit=${this.MAX_MEMORY_MB}MB, batchSize=${this.BATCH_SIZE}, GCThreshold=${this.GC_THRESHOLD_MB}MB`,
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get current memory usage
|
||||
*/
|
||||
getMemoryUsage(): MemoryStats {
|
||||
const usage = process.memoryUsage();
|
||||
return {
|
||||
heapUsed: Math.round(usage.heapUsed / 1024 / 1024),
|
||||
heapTotal: Math.round(usage.heapTotal / 1024 / 1024),
|
||||
external: Math.round((usage.external || 0) / 1024 / 1024),
|
||||
rss: Math.round(usage.rss / 1024 / 1024),
|
||||
timestamp: new Date(),
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if memory is approaching limit
|
||||
*/
|
||||
isMemoryHigh(): boolean {
|
||||
const usage = this.getMemoryUsage();
|
||||
return usage.heapUsed > this.MAX_MEMORY_MB * 0.85; // 85% threshold
|
||||
}
|
||||
|
||||
/**
|
||||
* Wait for memory to become available (with timeout)
|
||||
*/
|
||||
async waitForMemoryAvailable(timeoutMs: number = 30000): Promise<void> {
|
||||
const startTime = Date.now();
|
||||
|
||||
while (this.isMemoryHigh()) {
|
||||
if (Date.now() - startTime > timeoutMs) {
|
||||
throw new Error(
|
||||
`Memory wait timeout: current ${this.getMemoryUsage().heapUsed}MB > ${this.MAX_MEMORY_MB * 0.85}MB`,
|
||||
);
|
||||
}
|
||||
|
||||
this.logger.warn(
|
||||
`Memory usage too high. Waiting for release... ${this.getMemoryUsage().heapUsed}/${this.MAX_MEMORY_MB}MB`,
|
||||
);
|
||||
|
||||
// Force garbage collection (if available)
|
||||
if (global.gc) {
|
||||
this.logger.log('Running forced garbage collection...');
|
||||
global.gc();
|
||||
}
|
||||
|
||||
await new Promise((resolve) => setTimeout(resolve, 1000));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Force garbage collection (if available)
|
||||
*/
|
||||
forceGC(): void {
|
||||
if (global.gc) {
|
||||
const before = this.getMemoryUsage();
|
||||
global.gc();
|
||||
const after = this.getMemoryUsage();
|
||||
this.logger.log(
|
||||
`GC completed: ${before.heapUsed}MB → ${after.heapUsed}MB (${before.heapUsed - after.heapUsed}MB freed)`,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Dynamically adjust batch size
|
||||
*/
|
||||
getDynamicBatchSize(currentMemoryMB: number): number {
|
||||
const baseBatchSize = this.BATCH_SIZE;
|
||||
|
||||
if (currentMemoryMB > this.GC_THRESHOLD_MB) {
|
||||
// Memory pressure, reduce batch size
|
||||
const reduced = Math.max(10, Math.floor(baseBatchSize * 0.5));
|
||||
this.logger.warn(
|
||||
`Memory pressure (${currentMemoryMB}MB), adjusting batch size: ${baseBatchSize} → ${reduced}`,
|
||||
);
|
||||
return reduced;
|
||||
} else if (currentMemoryMB < this.MAX_MEMORY_MB * 0.4) {
|
||||
// Enough memory, increase batch size
|
||||
const increased = Math.min(200, Math.floor(baseBatchSize * 1.2));
|
||||
if (increased > baseBatchSize) {
|
||||
this.logger.log(
|
||||
`Memory available (${currentMemoryMB}MB), adjusting batch size: ${baseBatchSize} → ${increased}`,
|
||||
);
|
||||
}
|
||||
return increased;
|
||||
}
|
||||
|
||||
return baseBatchSize;
|
||||
}
|
||||
|
||||
/**
|
||||
* Process large data: auto-batching and memory control
|
||||
*/
|
||||
async processInBatches<T, R>(
|
||||
items: T[],
|
||||
processor: (batch: T[], batchIndex: number) => Promise<R[]>,
|
||||
options?: {
|
||||
onBatchComplete?: (
|
||||
batchIndex: number,
|
||||
totalBatches: number,
|
||||
results: R[],
|
||||
) => Promise<void> | void;
|
||||
maxConcurrency?: number;
|
||||
},
|
||||
): Promise<R[]> {
|
||||
const totalItems = items.length;
|
||||
if (totalItems === 0) return [];
|
||||
|
||||
const startTime = Date.now();
|
||||
this.logger.log(`Starting batch processing: ${totalItems} items`);
|
||||
|
||||
const allResults: R[] = [];
|
||||
let processedCount = 0;
|
||||
|
||||
for (let i = 0; i < totalItems; ) {
|
||||
// Check memory state and wait
|
||||
await this.waitForMemoryAvailable();
|
||||
|
||||
// Dynamically adjust batch size
|
||||
const currentMem = this.getMemoryUsage().heapUsed;
|
||||
const batchSize = this.getDynamicBatchSize(currentMem);
|
||||
|
||||
// Get current batch
|
||||
const batch = items.slice(i, i + batchSize);
|
||||
const batchIndex = Math.floor(i / batchSize) + 1;
|
||||
const totalBatches = Math.ceil(totalItems / batchSize);
|
||||
|
||||
this.logger.log(
|
||||
`Processing batch ${batchIndex}/${totalBatches}: ${batch.length} items (cumulative ${processedCount}/${totalItems})`,
|
||||
);
|
||||
|
||||
// Process batch
|
||||
const batchResults = await processor(batch, batchIndex);
|
||||
allResults.push(...batchResults);
|
||||
processedCount += batch.length;
|
||||
|
||||
// Callback notification
|
||||
if (options?.onBatchComplete) {
|
||||
await options.onBatchComplete(batchIndex, totalBatches, batchResults);
|
||||
}
|
||||
|
||||
// Force GC if memory near threshold
|
||||
if (currentMem > this.GC_THRESHOLD_MB) {
|
||||
this.forceGC();
|
||||
}
|
||||
|
||||
// Clear references to help GC
|
||||
batch.length = 0;
|
||||
|
||||
i += batchSize;
|
||||
}
|
||||
|
||||
const duration = ((Date.now() - startTime) / 1000).toFixed(2);
|
||||
const finalMem = this.getMemoryUsage();
|
||||
this.logger.log(
|
||||
`Batch processing completed: ${totalItems} items, duration ${duration}s, final memory ${finalMem.heapUsed}MB`,
|
||||
);
|
||||
|
||||
return allResults;
|
||||
}
|
||||
|
||||
/**
|
||||
* Estimate memory required for processing
|
||||
*/
|
||||
estimateMemoryUsage(
|
||||
itemCount: number,
|
||||
itemSizeBytes: number,
|
||||
vectorDim: number,
|
||||
): number {
|
||||
// Text content memory
|
||||
const textMemory = itemCount * itemSizeBytes;
|
||||
|
||||
// Vector memory (dimension * 4 bytes per vector)
|
||||
const vectorMemory = itemCount * vectorDim * 4;
|
||||
|
||||
// Object overhead (~100 bytes per object)
|
||||
const overhead = itemCount * 100;
|
||||
|
||||
const totalMB = Math.round(
|
||||
(textMemory + vectorMemory + overhead) / 1024 / 1024,
|
||||
);
|
||||
|
||||
return totalMB;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if batching should be used
|
||||
*/
|
||||
shouldUseBatching(
|
||||
itemCount: number,
|
||||
itemSizeBytes: number,
|
||||
vectorDim: number,
|
||||
): boolean {
|
||||
const estimatedMB = this.estimateMemoryUsage(
|
||||
itemCount,
|
||||
itemSizeBytes,
|
||||
vectorDim,
|
||||
);
|
||||
const threshold = this.MAX_MEMORY_MB * 0.7; // 70% threshold
|
||||
|
||||
if (estimatedMB > threshold) {
|
||||
this.logger.warn(
|
||||
`Estimated memory ${estimatedMB}MB exceeds threshold ${threshold}MB, using batch processing`,
|
||||
);
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get recommended batch size
|
||||
*/
|
||||
getRecommendedBatchSize(itemSizeBytes: number, vectorDim: number): number {
|
||||
// Goal: max 200MB memory per batch
|
||||
const targetMemoryMB = 200;
|
||||
const targetMemoryBytes = targetMemoryMB * 1024 * 1024;
|
||||
|
||||
// Memory per item = text + vector + overhead
|
||||
const singleItemMemory = itemSizeBytes + vectorDim * 4 + 100;
|
||||
|
||||
const batchSize = Math.floor(targetMemoryBytes / singleItemMemory);
|
||||
|
||||
// Limit between 10-200
|
||||
return Math.max(10, Math.min(200, batchSize));
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,106 @@
|
||||
import { Injectable } from '@nestjs/common';
|
||||
|
||||
export interface TextChunk {
|
||||
content: string;
|
||||
index: number;
|
||||
startPosition: number;
|
||||
endPosition: number;
|
||||
}
|
||||
|
||||
@Injectable()
|
||||
export class TextChunkerService {
|
||||
chunkText(
|
||||
text: string,
|
||||
chunkSize: number = 1000,
|
||||
overlap: number = 200,
|
||||
): TextChunk[] {
|
||||
if (!text || text.trim().length === 0) {
|
||||
return [];
|
||||
}
|
||||
|
||||
const cleanText = text.trim();
|
||||
const chunkSizeInChars = chunkSize * 4; // 1 token ≈ 4 chars
|
||||
const overlapInChars = overlap * 4;
|
||||
|
||||
// If text length <= chunk size, return entire text as one chunk
|
||||
if (cleanText.length <= chunkSizeInChars) {
|
||||
return [
|
||||
{
|
||||
content: cleanText,
|
||||
index: 0,
|
||||
startPosition: 0,
|
||||
endPosition: cleanText.length,
|
||||
},
|
||||
];
|
||||
}
|
||||
|
||||
const chunks: TextChunk[] = [];
|
||||
let start = 0;
|
||||
let index = 0;
|
||||
|
||||
while (start < cleanText.length) {
|
||||
let end = Math.min(start + chunkSizeInChars, cleanText.length);
|
||||
|
||||
// Split by sentence boundaries
|
||||
if (end < cleanText.length) {
|
||||
const sentenceEnd = this.findSentenceEnd(
|
||||
cleanText,
|
||||
end,
|
||||
start + chunkSizeInChars * 0.8,
|
||||
);
|
||||
if (sentenceEnd > start) {
|
||||
end = sentenceEnd;
|
||||
}
|
||||
}
|
||||
|
||||
const content = cleanText.slice(start, end).trim();
|
||||
if (content.length > 0) {
|
||||
chunks.push({
|
||||
content,
|
||||
index,
|
||||
startPosition: start,
|
||||
endPosition: end,
|
||||
});
|
||||
index++;
|
||||
}
|
||||
|
||||
// Fix infinite loop: if we reached the end, stop here.
|
||||
if (end >= cleanText.length) {
|
||||
break;
|
||||
}
|
||||
|
||||
// Calculate start position of next chunk
|
||||
const newStart = end - overlapInChars;
|
||||
// Protect against infinite loop if overlap is too large or chunk too small
|
||||
if (newStart <= start) {
|
||||
start = end; // Force advance if overlap would cause stagnation
|
||||
} else {
|
||||
start = newStart;
|
||||
}
|
||||
}
|
||||
|
||||
return chunks;
|
||||
}
|
||||
|
||||
private findSentenceEnd(
|
||||
text: string,
|
||||
preferredEnd: number,
|
||||
minEnd: number,
|
||||
): number {
|
||||
const sentenceEnders = ['.', '!', '?', '。', '!', '?'];
|
||||
|
||||
for (let i = preferredEnd; i >= minEnd; i--) {
|
||||
if (sentenceEnders.includes(text[i])) {
|
||||
return i + 1;
|
||||
}
|
||||
}
|
||||
|
||||
for (let i = preferredEnd; i >= minEnd; i--) {
|
||||
if (text[i] === ' ' || text[i] === '\n') {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
|
||||
return preferredEnd;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user