Files
aurak/server/src/assessment/graph/nodes/grader.node.ts
T

389 lines
15 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import { ChatOpenAI } from '@langchain/openai';
import {
SystemMessage,
HumanMessage,
AIMessage,
} from '@langchain/core/messages';
import { RunnableConfig } from '@langchain/core/runnables';
import { EvaluationState } from '../state';
import { safeParseJson } from '../../../common/json-utils';
/**
* Node responsible for grading the user's answer and deciding if a follow-up is needed.
*/
export const graderNode = async (
state: EvaluationState,
config?: RunnableConfig,
): Promise<Partial<EvaluationState>> => {
const { model } = (config?.configurable as any) || {};
const { questions, currentQuestionIndex, messages } = state;
const currentFollowUpCount = state.followUpCount || 0;
console.log('[GraderNode] Entering node...', {
currentIndex: currentQuestionIndex,
numMessages: messages?.length,
questionCount: state.questionCount,
hasQuestions: !!questions?.length,
});
if (!model) {
throw new Error('Missing model in node configuration');
}
const lastUserMessage = messages[messages.length - 1];
console.log('[GraderNode] Incoming Messages Count:', messages.length);
if (lastUserMessage) {
console.log(
'[GraderNode] Last Message Type:',
lastUserMessage.constructor.name,
);
// Safely extract content for logging
const logContent =
typeof lastUserMessage.content === 'string'
? lastUserMessage.content
: JSON.stringify(lastUserMessage.content);
console.log(
'[GraderNode] Last Message Content:',
logContent.substring(0, 50),
);
}
if (!(lastUserMessage instanceof HumanMessage)) {
console.log(
'[GraderNode] Last message is not HumanMessage, skipping grading.',
);
return {};
}
const isZh = state.language === 'zh';
const isJa = state.language === 'ja';
const currentQuestion = questions[currentQuestionIndex];
if (!currentQuestion) {
console.error(
`[GraderNode] Question at index ${currentQuestionIndex} not found!`,
);
return { currentQuestionIndex: currentQuestionIndex + 1 };
}
const isChoice = currentQuestion.questionType === 'MULTIPLE_CHOICE';
const expectedAnswer = currentQuestion.correctAnswer;
if (isChoice && expectedAnswer) {
const userAnswer = (lastUserMessage.content as string).trim();
const isCorrect = userAnswer.toUpperCase() === expectedAnswer?.toUpperCase();
console.log('[GraderNode] Choice grading:', { userAnswer, expectedAnswer, isCorrect });
const feedback = isCorrect ? '✅ 正确' : `❌ 错误,正确答案是 ${expectedAnswer}`;
const feedbackMessage = new AIMessage(
{ content: `Score: ${isCorrect ? 10 : 0}\nFeedback: ${feedback}` } as any,
);
return {
messages: [feedbackMessage],
feedbackHistory: [feedbackMessage],
scores: { [currentQuestion.id || currentQuestionIndex.toString()]: isCorrect ? 10 : 0 },
shouldFollowUp: false,
followUpCount: 0,
currentQuestionIndex: currentQuestionIndex + 1,
};
}
// ── Rule-based grading: use structured followupMapping if available ──
if (currentQuestion.followupHints) {
let mapping: any = null;
if (typeof currentQuestion.followupHints === 'string') {
try { mapping = JSON.parse(currentQuestion.followupHints); } catch {}
} else if (typeof currentQuestion.followupHints === 'object') {
mapping = currentQuestion.followupHints;
}
if (mapping && Array.isArray(mapping.branches)) {
const userAnswerText = typeof lastUserMessage.content === 'string'
? lastUserMessage.content : JSON.stringify(lastUserMessage.content);
// Score based on keyword coverage
let bestScore = mapping.defaultScore ?? 5;
let matchedFollowup = mapping.defaultFollowup || '';
let matchedAll = true;
const maxFollowUps = mapping.maxFollowups ?? 2;
for (const branch of mapping.branches) {
const kws = branch.keywords || [];
const matchCount = kws.filter((kw: string) => userAnswerText.toLowerCase().includes(kw.toLowerCase())).length;
if (kws.length > 0 && matchCount >= kws.length * 0.5) {
const branchScore = branch.score ?? 7;
if (branchScore > bestScore) bestScore = branchScore;
if (branch.followup) matchedFollowup = branch.followup;
} else if (kws.length > 0 && matchCount === 0) {
matchedAll = false;
}
}
const completionThreshold = mapping.completionThreshold ?? 80;
const tooShort = userAnswerText.trim().length < 8;
const saysIDontKnow = userAnswerText.trim().length < 10 && (
userAnswerText.includes('不知道') || userAnswerText.includes("don't know") || userAnswerText.includes('わかりません')
);
let shouldFollowUp: boolean;
if (saysIDontKnow || tooShort) {
shouldFollowUp = false;
bestScore = Math.min(bestScore, 2);
} else if (bestScore >= completionThreshold / 10) {
shouldFollowUp = false;
} else if (currentFollowUpCount >= maxFollowUps) {
shouldFollowUp = false;
} else {
shouldFollowUp = true;
}
const feedbackMessage = new AIMessage(`Score: ${bestScore}/10\n\nFeedback: ${shouldFollowUp ? matchedFollowup : '回答已覆盖关键点。'}`);
const feedbackHistoryMessages = shouldFollowUp && matchedFollowup
? [feedbackMessage, new AIMessage(matchedFollowup)]
: [feedbackMessage];
console.log('[GraderNode] Rule grading:', { score: bestScore, shouldFollowUp, matchedAll, followup: matchedFollowup?.substring(0, 60) });
return {
feedbackHistory: feedbackHistoryMessages,
scores: { [currentQuestion.id || currentQuestionIndex.toString()]: bestScore },
shouldFollowUp,
followUpCount: shouldFollowUp ? currentFollowUpCount + 1 : 0,
currentQuestionIndex: shouldFollowUp ? currentQuestionIndex : currentQuestionIndex + 1,
} as any;
}
}
const systemPromptZh = `你是一位考官。请评分并给出反馈。
规则:
1. 只用中文。
2. 多轮追问时,用户回答含所有轮次(第N轮回答:标记),综合判断已覆盖内容。
问题:${currentQuestion.questionText}
关键点:${currentQuestion.keyPoints.join(', ')}
评分标准:不要求深度,不要求使用特定术语,只看用户是否理解了概念。
用户理解核心概念就给分。即使没有使用关键点中的原词,只要意思到位就算覆盖。
例如关键点是"上下文窗口有限",用户说"信息太多超过AI处理长度"也是覆盖。
评分原则:往宽了给分,不确定时就给高分。明显正确就给8-10分,部分正确5-7分,完全不沾边才0-2分。
返回JSON
- score: 0-10
- feedback: 评语
- should_follow_up: true/false
- follow_up_question: 追问(仅true时需要,针对未覆盖的关键点,false时null)
请以 JSON 格式返回响应:
{"score":0到10,"feedback":"评语","should_follow_up":true或false,"follow_up_question":"追问或null"}
示例(需要追问):
{"score":6,"feedback":"提到了安全性和性能,未说明依赖关系。","should_follow_up":true,"follow_up_question":"你如何让AI在计划中明确任务依赖关系?"}
示例(不需追问):
{"score":8,"feedback":"回答完整。","should_follow_up":false,"follow_up_question":null}`;
const systemPromptJa = `あなたは試験官です。採点とフィードバックを提供してください。
ルール:
1. 日本語のみ使用。
2. 複数ラウンドの回答は「第N輪回答:」でマークされ、全ラウンドを総合判断。
質問:${currentQuestion.questionText}
キーポイント:${currentQuestion.keyPoints.join(', ')}
評価基準:正確性、網羅性、深さ。
部分点可(5〜7点)、見当違いのみ0〜2点。
JSON形式:
- score: 0〜10
- feedback: 評価
- should_follow_up: true/false
- follow_up_question: 追質問(true時のみ、未カバーのポイントに焦点、false時null)
JSON 形式で回答してください:
{"score":0から10,"feedback":"評価","should_follow_up":trueかfalse,"follow_up_question":"追質問かnull"}
例(追質問が必要):
{"score":6,"feedback":"安全性と性能に言及したが、依存関係が不明。","should_follow_up":true,"follow_up_question":"AIに計画内のタスク依存関係を明示させる方法は?"}
例(不要):
{"score":8,"feedback":"回答は完全。","should_follow_up":false,"follow_up_question":null}`;
const systemPromptEn = `You are an examiner. Grade and give feedback.
Rules:
1. English only.
2. Multi-round answers are tagged "第N轮回答:". Consider all rounds.
Question: ${currentQuestion.questionText}
Key points: ${currentQuestion.keyPoints.join(', ')}
Criteria: accuracy, completeness, depth.
Give partial credit (5-7 for partial), 0-2 only for off-target.
Return JSON:
- score: 0-10
- feedback: text
- should_follow_up: true/false
- follow_up_question: question (only when true, target uncovered points, null when false)
Format as JSON:
{"score":0-10,"feedback":"...","should_follow_up":true|false,"follow_up_question":"question or null"}
Example (follow-up needed):
{"score":6,"feedback":"Covered security and performance, missed dependencies.","should_follow_up":true,"follow_up_question":"How would you make the AI clarify task dependencies?"}
Example (no follow-up):
{"score":8,"feedback":"Complete answer.","should_follow_up":false,"follow_up_question":null}`;
let systemPrompt = isZh
? systemPromptZh
: isJa
? systemPromptJa
: systemPromptEn;
if (currentQuestion.judgment) {
const anchorText = isZh
? `\n\n【判定依据(通过标准)】${currentQuestion.judgment}`
: isJa
? `\n\n【判定基準(合格基準)】${currentQuestion.judgment}`
: `\n\n【Judgment Criteria (Pass Standard)】${currentQuestion.judgment}`;
systemPrompt += anchorText;
}
const maxFollowUps = (currentQuestion as any).maxFollowUps ?? 2;
const userContentText =
typeof lastUserMessage.content === 'string'
? lastUserMessage.content
: JSON.stringify(lastUserMessage.content);
let allAnswers = userContentText;
if (currentFollowUpCount > 0) {
const prevAnswers = state.messages
.filter(m => m instanceof HumanMessage)
.slice(-(currentFollowUpCount + 1))
.map((m, i) => `${i + 1}轮回答:${typeof m.content === 'string' ? m.content : JSON.stringify(m.content)}`);
allAnswers = prevAnswers.join('\n\n');
}
console.log('[GraderNode] === START GRADING ===');
console.log('[GraderNode] User answer length:', userContentText.length);
console.log('[GraderNode] Question:', currentQuestion?.questionText?.substring(0, 100));
console.log('[GraderNode] Target dimension:', currentQuestion?.dimension);
try {
const response = await model.invoke([
new SystemMessage(systemPrompt),
new HumanMessage(allAnswers),
]);
console.log('[GraderNode] LLM invoke completed');
try {
const rawContent = response.content as string;
console.log('[GraderNode] Raw AI response length:', rawContent.length);
console.log('[GraderNode] Raw AI response:', rawContent.substring(0, 800));
const result = safeParseJson<any>(rawContent);
if (!result) {
console.error('[GraderNode] Failed to parse JSON. Raw content:', rawContent);
throw new Error('Invalid JSON format from AI');
}
console.log('[GraderNode] === GRADING RESULT ===');
console.log('[GraderNode] Parsed result:', JSON.stringify(result, null, 2));
console.log('[GraderNode] Score value:', result.score);
console.log('[GraderNode] Feedback value:', result.feedback?.substring(0, 200));
const scoreLabel = isZh ? '得分' : isJa ? 'スコア' : 'Score';
const feedbackLabel = isZh ? '反馈' : isJa ? 'フィードバック' : 'Feedback';
let enhancedFeedback: string = result.feedback;
const newScores = {
...state.scores,
[currentQuestion.id || currentQuestionIndex.toString()]: result.score,
};
let shouldFollowUp = result.should_follow_up === true;
const normalizedContent = userContentText.trim().toLowerCase();
const saysIDontKnow =
normalizedContent.length < 10 &&
(normalizedContent.includes('不知道') ||
normalizedContent.includes('不会') ||
normalizedContent.includes("don't know") ||
normalizedContent.includes('no idea') ||
normalizedContent.includes('不知') ||
normalizedContent.includes('わかりません') ||
normalizedContent.includes('わからん') ||
normalizedContent.includes('知らない') ||
normalizedContent.includes('不明') ||
normalizedContent.includes('わからない'));
if (currentFollowUpCount >= maxFollowUps || result.score >= 8 || saysIDontKnow) {
shouldFollowUp = false;
}
let followupHintMsg: AIMessage | null = null;
if (shouldFollowUp && result.follow_up_question && result.follow_up_question.trim()) {
followupHintMsg = new AIMessage(result.follow_up_question.trim());
} else if (shouldFollowUp) {
shouldFollowUp = false;
}
const feedbackMessage = new AIMessage(
`${scoreLabel}: ${result.score}/10\n\n${feedbackLabel}: ${enhancedFeedback}`,
);
console.log('[GraderNode] Final State decision:', {
shouldFollowUp,
nextIndex: shouldFollowUp
? currentQuestionIndex
: currentQuestionIndex + 1,
score: result.score,
saysIDontKnow,
});
const feedbackHistoryMessages = followupHintMsg
? [feedbackMessage, followupHintMsg]
: [feedbackMessage];
return {
feedbackHistory: feedbackHistoryMessages,
scores: newScores,
shouldFollowUp: shouldFollowUp,
followUpCount: shouldFollowUp ? currentFollowUpCount + 1 : 0,
currentQuestionIndex: shouldFollowUp
? currentQuestionIndex
: currentQuestionIndex + 1,
} as any;
} catch (parseError) {
console.error('[GraderNode] Failed to parse grade:', parseError);
const scoreLabel = isZh ? '得分' : isJa ? 'スコア' : 'Score';
const fallbackMsg = new AIMessage(`${scoreLabel}: 5/10\n\n评分解析失败,默认给5分。`);
return {
feedbackHistory: [fallbackMsg],
scores: { [currentQuestion.id || currentQuestionIndex.toString()]: 5 },
shouldFollowUp: false,
followUpCount: 0,
currentQuestionIndex: currentQuestionIndex + 1,
} as any;
}
} catch (error) {
console.error('[GraderNode] LLM grading failed:', error);
const scoreLabel = isZh ? '得分' : isJa ? 'スコア' : 'Score';
const feedbackLabel = isZh ? '反馈' : isJa ? 'フィードバック' : 'Feedback';
const fallbackMsg = new AIMessage(`${scoreLabel}: 5/10\n\n${feedbackLabel}: 评分服务暂时不可用,默认给5分。`);
return {
feedbackHistory: [fallbackMsg],
scores: { [currentQuestion.id || currentQuestionIndex.toString()]: 5 },
shouldFollowUp: false,
followUpCount: 0,
currentQuestionIndex: currentQuestionIndex + 1,
} as any;
}
};