389 lines
15 KiB
TypeScript
389 lines
15 KiB
TypeScript
import { ChatOpenAI } from '@langchain/openai';
|
||
import {
|
||
SystemMessage,
|
||
HumanMessage,
|
||
AIMessage,
|
||
} from '@langchain/core/messages';
|
||
import { RunnableConfig } from '@langchain/core/runnables';
|
||
import { EvaluationState } from '../state';
|
||
import { safeParseJson } from '../../../common/json-utils';
|
||
|
||
/**
|
||
* Node responsible for grading the user's answer and deciding if a follow-up is needed.
|
||
*/
|
||
export const graderNode = async (
|
||
state: EvaluationState,
|
||
config?: RunnableConfig,
|
||
): Promise<Partial<EvaluationState>> => {
|
||
const { model } = (config?.configurable as any) || {};
|
||
const { questions, currentQuestionIndex, messages } = state;
|
||
const currentFollowUpCount = state.followUpCount || 0;
|
||
|
||
console.log('[GraderNode] Entering node...', {
|
||
currentIndex: currentQuestionIndex,
|
||
numMessages: messages?.length,
|
||
questionCount: state.questionCount,
|
||
hasQuestions: !!questions?.length,
|
||
});
|
||
|
||
if (!model) {
|
||
throw new Error('Missing model in node configuration');
|
||
}
|
||
|
||
const lastUserMessage = messages[messages.length - 1];
|
||
|
||
console.log('[GraderNode] Incoming Messages Count:', messages.length);
|
||
if (lastUserMessage) {
|
||
console.log(
|
||
'[GraderNode] Last Message Type:',
|
||
lastUserMessage.constructor.name,
|
||
);
|
||
// Safely extract content for logging
|
||
const logContent =
|
||
typeof lastUserMessage.content === 'string'
|
||
? lastUserMessage.content
|
||
: JSON.stringify(lastUserMessage.content);
|
||
console.log(
|
||
'[GraderNode] Last Message Content:',
|
||
logContent.substring(0, 50),
|
||
);
|
||
}
|
||
|
||
if (!(lastUserMessage instanceof HumanMessage)) {
|
||
console.log(
|
||
'[GraderNode] Last message is not HumanMessage, skipping grading.',
|
||
);
|
||
return {};
|
||
}
|
||
|
||
const isZh = state.language === 'zh';
|
||
const isJa = state.language === 'ja';
|
||
|
||
const currentQuestion = questions[currentQuestionIndex];
|
||
if (!currentQuestion) {
|
||
console.error(
|
||
`[GraderNode] Question at index ${currentQuestionIndex} not found!`,
|
||
);
|
||
return { currentQuestionIndex: currentQuestionIndex + 1 };
|
||
}
|
||
|
||
const isChoice = currentQuestion.questionType === 'MULTIPLE_CHOICE';
|
||
const expectedAnswer = currentQuestion.correctAnswer;
|
||
|
||
if (isChoice && expectedAnswer) {
|
||
const userAnswer = (lastUserMessage.content as string).trim();
|
||
const isCorrect = userAnswer.toUpperCase() === expectedAnswer?.toUpperCase();
|
||
|
||
console.log('[GraderNode] Choice grading:', { userAnswer, expectedAnswer, isCorrect });
|
||
|
||
const feedback = isCorrect ? '✅ 正确' : `❌ 错误,正确答案是 ${expectedAnswer}`;
|
||
const feedbackMessage = new AIMessage(
|
||
{ content: `Score: ${isCorrect ? 10 : 0}\nFeedback: ${feedback}` } as any,
|
||
);
|
||
|
||
return {
|
||
messages: [feedbackMessage],
|
||
feedbackHistory: [feedbackMessage],
|
||
scores: { [currentQuestion.id || currentQuestionIndex.toString()]: isCorrect ? 10 : 0 },
|
||
shouldFollowUp: false,
|
||
followUpCount: 0,
|
||
currentQuestionIndex: currentQuestionIndex + 1,
|
||
};
|
||
}
|
||
|
||
// ── Rule-based grading: use structured followupMapping if available ──
|
||
if (currentQuestion.followupHints) {
|
||
let mapping: any = null;
|
||
if (typeof currentQuestion.followupHints === 'string') {
|
||
try { mapping = JSON.parse(currentQuestion.followupHints); } catch {}
|
||
} else if (typeof currentQuestion.followupHints === 'object') {
|
||
mapping = currentQuestion.followupHints;
|
||
}
|
||
if (mapping && Array.isArray(mapping.branches)) {
|
||
const userAnswerText = typeof lastUserMessage.content === 'string'
|
||
? lastUserMessage.content : JSON.stringify(lastUserMessage.content);
|
||
|
||
// Score based on keyword coverage
|
||
let bestScore = mapping.defaultScore ?? 5;
|
||
let matchedFollowup = mapping.defaultFollowup || '';
|
||
let matchedAll = true;
|
||
const maxFollowUps = mapping.maxFollowups ?? 2;
|
||
|
||
for (const branch of mapping.branches) {
|
||
const kws = branch.keywords || [];
|
||
const matchCount = kws.filter((kw: string) => userAnswerText.toLowerCase().includes(kw.toLowerCase())).length;
|
||
if (kws.length > 0 && matchCount >= kws.length * 0.5) {
|
||
const branchScore = branch.score ?? 7;
|
||
if (branchScore > bestScore) bestScore = branchScore;
|
||
if (branch.followup) matchedFollowup = branch.followup;
|
||
} else if (kws.length > 0 && matchCount === 0) {
|
||
matchedAll = false;
|
||
}
|
||
}
|
||
|
||
const completionThreshold = mapping.completionThreshold ?? 80;
|
||
const tooShort = userAnswerText.trim().length < 8;
|
||
const saysIDontKnow = userAnswerText.trim().length < 10 && (
|
||
userAnswerText.includes('不知道') || userAnswerText.includes("don't know") || userAnswerText.includes('わかりません')
|
||
);
|
||
|
||
let shouldFollowUp: boolean;
|
||
if (saysIDontKnow || tooShort) {
|
||
shouldFollowUp = false;
|
||
bestScore = Math.min(bestScore, 2);
|
||
} else if (bestScore >= completionThreshold / 10) {
|
||
shouldFollowUp = false;
|
||
} else if (currentFollowUpCount >= maxFollowUps) {
|
||
shouldFollowUp = false;
|
||
} else {
|
||
shouldFollowUp = true;
|
||
}
|
||
|
||
const feedbackMessage = new AIMessage(`Score: ${bestScore}/10\n\nFeedback: ${shouldFollowUp ? matchedFollowup : '回答已覆盖关键点。'}`);
|
||
|
||
const feedbackHistoryMessages = shouldFollowUp && matchedFollowup
|
||
? [feedbackMessage, new AIMessage(matchedFollowup)]
|
||
: [feedbackMessage];
|
||
|
||
console.log('[GraderNode] Rule grading:', { score: bestScore, shouldFollowUp, matchedAll, followup: matchedFollowup?.substring(0, 60) });
|
||
|
||
return {
|
||
feedbackHistory: feedbackHistoryMessages,
|
||
scores: { [currentQuestion.id || currentQuestionIndex.toString()]: bestScore },
|
||
shouldFollowUp,
|
||
followUpCount: shouldFollowUp ? currentFollowUpCount + 1 : 0,
|
||
currentQuestionIndex: shouldFollowUp ? currentQuestionIndex : currentQuestionIndex + 1,
|
||
} as any;
|
||
}
|
||
}
|
||
|
||
const systemPromptZh = `你是一位考官。请评分并给出反馈。
|
||
|
||
规则:
|
||
1. 只用中文。
|
||
2. 多轮追问时,用户回答含所有轮次(第N轮回答:标记),综合判断已覆盖内容。
|
||
|
||
问题:${currentQuestion.questionText}
|
||
关键点:${currentQuestion.keyPoints.join(', ')}
|
||
|
||
评分标准:不要求深度,不要求使用特定术语,只看用户是否理解了概念。
|
||
用户理解核心概念就给分。即使没有使用关键点中的原词,只要意思到位就算覆盖。
|
||
例如关键点是"上下文窗口有限",用户说"信息太多超过AI处理长度"也是覆盖。
|
||
评分原则:往宽了给分,不确定时就给高分。明显正确就给8-10分,部分正确5-7分,完全不沾边才0-2分。
|
||
|
||
返回JSON:
|
||
- score: 0-10
|
||
- feedback: 评语
|
||
- should_follow_up: true/false
|
||
- follow_up_question: 追问(仅true时需要,针对未覆盖的关键点,false时null)
|
||
|
||
请以 JSON 格式返回响应:
|
||
{"score":0到10,"feedback":"评语","should_follow_up":true或false,"follow_up_question":"追问或null"}
|
||
|
||
示例(需要追问):
|
||
{"score":6,"feedback":"提到了安全性和性能,未说明依赖关系。","should_follow_up":true,"follow_up_question":"你如何让AI在计划中明确任务依赖关系?"}
|
||
|
||
示例(不需追问):
|
||
{"score":8,"feedback":"回答完整。","should_follow_up":false,"follow_up_question":null}`;
|
||
|
||
const systemPromptJa = `あなたは試験官です。採点とフィードバックを提供してください。
|
||
|
||
ルール:
|
||
1. 日本語のみ使用。
|
||
2. 複数ラウンドの回答は「第N輪回答:」でマークされ、全ラウンドを総合判断。
|
||
|
||
質問:${currentQuestion.questionText}
|
||
キーポイント:${currentQuestion.keyPoints.join(', ')}
|
||
|
||
評価基準:正確性、網羅性、深さ。
|
||
部分点可(5〜7点)、見当違いのみ0〜2点。
|
||
|
||
JSON形式:
|
||
- score: 0〜10
|
||
- feedback: 評価
|
||
- should_follow_up: true/false
|
||
- follow_up_question: 追質問(true時のみ、未カバーのポイントに焦点、false時null)
|
||
|
||
JSON 形式で回答してください:
|
||
{"score":0から10,"feedback":"評価","should_follow_up":trueかfalse,"follow_up_question":"追質問かnull"}
|
||
|
||
例(追質問が必要):
|
||
{"score":6,"feedback":"安全性と性能に言及したが、依存関係が不明。","should_follow_up":true,"follow_up_question":"AIに計画内のタスク依存関係を明示させる方法は?"}
|
||
|
||
例(不要):
|
||
{"score":8,"feedback":"回答は完全。","should_follow_up":false,"follow_up_question":null}`;
|
||
|
||
const systemPromptEn = `You are an examiner. Grade and give feedback.
|
||
|
||
Rules:
|
||
1. English only.
|
||
2. Multi-round answers are tagged "第N轮回答:". Consider all rounds.
|
||
|
||
Question: ${currentQuestion.questionText}
|
||
Key points: ${currentQuestion.keyPoints.join(', ')}
|
||
|
||
Criteria: accuracy, completeness, depth.
|
||
Give partial credit (5-7 for partial), 0-2 only for off-target.
|
||
|
||
Return JSON:
|
||
- score: 0-10
|
||
- feedback: text
|
||
- should_follow_up: true/false
|
||
- follow_up_question: question (only when true, target uncovered points, null when false)
|
||
|
||
Format as JSON:
|
||
{"score":0-10,"feedback":"...","should_follow_up":true|false,"follow_up_question":"question or null"}
|
||
|
||
Example (follow-up needed):
|
||
{"score":6,"feedback":"Covered security and performance, missed dependencies.","should_follow_up":true,"follow_up_question":"How would you make the AI clarify task dependencies?"}
|
||
|
||
Example (no follow-up):
|
||
{"score":8,"feedback":"Complete answer.","should_follow_up":false,"follow_up_question":null}`;
|
||
|
||
let systemPrompt = isZh
|
||
? systemPromptZh
|
||
: isJa
|
||
? systemPromptJa
|
||
: systemPromptEn;
|
||
|
||
if (currentQuestion.judgment) {
|
||
const anchorText = isZh
|
||
? `\n\n【判定依据(通过标准)】${currentQuestion.judgment}`
|
||
: isJa
|
||
? `\n\n【判定基準(合格基準)】${currentQuestion.judgment}`
|
||
: `\n\n【Judgment Criteria (Pass Standard)】${currentQuestion.judgment}`;
|
||
systemPrompt += anchorText;
|
||
}
|
||
|
||
const maxFollowUps = (currentQuestion as any).maxFollowUps ?? 2;
|
||
|
||
const userContentText =
|
||
typeof lastUserMessage.content === 'string'
|
||
? lastUserMessage.content
|
||
: JSON.stringify(lastUserMessage.content);
|
||
|
||
let allAnswers = userContentText;
|
||
if (currentFollowUpCount > 0) {
|
||
const prevAnswers = state.messages
|
||
.filter(m => m instanceof HumanMessage)
|
||
.slice(-(currentFollowUpCount + 1))
|
||
.map((m, i) => `第${i + 1}轮回答:${typeof m.content === 'string' ? m.content : JSON.stringify(m.content)}`);
|
||
allAnswers = prevAnswers.join('\n\n');
|
||
}
|
||
|
||
console.log('[GraderNode] === START GRADING ===');
|
||
console.log('[GraderNode] User answer length:', userContentText.length);
|
||
console.log('[GraderNode] Question:', currentQuestion?.questionText?.substring(0, 100));
|
||
console.log('[GraderNode] Target dimension:', currentQuestion?.dimension);
|
||
|
||
try {
|
||
const response = await model.invoke([
|
||
new SystemMessage(systemPrompt),
|
||
new HumanMessage(allAnswers),
|
||
]);
|
||
|
||
console.log('[GraderNode] LLM invoke completed');
|
||
try {
|
||
const rawContent = response.content as string;
|
||
console.log('[GraderNode] Raw AI response length:', rawContent.length);
|
||
console.log('[GraderNode] Raw AI response:', rawContent.substring(0, 800));
|
||
|
||
const result = safeParseJson<any>(rawContent);
|
||
if (!result) {
|
||
console.error('[GraderNode] Failed to parse JSON. Raw content:', rawContent);
|
||
throw new Error('Invalid JSON format from AI');
|
||
}
|
||
console.log('[GraderNode] === GRADING RESULT ===');
|
||
console.log('[GraderNode] Parsed result:', JSON.stringify(result, null, 2));
|
||
console.log('[GraderNode] Score value:', result.score);
|
||
console.log('[GraderNode] Feedback value:', result.feedback?.substring(0, 200));
|
||
|
||
const scoreLabel = isZh ? '得分' : isJa ? 'スコア' : 'Score';
|
||
const feedbackLabel = isZh ? '反馈' : isJa ? 'フィードバック' : 'Feedback';
|
||
let enhancedFeedback: string = result.feedback;
|
||
|
||
const newScores = {
|
||
...state.scores,
|
||
[currentQuestion.id || currentQuestionIndex.toString()]: result.score,
|
||
};
|
||
|
||
let shouldFollowUp = result.should_follow_up === true;
|
||
|
||
const normalizedContent = userContentText.trim().toLowerCase();
|
||
const saysIDontKnow =
|
||
normalizedContent.length < 10 &&
|
||
(normalizedContent.includes('不知道') ||
|
||
normalizedContent.includes('不会') ||
|
||
normalizedContent.includes("don't know") ||
|
||
normalizedContent.includes('no idea') ||
|
||
normalizedContent.includes('不知') ||
|
||
normalizedContent.includes('わかりません') ||
|
||
normalizedContent.includes('わからん') ||
|
||
normalizedContent.includes('知らない') ||
|
||
normalizedContent.includes('不明') ||
|
||
normalizedContent.includes('わからない'));
|
||
|
||
if (currentFollowUpCount >= maxFollowUps || result.score >= 8 || saysIDontKnow) {
|
||
shouldFollowUp = false;
|
||
}
|
||
|
||
let followupHintMsg: AIMessage | null = null;
|
||
if (shouldFollowUp && result.follow_up_question && result.follow_up_question.trim()) {
|
||
followupHintMsg = new AIMessage(result.follow_up_question.trim());
|
||
} else if (shouldFollowUp) {
|
||
shouldFollowUp = false;
|
||
}
|
||
|
||
const feedbackMessage = new AIMessage(
|
||
`${scoreLabel}: ${result.score}/10\n\n${feedbackLabel}: ${enhancedFeedback}`,
|
||
);
|
||
|
||
console.log('[GraderNode] Final State decision:', {
|
||
shouldFollowUp,
|
||
nextIndex: shouldFollowUp
|
||
? currentQuestionIndex
|
||
: currentQuestionIndex + 1,
|
||
score: result.score,
|
||
saysIDontKnow,
|
||
});
|
||
|
||
const feedbackHistoryMessages = followupHintMsg
|
||
? [feedbackMessage, followupHintMsg]
|
||
: [feedbackMessage];
|
||
|
||
return {
|
||
feedbackHistory: feedbackHistoryMessages,
|
||
scores: newScores,
|
||
shouldFollowUp: shouldFollowUp,
|
||
followUpCount: shouldFollowUp ? currentFollowUpCount + 1 : 0,
|
||
currentQuestionIndex: shouldFollowUp
|
||
? currentQuestionIndex
|
||
: currentQuestionIndex + 1,
|
||
} as any;
|
||
} catch (parseError) {
|
||
console.error('[GraderNode] Failed to parse grade:', parseError);
|
||
const scoreLabel = isZh ? '得分' : isJa ? 'スコア' : 'Score';
|
||
const fallbackMsg = new AIMessage(`${scoreLabel}: 5/10\n\n评分解析失败,默认给5分。`);
|
||
return {
|
||
feedbackHistory: [fallbackMsg],
|
||
scores: { [currentQuestion.id || currentQuestionIndex.toString()]: 5 },
|
||
shouldFollowUp: false,
|
||
followUpCount: 0,
|
||
currentQuestionIndex: currentQuestionIndex + 1,
|
||
} as any;
|
||
}
|
||
} catch (error) {
|
||
console.error('[GraderNode] LLM grading failed:', error);
|
||
const scoreLabel = isZh ? '得分' : isJa ? 'スコア' : 'Score';
|
||
const feedbackLabel = isZh ? '反馈' : isJa ? 'フィードバック' : 'Feedback';
|
||
const fallbackMsg = new AIMessage(`${scoreLabel}: 5/10\n\n${feedbackLabel}: 评分服务暂时不可用,默认给5分。`);
|
||
return {
|
||
feedbackHistory: [fallbackMsg],
|
||
scores: { [currentQuestion.id || currentQuestionIndex.toString()]: 5 },
|
||
shouldFollowUp: false,
|
||
followUpCount: 0,
|
||
currentQuestionIndex: currentQuestionIndex + 1,
|
||
} as any;
|
||
}
|
||
};
|