aurak/server/src/assessment/graph/nodes/grader.node.ts

import { ChatOpenAI } from '@langchain/openai';
import {
  SystemMessage,
  HumanMessage,
  AIMessage,
} from '@langchain/core/messages';
import { RunnableConfig } from '@langchain/core/runnables';
import { EvaluationState } from '../state';
import { safeParseJson } from '../../../common/json-utils';

/**
 * Node responsible for grading the user's answer and deciding if a follow-up is needed.
 */
export const graderNode = async (
  state: EvaluationState,
  config?: RunnableConfig,
): Promise<Partial<EvaluationState>> => {
  const { model } = (config?.configurable as any) || {};
  const { questions, currentQuestionIndex, messages } = state;
  const currentFollowUpCount = state.followUpCount || 0;

  console.log('[GraderNode] Entering node...', {
    currentIndex: currentQuestionIndex,
    numMessages: messages?.length,
    questionCount: state.questionCount,
    hasQuestions: !!questions?.length,
  });

  if (!model) {
    throw new Error('Missing model in node configuration');
  }

  const lastUserMessage = messages[messages.length - 1];

  console.log('[GraderNode] Incoming Messages Count:', messages.length);
  if (lastUserMessage) {
    console.log(
      '[GraderNode] Last Message Type:',
      lastUserMessage.constructor.name,
    );
    // Safely extract content for logging
    const logContent =
      typeof lastUserMessage.content === 'string'
        ? lastUserMessage.content
        : JSON.stringify(lastUserMessage.content);
    console.log(
      '[GraderNode] Last Message Content:',
      logContent.substring(0, 50),
    );
  }

  if (!(lastUserMessage instanceof HumanMessage)) {
    console.log(
      '[GraderNode] Last message is not HumanMessage, skipping grading.',
    );
    return {};
  }

  const isZh = state.language === 'zh';
  const isJa = state.language === 'ja';

  const currentQuestion = questions[currentQuestionIndex];
  if (!currentQuestion) {
    console.error(
      `[GraderNode] Question at index ${currentQuestionIndex} not found!`,
    );
    return { currentQuestionIndex: currentQuestionIndex + 1 };
  }

  const isChoice = currentQuestion.questionType === 'MULTIPLE_CHOICE';
  const expectedAnswer = currentQuestion.correctAnswer;

  if (isChoice && expectedAnswer) {
    const userAnswer = (lastUserMessage.content as string).trim();
    const isCorrect = userAnswer.toUpperCase() === expectedAnswer?.toUpperCase();

    console.log('[GraderNode] Choice grading:', { userAnswer, expectedAnswer, isCorrect });

    const feedback = isCorrect ? '✅ 正确' : `❌ 错误，正确答案是 ${expectedAnswer}`;
    const feedbackMessage = new AIMessage(
      { content: `Score: ${isCorrect ? 10 : 0}\nFeedback: ${feedback}` } as any,
    );

    return {
      messages: [feedbackMessage],
      feedbackHistory: [feedbackMessage],
      scores: { [currentQuestion.id || currentQuestionIndex.toString()]: isCorrect ? 10 : 0 },
      shouldFollowUp: false,
      followUpCount: 0,
      currentQuestionIndex: currentQuestionIndex + 1,
    };
  }

  // ── Rule-based grading: use structured followupMapping if available ──
  if (currentQuestion.followupHints) {
    let mapping: any = null;
    if (typeof currentQuestion.followupHints === 'string') {
      try { mapping = JSON.parse(currentQuestion.followupHints); } catch {}
    } else if (typeof currentQuestion.followupHints === 'object') {
      mapping = currentQuestion.followupHints;
    }
    if (mapping && Array.isArray(mapping.branches)) {
      const userAnswerText = typeof lastUserMessage.content === 'string'
        ? lastUserMessage.content : JSON.stringify(lastUserMessage.content);

      // Score based on keyword coverage
      let bestScore = mapping.defaultScore ?? 5;
      let matchedFollowup = mapping.defaultFollowup || '';
      let matchedAll = true;
      const maxFollowUps = mapping.maxFollowups ?? 2;

      for (const branch of mapping.branches) {
        const kws = branch.keywords || [];
        const matchCount = kws.filter((kw: string) => userAnswerText.toLowerCase().includes(kw.toLowerCase())).length;
        if (kws.length > 0 && matchCount >= kws.length * 0.5) {
          const branchScore = branch.score ?? 7;
          if (branchScore > bestScore) bestScore = branchScore;
          if (branch.followup) matchedFollowup = branch.followup;
        } else if (kws.length > 0 && matchCount === 0) {
          matchedAll = false;
        }
      }

      const completionThreshold = mapping.completionThreshold ?? 80;
      const tooShort = userAnswerText.trim().length < 8;
      const saysIDontKnow = userAnswerText.trim().length < 10 && (
        userAnswerText.includes('不知道') || userAnswerText.includes("don't know") || userAnswerText.includes('わかりません')
      );

      let shouldFollowUp: boolean;
      if (saysIDontKnow || tooShort) {
        shouldFollowUp = false;
        bestScore = Math.min(bestScore, 2);
      } else if (bestScore >= completionThreshold / 10) {
        shouldFollowUp = false;
      } else if (currentFollowUpCount >= maxFollowUps) {
        shouldFollowUp = false;
      } else {
        shouldFollowUp = true;
      }

      const feedbackMessage = new AIMessage(`Score: ${bestScore}/10\n\nFeedback: ${shouldFollowUp ? matchedFollowup : '回答已覆盖关键点。'}`);

      const feedbackHistoryMessages = shouldFollowUp && matchedFollowup
        ? [feedbackMessage, new AIMessage(matchedFollowup)]
        : [feedbackMessage];

      console.log('[GraderNode] Rule grading:', { score: bestScore, shouldFollowUp, matchedAll, followup: matchedFollowup?.substring(0, 60) });

      return {
        feedbackHistory: feedbackHistoryMessages,
        scores: { [currentQuestion.id || currentQuestionIndex.toString()]: bestScore },
        shouldFollowUp,
        followUpCount: shouldFollowUp ? currentFollowUpCount + 1 : 0,
        currentQuestionIndex: shouldFollowUp ? currentQuestionIndex : currentQuestionIndex + 1,
      } as any;
    }
  }

  const systemPromptZh = `你是一位考官。请评分并给出反馈。

规则：
1. 只用中文。
2. 多轮追问时，用户回答含所有轮次（第N轮回答：标记），综合判断已覆盖内容。

问题：${currentQuestion.questionText}
关键点：${currentQuestion.keyPoints.join(', ')}

  评分标准：不要求深度，不要求使用特定术语，只看用户是否理解了概念。
  用户理解核心概念就给分。即使没有使用关键点中的原词，只要意思到位就算覆盖。
  例如关键点是"上下文窗口有限"，用户说"信息太多超过AI处理长度"也是覆盖。
  评分原则：往宽了给分，不确定时就给高分。明显正确就给8-10分，部分正确5-7分，完全不沾边才0-2分。

返回JSON：
- score: 0-10
- feedback: 评语
- should_follow_up: true/false
- follow_up_question: 追问（仅true时需要，针对未覆盖的关键点，false时null）

请以 JSON 格式返回响应：
{"score":0到10,"feedback":"评语","should_follow_up":true或false,"follow_up_question":"追问或null"}

示例（需要追问）：
{"score":6,"feedback":"提到了安全性和性能，未说明依赖关系。","should_follow_up":true,"follow_up_question":"你如何让AI在计划中明确任务依赖关系？"}

示例（不需追问）：
{"score":8,"feedback":"回答完整。","should_follow_up":false,"follow_up_question":null}`;

  const systemPromptJa = `あなたは試験官です。採点とフィードバックを提供してください。

ルール：
1. 日本語のみ使用。
2. 複数ラウンドの回答は「第N輪回答：」でマークされ、全ラウンドを総合判断。

質問：${currentQuestion.questionText}
キーポイント：${currentQuestion.keyPoints.join(', ')}

評価基準：正確性、網羅性、深さ。
部分点可（5〜7点）、見当違いのみ0〜2点。

JSON形式：
- score: 0〜10
- feedback: 評価
- should_follow_up: true/false
- follow_up_question: 追質問（true時のみ、未カバーのポイントに焦点、false時null）

JSON 形式で回答してください：
{"score":0から10,"feedback":"評価","should_follow_up":trueかfalse,"follow_up_question":"追質問かnull"}

例（追質問が必要）：
{"score":6,"feedback":"安全性と性能に言及したが、依存関係が不明。","should_follow_up":true,"follow_up_question":"AIに計画内のタスク依存関係を明示させる方法は？"}

例（不要）：
{"score":8,"feedback":"回答は完全。","should_follow_up":false,"follow_up_question":null}`;

  const systemPromptEn = `You are an examiner. Grade and give feedback.

Rules:
1. English only.
2. Multi-round answers are tagged "第N轮回答：". Consider all rounds.

Question: ${currentQuestion.questionText}
Key points: ${currentQuestion.keyPoints.join(', ')}

Criteria: accuracy, completeness, depth.
Give partial credit (5-7 for partial), 0-2 only for off-target.

Return JSON:
- score: 0-10
- feedback: text
- should_follow_up: true/false
- follow_up_question: question (only when true, target uncovered points, null when false)

Format as JSON:
{"score":0-10,"feedback":"...","should_follow_up":true|false,"follow_up_question":"question or null"}

Example (follow-up needed):
{"score":6,"feedback":"Covered security and performance, missed dependencies.","should_follow_up":true,"follow_up_question":"How would you make the AI clarify task dependencies?"}

Example (no follow-up):
{"score":8,"feedback":"Complete answer.","should_follow_up":false,"follow_up_question":null}`;

  let systemPrompt = isZh
    ? systemPromptZh
    : isJa
      ? systemPromptJa
      : systemPromptEn;

  if (currentQuestion.judgment) {
    const anchorText = isZh
      ? `\n\n【判定依据（通过标准）】${currentQuestion.judgment}`
      : isJa
        ? `\n\n【判定基準（合格基準）】${currentQuestion.judgment}`
        : `\n\n【Judgment Criteria (Pass Standard)】${currentQuestion.judgment}`;
    systemPrompt += anchorText;
  }

  const maxFollowUps = (currentQuestion as any).maxFollowUps ?? 2;

  const userContentText =
    typeof lastUserMessage.content === 'string'
      ? lastUserMessage.content
      : JSON.stringify(lastUserMessage.content);

  let allAnswers = userContentText;
  if (currentFollowUpCount > 0) {
    const prevAnswers = state.messages
      .filter(m => m instanceof HumanMessage)
      .slice(-(currentFollowUpCount + 1))
      .map((m, i) => `第${i + 1}轮回答：${typeof m.content === 'string' ? m.content : JSON.stringify(m.content)}`);
    allAnswers = prevAnswers.join('\n\n');
  }

  console.log('[GraderNode] === START GRADING ===');
  console.log('[GraderNode] User answer length:', userContentText.length);
  console.log('[GraderNode] Question:', currentQuestion?.questionText?.substring(0, 100));
  console.log('[GraderNode] Target dimension:', currentQuestion?.dimension);

  try {
  const response = await model.invoke([
    new SystemMessage(systemPrompt),
    new HumanMessage(allAnswers),
  ]);

  console.log('[GraderNode] LLM invoke completed');
  try {
    const rawContent = response.content as string;
    console.log('[GraderNode] Raw AI response length:', rawContent.length);
    console.log('[GraderNode] Raw AI response:', rawContent.substring(0, 800));

    const result = safeParseJson<any>(rawContent);
    if (!result) {
      console.error('[GraderNode] Failed to parse JSON. Raw content:', rawContent);
      throw new Error('Invalid JSON format from AI');
    }
    console.log('[GraderNode] === GRADING RESULT ===');
    console.log('[GraderNode] Parsed result:', JSON.stringify(result, null, 2));
    console.log('[GraderNode] Score value:', result.score);
    console.log('[GraderNode] Feedback value:', result.feedback?.substring(0, 200));

    const scoreLabel = isZh ? '得分' : isJa ? 'スコア' : 'Score';
    const feedbackLabel = isZh ? '反馈' : isJa ? 'フィードバック' : 'Feedback';
    let enhancedFeedback: string = result.feedback;

    const newScores = {
      ...state.scores,
      [currentQuestion.id || currentQuestionIndex.toString()]: result.score,
    };

    let shouldFollowUp = result.should_follow_up === true;

    const normalizedContent = userContentText.trim().toLowerCase();
    const saysIDontKnow =
      normalizedContent.length < 10 &&
      (normalizedContent.includes('不知道') ||
        normalizedContent.includes('不会') ||
        normalizedContent.includes("don't know") ||
        normalizedContent.includes('no idea') ||
        normalizedContent.includes('不知') ||
        normalizedContent.includes('わかりません') ||
        normalizedContent.includes('わからん') ||
        normalizedContent.includes('知らない') ||
        normalizedContent.includes('不明') ||
        normalizedContent.includes('わからない'));

    if (currentFollowUpCount >= maxFollowUps || result.score >= 8 || saysIDontKnow) {
      shouldFollowUp = false;
    }

    let followupHintMsg: AIMessage | null = null;
    if (shouldFollowUp && result.follow_up_question && result.follow_up_question.trim()) {
      followupHintMsg = new AIMessage(result.follow_up_question.trim());
    } else if (shouldFollowUp) {
      shouldFollowUp = false;
    }

    const feedbackMessage = new AIMessage(
      `${scoreLabel}: ${result.score}/10\n\n${feedbackLabel}: ${enhancedFeedback}`,
    );

    console.log('[GraderNode] Final State decision:', {
      shouldFollowUp,
      nextIndex: shouldFollowUp
        ? currentQuestionIndex
        : currentQuestionIndex + 1,
      score: result.score,
      saysIDontKnow,
    });

    const feedbackHistoryMessages = followupHintMsg
      ? [feedbackMessage, followupHintMsg]
      : [feedbackMessage];

    return {
      feedbackHistory: feedbackHistoryMessages,
      scores: newScores,
      shouldFollowUp: shouldFollowUp,
      followUpCount: shouldFollowUp ? currentFollowUpCount + 1 : 0,
      currentQuestionIndex: shouldFollowUp
        ? currentQuestionIndex
        : currentQuestionIndex + 1,
    } as any;
  } catch (parseError) {
    console.error('[GraderNode] Failed to parse grade:', parseError);
    const scoreLabel = isZh ? '得分' : isJa ? 'スコア' : 'Score';
    const fallbackMsg = new AIMessage(`${scoreLabel}: 5/10\n\n评分解析失败，默认给5分。`);
    return {
      feedbackHistory: [fallbackMsg],
      scores: { [currentQuestion.id || currentQuestionIndex.toString()]: 5 },
      shouldFollowUp: false,
      followUpCount: 0,
      currentQuestionIndex: currentQuestionIndex + 1,
    } as any;
  }
  } catch (error) {
    console.error('[GraderNode] LLM grading failed:', error);
    const scoreLabel = isZh ? '得分' : isJa ? 'スコア' : 'Score';
    const feedbackLabel = isZh ? '反馈' : isJa ? 'フィードバック' : 'Feedback';
    const fallbackMsg = new AIMessage(`${scoreLabel}: 5/10\n\n${feedbackLabel}: 评分服务暂时不可用，默认给5分。`);
    return {
      feedbackHistory: [fallbackMsg],
      scores: { [currentQuestion.id || currentQuestionIndex.toString()]: 5 },
      shouldFollowUp: false,
      followUpCount: 0,
      currentQuestionIndex: currentQuestionIndex + 1,
    } as any;
  }
};