Spaces:

upstage
/

ko-freshqa-leaderboard

Running

File size: 14,136 Bytes

import re
import pandas as pd
from openai import OpenAI
from typing import List, Dict, Any, Tuple
import time
import random

from config import Config
from src.utils import get_current_date_str

class FreshEval:

    def __init__(self, model: str='solar-pro2', api_key: str=None):
        self.model = model
        self.api_key = api_key or Config.UPSTAGE_API_KEY
        self.client = OpenAI(
            api_key=self.api_key,
            base_url="https://api.upstage.ai/v1/solar"
        )

        self.temperature = 0.0
        self.max_tokens = 256
        self.chat_completions = True

        if model.startswith('gpt-4') | model.startswith('solar'):
            self.num_organic_results = 15
            self.num_related_questions = 3
            self.num_questions_and_answers = 3
            self.num_retrieved_evidences = 15
        else:
            self.num_organic_results = 15
            self.num_related_questions = 2
            self.num_questions_and_answers = 2
            self.num_retrieved_evidences = 5


    def _is_rate_limit_error(self, error: Exception) -> bool:
        """429 에러 감지 함수"""
        error_str = str(error)
        error_type = type(error).__name__
        
        # 1. HTTP 상태 코드 확인
        if hasattr(error, 'response') and hasattr(error.response, 'status_code'):
            if error.response.status_code == 429:
                # print(f"✅ HTTP 429 에러 감지: {error.response.status_code}")
                return True
        
        # 2. 텍스트 기반 감지 (백업)
        error_lower = error_str.lower()
        if ("429" in error_lower or 
            "rate" in error_lower or 
            "limit" in error_lower or
            "too_many_requests" in error_lower or
            "request limit" in error_lower):
            # print(f"✅ 텍스트 기반 429 에러 감지")
            return True
        
        return False


    def call_llm_api(self, prompt:str, current_date:str) -> str:
        """LLM API 호출 함수 (키 회전 및 백오프 지원)"""
        from src.api_key_rotator import get_rotator
        
        rotator = get_rotator()
        num_keys = len(rotator.keys)
        base_delay = 3.0
        
        def _make_api_call(eval_instance: FreshEval) -> str:
            """API 호출 헬퍼 함수"""
            if eval_instance.chat_completions:
                # Chat completions API
                response = eval_instance.client.chat.completions.create(
                    model=eval_instance.model,
                    temperature=eval_instance.temperature,
                    max_tokens=eval_instance.max_tokens,
                    messages=[
                        {
                            "role": "system", 
                            "content": (
                                f"You are a helpful assistant. Respond as concisely as possible. Knowledge cutoff: {current_date}."
                            )
                        },
                        {
                            "role": "user",
                            "content": "What's today's date?"
                        },
                        {
                            "role": "assistant",
                            "content": f"Today is {current_date} in Pacific Standard Time."
                        },
                        {
                            "role": "user",
                            "content": prompt
                        }
                    ],
                )
                return response.choices[0].message.content
            else:
                # Completions API
                response = eval_instance.client.completions.create(
                    model=eval_instance.model,
                    temperature=eval_instance.temperature,
                    max_tokens=eval_instance.max_tokens,
                    prompt=prompt,
                )
                return response.choices[0].text
        
        # 현재 키로 시작
        current_key = self.api_key
        current_instance = FreshEval(model=self.model, api_key=current_key)
        
        # 키가 1개인 경우: 기존 백오프 로직만 사용
        if num_keys == 1:
            max_retries = 7
            for attempt in range(max_retries):
                try:
                    return _make_api_call(current_instance)
                except Exception as e:
                    if self._is_rate_limit_error(e):
                        if attempt < max_retries - 1:
                            # 지수적 백오프
                            delay = base_delay * (2 ** attempt) + random.uniform(0, 1)
                            time.sleep(delay)
                            continue
                        # else:
                            # print(f"❌ 최대 재시도 횟수 초과")
                    raise e

            # max_retries 초과할 때까지 return 되지 않으면 에러 발생
            raise Exception("call llm api:최대 재시도 횟수 초과")
        
        # 키가 2개 이상인 경우: 키 전환 로직 (3초 대기 포함)
        # 성공할 때까지 키를 순환하며 시도 (최대 모든 키를 3바퀴까지)
        max_attempts = num_keys * 3  # 모든 키를 최대 3바퀴까지 시도
        key_attempt_count = 0
        
        # 현재 키로 첫 시도
        for attempt in range(max_attempts):
            try:
                return _make_api_call(current_instance)  # 성공하면 즉시 반환
            except Exception as e:
                if self._is_rate_limit_error(e):
                    key_attempt_count += 1
                    # 다음 키로 전환하기 전에 2초 대기
                    time.sleep(2)
                    current_key = rotator.pick_key()
                    # print("🔑 키 전환")
                    current_instance = FreshEval(model=self.model, api_key=current_key)
                    continue  # 다음 키로 계속 시도
                else:
                    # 429가 아닌 에러는 즉시 전파
                    raise
        
        # 최대 시도 횟수 초과 (모든 키를 여러 바퀴 시도했지만 모두 실패)
        raise Exception(f"모든 API 키에서 429 에러 발생 (최대 {max_attempts}회 시도)")


    def call_fresheval(self, mode:str, question:str, evaluation:str, current_date:str) -> str:
        """FreshEval 평가 함수"""

        fresheval_question = f'\nquestion: {question}{evaluation}'

        # 환경변수 기반 프롬프트(본체: prefix + demo) 우선 사용
        env_prompt_body = None
        if mode == 'Relaxed':
            env_prompt_body = Config.FRESHQA_PROMPT_RELAXED
        elif mode == 'Strict':
            env_prompt_body = Config.FRESHQA_PROMPT_STRICT

        if env_prompt_body and str(env_prompt_body).strip():
            base_prompt = str(env_prompt_body).strip()
        else:
            raise ValueError(f"{mode} 평가 프롬프트 설정이 없습니다.")

        fresheval_prompt = base_prompt + fresheval_question
        
        # 평가
        answer = self.call_llm_api(fresheval_prompt, current_date)

        return answer


    def extract_ratings(self, response:str) -> Tuple[bool, Dict[str, str]]:
        """평가 결과에서 등급 추출"""
        def _clean(text: str) -> str:
            # 양끝 장식/공백 제거 + 내부 흔적 정리 + 소문자화
            text = re.sub(r'^[*`_~\s]+|[*`_~\s]+$', '', text)
            text = re.sub(r'[*`_~]', '', text)
            return text.strip().strip('.').strip().lower()

        def _judge(val: str):
            """
            문자열에서 correct/incorrect 판정.
            - 'incorrect'가 보이면 무조건 FALSE
            - 'partially correct'는 모호 → None
            - 'correct'는 TRUE
            """
            if re.search(r'(?i)\bincorrect\b', val):
                return 'FALSE'
            if re.search(r'(?i)\bpartial(?:ly)?\s+correct\b', val):
                return None
            if re.search(r'(?i)\bcorrect\b', val):
                return 'TRUE'
            return None

        def _from_label(block_label: str):
            """
            라벨(예: 'Final Evaluation' 또는 'Evaluation') 기준으로
            - 같은 줄 캡처 먼저 시도
            - 실패하면 라벨 이후 ~ 다음 빈 줄 이전 범위에서 판정 키워드 탐색
            """
            # 같은 줄 캡처: 라벨 ± 장식 ± 콜론 이후 ~ 줄끝
            same_line = re.search(
                rf'(?i){block_label}\s*(?:[*`_~]*\s*:\s*|:\s*[*`_~]*)\s*([^\r\n]+)',
                response
            )
            if same_line:
                val = _clean(same_line.group(1))
                j = _judge(val)
                if j is not None:
                    return j

            # 위치만 찾고(값 없이 줄바꿈된 케이스), 다음 빈 줄(or 섹션) 전까지 스캔
            pos = re.search(
                rf'(?i){block_label}\s*(?:[*`_~]*\s*:\s*|:\s*[*`_~]*)',
                response
            )
            if pos:
                tail = response[pos.end():]
                # 다음 '빈 줄(연속 개행)' 또는 다음 섹션 시작 전까지만 본다 (너무 멀리 안가도록)
                m_stop = re.search(r'\n\s*\n', tail)
                segment = tail[:m_stop.start()] if m_stop else tail[:300]  # 안전한 상한
                seg_clean = _clean(segment)
                j = _judge(seg_clean)
                if j is not None:
                    return j
            return None

        # 1) Final Evaluation 최우선
        final_judgement = _from_label('final\s+evaluation')
        if final_judgement:
            return True, {'rating': final_judgement}

        # 2) Evaluation
        eval_judgement = _from_label('evaluation')
        if eval_judgement:
            return True, {'rating': eval_judgement}

        # 3) 폴백: credited 문장
        if re.search(r'(?i)thus,\s*the\s*response\s*is\s*credited\b', response):
            return True, {'rating': 'TRUE'}
        if re.search(r'(?i)thus,\s*the\s*response\s*is\s*not\s*credited\b', response):
            return True, {'rating': 'FALSE'}

        # 4) 실패
        return False, {'rating': None}


    def evaluate_single_row(self, row: pd.Series, mode: str, current_date:str) -> Dict[str, Any]:
        """단일 행 평가"""
        question = row['question']
        response = row['model_response']
        correct_answers = [row[f'answer_{i}'] for i in range(10)]
        correct_answers = [str(x) for x in correct_answers if pd.notna(x) and str(x).strip()]


        # model_response가 비어있거나 NaN인 경우 바로 틀렸다는 결과로 처리하고 return
        if pd.isna(response) or (isinstance(response, str) and response.strip() == ''):
            # print('model_response가 비어있음. rating=0으로 처리')
            row_dict = row.to_dict()
            row_dict['rating'] = 0
            row_dict['explanation'] = "model_response가 비어있음"
            return row_dict
        
        # 평가 템플릿 생성
        evaluation_template = (
            "\ncorrect answer(s): {correct_answers}"
            "\nresponse: {response}"
            "\ncomment: "
        )
        evaluation = evaluation_template.format(
            correct_answers=' | '.join(correct_answers),
            response=response,
        )

        # 평가
        fresheval_response = self.call_fresheval(
            mode=mode,
            question=question,
            evaluation=evaluation,
            current_date=current_date
        )

        is_valid_eval, eval_result = self.extract_ratings(fresheval_response)

        # if is_valid_eval:
        #     print('완료')
        
        # 재평가 횟수 제한 (최대 3회)
        max_retries = 3
        retry_count = 0

        # 재시도 loop
        while not is_valid_eval and retry_count < max_retries:
            retry_count += 1
            # print(f'유효하지 않은 평가, 재평가 중... ({retry_count}/{max_retries})\n response: {fresheval_response}')
            
            fresheval_response = self.call_fresheval(
                mode=mode,
                question=question,
                evaluation=evaluation,
                current_date=current_date
            )

            is_valid_eval, eval_result = self.extract_ratings(fresheval_response)
            # if is_valid_eval:
            #     print('완료')
        
        # 최대 재시도 횟수 초과 시 기본 값 사용
        if not is_valid_eval:
            # print(f'⚠️  최대 재시도 횟수({max_retries}) 초과. 기본값 사용: rating=0')
            eval_result = {'rating': 0}
            fresheval_response = "재시도 횟수 초과로 인한 기본 평가"
        
        row_dict = row.to_dict()
        row_dict['rating'] = eval_result['rating']
        row_dict['explanation'] = fresheval_response
        
        # 📊 DEBUG: FALSE인 경우에만 상세 출력
        # if eval_result['rating'] == 'FALSE':
        #     print(f"\n{'='*80}")
        #     print(f"❌ FALSE 평가된 질문")
        #     print(f"   Mode: {mode}")
        #     print(f"   Question: {question}")
        #     print(f"   Correct Answers: {' | '.join(correct_answers)}")
        #     print(f"   Model Response: {response}")
        #     print(f"\n   LLM 평가 응답:")
        #     print(f"   {fresheval_response}")
        #     print(f"   최종 Rating: {eval_result['rating']}")
        #     print(f"{'='*80}\n")

        return row_dict

    
    def evaluate_dataframe(self, df: pd.DataFrame, mode: str) -> pd.DataFrame:
        """데이터프레임 평가"""
        
        freshevals = []
        current_date = get_current_date_str()

        len_df = len(df)
        for index, row in df.iterrows():
            print(f'{mode} 평가 중... {index+1}/{len_df}')
            row_dict = self.evaluate_single_row(row, mode, current_date)
            freshevals.append(row_dict)

        return pd.DataFrame(freshevals)