Spaces:
Running
Running
| import re | |
| import pandas as pd | |
| from openai import OpenAI | |
| from typing import List, Dict, Any, Tuple | |
| import time | |
| import random | |
| from config import Config | |
| from src.utils import get_current_date_str | |
| class FreshEval: | |
| def __init__(self, model: str='solar-pro2', api_key: str=None): | |
| self.model = model | |
| self.api_key = api_key or Config.UPSTAGE_API_KEY | |
| self.client = OpenAI( | |
| api_key=self.api_key, | |
| base_url="https://api.upstage.ai/v1/solar" | |
| ) | |
| self.temperature = 0.0 | |
| self.max_tokens = 256 | |
| self.chat_completions = True | |
| if model.startswith('gpt-4') | model.startswith('solar'): | |
| self.num_organic_results = 15 | |
| self.num_related_questions = 3 | |
| self.num_questions_and_answers = 3 | |
| self.num_retrieved_evidences = 15 | |
| else: | |
| self.num_organic_results = 15 | |
| self.num_related_questions = 2 | |
| self.num_questions_and_answers = 2 | |
| self.num_retrieved_evidences = 5 | |
| def _is_rate_limit_error(self, error: Exception) -> bool: | |
| """429 ์๋ฌ ๊ฐ์ง ํจ์""" | |
| error_str = str(error) | |
| error_type = type(error).__name__ | |
| # 1. HTTP ์ํ ์ฝ๋ ํ์ธ | |
| if hasattr(error, 'response') and hasattr(error.response, 'status_code'): | |
| if error.response.status_code == 429: | |
| # print(f"โ HTTP 429 ์๋ฌ ๊ฐ์ง: {error.response.status_code}") | |
| return True | |
| # 2. ํ ์คํธ ๊ธฐ๋ฐ ๊ฐ์ง (๋ฐฑ์ ) | |
| error_lower = error_str.lower() | |
| if ("429" in error_lower or | |
| "rate" in error_lower or | |
| "limit" in error_lower or | |
| "too_many_requests" in error_lower or | |
| "request limit" in error_lower): | |
| # print(f"โ ํ ์คํธ ๊ธฐ๋ฐ 429 ์๋ฌ ๊ฐ์ง") | |
| return True | |
| return False | |
| def call_llm_api(self, prompt:str, current_date:str) -> str: | |
| """LLM API ํธ์ถ ํจ์ (ํค ํ์ ๋ฐ ๋ฐฑ์คํ ์ง์)""" | |
| from src.api_key_rotator import get_rotator | |
| rotator = get_rotator() | |
| num_keys = len(rotator.keys) | |
| base_delay = 3.0 | |
| def _make_api_call(eval_instance: FreshEval) -> str: | |
| """API ํธ์ถ ํฌํผ ํจ์""" | |
| if eval_instance.chat_completions: | |
| # Chat completions API | |
| response = eval_instance.client.chat.completions.create( | |
| model=eval_instance.model, | |
| temperature=eval_instance.temperature, | |
| max_tokens=eval_instance.max_tokens, | |
| messages=[ | |
| { | |
| "role": "system", | |
| "content": ( | |
| f"You are a helpful assistant. Respond as concisely as possible. Knowledge cutoff: {current_date}." | |
| ) | |
| }, | |
| { | |
| "role": "user", | |
| "content": "What's today's date?" | |
| }, | |
| { | |
| "role": "assistant", | |
| "content": f"Today is {current_date} in Pacific Standard Time." | |
| }, | |
| { | |
| "role": "user", | |
| "content": prompt | |
| } | |
| ], | |
| ) | |
| return response.choices[0].message.content | |
| else: | |
| # Completions API | |
| response = eval_instance.client.completions.create( | |
| model=eval_instance.model, | |
| temperature=eval_instance.temperature, | |
| max_tokens=eval_instance.max_tokens, | |
| prompt=prompt, | |
| ) | |
| return response.choices[0].text | |
| # ํ์ฌ ํค๋ก ์์ | |
| current_key = self.api_key | |
| current_instance = FreshEval(model=self.model, api_key=current_key) | |
| # ํค๊ฐ 1๊ฐ์ธ ๊ฒฝ์ฐ: ๊ธฐ์กด ๋ฐฑ์คํ ๋ก์ง๋ง ์ฌ์ฉ | |
| if num_keys == 1: | |
| max_retries = 7 | |
| for attempt in range(max_retries): | |
| try: | |
| return _make_api_call(current_instance) | |
| except Exception as e: | |
| if self._is_rate_limit_error(e): | |
| if attempt < max_retries - 1: | |
| # ์ง์์ ๋ฐฑ์คํ | |
| delay = base_delay * (2 ** attempt) + random.uniform(0, 1) | |
| time.sleep(delay) | |
| continue | |
| # else: | |
| # print(f"โ ์ต๋ ์ฌ์๋ ํ์ ์ด๊ณผ") | |
| raise e | |
| # max_retries ์ด๊ณผํ ๋๊น์ง return ๋์ง ์์ผ๋ฉด ์๋ฌ ๋ฐ์ | |
| raise Exception("call llm api:์ต๋ ์ฌ์๋ ํ์ ์ด๊ณผ") | |
| # ํค๊ฐ 2๊ฐ ์ด์์ธ ๊ฒฝ์ฐ: ํค ์ ํ ๋ก์ง (3์ด ๋๊ธฐ ํฌํจ) | |
| # ์ฑ๊ณตํ ๋๊น์ง ํค๋ฅผ ์ํํ๋ฉฐ ์๋ (์ต๋ ๋ชจ๋ ํค๋ฅผ 3๋ฐํด๊น์ง) | |
| max_attempts = num_keys * 3 # ๋ชจ๋ ํค๋ฅผ ์ต๋ 3๋ฐํด๊น์ง ์๋ | |
| key_attempt_count = 0 | |
| # ํ์ฌ ํค๋ก ์ฒซ ์๋ | |
| for attempt in range(max_attempts): | |
| try: | |
| return _make_api_call(current_instance) # ์ฑ๊ณตํ๋ฉด ์ฆ์ ๋ฐํ | |
| except Exception as e: | |
| if self._is_rate_limit_error(e): | |
| key_attempt_count += 1 | |
| # ๋ค์ ํค๋ก ์ ํํ๊ธฐ ์ ์ 2์ด ๋๊ธฐ | |
| time.sleep(2) | |
| current_key = rotator.pick_key() | |
| # print("๐ ํค ์ ํ") | |
| current_instance = FreshEval(model=self.model, api_key=current_key) | |
| continue # ๋ค์ ํค๋ก ๊ณ์ ์๋ | |
| else: | |
| # 429๊ฐ ์๋ ์๋ฌ๋ ์ฆ์ ์ ํ | |
| raise | |
| # ์ต๋ ์๋ ํ์ ์ด๊ณผ (๋ชจ๋ ํค๋ฅผ ์ฌ๋ฌ ๋ฐํด ์๋ํ์ง๋ง ๋ชจ๋ ์คํจ) | |
| raise Exception(f"๋ชจ๋ API ํค์์ 429 ์๋ฌ ๋ฐ์ (์ต๋ {max_attempts}ํ ์๋)") | |
| def call_fresheval(self, mode:str, question:str, evaluation:str, current_date:str) -> str: | |
| """FreshEval ํ๊ฐ ํจ์""" | |
| fresheval_question = f'\nquestion: {question}{evaluation}' | |
| # ํ๊ฒฝ๋ณ์ ๊ธฐ๋ฐ ํ๋กฌํํธ(๋ณธ์ฒด: prefix + demo) ์ฐ์ ์ฌ์ฉ | |
| env_prompt_body = None | |
| if mode == 'Relaxed': | |
| env_prompt_body = Config.FRESHQA_PROMPT_RELAXED | |
| elif mode == 'Strict': | |
| env_prompt_body = Config.FRESHQA_PROMPT_STRICT | |
| if env_prompt_body and str(env_prompt_body).strip(): | |
| base_prompt = str(env_prompt_body).strip() | |
| else: | |
| raise ValueError(f"{mode} ํ๊ฐ ํ๋กฌํํธ ์ค์ ์ด ์์ต๋๋ค.") | |
| fresheval_prompt = base_prompt + fresheval_question | |
| # ํ๊ฐ | |
| answer = self.call_llm_api(fresheval_prompt, current_date) | |
| return answer | |
| def extract_ratings(self, response:str) -> Tuple[bool, Dict[str, str]]: | |
| """ํ๊ฐ ๊ฒฐ๊ณผ์์ ๋ฑ๊ธ ์ถ์ถ""" | |
| def _clean(text: str) -> str: | |
| # ์๋ ์ฅ์/๊ณต๋ฐฑ ์ ๊ฑฐ + ๋ด๋ถ ํ์ ์ ๋ฆฌ + ์๋ฌธ์ํ | |
| text = re.sub(r'^[*`_~\s]+|[*`_~\s]+$', '', text) | |
| text = re.sub(r'[*`_~]', '', text) | |
| return text.strip().strip('.').strip().lower() | |
| def _judge(val: str): | |
| """ | |
| ๋ฌธ์์ด์์ correct/incorrect ํ์ . | |
| - 'incorrect'๊ฐ ๋ณด์ด๋ฉด ๋ฌด์กฐ๊ฑด FALSE | |
| - 'partially correct'๋ ๋ชจํธ โ None | |
| - 'correct'๋ TRUE | |
| """ | |
| if re.search(r'(?i)\bincorrect\b', val): | |
| return 'FALSE' | |
| if re.search(r'(?i)\bpartial(?:ly)?\s+correct\b', val): | |
| return None | |
| if re.search(r'(?i)\bcorrect\b', val): | |
| return 'TRUE' | |
| return None | |
| def _from_label(block_label: str): | |
| """ | |
| ๋ผ๋ฒจ(์: 'Final Evaluation' ๋๋ 'Evaluation') ๊ธฐ์ค์ผ๋ก | |
| - ๊ฐ์ ์ค ์บก์ฒ ๋จผ์ ์๋ | |
| - ์คํจํ๋ฉด ๋ผ๋ฒจ ์ดํ ~ ๋ค์ ๋น ์ค ์ด์ ๋ฒ์์์ ํ์ ํค์๋ ํ์ | |
| """ | |
| # ๊ฐ์ ์ค ์บก์ฒ: ๋ผ๋ฒจ ยฑ ์ฅ์ ยฑ ์ฝ๋ก ์ดํ ~ ์ค๋ | |
| same_line = re.search( | |
| rf'(?i){block_label}\s*(?:[*`_~]*\s*:\s*|:\s*[*`_~]*)\s*([^\r\n]+)', | |
| response | |
| ) | |
| if same_line: | |
| val = _clean(same_line.group(1)) | |
| j = _judge(val) | |
| if j is not None: | |
| return j | |
| # ์์น๋ง ์ฐพ๊ณ (๊ฐ ์์ด ์ค๋ฐ๊ฟ๋ ์ผ์ด์ค), ๋ค์ ๋น ์ค(or ์น์ ) ์ ๊น์ง ์ค์บ | |
| pos = re.search( | |
| rf'(?i){block_label}\s*(?:[*`_~]*\s*:\s*|:\s*[*`_~]*)', | |
| response | |
| ) | |
| if pos: | |
| tail = response[pos.end():] | |
| # ๋ค์ '๋น ์ค(์ฐ์ ๊ฐํ)' ๋๋ ๋ค์ ์น์ ์์ ์ ๊น์ง๋ง ๋ณธ๋ค (๋๋ฌด ๋ฉ๋ฆฌ ์๊ฐ๋๋ก) | |
| m_stop = re.search(r'\n\s*\n', tail) | |
| segment = tail[:m_stop.start()] if m_stop else tail[:300] # ์์ ํ ์ํ | |
| seg_clean = _clean(segment) | |
| j = _judge(seg_clean) | |
| if j is not None: | |
| return j | |
| return None | |
| # 1) Final Evaluation ์ต์ฐ์ | |
| final_judgement = _from_label('final\s+evaluation') | |
| if final_judgement: | |
| return True, {'rating': final_judgement} | |
| # 2) Evaluation | |
| eval_judgement = _from_label('evaluation') | |
| if eval_judgement: | |
| return True, {'rating': eval_judgement} | |
| # 3) ํด๋ฐฑ: credited ๋ฌธ์ฅ | |
| if re.search(r'(?i)thus,\s*the\s*response\s*is\s*credited\b', response): | |
| return True, {'rating': 'TRUE'} | |
| if re.search(r'(?i)thus,\s*the\s*response\s*is\s*not\s*credited\b', response): | |
| return True, {'rating': 'FALSE'} | |
| # 4) ์คํจ | |
| return False, {'rating': None} | |
| def evaluate_single_row(self, row: pd.Series, mode: str, current_date:str) -> Dict[str, Any]: | |
| """๋จ์ผ ํ ํ๊ฐ""" | |
| question = row['question'] | |
| response = row['model_response'] | |
| correct_answers = [row[f'answer_{i}'] for i in range(10)] | |
| correct_answers = [str(x) for x in correct_answers if pd.notna(x) and str(x).strip()] | |
| # model_response๊ฐ ๋น์ด์๊ฑฐ๋ NaN์ธ ๊ฒฝ์ฐ ๋ฐ๋ก ํ๋ ธ๋ค๋ ๊ฒฐ๊ณผ๋ก ์ฒ๋ฆฌํ๊ณ return | |
| if pd.isna(response) or (isinstance(response, str) and response.strip() == ''): | |
| # print('model_response๊ฐ ๋น์ด์์. rating=0์ผ๋ก ์ฒ๋ฆฌ') | |
| row_dict = row.to_dict() | |
| row_dict['rating'] = 0 | |
| row_dict['explanation'] = "model_response๊ฐ ๋น์ด์์" | |
| return row_dict | |
| # ํ๊ฐ ํ ํ๋ฆฟ ์์ฑ | |
| evaluation_template = ( | |
| "\ncorrect answer(s): {correct_answers}" | |
| "\nresponse: {response}" | |
| "\ncomment: " | |
| ) | |
| evaluation = evaluation_template.format( | |
| correct_answers=' | '.join(correct_answers), | |
| response=response, | |
| ) | |
| # ํ๊ฐ | |
| fresheval_response = self.call_fresheval( | |
| mode=mode, | |
| question=question, | |
| evaluation=evaluation, | |
| current_date=current_date | |
| ) | |
| is_valid_eval, eval_result = self.extract_ratings(fresheval_response) | |
| # if is_valid_eval: | |
| # print('์๋ฃ') | |
| # ์ฌํ๊ฐ ํ์ ์ ํ (์ต๋ 3ํ) | |
| max_retries = 3 | |
| retry_count = 0 | |
| # ์ฌ์๋ loop | |
| while not is_valid_eval and retry_count < max_retries: | |
| retry_count += 1 | |
| # print(f'์ ํจํ์ง ์์ ํ๊ฐ, ์ฌํ๊ฐ ์ค... ({retry_count}/{max_retries})\n response: {fresheval_response}') | |
| fresheval_response = self.call_fresheval( | |
| mode=mode, | |
| question=question, | |
| evaluation=evaluation, | |
| current_date=current_date | |
| ) | |
| is_valid_eval, eval_result = self.extract_ratings(fresheval_response) | |
| # if is_valid_eval: | |
| # print('์๋ฃ') | |
| # ์ต๋ ์ฌ์๋ ํ์ ์ด๊ณผ ์ ๊ธฐ๋ณธ ๊ฐ ์ฌ์ฉ | |
| if not is_valid_eval: | |
| # print(f'โ ๏ธ ์ต๋ ์ฌ์๋ ํ์({max_retries}) ์ด๊ณผ. ๊ธฐ๋ณธ๊ฐ ์ฌ์ฉ: rating=0') | |
| eval_result = {'rating': 0} | |
| fresheval_response = "์ฌ์๋ ํ์ ์ด๊ณผ๋ก ์ธํ ๊ธฐ๋ณธ ํ๊ฐ" | |
| row_dict = row.to_dict() | |
| row_dict['rating'] = eval_result['rating'] | |
| row_dict['explanation'] = fresheval_response | |
| # ๐ DEBUG: FALSE์ธ ๊ฒฝ์ฐ์๋ง ์์ธ ์ถ๋ ฅ | |
| # if eval_result['rating'] == 'FALSE': | |
| # print(f"\n{'='*80}") | |
| # print(f"โ FALSE ํ๊ฐ๋ ์ง๋ฌธ") | |
| # print(f" Mode: {mode}") | |
| # print(f" Question: {question}") | |
| # print(f" Correct Answers: {' | '.join(correct_answers)}") | |
| # print(f" Model Response: {response}") | |
| # print(f"\n LLM ํ๊ฐ ์๋ต:") | |
| # print(f" {fresheval_response}") | |
| # print(f" ์ต์ข Rating: {eval_result['rating']}") | |
| # print(f"{'='*80}\n") | |
| return row_dict | |
| def evaluate_dataframe(self, df: pd.DataFrame, mode: str) -> pd.DataFrame: | |
| """๋ฐ์ดํฐํ๋ ์ ํ๊ฐ""" | |
| freshevals = [] | |
| current_date = get_current_date_str() | |
| len_df = len(df) | |
| for index, row in df.iterrows(): | |
| print(f'{mode} ํ๊ฐ ์ค... {index+1}/{len_df}') | |
| row_dict = self.evaluate_single_row(row, mode, current_date) | |
| freshevals.append(row_dict) | |
| return pd.DataFrame(freshevals) |