jisubae
chore: reduce retry attempts
660474a
import re
import pandas as pd
from openai import OpenAI
from typing import List, Dict, Any, Tuple
import time
import random
from config import Config
from src.utils import get_current_date_str
class FreshEval:
def __init__(self, model: str='solar-pro2', api_key: str=None):
self.model = model
self.api_key = api_key or Config.UPSTAGE_API_KEY
self.client = OpenAI(
api_key=self.api_key,
base_url="https://api.upstage.ai/v1/solar"
)
self.temperature = 0.0
self.max_tokens = 256
self.chat_completions = True
if model.startswith('gpt-4') | model.startswith('solar'):
self.num_organic_results = 15
self.num_related_questions = 3
self.num_questions_and_answers = 3
self.num_retrieved_evidences = 15
else:
self.num_organic_results = 15
self.num_related_questions = 2
self.num_questions_and_answers = 2
self.num_retrieved_evidences = 5
def _is_rate_limit_error(self, error: Exception) -> bool:
"""429 ์—๋Ÿฌ ๊ฐ์ง€ ํ•จ์ˆ˜"""
error_str = str(error)
error_type = type(error).__name__
# 1. HTTP ์ƒํƒœ ์ฝ”๋“œ ํ™•์ธ
if hasattr(error, 'response') and hasattr(error.response, 'status_code'):
if error.response.status_code == 429:
# print(f"โœ… HTTP 429 ์—๋Ÿฌ ๊ฐ์ง€: {error.response.status_code}")
return True
# 2. ํ…์ŠคํŠธ ๊ธฐ๋ฐ˜ ๊ฐ์ง€ (๋ฐฑ์—…)
error_lower = error_str.lower()
if ("429" in error_lower or
"rate" in error_lower or
"limit" in error_lower or
"too_many_requests" in error_lower or
"request limit" in error_lower):
# print(f"โœ… ํ…์ŠคํŠธ ๊ธฐ๋ฐ˜ 429 ์—๋Ÿฌ ๊ฐ์ง€")
return True
return False
def call_llm_api(self, prompt:str, current_date:str) -> str:
"""LLM API ํ˜ธ์ถœ ํ•จ์ˆ˜ (ํ‚ค ํšŒ์ „ ๋ฐ ๋ฐฑ์˜คํ”„ ์ง€์›)"""
from src.api_key_rotator import get_rotator
rotator = get_rotator()
num_keys = len(rotator.keys)
base_delay = 3.0
def _make_api_call(eval_instance: FreshEval) -> str:
"""API ํ˜ธ์ถœ ํ—ฌํผ ํ•จ์ˆ˜"""
if eval_instance.chat_completions:
# Chat completions API
response = eval_instance.client.chat.completions.create(
model=eval_instance.model,
temperature=eval_instance.temperature,
max_tokens=eval_instance.max_tokens,
messages=[
{
"role": "system",
"content": (
f"You are a helpful assistant. Respond as concisely as possible. Knowledge cutoff: {current_date}."
)
},
{
"role": "user",
"content": "What's today's date?"
},
{
"role": "assistant",
"content": f"Today is {current_date} in Pacific Standard Time."
},
{
"role": "user",
"content": prompt
}
],
)
return response.choices[0].message.content
else:
# Completions API
response = eval_instance.client.completions.create(
model=eval_instance.model,
temperature=eval_instance.temperature,
max_tokens=eval_instance.max_tokens,
prompt=prompt,
)
return response.choices[0].text
# ํ˜„์žฌ ํ‚ค๋กœ ์‹œ์ž‘
current_key = self.api_key
current_instance = FreshEval(model=self.model, api_key=current_key)
# ํ‚ค๊ฐ€ 1๊ฐœ์ธ ๊ฒฝ์šฐ: ๊ธฐ์กด ๋ฐฑ์˜คํ”„ ๋กœ์ง๋งŒ ์‚ฌ์šฉ
if num_keys == 1:
max_retries = 7
for attempt in range(max_retries):
try:
return _make_api_call(current_instance)
except Exception as e:
if self._is_rate_limit_error(e):
if attempt < max_retries - 1:
# ์ง€์ˆ˜์  ๋ฐฑ์˜คํ”„
delay = base_delay * (2 ** attempt) + random.uniform(0, 1)
time.sleep(delay)
continue
# else:
# print(f"โŒ ์ตœ๋Œ€ ์žฌ์‹œ๋„ ํšŸ์ˆ˜ ์ดˆ๊ณผ")
raise e
# max_retries ์ดˆ๊ณผํ•  ๋•Œ๊นŒ์ง€ return ๋˜์ง€ ์•Š์œผ๋ฉด ์—๋Ÿฌ ๋ฐœ์ƒ
raise Exception("call llm api:์ตœ๋Œ€ ์žฌ์‹œ๋„ ํšŸ์ˆ˜ ์ดˆ๊ณผ")
# ํ‚ค๊ฐ€ 2๊ฐœ ์ด์ƒ์ธ ๊ฒฝ์šฐ: ํ‚ค ์ „ํ™˜ ๋กœ์ง (3์ดˆ ๋Œ€๊ธฐ ํฌํ•จ)
# ์„ฑ๊ณตํ•  ๋•Œ๊นŒ์ง€ ํ‚ค๋ฅผ ์ˆœํ™˜ํ•˜๋ฉฐ ์‹œ๋„ (์ตœ๋Œ€ ๋ชจ๋“  ํ‚ค๋ฅผ 3๋ฐ”ํ€ด๊นŒ์ง€)
max_attempts = num_keys * 3 # ๋ชจ๋“  ํ‚ค๋ฅผ ์ตœ๋Œ€ 3๋ฐ”ํ€ด๊นŒ์ง€ ์‹œ๋„
key_attempt_count = 0
# ํ˜„์žฌ ํ‚ค๋กœ ์ฒซ ์‹œ๋„
for attempt in range(max_attempts):
try:
return _make_api_call(current_instance) # ์„ฑ๊ณตํ•˜๋ฉด ์ฆ‰์‹œ ๋ฐ˜ํ™˜
except Exception as e:
if self._is_rate_limit_error(e):
key_attempt_count += 1
# ๋‹ค์Œ ํ‚ค๋กœ ์ „ํ™˜ํ•˜๊ธฐ ์ „์— 2์ดˆ ๋Œ€๊ธฐ
time.sleep(2)
current_key = rotator.pick_key()
# print("๐Ÿ”‘ ํ‚ค ์ „ํ™˜")
current_instance = FreshEval(model=self.model, api_key=current_key)
continue # ๋‹ค์Œ ํ‚ค๋กœ ๊ณ„์† ์‹œ๋„
else:
# 429๊ฐ€ ์•„๋‹Œ ์—๋Ÿฌ๋Š” ์ฆ‰์‹œ ์ „ํŒŒ
raise
# ์ตœ๋Œ€ ์‹œ๋„ ํšŸ์ˆ˜ ์ดˆ๊ณผ (๋ชจ๋“  ํ‚ค๋ฅผ ์—ฌ๋Ÿฌ ๋ฐ”ํ€ด ์‹œ๋„ํ–ˆ์ง€๋งŒ ๋ชจ๋‘ ์‹คํŒจ)
raise Exception(f"๋ชจ๋“  API ํ‚ค์—์„œ 429 ์—๋Ÿฌ ๋ฐœ์ƒ (์ตœ๋Œ€ {max_attempts}ํšŒ ์‹œ๋„)")
def call_fresheval(self, mode:str, question:str, evaluation:str, current_date:str) -> str:
"""FreshEval ํ‰๊ฐ€ ํ•จ์ˆ˜"""
fresheval_question = f'\nquestion: {question}{evaluation}'
# ํ™˜๊ฒฝ๋ณ€์ˆ˜ ๊ธฐ๋ฐ˜ ํ”„๋กฌํ”„ํŠธ(๋ณธ์ฒด: prefix + demo) ์šฐ์„  ์‚ฌ์šฉ
env_prompt_body = None
if mode == 'Relaxed':
env_prompt_body = Config.FRESHQA_PROMPT_RELAXED
elif mode == 'Strict':
env_prompt_body = Config.FRESHQA_PROMPT_STRICT
if env_prompt_body and str(env_prompt_body).strip():
base_prompt = str(env_prompt_body).strip()
else:
raise ValueError(f"{mode} ํ‰๊ฐ€ ํ”„๋กฌํ”„ํŠธ ์„ค์ •์ด ์—†์Šต๋‹ˆ๋‹ค.")
fresheval_prompt = base_prompt + fresheval_question
# ํ‰๊ฐ€
answer = self.call_llm_api(fresheval_prompt, current_date)
return answer
def extract_ratings(self, response:str) -> Tuple[bool, Dict[str, str]]:
"""ํ‰๊ฐ€ ๊ฒฐ๊ณผ์—์„œ ๋“ฑ๊ธ‰ ์ถ”์ถœ"""
def _clean(text: str) -> str:
# ์–‘๋ ์žฅ์‹/๊ณต๋ฐฑ ์ œ๊ฑฐ + ๋‚ด๋ถ€ ํ”์  ์ •๋ฆฌ + ์†Œ๋ฌธ์žํ™”
text = re.sub(r'^[*`_~\s]+|[*`_~\s]+$', '', text)
text = re.sub(r'[*`_~]', '', text)
return text.strip().strip('.').strip().lower()
def _judge(val: str):
"""
๋ฌธ์ž์—ด์—์„œ correct/incorrect ํŒ์ •.
- 'incorrect'๊ฐ€ ๋ณด์ด๋ฉด ๋ฌด์กฐ๊ฑด FALSE
- 'partially correct'๋Š” ๋ชจํ˜ธ โ†’ None
- 'correct'๋Š” TRUE
"""
if re.search(r'(?i)\bincorrect\b', val):
return 'FALSE'
if re.search(r'(?i)\bpartial(?:ly)?\s+correct\b', val):
return None
if re.search(r'(?i)\bcorrect\b', val):
return 'TRUE'
return None
def _from_label(block_label: str):
"""
๋ผ๋ฒจ(์˜ˆ: 'Final Evaluation' ๋˜๋Š” 'Evaluation') ๊ธฐ์ค€์œผ๋กœ
- ๊ฐ™์€ ์ค„ ์บก์ฒ˜ ๋จผ์ € ์‹œ๋„
- ์‹คํŒจํ•˜๋ฉด ๋ผ๋ฒจ ์ดํ›„ ~ ๋‹ค์Œ ๋นˆ ์ค„ ์ด์ „ ๋ฒ”์œ„์—์„œ ํŒ์ • ํ‚ค์›Œ๋“œ ํƒ์ƒ‰
"""
# ๊ฐ™์€ ์ค„ ์บก์ฒ˜: ๋ผ๋ฒจ ยฑ ์žฅ์‹ ยฑ ์ฝœ๋ก  ์ดํ›„ ~ ์ค„๋
same_line = re.search(
rf'(?i){block_label}\s*(?:[*`_~]*\s*:\s*|:\s*[*`_~]*)\s*([^\r\n]+)',
response
)
if same_line:
val = _clean(same_line.group(1))
j = _judge(val)
if j is not None:
return j
# ์œ„์น˜๋งŒ ์ฐพ๊ณ (๊ฐ’ ์—†์ด ์ค„๋ฐ”๊ฟˆ๋œ ์ผ€์ด์Šค), ๋‹ค์Œ ๋นˆ ์ค„(or ์„น์…˜) ์ „๊นŒ์ง€ ์Šค์บ”
pos = re.search(
rf'(?i){block_label}\s*(?:[*`_~]*\s*:\s*|:\s*[*`_~]*)',
response
)
if pos:
tail = response[pos.end():]
# ๋‹ค์Œ '๋นˆ ์ค„(์—ฐ์† ๊ฐœํ–‰)' ๋˜๋Š” ๋‹ค์Œ ์„น์…˜ ์‹œ์ž‘ ์ „๊นŒ์ง€๋งŒ ๋ณธ๋‹ค (๋„ˆ๋ฌด ๋ฉ€๋ฆฌ ์•ˆ๊ฐ€๋„๋ก)
m_stop = re.search(r'\n\s*\n', tail)
segment = tail[:m_stop.start()] if m_stop else tail[:300] # ์•ˆ์ „ํ•œ ์ƒํ•œ
seg_clean = _clean(segment)
j = _judge(seg_clean)
if j is not None:
return j
return None
# 1) Final Evaluation ์ตœ์šฐ์„ 
final_judgement = _from_label('final\s+evaluation')
if final_judgement:
return True, {'rating': final_judgement}
# 2) Evaluation
eval_judgement = _from_label('evaluation')
if eval_judgement:
return True, {'rating': eval_judgement}
# 3) ํด๋ฐฑ: credited ๋ฌธ์žฅ
if re.search(r'(?i)thus,\s*the\s*response\s*is\s*credited\b', response):
return True, {'rating': 'TRUE'}
if re.search(r'(?i)thus,\s*the\s*response\s*is\s*not\s*credited\b', response):
return True, {'rating': 'FALSE'}
# 4) ์‹คํŒจ
return False, {'rating': None}
def evaluate_single_row(self, row: pd.Series, mode: str, current_date:str) -> Dict[str, Any]:
"""๋‹จ์ผ ํ–‰ ํ‰๊ฐ€"""
question = row['question']
response = row['model_response']
correct_answers = [row[f'answer_{i}'] for i in range(10)]
correct_answers = [str(x) for x in correct_answers if pd.notna(x) and str(x).strip()]
# model_response๊ฐ€ ๋น„์–ด์žˆ๊ฑฐ๋‚˜ NaN์ธ ๊ฒฝ์šฐ ๋ฐ”๋กœ ํ‹€๋ ธ๋‹ค๋Š” ๊ฒฐ๊ณผ๋กœ ์ฒ˜๋ฆฌํ•˜๊ณ  return
if pd.isna(response) or (isinstance(response, str) and response.strip() == ''):
# print('model_response๊ฐ€ ๋น„์–ด์žˆ์Œ. rating=0์œผ๋กœ ์ฒ˜๋ฆฌ')
row_dict = row.to_dict()
row_dict['rating'] = 0
row_dict['explanation'] = "model_response๊ฐ€ ๋น„์–ด์žˆ์Œ"
return row_dict
# ํ‰๊ฐ€ ํ…œํ”Œ๋ฆฟ ์ƒ์„ฑ
evaluation_template = (
"\ncorrect answer(s): {correct_answers}"
"\nresponse: {response}"
"\ncomment: "
)
evaluation = evaluation_template.format(
correct_answers=' | '.join(correct_answers),
response=response,
)
# ํ‰๊ฐ€
fresheval_response = self.call_fresheval(
mode=mode,
question=question,
evaluation=evaluation,
current_date=current_date
)
is_valid_eval, eval_result = self.extract_ratings(fresheval_response)
# if is_valid_eval:
# print('์™„๋ฃŒ')
# ์žฌํ‰๊ฐ€ ํšŸ์ˆ˜ ์ œํ•œ (์ตœ๋Œ€ 3ํšŒ)
max_retries = 3
retry_count = 0
# ์žฌ์‹œ๋„ loop
while not is_valid_eval and retry_count < max_retries:
retry_count += 1
# print(f'์œ ํšจํ•˜์ง€ ์•Š์€ ํ‰๊ฐ€, ์žฌํ‰๊ฐ€ ์ค‘... ({retry_count}/{max_retries})\n response: {fresheval_response}')
fresheval_response = self.call_fresheval(
mode=mode,
question=question,
evaluation=evaluation,
current_date=current_date
)
is_valid_eval, eval_result = self.extract_ratings(fresheval_response)
# if is_valid_eval:
# print('์™„๋ฃŒ')
# ์ตœ๋Œ€ ์žฌ์‹œ๋„ ํšŸ์ˆ˜ ์ดˆ๊ณผ ์‹œ ๊ธฐ๋ณธ ๊ฐ’ ์‚ฌ์šฉ
if not is_valid_eval:
# print(f'โš ๏ธ ์ตœ๋Œ€ ์žฌ์‹œ๋„ ํšŸ์ˆ˜({max_retries}) ์ดˆ๊ณผ. ๊ธฐ๋ณธ๊ฐ’ ์‚ฌ์šฉ: rating=0')
eval_result = {'rating': 0}
fresheval_response = "์žฌ์‹œ๋„ ํšŸ์ˆ˜ ์ดˆ๊ณผ๋กœ ์ธํ•œ ๊ธฐ๋ณธ ํ‰๊ฐ€"
row_dict = row.to_dict()
row_dict['rating'] = eval_result['rating']
row_dict['explanation'] = fresheval_response
# ๐Ÿ“Š DEBUG: FALSE์ธ ๊ฒฝ์šฐ์—๋งŒ ์ƒ์„ธ ์ถœ๋ ฅ
# if eval_result['rating'] == 'FALSE':
# print(f"\n{'='*80}")
# print(f"โŒ FALSE ํ‰๊ฐ€๋œ ์งˆ๋ฌธ")
# print(f" Mode: {mode}")
# print(f" Question: {question}")
# print(f" Correct Answers: {' | '.join(correct_answers)}")
# print(f" Model Response: {response}")
# print(f"\n LLM ํ‰๊ฐ€ ์‘๋‹ต:")
# print(f" {fresheval_response}")
# print(f" ์ตœ์ข… Rating: {eval_result['rating']}")
# print(f"{'='*80}\n")
return row_dict
def evaluate_dataframe(self, df: pd.DataFrame, mode: str) -> pd.DataFrame:
"""๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„ ํ‰๊ฐ€"""
freshevals = []
current_date = get_current_date_str()
len_df = len(df)
for index, row in df.iterrows():
print(f'{mode} ํ‰๊ฐ€ ์ค‘... {index+1}/{len_df}')
row_dict = self.evaluate_single_row(row, mode, current_date)
freshevals.append(row_dict)
return pd.DataFrame(freshevals)