Spaces:

everydayconversify
/

conversify-model

Sleeping

File size: 4,142 Bytes

43bf256
 
ee22dcb
 
43bf256
 
ee22dcb
43bf256
ef707bc
ee22dcb
ab57525
ee22dcb
 
43bf256
fc1a2fa
 
fbd4c8c
 
1a090bf
 
 
78fb2c7
 
fc1a2fa
dd11498
 
 
43bf256
 
 
 
 
 
 
 
 
 
 
 
ee22dcb
 
43bf256
ee22dcb
43bf256
06f14cd
43bf256
ee22dcb
43bf256
 
ee22dcb
43bf256
 
 
 
 
 
78fb2c7
ee22dcb
43bf256
ee22dcb
43bf256
06f14cd
43bf256
 
06f14cd
43bf256
 
 
 
 
 
 
 
06f14cd
 
43bf256
 
 
ef707bc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43bf256
ef707bc
06f14cd
ee22dcb
 
43bf256

# app.py (GGUF + llama-cpp-python 버전)

from fastapi import FastAPI
from pydantic import BaseModel
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
import uvicorn
import json
import ast

# 1. FastAPI 앱 인스턴스 생성
app = FastAPI()

# 2. GGUF 모델 로딩 준비
# #    TheBloke의 SOLAR 모델을 예시로 사용.
# #    'repo_id'는 모델이 있는 저장소, 'filename'은 그 안의 특정 GGUF 파일명.
# model_repo_id = "TheBloke/SOLAR-10.7B-Instruct-v1.0-GGUF"
# model_filename = "solar-10.7b-instruct-v1.0.Q4_K_S.gguf"
# model_repo_id = "lmstudio-community/gemma-2-2b-it-GGUF"
# model_filename = "gemma-2-2b-it-Q4_K_M.gguf"
model_repo_id = "Qwen/Qwen2-0.5B-Instruct-GGUF"
# model_filename = "qwen2-0_5b-instruct-q4_k_m.gguf"
model_filename = "qwen2-0_5b-instruct-q2_k.gguf"

# #    7B 모델인 Qwen 2.5로 변경하여 테스트
# model_repo_id = "Triangle104/Qwen2.5-7B-Instruct-Q4_K_S-GGUF"
# model_filename = "qwen2.5-7b-instruct-q4_k_s.gguf"

# Hugging Face Hub에서 GGUF 파일을 다운로드하고, 로컬 경로를 가져온다.
# 이 과정은 서버 시작 시 한번만 실행된다.
model_path = hf_hub_download(repo_id=model_repo_id, filename=model_filename)

# llama-cpp-python을 사용해 GGUF 모델을 메모리에 로드한다.
# n_gpu_layers=-1 은 GPU를 최대한 사용하라는 뜻. CPU만 쓰는 환경에서는 0으로 설정.
llm = Llama(
  model_path=model_path,
  n_ctx=4096, # 모델이 한번에 처리할 수 있는 최대 토큰 길이
  n_threads=8, # 사용할 CPU 스레드 수
  n_gpu_layers=0 # GPU에 올릴 레이어 수 (-1은 가능한 모두 올리라는 뜻)
)

# 요청 본문 형식은 이전과 동일
class TranslationRequest(BaseModel):
    text: str

# 3. API 엔드포인트 수정
@app.post("/translate")
async def translate_all_in_one(request: TranslationRequest):
    korean_text = request.text

    # GGUF 모델(Llama-2 Chat 형식)에 맞는 프롬프트 형식
    prompt = f"""### User:
        You are a helpful translation and pronunciation assistant.
        Given the following Korean text, perform three tasks.
        1. Translate the text into natural, everyday English.
        2. Translate the text into natural, everyday Japanese.
        3. Write the Japanese translation generated in 2 into Korean as it is pronounced.

        Format your response as a single, valid JSON object with the keys "english", "japanese", and "pronunciation".

        Korean Text: "{korean_text}"

        ### Assistant:
        """
    
    # 모델을 통해 텍스트 생성 실행
    output = llm(
      prompt,
      max_tokens=512,
      stop=["### User:", "</s>"], # 응답 생성을 멈출 특정 단어
      temperature=0.7,
      top_k=50,
      echo=False # 프롬프트를 다시 출력하지 않도록 설정
    )

    generated_output = output["choices"][0]["text"].strip()
    
    try:
        # 1. 모델이 생성한 텍스트에서 앞뒤의 불필요한 부분을 정리
        #    (혹시 모를 따옴표나 공백, 마크다운 코드 블록 제거)
        clean_output = generated_output.strip().strip("'\"")
        if clean_output.startswith("```json"):
            clean_output = clean_output[7:]
        if clean_output.endswith("```"):
            clean_output = clean_output[:-3]
        clean_output = clean_output.strip()

        # 2. ast.literal_eval을 사용해 문자열을 안전하게 파이썬 딕셔너리로 변환
        #    이것이 바로 홑따옴표 문제를 해결하는 열쇠!
        parsed_data = ast.literal_eval(clean_output)
        
        # 3. 성공적으로 변환된 딕셔너리를 반환 (FastAPI가 JSON으로 만들어줌)
        return parsed_data

    except (ValueError, SyntaxError) as e:
        # ast.literal_eval이 실패하면 ValueError 또는 SyntaxError 발생
        print(f"AST 파싱 에러: {e}")
        print(f"모델 원본 출력: {generated_output}")
        return {"error": "Failed to parse model output as a dictionary", "raw_output": generated_output}

@app.get("/")
def read_root():
    return {"message": "GGUF Translation API is running"}