Spaces:
Runtime error
Runtime error
| # app.py | |
| import os | |
| import time | |
| import json | |
| import re | |
| import html | |
| import urllib.parse | |
| from collections import OrderedDict | |
| import gradio as gr | |
| import requests | |
| from bs4 import BeautifulSoup | |
| from langdetect import detect, DetectorFactory | |
| DetectorFactory.seed = 0 # deterministic language detection | |
| # Hugging Face transformers | |
| from transformers import AutoTokenizer, AutoModelForSeq2SeqLM | |
| # Optional OpenAI client (new-ish SDK) | |
| try: | |
| from openai import OpenAI, error as openai_error | |
| except Exception: | |
| OpenAI = None | |
| openai_error = None | |
| # Optional datasets for Tatoeba examples | |
| try: | |
| from datasets import load_dataset | |
| except Exception: | |
| load_dataset = None | |
| # ---------- Config ---------- | |
| MODEL_NAME = "Helsinki-NLP/opus-mt-en-pt" # literal MT | |
| MAX_FREE_CANDIDATES = 8 | |
| TATOEBA_CONFIG = ("Helsinki-NLP/tatoeba_mt", "eng-por") # dataset id + config | |
| HEADERS = { | |
| "User-Agent": "Mozilla/5.0 (compatible; translator-bot/1.0; +https://example.com/bot)" | |
| } | |
| REQUEST_TIMEOUT = 8 # seconds | |
| SLEEP_BETWEEN_REQUESTS = 0.5 # polite pacing for scrapers | |
| OPENAI_KEY = os.getenv("OPENAI_API_KEY") | |
| openai_client = None | |
| if OPENAI_KEY and OpenAI is not None: | |
| try: | |
| openai_client = OpenAI(api_key=OPENAI_KEY) | |
| except Exception: | |
| openai_client = None | |
| # Load HF model/tokenizer (literal MT) | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) | |
| model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME) | |
| # ---------- Utilities ---------- | |
| def safe_request(url): | |
| """GET page with basic error handling and headers.""" | |
| try: | |
| r = requests.get(url, headers=HEADERS, timeout=REQUEST_TIMEOUT) | |
| time.sleep(SLEEP_BETWEEN_REQUESTS) | |
| if r.status_code == 200: | |
| return r.text | |
| return None | |
| except Exception: | |
| return None | |
| def normalize_text(s): | |
| if not s: | |
| return "" | |
| s = re.sub(r"\s+", " ", s).strip() | |
| return s | |
| def is_probably_portuguese(text): | |
| """Detect Portuguese robustly (langdetect + heuristic).""" | |
| if not text or len(text.strip()) < 2: | |
| return False | |
| try: | |
| lang = detect(text) | |
| if lang == "pt": | |
| return True | |
| except Exception: | |
| pass | |
| # fallback heuristic: presence of Portuguese diacritics or common PT glue-words | |
| pt_markers = ["ão", "â", "ê", "ô", "é", "à", "í", "õ", "ç", " que ", " para ", " com "] | |
| low = text.lower() | |
| return any(m in low for m in pt_markers) | |
| def dedupe_preserve_order(items): | |
| seen = set() | |
| out = [] | |
| for item in items: | |
| key = normalize_text(item).lower() | |
| if key and key not in seen: | |
| seen.add(key) | |
| out.append(item) | |
| return out | |
| # ---------- MT (literal) ---------- | |
| def literal_mt(text): | |
| """Return a short literal PT translation from HF model.""" | |
| try: | |
| inputs = tokenizer(text, return_tensors="pt", truncation=True) | |
| outputs = model.generate(**inputs, max_length=120) | |
| out = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| return normalize_text(out) | |
| except Exception as e: | |
| return None | |
| # ---------- Tatoeba examples (corpus-backed idioms) ---------- | |
| _tatoeba_cache = None | |
| def load_tatoeba_dataset(): | |
| global _tatoeba_cache | |
| if _tatoeba_cache is not None: | |
| return _tatoeba_cache | |
| if load_dataset is None: | |
| _tatoeba_cache = None | |
| return None | |
| try: | |
| ds = load_dataset(*TATOEBA_CONFIG, split="train") # may take time first run | |
| _tatoeba_cache = ds | |
| return ds | |
| except Exception: | |
| _tatoeba_cache = None | |
| return None | |
| def find_tatoeba_candidates(phrase, max_results=3): | |
| """Search the loaded Tatoeba ENG->POR pairs for examples containing the phrase (best-effort).""" | |
| ds = load_tatoeba_dataset() | |
| if ds is None: | |
| return [] | |
| # safe lowercase match on English side | |
| phrase_lc = phrase.lower() | |
| results = [] | |
| try: | |
| # streaming filter to avoid loading whole dataset in memory for large datasets | |
| for doc in ds: | |
| en = doc.get("translation", {}).get("en") or doc.get("en") or "" | |
| pt = doc.get("translation", {}).get("pt") or doc.get("pt") or "" | |
| if not en or not pt: | |
| continue | |
| if phrase_lc in en.lower(): | |
| # prefer shorter pt strings (phrases) sometimes | |
| example = normalize_text(pt) | |
| if example and is_probably_portuguese(example): | |
| results.append({"pt": example, "en": normalize_text(en)}) | |
| if len(results) >= max_results: | |
| break | |
| except Exception: | |
| pass | |
| return results[:max_results] | |
| # ---------- Linguee scraper ---------- | |
| def linguee_search(phrase, max_results=4): | |
| """Scrape Linguee search page for phrase candidates (best-effort).""" | |
| url = "https://www.linguee.com/english-portuguese/search?source=english&query=" + urllib.parse.quote(phrase) | |
| html_text = safe_request(url) | |
| candidates = [] | |
| if not html_text: | |
| return candidates | |
| soup = BeautifulSoup(html_text, "html.parser") | |
| # Common Linguee link class | |
| for a in soup.select("a.dictLink"): | |
| txt = normalize_text(a.get_text()) | |
| if txt and is_probably_portuguese(txt): | |
| candidates.append({"pt": txt, "source_url": url}) | |
| # fallback: look for translations in spans | |
| if not candidates: | |
| # Try more generic selectors | |
| for tag in soup.find_all(["a", "span", "div"]): | |
| txt = normalize_text(tag.get_text()) | |
| if txt and len(txt.split()) <= 6 and is_probably_portuguese(txt): | |
| candidates.append({"pt": txt, "source_url": url}) | |
| # dedupe and limit | |
| seen = [] | |
| out = [] | |
| for c in candidates: | |
| key = c["pt"].lower() | |
| if key not in seen: | |
| seen.append(key) | |
| out.append(c) | |
| if len(out) >= max_results: | |
| break | |
| return out | |
| # ---------- Reverso Context scraper ---------- | |
| def reverso_search(phrase, max_results=4): | |
| """Scrape Reverso Context for translation candidates (best-effort).""" | |
| # Reverso uses hyphenated path sometimes; use safe URL encoding | |
| url = "https://context.reverso.net/translation/english-portuguese/" + urllib.parse.quote(phrase) | |
| html_text = safe_request(url) | |
| candidates = [] | |
| if not html_text: | |
| return candidates | |
| soup = BeautifulSoup(html_text, "html.parser") | |
| # Reverso often shows translations in elements with class containing 'translation' | |
| for el in soup.select(".translation, .display, .translations, .translation .text"): | |
| txt = normalize_text(el.get_text()) | |
| if txt and is_probably_portuguese(txt): | |
| candidates.append({"pt": txt, "source_url": url}) | |
| # fallback: pick anchors with possible translations | |
| if not candidates: | |
| for a in soup.find_all("a"): | |
| txt = normalize_text(a.get_text()) | |
| if txt and len(txt.split()) <= 6 and is_probably_portuguese(txt): | |
| candidates.append({"pt": txt, "source_url": url}) | |
| # dedupe & limit | |
| out = [] | |
| seen = set() | |
| for c in candidates: | |
| key = c["pt"].lower() | |
| if key not in seen: | |
| seen.add(key) | |
| out.append(c) | |
| if len(out) >= max_results: | |
| break | |
| return out | |
| # ---------- Glosbe scraper ---------- | |
| def glosbe_search(phrase, max_results=4): | |
| """Scrape Glosbe page for EN->PT matches (best-effort).""" | |
| url = "https://glosbe.com/en/pt/" + urllib.parse.quote(phrase) | |
| html_text = safe_request(url) | |
| candidates = [] | |
| if not html_text: | |
| return candidates | |
| soup = BeautifulSoup(html_text, "html.parser") | |
| # Glosbe often lists translations in elements with class 'translation' or table rows | |
| for el in soup.select(".translation, .meaning, .wrap"): | |
| txt = normalize_text(el.get_text()) | |
| if txt and is_probably_portuguese(txt): | |
| candidates.append({"pt": txt, "source_url": url}) | |
| # fallback: anchors | |
| if not candidates: | |
| for a in soup.find_all("a"): | |
| txt = normalize_text(a.get_text()) | |
| if txt and len(txt.split()) <= 6 and is_probably_portuguese(txt): | |
| candidates.append({"pt": txt, "source_url": url}) | |
| out = [] | |
| seen = set() | |
| for c in candidates: | |
| key = c["pt"].lower() | |
| if key not in seen: | |
| seen.add(key) | |
| out.append(c) | |
| if len(out) >= max_results: | |
| break | |
| return out | |
| # ---------- Sinonimos (to expand) ---------- | |
| def sinonimos_search(pt_phrase, max_results=4): | |
| url = "https://www.sinonimos.com.br/" + urllib.parse.quote(pt_phrase.replace(" ", "-")) | |
| html_text = safe_request(url) | |
| if not html_text: | |
| return [] | |
| soup = BeautifulSoup(html_text, "html.parser") | |
| syns = [] | |
| # site uses anchors with class "sinonimo" | |
| for a in soup.select("a.sinonimo"): | |
| txt = normalize_text(a.get_text()) | |
| if txt and is_probably_portuguese(txt): | |
| syns.append(txt) | |
| if len(syns) >= max_results: | |
| break | |
| # fallback generic anchor text | |
| if not syns: | |
| for a in soup.find_all("a"): | |
| txt = normalize_text(a.get_text()) | |
| if txt and is_probably_portuguese(txt) and len(txt.split()) <= 4: | |
| syns.append(txt) | |
| if len(syns) >= max_results: | |
| break | |
| return dedupe_preserve_order(syns)[:max_results] | |
| # ---------- Merge, classify, rank ---------- | |
| def classify_candidate(pt_text, literal_text): | |
| """Return type label: 'literal' if same as literal, else 'idiomatic' or 'paraphrase'.""" | |
| if not pt_text: | |
| return "unknown" | |
| if normalize_text(pt_text).lower() == normalize_text(literal_text).lower(): | |
| return "literal" | |
| # if candidate is short and contains common idiom markers or differs a lot => idiomatic | |
| if len(pt_text.split()) <= 3: | |
| return "idiomatic" | |
| # else paraphrase | |
| return "paraphrase" | |
| def rank_candidates(candidates): | |
| """ | |
| candidates: list of dicts with keys 'pt', 'sources' (list), 'example' (optional) | |
| Ranking heuristic (best-effort): | |
| - prefer items that appear in multiple sources | |
| - prefer Tatoeba examples (corpus-backed) | |
| - otherwise keep source-priority order | |
| """ | |
| # count source hits | |
| for c in candidates: | |
| c_sources = set(c.get("sources", [])) | |
| c["score"] = len(c_sources) | |
| # small boost if Tatoeba example exists | |
| if c.get("example"): | |
| c["score"] += 1.5 | |
| # sort by score desc, then shorter phrase first | |
| candidates_sorted = sorted(candidates, key=lambda x: (-x["score"], len(x.get("pt","").split()))) | |
| return candidates_sorted | |
| # ---------- GPT (hybrid) ---------- | |
| def gpt_analyze(phrase): | |
| """Ask GPT to provide literal, idioms, explanation; returns structured dict or None on failure.""" | |
| if openai_client is None: | |
| return None | |
| prompt = ( | |
| f"You are a bilingual English → Brazilian Portuguese translation assistant.\n\n" | |
| f"For the English expression: \"{phrase}\"\n\n" | |
| "Return a JSON object with these keys:\n" | |
| " - literal: a short literal word-for-word PT translation (string)\n" | |
| " - idioms: an array of 2-4 idiomatic or natural Brazilian Portuguese equivalents (strings)\n" | |
| " - paraphrases: an array of 1-3 short paraphrases in PT that convey the meaning\n" | |
| " - explanation: a short explanation in Portuguese and/or English of the meaning and nuance\n\n" | |
| "Only output the JSON (no extra commentary). Keep strings short. If no idiom exists, return plausible paraphrases.\n" | |
| ) | |
| try: | |
| # new-ish client API: chat completions | |
| resp = openai_client.chat.completions.create( | |
| model="gpt-4o-mini", | |
| messages=[{"role": "user", "content": prompt}], | |
| temperature=0.3, | |
| max_tokens=500, | |
| ) | |
| txt = resp.choices[0].message.content | |
| # The model should return JSON — try to locate first {...} block | |
| first_brace = txt.find("{") | |
| last_brace = txt.rfind("}") | |
| if first_brace >= 0 and last_brace > first_brace: | |
| json_text = txt[first_brace:last_brace+1] | |
| data = json.loads(json_text) | |
| return data | |
| # fallback: try direct load | |
| return json.loads(txt) | |
| except Exception: | |
| return None | |
| # ---------- Main orchestration ---------- | |
| def build_free_stack_candidates(expr): | |
| """Gather candidates from Tatoeba + Linguee + Reverso + Glosbe + Sinonimos, return structured list.""" | |
| literal = literal_mt(expr) or "" | |
| candidates = [] | |
| # 1) Tatoeba examples (parallel corpus) | |
| tatoeba_hits = find_tatoeba_candidates(expr, max_results=4) | |
| for hit in tatoeba_hits: | |
| pt = hit.get("pt") | |
| example_en = hit.get("en") | |
| candidates.append({ | |
| "pt": pt, | |
| "sources": ["Tatoeba"], | |
| "example": {"pt": pt, "en": example_en} | |
| }) | |
| # 2) Linguee | |
| ling_hits = linguee_search(expr, max_results=4) | |
| for h in ling_hits: | |
| candidates.append({"pt": normalize_text(h["pt"]), "sources": ["Linguee"], "source_url": h.get("source_url")}) | |
| # 3) Reverso | |
| rev_hits = reverso_search(expr, max_results=3) | |
| for h in rev_hits: | |
| candidates.append({"pt": normalize_text(h["pt"]), "sources": ["Reverso"], "source_url": h.get("source_url")}) | |
| # 4) Glosbe | |
| glos_hits = glosbe_search(expr, max_results=3) | |
| for h in glos_hits: | |
| candidates.append({"pt": normalize_text(h["pt"]), "sources": ["Glosbe"], "source_url": h.get("source_url")}) | |
| # 5) Expand using Sinonimos for the top candidate (if exists) | |
| if candidates: | |
| top_pt = normalize_text(candidates[0]["pt"]) | |
| syns = sinonimos_search(top_pt, max_results=4) | |
| for s in syns: | |
| candidates.append({"pt": s, "sources": ["Sinonimos"], "source_url": f"https://www.sinonimos.com.br/{urllib.parse.quote(top_pt)}"}) | |
| # dedupe by pt text (preserve order) | |
| seen = set() | |
| unique = [] | |
| for c in candidates: | |
| key = normalize_text(c.get("pt", "")).lower() | |
| if key and key not in seen: | |
| seen.add(key) | |
| unique.append(c) | |
| # classify and fill missing info | |
| for c in unique: | |
| c["type"] = classify_candidate(c.get("pt", ""), literal) | |
| if "sources" not in c: | |
| c["sources"] = ["unknown"] | |
| # ranking | |
| ranked = rank_candidates(unique) | |
| return literal, ranked | |
| def build_response(expr): | |
| expr = expr.strip() | |
| if not expr: | |
| return {"status": "error", "message": "Empty input."} | |
| # Try GPT hybrid first | |
| gpt_result = None | |
| if openai_client: | |
| try: | |
| gpt_result = gpt_analyze(expr) | |
| except Exception: | |
| gpt_result = None | |
| if gpt_result: | |
| # Build structured response from GPT | |
| status = "✅ Hybrid Mode: GPT-4o-mini + HF literal MT" | |
| literal_mt_text = literal_mt(expr) or "" | |
| options = [] | |
| # literal from GPT if provided, else MT | |
| literal_from_gpt = gpt_result.get("literal") if isinstance(gpt_result, dict) else None | |
| literal_text = normalize_text(literal_from_gpt) if literal_from_gpt else literal_mt_text | |
| options.append({ | |
| "translation": s, | |
| "type": "idiomatic" if len(s.split()) <= 3 else "paraphrase", | |
| "sources": ["Sinonimos"], | |
| "example": None, | |
| "confidence": 95, | |
| "rationale": "Literal translation from GPT (backed by HF literal MT)." | |
| }) | |
| # idioms | |
| idioms = gpt_result.get("idioms") or [] | |
| for idi in idioms[:4]: | |
| idi_norm = normalize_text(idi) | |
| options.append({ | |
| "translation": idi_norm, | |
| "type": "idiomatic", | |
| "sources": ["GPT"], | |
| "example": None, | |
| "confidence": 90, | |
| "rationale": "GPT suggested idiomatic equivalent." | |
| }) | |
| # paraphrases | |
| for para in (gpt_result.get("paraphrases") or [])[:2]: | |
| para_norm = normalize_text(para) | |
| options.append({ | |
| "translation": para_norm, | |
| "type": "paraphrase", | |
| "sources": ["GPT"], | |
| "example": None, | |
| "confidence": 85, | |
| "rationale": "GPT paraphrase to capture meaning." | |
| }) |