Spaces:

DataQuests
/

DeepCritical

Running

VibecoderMcSwaggins commited on 11 days ago

Commit

7fab6d4

1 Parent(s): 2e4a760

feat: implement HFInferenceJudgeHandler for free-tier AI analysis

Replace MockJudgeHandler with real AI analysis using HuggingFace Inference API:

- Add HFInferenceJudgeHandler with chat_completion API
- Model fallback chain: Llama 3.1 → Mistral → Zephyr (ungated)
- Robust JSON extraction (handles markdown blocks, nested braces)
- Tenacity retry with exponential backoff for rate limits
- Fix app.py to use HF Inference when no paid API keys present

Priority: User API key → Env API key → HF Inference (free)

Hackathon judges now get real AI analysis without needing API keys.
Set HF_TOKEN as Space secret for best model (Llama 3.1).

Files changed (9) hide show

.env.example +14 -0
docs/implementation/03_phase_judge.md +414 -14
docs/implementation/04_phase_ui.md +84 -18
pyproject.toml +1 -0
src/agent_factory/judges.py +199 -1
src/app.py +45 -34
src/prompts/report.py +4 -4
tests/unit/agent_factory/test_judges_hf.py +137 -0
uv.lock +2 -0

.env.example CHANGED Viewed

@@ -11,6 +11,20 @@ ANTHROPIC_API_KEY=sk-ant-your-key-here
 OPENAI_MODEL=gpt-5.1
 ANTHROPIC_MODEL=claude-sonnet-4-5-20250929
 # ============== AGENT CONFIGURATION ==============
 MAX_ITERATIONS=10

 OPENAI_MODEL=gpt-5.1
 ANTHROPIC_MODEL=claude-sonnet-4-5-20250929
+# ============== HUGGINGFACE (FREE TIER) ==============
+# HuggingFace Token - enables Llama 3.1 (best quality free model)
+# Get yours at: https://huggingface.co/settings/tokens
+#
+# WITHOUT HF_TOKEN: Falls back to ungated models (zephyr-7b-beta)
+# WITH HF_TOKEN: Uses Llama 3.1 8B Instruct (requires accepting license)
+#
+# For HuggingFace Spaces deployment:
+#   Set this as a "Secret" in Space Settings → Variables and secrets
+#   Users/judges don't need their own token - the Space secret is used
+#
+HF_TOKEN=hf_your-token-here
 # ============== AGENT CONFIGURATION ==============
 MAX_ITERATIONS=10

docs/implementation/03_phase_judge.md CHANGED Viewed

@@ -350,20 +350,333 @@ class JudgeHandler:
         )
-class MockJudgeHandler:
     """
-    Mock JudgeHandler for testing without LLM calls.
-    Use this in unit tests to avoid API calls.
     """
-    def __init__(self, mock_response: JudgeAssessment | None = None):
         """
-        Initialize with optional mock response.
         Args:
-            mock_response: The assessment to return. If None, uses default.
         """
         self.mock_response = mock_response
         self.call_count = 0
         self.last_question = None
@@ -374,7 +687,7 @@ class MockJudgeHandler:
         question: str,
         evidence: List[Evidence],
     ) -> JudgeAssessment:
-        """Return the mock response."""
         self.call_count += 1
         self.last_question = question
         self.last_evidence = evidence
@@ -382,21 +695,21 @@ class MockJudgeHandler:
         if self.mock_response:
             return self.mock_response
-        # Default mock response
         return JudgeAssessment(
             details=AssessmentDetails(
                 mechanism_score=7,
-                mechanism_reasoning="Mock assessment - good mechanism evidence",
                 clinical_evidence_score=6,
-                clinical_reasoning="Mock assessment - moderate clinical evidence",
-                drug_candidates=["Drug A", "Drug B"],
-                key_findings=["Finding 1", "Finding 2"],
             ),
             sufficient=len(evidence) >= 3,
             confidence=0.75,
             recommendation="synthesize" if len(evidence) >= 3 else "continue",
             next_search_queries=["query 1", "query 2"] if len(evidence) < 3 else [],
-            reasoning="Mock assessment for testing purposes",
         )
 ```
@@ -547,8 +860,89 @@ class TestJudgeHandler:
             assert "failed" in result.reasoning.lower()
 class TestMockJudgeHandler:
-    """Tests for MockJudgeHandler."""
     @pytest.mark.asyncio
     async def test_mock_handler_returns_default(self):
@@ -641,9 +1035,15 @@ dependencies = [
     "pydantic-ai>=0.0.16",
     "openai>=1.0.0",
     "anthropic>=0.18.0",
 ]
 ```
 ---
 ## 7. Configuration (`src/utils/config.py`)

         )
+class HFInferenceJudgeHandler:
     """
+    JudgeHandler using HuggingFace Inference API for FREE LLM calls.
+    This is the DEFAULT for demo mode - provides real AI analysis without
+    requiring users to have OpenAI/Anthropic API keys.
+    Model Fallback Chain (handles gated models and rate limits):
+        1. meta-llama/Llama-3.1-8B-Instruct (best quality, requires HF_TOKEN)
+        2. mistralai/Mistral-7B-Instruct-v0.3 (good quality, may require token)
+        3. HuggingFaceH4/zephyr-7b-beta (ungated, always works)
+    Rate Limit Handling:
+        - Exponential backoff with 3 retries
+        - Falls back to next model on persistent 429/503 errors
     """
+    # Model fallback chain: gated (best) → ungated (fallback)
+    FALLBACK_MODELS = [
+        "meta-llama/Llama-3.1-8B-Instruct",      # Best quality (gated)
+        "mistralai/Mistral-7B-Instruct-v0.3",    # Good quality
+        "HuggingFaceH4/zephyr-7b-beta",          # Ungated fallback
+    ]
+    def __init__(self, model_id: str | None = None):
+        """
+        Initialize with HF Inference client.
+        Args:
+            model_id: HuggingFace model ID. If None, uses fallback chain.
+                     Will automatically use HF_TOKEN from env if available.
+        """
+        from huggingface_hub import InferenceClient
+        import os
+        self.model_id = model_id or self.FALLBACK_MODELS[0]
+        self._fallback_models = self.FALLBACK_MODELS.copy()
+        # InferenceClient auto-reads HF_TOKEN from env
+        self.client = InferenceClient(model=self.model_id)
+        self._has_token = bool(os.getenv("HF_TOKEN"))
+        self.call_count = 0
+        self.last_question = None
+        self.last_evidence = None
+        logger.info(
+            "HFInferenceJudgeHandler initialized",
+            model=self.model_id,
+            has_token=self._has_token,
+        )
+    def _extract_json(self, response: str) -> dict | None:
+        """
+        Robustly extract JSON from LLM response.
+        Handles:
+        - Raw JSON: {"key": "value"}
+        - Markdown code blocks: ```json\n{"key": "value"}\n```
+        - Preamble text: "Here is the JSON:\n{"key": "value"}"
+        - Nested braces: {"outer": {"inner": "value"}}
+        Returns:
+            Parsed dict or None if extraction fails
+        """
+        import json
+        import re
+        # Strategy 1: Try markdown code block first
+        code_block_match = re.search(r"```(?:json)?\s*(\{[\s\S]*?\})\s*```", response)
+        if code_block_match:
+            try:
+                return json.loads(code_block_match.group(1))
+            except json.JSONDecodeError:
+                pass
+        # Strategy 2: Find outermost JSON object with brace matching
+        # This handles nested objects correctly
+        start = response.find("{")
+        if start == -1:
+            return None
+        depth = 0
+        end = start
+        in_string = False
+        escape_next = False
+        for i, char in enumerate(response[start:], start):
+            if escape_next:
+                escape_next = False
+                continue
+            if char == "\\":
+                escape_next = True
+                continue
+            if char == '"' and not escape_next:
+                in_string = not in_string
+                continue
+            if in_string:
+                continue
+            if char == "{":
+                depth += 1
+            elif char == "}":
+                depth -= 1
+                if depth == 0:
+                    end = i + 1
+                    break
+        if depth == 0 and end > start:
+            try:
+                return json.loads(response[start:end])
+            except json.JSONDecodeError:
+                pass
+        return None
+    async def _call_with_retry(
+        self,
+        messages: list[dict],
+        max_retries: int = 3,
+    ) -> str:
+        """
+        Call HF Inference with exponential backoff retry.
+        Args:
+            messages: Chat messages in OpenAI format
+            max_retries: Max retry attempts
+        Returns:
+            Response text
+        Raises:
+            Exception if all retries fail
+        """
+        import asyncio
+        import time
+        last_error = None
+        for attempt in range(max_retries):
+            try:
+                loop = asyncio.get_event_loop()
+                response = await loop.run_in_executor(
+                    None,
+                    lambda: self.client.chat_completion(
+                        messages=messages,
+                        max_tokens=1024,
+                        temperature=0.1,
+                    )
+                )
+                return response.choices[0].message.content
+            except Exception as e:
+                last_error = e
+                error_str = str(e).lower()
+                # Check if rate limited or service unavailable
+                is_rate_limit = "429" in error_str or "rate" in error_str
+                is_unavailable = "503" in error_str or "unavailable" in error_str
+                is_auth_error = "401" in error_str or "403" in error_str
+                if is_auth_error:
+                    # Gated model without token - try fallback immediately
+                    logger.warning("Auth error, trying fallback model", error=str(e))
+                    if self._try_fallback_model():
+                        continue
+                    raise
+                if is_rate_limit or is_unavailable:
+                    # Exponential backoff: 1s, 2s, 4s
+                    wait_time = 2 ** attempt
+                    logger.warning(
+                        "Rate limited, retrying",
+                        attempt=attempt + 1,
+                        wait=wait_time,
+                        error=str(e),
+                    )
+                    await asyncio.sleep(wait_time)
+                    continue
+                # Other errors - raise immediately
+                raise
+        # All retries failed - try fallback model
+        if self._try_fallback_model():
+            return await self._call_with_retry(messages, max_retries=1)
+        raise last_error or Exception("All retries failed")
+    def _try_fallback_model(self) -> bool:
+        """
+        Try to switch to a fallback model.
+        Returns:
+            True if successfully switched, False if no fallbacks left
+        """
+        from huggingface_hub import InferenceClient
+        # Remove current model from fallbacks
+        if self.model_id in self._fallback_models:
+            self._fallback_models.remove(self.model_id)
+        if not self._fallback_models:
+            return False
+        # Switch to next model
+        self.model_id = self._fallback_models[0]
+        self.client = InferenceClient(model=self.model_id)
+        logger.info("Switched to fallback model", model=self.model_id)
+        return True
+    async def assess(
+        self,
+        question: str,
+        evidence: List[Evidence],
+    ) -> JudgeAssessment:
         """
+        Assess evidence using HuggingFace Inference API.
+        Uses chat_completion API for model-agnostic prompts.
+        Includes retry logic and fallback model chain.
         Args:
+            question: The user's research question
+            evidence: List of Evidence objects from search
+        Returns:
+            JudgeAssessment with evaluation results
         """
+        self.call_count += 1
+        self.last_question = question
+        self.last_evidence = evidence
+        # Format the prompt
+        if evidence:
+            user_prompt = format_user_prompt(question, evidence)
+        else:
+            user_prompt = format_empty_evidence_prompt(question)
+        # Build messages in OpenAI-compatible format (works with chat_completion)
+        json_schema = """{
+    "details": {
+        "mechanism_score": <int 0-10>,
+        "mechanism_reasoning": "<string>",
+        "clinical_evidence_score": <int 0-10>,
+        "clinical_reasoning": "<string>",
+        "drug_candidates": ["<string>", ...],
+        "key_findings": ["<string>", ...]
+    },
+    "sufficient": <bool>,
+    "confidence": <float 0-1>,
+    "recommendation": "continue" | "synthesize",
+    "next_search_queries": ["<string>", ...],
+    "reasoning": "<string>"
+}"""
+        messages = [
+            {
+                "role": "system",
+                "content": f"{SYSTEM_PROMPT}\n\nIMPORTANT: Respond with ONLY valid JSON matching this schema:\n{json_schema}",
+            },
+            {
+                "role": "user",
+                "content": user_prompt,
+            },
+        ]
+        try:
+            # Call with retry and fallback
+            response = await self._call_with_retry(messages)
+            # Robust JSON extraction
+            data = self._extract_json(response)
+            if data:
+                return JudgeAssessment(**data)
+            # If no valid JSON, return fallback
+            logger.warning(
+                "HF Inference returned invalid JSON",
+                response=response[:200],
+                model=self.model_id,
+            )
+            return self._create_fallback_assessment(question, "Invalid JSON response")
+        except Exception as e:
+            logger.error("HF Inference failed", error=str(e), model=self.model_id)
+            return self._create_fallback_assessment(question, str(e))
+    def _create_fallback_assessment(
+        self,
+        question: str,
+        error: str,
+    ) -> JudgeAssessment:
+        """Create a fallback assessment when inference fails."""
+        return JudgeAssessment(
+            details=AssessmentDetails(
+                mechanism_score=0,
+                mechanism_reasoning=f"Assessment failed: {error}",
+                clinical_evidence_score=0,
+                clinical_reasoning=f"Assessment failed: {error}",
+                drug_candidates=[],
+                key_findings=[],
+            ),
+            sufficient=False,
+            confidence=0.0,
+            recommendation="continue",
+            next_search_queries=[
+                f"{question} mechanism",
+                f"{question} clinical trials",
+                f"{question} drug candidates",
+            ],
+            reasoning=f"HF Inference failed: {error}. Recommend retrying.",
+        )
+class MockJudgeHandler:
+    """
+    Mock JudgeHandler for UNIT TESTING ONLY.
+    NOT for production use. Use HFInferenceJudgeHandler for demo mode.
+    """
+    def __init__(self, mock_response: JudgeAssessment | None = None):
+        """Initialize with optional mock response for testing."""
         self.mock_response = mock_response
         self.call_count = 0
         self.last_question = None
         question: str,
         evidence: List[Evidence],
     ) -> JudgeAssessment:
+        """Return the mock response (for testing only)."""
         self.call_count += 1
         self.last_question = question
         self.last_evidence = evidence
         if self.mock_response:
             return self.mock_response
+        # Default mock response for tests
         return JudgeAssessment(
             details=AssessmentDetails(
                 mechanism_score=7,
+                mechanism_reasoning="Mock assessment for testing",
                 clinical_evidence_score=6,
+                clinical_reasoning="Mock assessment for testing",
+                drug_candidates=["TestDrug"],
+                key_findings=["Test finding"],
             ),
             sufficient=len(evidence) >= 3,
             confidence=0.75,
             recommendation="synthesize" if len(evidence) >= 3 else "continue",
             next_search_queries=["query 1", "query 2"] if len(evidence) < 3 else [],
+            reasoning="Mock assessment for unit testing only",
         )
 ```
             assert "failed" in result.reasoning.lower()
+class TestHFInferenceJudgeHandler:
+    """Tests for HFInferenceJudgeHandler."""
+    @pytest.mark.asyncio
+    async def test_extract_json_raw(self):
+        """Should extract raw JSON."""
+        from src.agent_factory.judges import HFInferenceJudgeHandler
+        handler = HFInferenceJudgeHandler.__new__(HFInferenceJudgeHandler)
+        # Bypass __init__ for unit testing extraction
+        result = handler._extract_json('{"key": "value"}')
+        assert result == {"key": "value"}
+    @pytest.mark.asyncio
+    async def test_extract_json_markdown_block(self):
+        """Should extract JSON from markdown code block."""
+        from src.agent_factory.judges import HFInferenceJudgeHandler
+        handler = HFInferenceJudgeHandler.__new__(HFInferenceJudgeHandler)
+        response = '''Here is the assessment:
+```json
+{"key": "value", "nested": {"inner": 1}}
+```
+'''
+        result = handler._extract_json(response)
+        assert result == {"key": "value", "nested": {"inner": 1}}
+    @pytest.mark.asyncio
+    async def test_extract_json_with_preamble(self):
+        """Should extract JSON with preamble text."""
+        from src.agent_factory.judges import HFInferenceJudgeHandler
+        handler = HFInferenceJudgeHandler.__new__(HFInferenceJudgeHandler)
+        response = 'Here is your JSON response:\n{"sufficient": true, "confidence": 0.85}'
+        result = handler._extract_json(response)
+        assert result == {"sufficient": True, "confidence": 0.85}
+    @pytest.mark.asyncio
+    async def test_extract_json_nested_braces(self):
+        """Should handle nested braces correctly."""
+        from src.agent_factory.judges import HFInferenceJudgeHandler
+        handler = HFInferenceJudgeHandler.__new__(HFInferenceJudgeHandler)
+        response = '{"details": {"mechanism_score": 8}, "reasoning": "test"}'
+        result = handler._extract_json(response)
+        assert result["details"]["mechanism_score"] == 8
+    @pytest.mark.asyncio
+    async def test_hf_handler_uses_fallback_models(self):
+        """HFInferenceJudgeHandler should have fallback model chain."""
+        from src.agent_factory.judges import HFInferenceJudgeHandler
+        # Check class has fallback models defined
+        assert len(HFInferenceJudgeHandler.FALLBACK_MODELS) >= 3
+        assert "zephyr-7b-beta" in HFInferenceJudgeHandler.FALLBACK_MODELS[-1]
+    @pytest.mark.asyncio
+    async def test_hf_handler_fallback_on_auth_error(self):
+        """Should fall back to ungated model on auth error."""
+        from src.agent_factory.judges import HFInferenceJudgeHandler
+        from unittest.mock import MagicMock, patch
+        with patch("src.agent_factory.judges.InferenceClient") as mock_client_class:
+            # First call raises 403, second succeeds
+            mock_client = MagicMock()
+            mock_client.chat_completion.side_effect = [
+                Exception("403 Forbidden: gated model"),
+                MagicMock(choices=[MagicMock(message=MagicMock(content='{"sufficient": false}'))])
+            ]
+            mock_client_class.return_value = mock_client
+            handler = HFInferenceJudgeHandler()
+            # Manually trigger fallback test
+            assert handler._try_fallback_model() is True
+            assert handler.model_id != "meta-llama/Llama-3.1-8B-Instruct"
 class TestMockJudgeHandler:
+    """Tests for MockJudgeHandler (UNIT TESTING ONLY)."""
     @pytest.mark.asyncio
     async def test_mock_handler_returns_default(self):
     "pydantic-ai>=0.0.16",
     "openai>=1.0.0",
     "anthropic>=0.18.0",
+    "huggingface-hub>=0.20.0",  # For HFInferenceJudgeHandler (FREE LLM)
 ]
 ```
+**Note**: `huggingface-hub` is required for the free tier to work. It:
+- Provides `InferenceClient` for API calls
+- Auto-reads `HF_TOKEN` from environment (optional, for gated models)
+- Works without any token for ungated models like `zephyr-7b-beta`
 ---
 ## 7. Configuration (`src/utils/config.py`)

docs/implementation/04_phase_ui.md CHANGED Viewed

@@ -408,33 +408,65 @@ from typing import AsyncGenerator
 from src.orchestrator import Orchestrator
 from src.tools.pubmed import PubMedTool
-from src.tools.websearch import WebTool
 from src.tools.search_handler import SearchHandler
-from src.agent_factory.judges import JudgeHandler, MockJudgeHandler
 from src.utils.models import OrchestratorConfig, AgentEvent
-def create_orchestrator(use_mock: bool = False) -> Orchestrator:
     """
     Create an orchestrator instance.
     Args:
-        use_mock: If True, use MockJudgeHandler (no API key needed)
     Returns:
-        Configured Orchestrator instance
     """
     # Create search tools
     search_handler = SearchHandler(
-        tools=[PubMedTool(), WebTool()],
         timeout=30.0,
     )
-    # Create judge (mock or real)
-    if use_mock:
-        judge_handler = MockJudgeHandler()
     else:
-        judge_handler = JudgeHandler()
     # Create orchestrator
     config = OrchestratorConfig(
@@ -446,12 +478,14 @@ def create_orchestrator(use_mock: bool = False) -> Orchestrator:
         search_handler=search_handler,
         judge_handler=judge_handler,
         config=config,
-    )
 async def research_agent(
     message: str,
     history: list[dict],
 ) -> AsyncGenerator[str, None]:
     """
     Gradio chat function that runs the research agent.
@@ -459,6 +493,8 @@ async def research_agent(
     Args:
         message: User's research question
         history: Chat history (Gradio format)
     Yields:
         Markdown-formatted responses for streaming
@@ -467,10 +503,31 @@ async def research_agent(
         yield "Please enter a research question."
         return
-    # Create orchestrator (use mock if no API key)
     import os
-    use_mock = not (os.getenv("OPENAI_API_KEY") or os.getenv("ANTHROPIC_API_KEY"))
-    orchestrator = create_orchestrator(use_mock=use_mock)
     # Run the agent and stream events
     response_parts = []
@@ -952,15 +1009,22 @@ uv run python -m src.app
 import asyncio
 from src.orchestrator import Orchestrator
 from src.tools.pubmed import PubMedTool
-from src.tools.websearch import WebTool
 from src.tools.search_handler import SearchHandler
-from src.agent_factory.judges import MockJudgeHandler
 from src.utils.models import OrchestratorConfig
 async def test_full_flow():
     # Create components
-    search_handler = SearchHandler([PubMedTool(), WebTool()])
-    judge_handler = MockJudgeHandler()  # Use mock for testing
     config = OrchestratorConfig(max_iterations=3)
     # Create orchestrator
@@ -980,6 +1044,8 @@ async def test_full_flow():
 asyncio.run(test_full_flow())
 ```
 ---
 ## 10. Deployment Verification

 from src.orchestrator import Orchestrator
 from src.tools.pubmed import PubMedTool
+from src.tools.clinicaltrials import ClinicalTrialsTool
+from src.tools.biorxiv import BioRxivTool
 from src.tools.search_handler import SearchHandler
+from src.agent_factory.judges import JudgeHandler, HFInferenceJudgeHandler
 from src.utils.models import OrchestratorConfig, AgentEvent
+def create_orchestrator(
+    user_api_key: str | None = None,
+    api_provider: str = "openai",
+) -> tuple[Orchestrator, str]:
     """
     Create an orchestrator instance.
     Args:
+        user_api_key: Optional user-provided API key (BYOK)
+        api_provider: API provider ("openai" or "anthropic")
     Returns:
+        Tuple of (Configured Orchestrator instance, backend_name)
+    Priority:
+        1. User-provided API key → JudgeHandler (OpenAI/Anthropic)
+        2. Environment API key → JudgeHandler (OpenAI/Anthropic)
+        3. No key → HFInferenceJudgeHandler (FREE, automatic fallback chain)
+    HF Inference Fallback Chain:
+        1. Llama 3.1 8B (requires HF_TOKEN for gated model)
+        2. Mistral 7B (may require token)
+        3. Zephyr 7B (ungated, always works)
     """
+    import os
     # Create search tools
     search_handler = SearchHandler(
+        tools=[PubMedTool(), ClinicalTrialsTool(), BioRxivTool()],
         timeout=30.0,
     )
+    # Determine which judge to use
+    has_env_key = bool(os.getenv("OPENAI_API_KEY") or os.getenv("ANTHROPIC_API_KEY"))
+    has_user_key = bool(user_api_key)
+    has_hf_token = bool(os.getenv("HF_TOKEN"))
+    if has_user_key:
+        # User provided their own key
+        judge_handler = JudgeHandler(model=None)
+        backend_name = f"your {api_provider.upper()} API key"
+    elif has_env_key:
+        # Environment has API key configured
+        judge_handler = JudgeHandler(model=None)
+        backend_name = "configured API key"
     else:
+        # Use FREE HuggingFace Inference with automatic fallback
+        judge_handler = HFInferenceJudgeHandler()
+        if has_hf_token:
+            backend_name = "HuggingFace Inference (Llama 3.1)"
+        else:
+            backend_name = "HuggingFace Inference (free tier)"
     # Create orchestrator
     config = OrchestratorConfig(
         search_handler=search_handler,
         judge_handler=judge_handler,
         config=config,
+    ), backend_name
 async def research_agent(
     message: str,
     history: list[dict],
+    api_key: str = "",
+    api_provider: str = "openai",
 ) -> AsyncGenerator[str, None]:
     """
     Gradio chat function that runs the research agent.
     Args:
         message: User's research question
         history: Chat history (Gradio format)
+        api_key: Optional user-provided API key (BYOK)
+        api_provider: API provider ("openai" or "anthropic")
     Yields:
         Markdown-formatted responses for streaming
         yield "Please enter a research question."
         return
     import os
+    # Clean user-provided API key
+    user_api_key = api_key.strip() if api_key else None
+    # Create orchestrator with appropriate judge
+    orchestrator, backend_name = create_orchestrator(
+        user_api_key=user_api_key,
+        api_provider=api_provider,
+    )
+    # Determine icon based on backend
+    has_hf_token = bool(os.getenv("HF_TOKEN"))
+    if "HuggingFace" in backend_name:
+        icon = "🤗"
+        extra_note = (
+            "\n*For premium analysis, enter an OpenAI or Anthropic API key.*"
+            if not has_hf_token else ""
+        )
+    else:
+        icon = "🔑"
+        extra_note = ""
+    # Inform user which backend is being used
+    yield f"{icon} **Using {backend_name}**{extra_note}\n\n"
     # Run the agent and stream events
     response_parts = []
 import asyncio
 from src.orchestrator import Orchestrator
 from src.tools.pubmed import PubMedTool
+from src.tools.biorxiv import BioRxivTool
+from src.tools.clinicaltrials import ClinicalTrialsTool
 from src.tools.search_handler import SearchHandler
+from src.agent_factory.judges import HFInferenceJudgeHandler, MockJudgeHandler
 from src.utils.models import OrchestratorConfig
 async def test_full_flow():
     # Create components
+    search_handler = SearchHandler([PubMedTool(), ClinicalTrialsTool(), BioRxivTool()])
+    # Option 1: Use FREE HuggingFace Inference (real AI analysis)
+    judge_handler = HFInferenceJudgeHandler()
+    # Option 2: Use MockJudgeHandler for UNIT TESTING ONLY
+    # judge_handler = MockJudgeHandler()
     config = OrchestratorConfig(max_iterations=3)
     # Create orchestrator
 asyncio.run(test_full_flow())
 ```
+**Important**: `MockJudgeHandler` is for **unit testing only**. For actual demo/production use, always use `HFInferenceJudgeHandler` (free) or `JudgeHandler` (with API key).
 ---
 ## 10. Deployment Verification

pyproject.toml CHANGED Viewed

@@ -16,6 +16,7 @@ dependencies = [
     "httpx>=0.27", # Async HTTP client (PubMed)
     "beautifulsoup4>=4.12", # HTML parsing
     "xmltodict>=0.13", # PubMed XML -> dict
     # UI
     "gradio[mcp]>=6.0.0", # Chat interface with MCP server support (6.0 required for css in launch())
     # Utils

     "httpx>=0.27", # Async HTTP client (PubMed)
     "beautifulsoup4>=4.12", # HTML parsing
     "xmltodict>=0.13", # PubMed XML -> dict
+    "huggingface-hub>=0.20.0", # Hugging Face Inference API
     # UI
     "gradio[mcp]>=6.0.0", # Chat interface with MCP server support (6.0 required for css in launch())
     # Utils

src/agent_factory/judges.py CHANGED Viewed

@@ -1,13 +1,17 @@
 """Judge handler for evidence assessment using PydanticAI."""
-from typing import Any
 import structlog
 from pydantic_ai import Agent
 from pydantic_ai.models.anthropic import AnthropicModel
 from pydantic_ai.models.openai import OpenAIModel
 from pydantic_ai.providers.anthropic import AnthropicProvider
 from pydantic_ai.providers.openai import OpenAIProvider
 from src.prompts.judge import (
     SYSTEM_PROMPT,
@@ -146,6 +150,200 @@ class JudgeHandler:
         )
 class MockJudgeHandler:
     """
     Mock JudgeHandler for demo mode without LLM calls.

 """Judge handler for evidence assessment using PydanticAI."""
+import asyncio
+import json
+from typing import Any, ClassVar
 import structlog
+from huggingface_hub import InferenceClient
 from pydantic_ai import Agent
 from pydantic_ai.models.anthropic import AnthropicModel
 from pydantic_ai.models.openai import OpenAIModel
 from pydantic_ai.providers.anthropic import AnthropicProvider
 from pydantic_ai.providers.openai import OpenAIProvider
+from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_exponential
 from src.prompts.judge import (
     SYSTEM_PROMPT,
         )
+class HFInferenceJudgeHandler:
+    """
+    JudgeHandler using HuggingFace Inference API for FREE LLM calls.
+    Defaults to Llama-3.1-8B-Instruct (requires HF_TOKEN) or falls back to public models.
+    """
+    FALLBACK_MODELS: ClassVar[list[str]] = [
+        "meta-llama/Llama-3.1-8B-Instruct",  # Primary (Gated)
+        "mistralai/Mistral-7B-Instruct-v0.3",  # Secondary
+        "HuggingFaceH4/zephyr-7b-beta",  # Fallback (Ungated)
+    ]
+    def __init__(self, model_id: str | None = None) -> None:
+        """
+        Initialize with HF Inference client.
+        Args:
+            model_id: Optional specific model ID. If None, uses FALLBACK_MODELS chain.
+        """
+        self.model_id = model_id
+        # Will automatically use HF_TOKEN from env if available
+        self.client = InferenceClient()
+        self.call_count = 0
+        self.last_question: str | None = None
+        self.last_evidence: list[Evidence] | None = None
+    async def assess(
+        self,
+        question: str,
+        evidence: list[Evidence],
+    ) -> JudgeAssessment:
+        """
+        Assess evidence using HuggingFace Inference API.
+        Attempts models in order until one succeeds.
+        """
+        self.call_count += 1
+        self.last_question = question
+        self.last_evidence = evidence
+        # Format the user prompt
+        if evidence:
+            user_prompt = format_user_prompt(question, evidence)
+        else:
+            user_prompt = format_empty_evidence_prompt(question)
+        models_to_try = [self.model_id] if self.model_id else self.FALLBACK_MODELS
+        last_error = None
+        for model in models_to_try:
+            try:
+                return await self._call_with_retry(model, user_prompt, question)
+            except Exception as e:
+                logger.warning(f"Model {model} failed", error=str(e))
+                last_error = e
+                continue
+        # All models failed
+        logger.error("All HF models failed", error=str(last_error))
+        return self._create_fallback_assessment(question, str(last_error))
+    @retry(
+        stop=stop_after_attempt(3),
+        wait=wait_exponential(multiplier=1, min=1, max=4),
+        retry=retry_if_exception_type(Exception),
+        reraise=True,
+    )
+    async def _call_with_retry(self, model: str, prompt: str, question: str) -> JudgeAssessment:
+        """Make API call with retry logic using chat_completion."""
+        loop = asyncio.get_running_loop()
+        # Build messages for chat_completion (model-agnostic)
+        messages = [
+            {
+                "role": "system",
+                "content": f"""{SYSTEM_PROMPT}
+IMPORTANT: Respond with ONLY valid JSON matching this schema:
+{{
+    "details": {{
+        "mechanism_score": <int 0-10>,
+        "mechanism_reasoning": "<string>",
+        "clinical_evidence_score": <int 0-10>,
+        "clinical_reasoning": "<string>",
+        "drug_candidates": ["<string>", ...],
+        "key_findings": ["<string>", ...]
+    }},
+    "sufficient": <bool>,
+    "confidence": <float 0-1>,
+    "recommendation": "continue" | "synthesize",
+    "next_search_queries": ["<string>", ...],
+    "reasoning": "<string>"
+}}""",
+            },
+            {"role": "user", "content": prompt},
+        ]
+        # Use chat_completion (conversational task - supported by all models)
+        response = await loop.run_in_executor(
+            None,
+            lambda: self.client.chat_completion(
+                messages=messages,
+                model=model,
+                max_tokens=1024,
+                temperature=0.1,
+            ),
+        )
+        # Extract content from response
+        content = response.choices[0].message.content
+        if not content:
+            raise ValueError("Empty response from model")
+        # Extract and parse JSON
+        json_data = self._extract_json(content)
+        if not json_data:
+            raise ValueError("No valid JSON found in response")
+        return JudgeAssessment(**json_data)
+    def _extract_json(self, text: str) -> dict[str, Any] | None:
+        """
+        Robust JSON extraction that handles markdown blocks and nested braces.
+        """
+        text = text.strip()
+        # Remove markdown code blocks if present
+        if "```json" in text:
+            text = text.split("```json")[1].split("```")[0]
+        elif "```" in text:
+            text = text.split("```")[1].split("```")[0]
+        text = text.strip()
+        # Find first '{'
+        start_idx = text.find("{")
+        if start_idx == -1:
+            return None
+        # Stack-based parsing ignoring chars in strings
+        count = 0
+        in_string = False
+        escape = False
+        for i, char in enumerate(text[start_idx:], start=start_idx):
+            if in_string:
+                if escape:
+                    escape = False
+                elif char == "\\":
+                    escape = True
+                elif char == '"':
+                    in_string = False
+            elif char == '"':
+                in_string = True
+            elif char == "{":
+                count += 1
+            elif char == "}":
+                count -= 1
+                if count == 0:
+                    try:
+                        result = json.loads(text[start_idx : i + 1])
+                        if isinstance(result, dict):
+                            return result
+                        return None
+                    except json.JSONDecodeError:
+                        return None
+        return None
+    def _create_fallback_assessment(
+        self,
+        question: str,
+        error: str,
+    ) -> JudgeAssessment:
+        """Create a fallback assessment when inference fails."""
+        return JudgeAssessment(
+            details=AssessmentDetails(
+                mechanism_score=0,
+                mechanism_reasoning=f"Assessment failed: {error}",
+                clinical_evidence_score=0,
+                clinical_reasoning=f"Assessment failed: {error}",
+                drug_candidates=[],
+                key_findings=[],
+            ),
+            sufficient=False,
+            confidence=0.0,
+            recommendation="continue",
+            next_search_queries=[
+                f"{question} mechanism",
+                f"{question} clinical trials",
+            ],
+            reasoning=f"HF Inference failed: {error}. Recommend configuring OpenAI/Anthropic key.",
+        )
 class MockJudgeHandler:
     """
     Mock JudgeHandler for demo mode without LLM calls.

src/app.py CHANGED Viewed

@@ -10,7 +10,7 @@ from pydantic_ai.models.openai import OpenAIModel
 from pydantic_ai.providers.anthropic import AnthropicProvider
 from pydantic_ai.providers.openai import OpenAIProvider
-from src.agent_factory.judges import JudgeHandler, MockJudgeHandler
 from src.mcp_tools import (
     analyze_hypothesis,
     search_all_sources,
@@ -32,7 +32,7 @@ def configure_orchestrator(
     mode: str = "simple",
     user_api_key: str | None = None,
     api_provider: str = "openai",
-) -> Any:
     """
     Create an orchestrator instance.
@@ -43,7 +43,7 @@ def configure_orchestrator(
         api_provider: API provider ("openai" or "anthropic")
     Returns:
-        Configured Orchestrator instance
     """
     # Create orchestrator config
     config = OrchestratorConfig(
@@ -57,12 +57,21 @@ def configure_orchestrator(
         timeout=config.search_timeout,
     )
-    # Create judge (mock or real)
-    judge_handler: JudgeHandler | MockJudgeHandler
     if use_mock:
         judge_handler = MockJudgeHandler()
-    else:
-        # Create model with user's API key if provided
         model: AnthropicModel | OpenAIModel | None = None
         if user_api_key:
             if api_provider == "anthropic":
@@ -71,17 +80,26 @@ def configure_orchestrator(
             elif api_provider == "openai":
                 openai_provider = OpenAIProvider(api_key=user_api_key)
                 model = OpenAIModel(settings.openai_model, provider=openai_provider)
-            else:
-                raise ValueError(f"Unsupported API provider: {api_provider}")
         judge_handler = JudgeHandler(model=model)
-    return create_orchestrator(
         search_handler=search_handler,
         judge_handler=judge_handler,
         config=config,
         mode=mode,  # type: ignore
     )
 async def research_agent(
     message: str,
@@ -110,54 +128,47 @@ async def research_agent(
     # Clean user-provided API key
     user_api_key = api_key.strip() if api_key else None
-    # Decide whether to use real LLMs or mock based on mode and available keys
     has_openai = bool(os.getenv("OPENAI_API_KEY"))
     has_anthropic = bool(os.getenv("ANTHROPIC_API_KEY"))
     has_user_key = bool(user_api_key)
-    if mode == "magentic":
-        # Magentic currently supports OpenAI only
-        use_mock = not (has_openai or (has_user_key and api_provider == "openai"))
-    else:
-        # Simple mode can work with either provider
-        use_mock = not (has_openai or has_anthropic or has_user_key)
-    # If magentic mode requested but no OpenAI key, fallback/warn
-    if mode == "magentic" and use_mock:
         yield (
-            "⚠️ **Warning**: Magentic mode requires OpenAI API key. "
-            "Falling back to demo mode.\n\n"
         )
         mode = "simple"
     # Inform user about their key being used
-    if has_user_key and not use_mock:
         yield (
             f"🔑 **Using your {api_provider.upper()} API key** - "
             "Your key is used only for this session and is never stored.\n\n"
         )
-    # Warn users when running in demo mode (no LLM keys)
-    if use_mock:
         yield (
-            "🔬 **Demo Mode**: Running with real biomedical searches but without "
-            "LLM-powered analysis.\n\n"
-            "**To unlock full AI analysis:**\n"
-            "- Enter your OpenAI or Anthropic API key below, OR\n"
-            "- Configure secrets in HuggingFace Space settings\n\n"
-            "---\n\n"
         )
     # Run the agent and stream events
     response_parts: list[str] = []
     try:
-        orchestrator = configure_orchestrator(
-            use_mock=use_mock,
             mode=mode,
             user_api_key=user_api_key,
             api_provider=api_provider,
         )
         async for event in orchestrator.run(message):
             # Format event as markdown
             event_md = event.to_markdown()

 from pydantic_ai.providers.anthropic import AnthropicProvider
 from pydantic_ai.providers.openai import OpenAIProvider
+from src.agent_factory.judges import HFInferenceJudgeHandler, JudgeHandler, MockJudgeHandler
 from src.mcp_tools import (
     analyze_hypothesis,
     search_all_sources,
     mode: str = "simple",
     user_api_key: str | None = None,
     api_provider: str = "openai",
+) -> tuple[Any, str]:
     """
     Create an orchestrator instance.
         api_provider: API provider ("openai" or "anthropic")
     Returns:
+        Tuple of (Orchestrator instance, backend_name)
     """
     # Create orchestrator config
     config = OrchestratorConfig(
         timeout=config.search_timeout,
     )
+    # Create judge (mock, real, or free tier)
+    judge_handler: JudgeHandler | MockJudgeHandler | HFInferenceJudgeHandler
+    backend_info = "Unknown"
+    # 1. Forced Mock (Unit Testing)
     if use_mock:
         judge_handler = MockJudgeHandler()
+        backend_info = "Mock (Testing)"
+    # 2. Paid API Key (User provided or Env)
+    elif (
+        user_api_key
+        or (api_provider == "openai" and os.getenv("OPENAI_API_KEY"))
+        or (api_provider == "anthropic" and os.getenv("ANTHROPIC_API_KEY"))
+    ):
         model: AnthropicModel | OpenAIModel | None = None
         if user_api_key:
             if api_provider == "anthropic":
             elif api_provider == "openai":
                 openai_provider = OpenAIProvider(api_key=user_api_key)
                 model = OpenAIModel(settings.openai_model, provider=openai_provider)
+            backend_info = f"Paid API ({api_provider.upper()})"
+        else:
+            backend_info = "Paid API (Env Config)"
         judge_handler = JudgeHandler(model=model)
+    # 3. Free Tier (HuggingFace Inference)
+    else:
+        judge_handler = HFInferenceJudgeHandler()
+        backend_info = "Free Tier (Llama 3.1 / Mistral)"
+    orchestrator = create_orchestrator(
         search_handler=search_handler,
         judge_handler=judge_handler,
         config=config,
         mode=mode,  # type: ignore
     )
+    return orchestrator, backend_info
 async def research_agent(
     message: str,
     # Clean user-provided API key
     user_api_key = api_key.strip() if api_key else None
+    # Check available keys
     has_openai = bool(os.getenv("OPENAI_API_KEY"))
     has_anthropic = bool(os.getenv("ANTHROPIC_API_KEY"))
     has_user_key = bool(user_api_key)
+    has_paid_key = has_openai or has_anthropic or has_user_key
+    # Magentic mode requires OpenAI specifically
+    if mode == "magentic" and not (has_openai or (has_user_key and api_provider == "openai")):
         yield (
+            "⚠️ **Warning**: Magentic mode requires OpenAI API key. Falling back to simple mode.\n\n"
         )
         mode = "simple"
     # Inform user about their key being used
+    if has_user_key:
         yield (
             f"🔑 **Using your {api_provider.upper()} API key** - "
             "Your key is used only for this session and is never stored.\n\n"
         )
+    elif not has_paid_key:
+        # No paid keys - will use FREE HuggingFace Inference
         yield (
+            "🤗 **Free Tier**: Using HuggingFace Inference (Llama 3.1 / Mistral) for AI analysis.\n"
+            "For premium models, enter an OpenAI or Anthropic API key below.\n\n"
         )
     # Run the agent and stream events
     response_parts: list[str] = []
     try:
+        # use_mock=False - let configure_orchestrator decide based on available keys
+        # It will use: Paid API > HF Inference (free tier)
+        orchestrator, backend_name = configure_orchestrator(
+            use_mock=False,  # Never use mock in production - HF Inference is the free fallback
             mode=mode,
             user_api_key=user_api_key,
             api_provider=api_provider,
         )
+        yield f"🧠 **Backend**: {backend_name}\n\n"
         async for event in orchestrator.run(message):
             # Format event as markdown
             event_md = event.to_markdown()

src/prompts/report.py CHANGED Viewed

@@ -124,13 +124,13 @@ async def format_report_prompt(
 {hypotheses_summary}
 ## Assessment Scores
-- Mechanism Score: {assessment.get('mechanism_score', 'N/A')}/10
-- Clinical Evidence Score: {assessment.get('clinical_score', 'N/A')}/10
-- Overall Confidence: {assessment.get('confidence', 0):.0%}
 ## Metadata
 - Sources Searched: {sources}
-- Search Iterations: {metadata.get('iterations', 0)}
 Generate a complete ResearchReport with all sections filled in.

 {hypotheses_summary}
 ## Assessment Scores
+- Mechanism Score: {assessment.get("mechanism_score", "N/A")}/10
+- Clinical Evidence Score: {assessment.get("clinical_score", "N/A")}/10
+- Overall Confidence: {assessment.get("confidence", 0):.0%}
 ## Metadata
 - Sources Searched: {sources}
+- Search Iterations: {metadata.get("iterations", 0)}
 Generate a complete ResearchReport with all sections filled in.

tests/unit/agent_factory/test_judges_hf.py ADDED Viewed

	@@ -0,0 +1,137 @@

+"""Unit tests for HFInferenceJudgeHandler."""
+from unittest.mock import AsyncMock, MagicMock, patch
+import pytest
+from src.agent_factory.judges import HFInferenceJudgeHandler
+from src.utils.models import Citation, Evidence
+class TestHFInferenceJudgeHandler:
+    """Tests for HFInferenceJudgeHandler."""
+    @pytest.fixture
+    def mock_client(self):
+        """Mock HuggingFace InferenceClient."""
+        with patch("src.agent_factory.judges.InferenceClient") as mock:
+            client_instance = MagicMock()
+            mock.return_value = client_instance
+            yield client_instance
+    @pytest.fixture
+    def handler(self, mock_client):
+        """Create a handler instance with mocked client."""
+        return HFInferenceJudgeHandler()
+    @pytest.mark.asyncio
+    async def test_assess_success(self, handler, mock_client):
+        """Test successful assessment with primary model."""
+        import json
+        # Construct valid JSON payload
+        data = {
+            "details": {
+                "mechanism_score": 8,
+                "mechanism_reasoning": "Good mechanism",
+                "clinical_evidence_score": 7,
+                "clinical_reasoning": "Good clinical",
+                "drug_candidates": ["Drug A"],
+                "key_findings": ["Finding 1"],
+            },
+            "sufficient": True,
+            "confidence": 0.85,
+            "recommendation": "synthesize",
+            "next_search_queries": [],
+            "reasoning": (
+                "Sufficient evidence provided to support the hypothesis with high confidence."
+            ),
+        }
+        # Mock chat_completion response structure
+        mock_message = MagicMock()
+        mock_message.content = f"""Here is the analysis:
+```json
+{json.dumps(data)}
+```"""
+        mock_choice = MagicMock()
+        mock_choice.message = mock_message
+        mock_response = MagicMock()
+        mock_response.choices = [mock_choice]
+        # Setup async mock for run_in_executor
+        with patch("asyncio.get_running_loop") as mock_loop:
+            mock_loop.return_value.run_in_executor = AsyncMock(return_value=mock_response)
+            evidence = [
+                Evidence(
+                    content="test", citation=Citation(source="pubmed", title="t", url="u", date="d")
+                )
+            ]
+            result = await handler.assess("test question", evidence)
+            assert result.sufficient is True
+            assert result.confidence == 0.85
+            assert result.details.drug_candidates == ["Drug A"]
+    @pytest.mark.asyncio
+    async def test_assess_fallback_logic(self, handler, mock_client):
+        """Test fallback to secondary model when primary fails."""
+        # Setup async mock to fail first, then succeed
+        with patch("asyncio.get_running_loop"):
+            # We need to mock the _call_with_retry method directly to test the loop in assess
+            # but _call_with_retry is decorated with tenacity,
+            # which makes it harder to mock partial failures easily
+            # without triggering the tenacity retry loop first.
+            # Instead, let's mock run_in_executor to raise exception on first call
+            # This is tricky because assess loops over models,
+            # and for each model _call_with_retry retries.
+            # We want to simulate: Model 1 fails (retries exhausted) -> Model 2 succeeds.
+            # Let's patch _call_with_retry to avoid waiting for real retries
+            side_effect = [
+                Exception("Model 1 failed"),
+                Exception("Model 2 failed"),
+                Exception("Model 3 failed"),
+            ]
+            with patch.object(handler, "_call_with_retry", side_effect=side_effect) as mock_call:
+                evidence = []
+                result = await handler.assess("test", evidence)
+                # Should have tried all 3 fallback models
+                assert mock_call.call_count == 3
+                assert result.sufficient is False  # Fallback assessment
+                error_msg = "All HF models failed"
+                assert error_msg in str(mock_call.side_effect) or "failed" in result.reasoning
+    def test_extract_json_robustness(self, handler):
+        """Test JSON extraction with various inputs."""
+        # 1. Clean JSON
+        assert handler._extract_json('{"a": 1}') == {"a": 1}
+        # 2. Markdown block
+        assert handler._extract_json('```json\n{"a": 1}\n```') == {"a": 1}
+        # 3. Text preamble/postamble
+        text = """
+        Sure, here is the JSON:
+        {
+            "a": 1,
+            "b": {
+                "c": 2
+            }
+        }
+        Hope that helps!
+        """
+        assert handler._extract_json(text) == {"a": 1, "b": {"c": 2}}
+        # 4. Nested braces
+        nested = '{"a": {"b": "}"}}'
+        assert handler._extract_json(nested) == {"a": {"b": "}"}}
+        # 5. Invalid JSON
+        assert handler._extract_json("Not JSON") is None
+        assert handler._extract_json("{Incomplete") is None

uv.lock CHANGED Viewed

@@ -1065,6 +1065,7 @@ dependencies = [
     { name = "beautifulsoup4" },
     { name = "gradio", extra = ["mcp"] },
     { name = "httpx" },
     { name = "openai" },
     { name = "pydantic" },
     { name = "pydantic-ai" },
@@ -1114,6 +1115,7 @@ requires-dist = [
     { name = "chromadb", marker = "extra == 'modal'", specifier = ">=0.4.0" },
     { name = "gradio", extras = ["mcp"], specifier = ">=6.0.0" },
     { name = "httpx", specifier = ">=0.27" },
     { name = "llama-index", marker = "extra == 'modal'", specifier = ">=0.11.0" },
     { name = "llama-index-embeddings-openai", marker = "extra == 'modal'" },
     { name = "llama-index-llms-openai", marker = "extra == 'modal'" },

     { name = "beautifulsoup4" },
     { name = "gradio", extra = ["mcp"] },
     { name = "httpx" },
+    { name = "huggingface-hub" },
     { name = "openai" },
     { name = "pydantic" },
     { name = "pydantic-ai" },
     { name = "chromadb", marker = "extra == 'modal'", specifier = ">=0.4.0" },
     { name = "gradio", extras = ["mcp"], specifier = ">=6.0.0" },
     { name = "httpx", specifier = ">=0.27" },
+    { name = "huggingface-hub", specifier = ">=0.20.0" },
     { name = "llama-index", marker = "extra == 'modal'", specifier = ">=0.11.0" },
     { name = "llama-index-embeddings-openai", marker = "extra == 'modal'" },
     { name = "llama-index-llms-openai", marker = "extra == 'modal'" },