Spaces:

DataQuests
/

DeepCritical

Running

VibecoderMcSwaggins commited on 13 days ago

Commit

3139749

1 Parent(s): e993253

fix: apply lazy init pattern and pydantic-ai fixes to ReportAgent (#12)

Applies the same fixes as Phase 7 HypothesisAgent:
- Lazy initialization via _get_agent() to avoid API key requirement at import
- Use output_type instead of result_type (pydantic-ai API)
- Use result.output instead of result.data
- Fix line length issues in tests
- Proper mocking of get_model in tests

Files changed (6) hide show

src/agents/report_agent.py +136 -0
src/orchestrator_magentic.py +22 -2
src/prompts/report.py +111 -0
src/utils/citation_validator.py +75 -0
src/utils/models.py +99 -0
tests/unit/agents/test_report_agent.py +228 -0

src/agents/report_agent.py ADDED Viewed

	@@ -0,0 +1,136 @@

+"""Report agent for generating structured research reports."""
+from collections.abc import AsyncIterable
+from typing import TYPE_CHECKING, Any
+from agent_framework import (
+    AgentRunResponse,
+    AgentRunResponseUpdate,
+    AgentThread,
+    BaseAgent,
+    ChatMessage,
+    Role,
+)
+from pydantic_ai import Agent
+from src.agent_factory.judges import get_model
+from src.prompts.report import SYSTEM_PROMPT, format_report_prompt
+from src.utils.citation_validator import validate_references
+from src.utils.models import Evidence, ResearchReport
+if TYPE_CHECKING:
+    from src.services.embeddings import EmbeddingService
+class ReportAgent(BaseAgent):  # type: ignore[misc]
+    """Generates structured scientific reports from evidence and hypotheses."""
+    def __init__(
+        self,
+        evidence_store: dict[str, Any],
+        embedding_service: "EmbeddingService | None" = None,  # For diverse selection
+    ) -> None:
+        super().__init__(
+            name="ReportAgent",
+            description="Generates structured scientific research reports with citations",
+        )
+        self._evidence_store = evidence_store
+        self._embeddings = embedding_service
+        self._agent: Agent[None, ResearchReport] | None = None  # Lazy init
+    def _get_agent(self) -> Agent[None, ResearchReport]:
+        """Lazy initialization of LLM agent to avoid requiring API keys at import."""
+        if self._agent is None:
+            self._agent = Agent(
+                model=get_model(),
+                output_type=ResearchReport,
+                system_prompt=SYSTEM_PROMPT,
+            )
+        return self._agent
+    async def run(
+        self,
+        messages: str | ChatMessage | list[str] | list[ChatMessage] | None = None,
+        *,
+        thread: AgentThread | None = None,
+        **kwargs: Any,
+    ) -> AgentRunResponse:
+        """Generate research report."""
+        query = self._extract_query(messages)
+        # Gather all context
+        evidence: list[Evidence] = self._evidence_store.get("current", [])
+        hypotheses = self._evidence_store.get("hypotheses", [])
+        assessment = self._evidence_store.get("last_assessment", {})
+        if not evidence:
+            return AgentRunResponse(
+                messages=[
+                    ChatMessage(
+                        role=Role.ASSISTANT,
+                        text="Cannot generate report: No evidence collected.",
+                    )
+                ],
+                response_id="report-no-evidence",
+            )
+        # Build metadata
+        metadata = {
+            "sources": list(set(e.citation.source for e in evidence)),
+            "iterations": self._evidence_store.get("iteration_count", 0),
+        }
+        # Generate report (format_report_prompt is now async)
+        prompt = await format_report_prompt(
+            query=query,
+            evidence=evidence,
+            hypotheses=hypotheses,
+            assessment=assessment,
+            metadata=metadata,
+            embeddings=self._embeddings,
+        )
+        result = await self._get_agent().run(prompt)
+        report = result.output
+        # ═══════════════════════════════════════════════════════════════════
+        # 🚨 CRITICAL: Validate citations to prevent hallucination
+        # ═══════════════════════════════════════════════════════════════════
+        report = validate_references(report, evidence)
+        # Store validated report
+        self._evidence_store["final_report"] = report
+        # Return markdown version
+        return AgentRunResponse(
+            messages=[ChatMessage(role=Role.ASSISTANT, text=report.to_markdown())],
+            response_id="report-complete",
+            additional_properties={"report": report.model_dump()},
+        )
+    def _extract_query(
+        self, messages: str | ChatMessage | list[str] | list[ChatMessage] | None
+    ) -> str:
+        """Extract query from messages."""
+        if isinstance(messages, str):
+            return messages
+        elif isinstance(messages, ChatMessage):
+            return messages.text or ""
+        elif isinstance(messages, list):
+            for msg in reversed(messages):
+                if isinstance(msg, ChatMessage) and msg.role == Role.USER:
+                    return msg.text or ""
+                elif isinstance(msg, str):
+                    return msg
+        return ""
+    async def run_stream(
+        self,
+        messages: str | ChatMessage | list[str] | list[ChatMessage] | None = None,
+        *,
+        thread: AgentThread | None = None,
+        **kwargs: Any,
+    ) -> AsyncIterable[AgentRunResponseUpdate]:
+        """Streaming wrapper."""
+        result = await self.run(messages, thread=thread, **kwargs)
+        yield AgentRunResponseUpdate(messages=result.messages, response_id=result.response_id)

src/orchestrator_magentic.py CHANGED Viewed

@@ -25,6 +25,7 @@ from agent_framework.openai import OpenAIChatClient
 from src.agents.hypothesis_agent import HypothesisAgent
 from src.agents.judge_agent import JudgeAgent
 from src.agents.search_agent import SearchAgent
 from src.orchestrator import JudgeHandlerProtocol, SearchHandlerProtocol
 from src.utils.config import settings
@@ -81,6 +82,7 @@ class MagenticOrchestrator:
         search_agent: SearchAgent,
         hypothesis_agent: HypothesisAgent,
         judge_agent: JudgeAgent,
     ) -> Any:
         """Build the Magentic workflow with participants."""
         if not settings.openai_api_key:
@@ -95,6 +97,7 @@ class MagenticOrchestrator:
                 searcher=search_agent,
                 hypothesizer=hypothesis_agent,
                 judge=judge_agent,
             )
             .with_standard_manager(
                 chat_client=OpenAIChatClient(
@@ -124,12 +127,22 @@ Workflow:
 2. HypothesisAgent: Generate mechanistic hypotheses (Drug -> Target -> Pathway -> Effect).
 3. SearcherAgent: Use hypothesis-suggested queries for targeted search.
 4. JudgeAgent: Evaluate if evidence supports hypotheses.
-5. Repeat until confident or max rounds.
 Focus on:
 - Identifying specific molecular targets
 - Understanding mechanism of action
 - Finding supporting/contradicting evidence for hypotheses
 """
     async def run(self, query: str) -> AsyncGenerator[AgentEvent, None]:
@@ -155,9 +168,10 @@ Focus on:
         hypothesis_agent = HypothesisAgent(
             self._evidence_store, embedding_service=embedding_service
         )
         # Build workflow and task
-        workflow = self._build_workflow(search_agent, hypothesis_agent, judge_agent)
         task = self._format_task(query, embedding_service is not None)
         iteration = 0
@@ -249,6 +263,12 @@ Focus on:
                 message=f"Judge agent: {_truncate(msg_text)}",
                 iteration=iteration,
             )
         return AgentEvent(
             type="judging",
             message=f"{agent_name}: {_truncate(msg_text)}",

 from src.agents.hypothesis_agent import HypothesisAgent
 from src.agents.judge_agent import JudgeAgent
+from src.agents.report_agent import ReportAgent
 from src.agents.search_agent import SearchAgent
 from src.orchestrator import JudgeHandlerProtocol, SearchHandlerProtocol
 from src.utils.config import settings
         search_agent: SearchAgent,
         hypothesis_agent: HypothesisAgent,
         judge_agent: JudgeAgent,
+        report_agent: ReportAgent,
     ) -> Any:
         """Build the Magentic workflow with participants."""
         if not settings.openai_api_key:
                 searcher=search_agent,
                 hypothesizer=hypothesis_agent,
                 judge=judge_agent,
+                reporter=report_agent,
             )
             .with_standard_manager(
                 chat_client=OpenAIChatClient(
 2. HypothesisAgent: Generate mechanistic hypotheses (Drug -> Target -> Pathway -> Effect).
 3. SearcherAgent: Use hypothesis-suggested queries for targeted search.
 4. JudgeAgent: Evaluate if evidence supports hypotheses.
+5. If sufficient -> ReportAgent: Generate structured research report.
+6. If not sufficient -> Repeat from step 1 with refined queries.
 Focus on:
 - Identifying specific molecular targets
 - Understanding mechanism of action
 - Finding supporting/contradicting evidence for hypotheses
+The final output should be a complete research report with:
+- Executive summary
+- Methodology
+- Hypotheses tested
+- Mechanistic and clinical findings
+- Drug candidates
+- Limitations
+- Conclusion with references
 """
     async def run(self, query: str) -> AsyncGenerator[AgentEvent, None]:
         hypothesis_agent = HypothesisAgent(
             self._evidence_store, embedding_service=embedding_service
         )
+        report_agent = ReportAgent(self._evidence_store, embedding_service=embedding_service)
         # Build workflow and task
+        workflow = self._build_workflow(search_agent, hypothesis_agent, judge_agent, report_agent)
         task = self._format_task(query, embedding_service is not None)
         iteration = 0
                 message=f"Judge agent: {_truncate(msg_text)}",
                 iteration=iteration,
             )
+        elif "report" in agent_name.lower():
+            return AgentEvent(
+                type="synthesizing",
+                message="Report generated successfully.",
+                iteration=iteration,
+            )
         return AgentEvent(
             type="judging",
             message=f"{agent_name}: {_truncate(msg_text)}",

src/prompts/report.py ADDED Viewed

	@@ -0,0 +1,111 @@

+"""Prompts for Report Agent."""
+from typing import TYPE_CHECKING, Any
+from src.utils.text_utils import select_diverse_evidence, truncate_at_sentence
+if TYPE_CHECKING:
+    from src.services.embeddings import EmbeddingService
+    from src.utils.models import Evidence, MechanismHypothesis
+SYSTEM_PROMPT = """You are a scientific writer specializing in drug repurposing research reports.
+Your role is to synthesize evidence and hypotheses into a clear, structured report.
+A good report:
+1. Has a clear EXECUTIVE SUMMARY (one paragraph, key takeaways)
+2. States the RESEARCH QUESTION clearly
+3. Describes METHODOLOGY (what was searched, how)
+4. Evaluates HYPOTHESES with evidence counts
+5. Separates MECHANISTIC and CLINICAL findings
+6. Lists specific DRUG CANDIDATES
+7. Acknowledges LIMITATIONS honestly
+8. Provides a balanced CONCLUSION
+9. Includes properly formatted REFERENCES
+Write in scientific but accessible language. Be specific about evidence strength.
+─────────────────────────────────────────────────────────────────────────────
+🚨 CRITICAL CITATION REQUIREMENTS 🚨
+─────────────────────────────────────────────────────────────────────────────
+You MUST follow these rules for the References section:
+1. You may ONLY cite papers that appear in the Evidence section above
+2. Every reference URL must EXACTLY match a provided evidence URL
+3. Do NOT invent, fabricate, or hallucinate any references
+4. Do NOT modify paper titles, authors, dates, or URLs
+5. If unsure about a citation, OMIT it rather than guess
+6. Copy URLs exactly as provided - do not create similar-looking URLs
+VIOLATION OF THESE RULES PRODUCES DANGEROUS MISINFORMATION.
+─────────────────────────────────────────────────────────────────────────────"""
+async def format_report_prompt(
+    query: str,
+    evidence: list["Evidence"],
+    hypotheses: list["MechanismHypothesis"],
+    assessment: dict[str, Any],
+    metadata: dict[str, Any],
+    embeddings: "EmbeddingService | None" = None,
+) -> str:
+    """Format prompt for report generation.
+    Includes full evidence details for accurate citation.
+    """
+    # Select diverse evidence (not arbitrary truncation)
+    selected = await select_diverse_evidence(evidence, n=20, query=query, embeddings=embeddings)
+    # Include FULL citation details for each evidence item
+    # This helps the LLM create accurate references
+    evidence_lines = []
+    for e in selected:
+        authors = ", ".join(e.citation.authors or ["Unknown"])
+        evidence_lines.append(
+            f"- **Title**: {e.citation.title}\n"
+            f"  **URL**: {e.citation.url}\n"
+            f"  **Authors**: {authors}\n"
+            f"  **Date**: {e.citation.date or 'n.d.'}\n"
+            f"  **Source**: {e.citation.source}\n"
+            f"  **Content**: {truncate_at_sentence(e.content, 200)}\n"
+        )
+    evidence_summary = "\n".join(evidence_lines)
+    if hypotheses:
+        hypotheses_lines = []
+        for h in hypotheses:
+            hypotheses_lines.append(
+                f"- {h.drug} -> {h.target} -> {h.pathway} -> {h.effect} "
+                f"(Confidence: {h.confidence:.0%})"
+            )
+        hypotheses_summary = "\n".join(hypotheses_lines)
+    else:
+        hypotheses_summary = "No hypotheses generated yet."
+    sources = ", ".join(metadata.get("sources", []))
+    return f"""Generate a structured research report for the following query.
+## Original Query
+{query}
+## Evidence Collected ({len(selected)} papers, selected for diversity)
+{evidence_summary}
+## Hypotheses Generated
+{hypotheses_summary}
+## Assessment Scores
+- Mechanism Score: {assessment.get('mechanism_score', 'N/A')}/10
+- Clinical Evidence Score: {assessment.get('clinical_score', 'N/A')}/10
+- Overall Confidence: {assessment.get('confidence', 0):.0%}
+## Metadata
+- Sources Searched: {sources}
+- Search Iterations: {metadata.get('iterations', 0)}
+Generate a complete ResearchReport with all sections filled in.
+REMINDER: Only cite papers from the Evidence section above. Copy URLs exactly."""

src/utils/citation_validator.py ADDED Viewed

	@@ -0,0 +1,75 @@

+"""Citation validation to prevent LLM hallucination.
+CRITICAL: Medical research requires accurate citations.
+This module validates that all references exist in collected evidence.
+"""
+import logging
+from typing import TYPE_CHECKING
+if TYPE_CHECKING:
+    from src.utils.models import Evidence, ResearchReport
+logger = logging.getLogger(__name__)
+def validate_references(report: "ResearchReport", evidence: list["Evidence"]) -> "ResearchReport":
+    """Ensure all references actually exist in collected evidence.
+    CRITICAL: Prevents LLM hallucination of citations.
+    Args:
+        report: The generated research report
+        evidence: All evidence collected during research
+    Returns:
+        Report with only valid references (hallucinated ones removed)
+    """
+    # Build set of valid URLs from evidence
+    valid_urls = {e.citation.url for e in evidence}
+    # Also check titles (case-insensitive) as fallback
+    valid_titles = {e.citation.title.lower() for e in evidence}
+    validated_refs = []
+    removed_count = 0
+    for ref in report.references:
+        ref_url = ref.get("url", "")
+        ref_title = ref.get("title", "").lower()
+        # Check if URL matches collected evidence
+        if ref_url in valid_urls:
+            validated_refs.append(ref)
+        # Fallback: check title match (URLs might differ slightly)
+        elif ref_title and any(ref_title in t or t in ref_title for t in valid_titles):
+            validated_refs.append(ref)
+        else:
+            removed_count += 1
+            logger.warning(
+                f"Removed hallucinated reference: '{ref.get('title', 'Unknown')}' "
+                f"(URL: {ref_url[:50]}...)"
+            )
+    if removed_count > 0:
+        logger.info(
+            f"Citation validation removed {removed_count} hallucinated references. "
+            f"{len(validated_refs)} valid references remain."
+        )
+    # Update report with validated references
+    report.references = validated_refs
+    return report
+def build_reference_from_evidence(evidence: "Evidence") -> dict[str, str]:
+    """Build a properly formatted reference from evidence.
+    Use this to ensure references match the original evidence exactly.
+    """
+    return {
+        "title": evidence.citation.title,
+        "authors": ", ".join(evidence.citation.authors or ["Unknown"]),
+        "source": evidence.citation.source,
+        "date": evidence.citation.date or "n.d.",
+        "url": evidence.citation.url,
+    }

src/utils/models.py CHANGED Viewed

@@ -172,6 +172,105 @@ class HypothesisAssessment(BaseModel):
     recommended_searches: list[str] = Field(description="Searches to fill knowledge gaps")
 class OrchestratorConfig(BaseModel):
     """Configuration for the orchestrator."""

     recommended_searches: list[str] = Field(description="Searches to fill knowledge gaps")
+class ReportSection(BaseModel):
+    """A section of the research report."""
+    title: str
+    content: str
+    citations: list[str] = Field(default_factory=list)
+class ResearchReport(BaseModel):
+    """Structured scientific report."""
+    title: str = Field(description="Report title")
+    executive_summary: str = Field(
+        description="One-paragraph summary for quick reading", min_length=100, max_length=1000
+    )
+    research_question: str = Field(description="Clear statement of what was investigated")
+    methodology: ReportSection = Field(description="How the research was conducted")
+    hypotheses_tested: list[dict[str, Any]] = Field(
+        description="Hypotheses with supporting/contradicting evidence counts"
+    )
+    mechanistic_findings: ReportSection = Field(description="Findings about drug mechanisms")
+    clinical_findings: ReportSection = Field(
+        description="Findings from clinical/preclinical studies"
+    )
+    drug_candidates: list[str] = Field(description="Identified drug candidates")
+    limitations: list[str] = Field(description="Study limitations")
+    conclusion: str = Field(description="Overall conclusion")
+    references: list[dict[str, str]] = Field(
+        description="Formatted references with title, authors, source, URL"
+    )
+    # Metadata
+    sources_searched: list[str] = Field(default_factory=list)
+    total_papers_reviewed: int = 0
+    search_iterations: int = 0
+    confidence_score: float = Field(ge=0, le=1)
+    def to_markdown(self) -> str:
+        """Render report as markdown."""
+        sections = [
+            f"# {self.title}\n",
+            f"## Executive Summary\n{self.executive_summary}\n",
+            f"## Research Question\n{self.research_question}\n",
+            f"## Methodology\n{self.methodology.content}\n",
+        ]
+        # Hypotheses
+        sections.append("## Hypotheses Tested\n")
+        for h in self.hypotheses_tested:
+            supported = h.get("supported", 0)
+            contradicted = h.get("contradicted", 0)
+            status = "✅ Supported" if supported > contradicted else "⚠️ Mixed"
+            sections.append(
+                f"- **{h.get('mechanism', 'Unknown')}** ({status}): "
+                f"{supported} supporting, {contradicted} contradicting\n"
+            )
+        # Findings
+        sections.append(f"## Mechanistic Findings\n{self.mechanistic_findings.content}\n")
+        sections.append(f"## Clinical Findings\n{self.clinical_findings.content}\n")
+        # Drug candidates
+        sections.append("## Drug Candidates\n")
+        for drug in self.drug_candidates:
+            sections.append(f"- **{drug}**\n")
+        # Limitations
+        sections.append("## Limitations\n")
+        for lim in self.limitations:
+            sections.append(f"- {lim}\n")
+        # Conclusion
+        sections.append(f"## Conclusion\n{self.conclusion}\n")
+        # References
+        sections.append("## References\n")
+        for i, ref in enumerate(self.references, 1):
+            sections.append(
+                f"{i}. {ref.get('authors', 'Unknown')}. "
+                f"*{ref.get('title', 'Untitled')}*. "
+                f"{ref.get('source', '')} ({ref.get('date', '')}). "
+                f"[Link]({ref.get('url', '#')})\n"
+            )
+        # Metadata footer
+        sections.append("\n---\n")
+        sections.append(
+            f"*Report generated from {self.total_papers_reviewed} papers "
+            f"across {self.search_iterations} search iterations. "
+            f"Confidence: {self.confidence_score:.0%}*"
+        )
+        return "\n".join(sections)
 class OrchestratorConfig(BaseModel):
     """Configuration for the orchestrator."""

tests/unit/agents/test_report_agent.py ADDED Viewed

	@@ -0,0 +1,228 @@

+"""Unit tests for ReportAgent."""
+from typing import Any
+from unittest.mock import AsyncMock, MagicMock, patch
+import pytest
+from src.agents.report_agent import ReportAgent
+from src.utils.models import (
+    Citation,
+    Evidence,
+    MechanismHypothesis,
+    ReportSection,
+    ResearchReport,
+)
+@pytest.fixture
+def sample_evidence() -> list[Evidence]:
+    return [
+        Evidence(
+            content="Metformin activates AMPK...",
+            citation=Citation(
+                source="pubmed",
+                title="Metformin mechanisms",
+                url="https://pubmed.ncbi.nlm.nih.gov/12345/",
+                date="2023",
+                authors=["Smith J", "Jones A"],
+            ),
+        )
+    ]
+@pytest.fixture
+def sample_hypotheses() -> list[MechanismHypothesis]:
+    return [
+        MechanismHypothesis(
+            drug="Metformin",
+            target="AMPK",
+            pathway="mTOR inhibition",
+            effect="Neuroprotection",
+            confidence=0.8,
+            search_suggestions=[],
+        )
+    ]
+@pytest.fixture
+def mock_report() -> ResearchReport:
+    return ResearchReport(
+        title="Drug Repurposing Analysis: Metformin for Alzheimer's",
+        executive_summary=(
+            "This report analyzes metformin as a potential candidate for "
+            "repurposing in Alzheimer's disease treatment. It summarizes "
+            "findings from mechanistic studies showing AMPK activation effects "
+            "and reviews clinical data. The evidence suggests a potential "
+            "neuroprotective role, although clinical trials are still limited."
+        ),
+        research_question="Can metformin be repurposed for Alzheimer's disease?",
+        methodology=ReportSection(
+            title="Methodology", content="Searched PubMed and web sources..."
+        ),
+        hypotheses_tested=[
+            {"mechanism": "Metformin -> AMPK -> neuroprotection", "supported": 5, "contradicted": 1}
+        ],
+        mechanistic_findings=ReportSection(
+            title="Mechanistic Findings", content="Evidence suggests AMPK activation..."
+        ),
+        clinical_findings=ReportSection(
+            title="Clinical Findings", content="Limited clinical data available..."
+        ),
+        drug_candidates=["Metformin"],
+        limitations=["Abstract-level analysis only"],
+        conclusion="Metformin shows promise...",
+        references=[],
+        sources_searched=["pubmed", "web"],
+        total_papers_reviewed=10,
+        search_iterations=3,
+        confidence_score=0.75,
+    )
+@pytest.mark.asyncio
+async def test_report_agent_generates_report(
+    sample_evidence: list[Evidence],
+    sample_hypotheses: list[MechanismHypothesis],
+    mock_report: ResearchReport,
+) -> None:
+    """ReportAgent should generate structured report."""
+    store: dict[str, Any] = {
+        "current": sample_evidence,
+        "hypotheses": sample_hypotheses,
+        "last_assessment": {"mechanism_score": 8, "clinical_score": 6},
+    }
+    with (
+        patch("src.agents.report_agent.get_model") as mock_get_model,
+        patch("src.agents.report_agent.Agent") as mock_agent_class,
+    ):
+        mock_get_model.return_value = MagicMock()
+        mock_result = MagicMock()
+        mock_result.output = mock_report
+        mock_agent_class.return_value.run = AsyncMock(return_value=mock_result)
+        agent = ReportAgent(store)
+        response = await agent.run("metformin alzheimer")
+        assert response.messages[0].text is not None
+        assert "Executive Summary" in response.messages[0].text
+        assert "Methodology" in response.messages[0].text
+        assert "References" in response.messages[0].text
+@pytest.mark.asyncio
+async def test_report_agent_no_evidence() -> None:
+    """ReportAgent should handle empty evidence gracefully."""
+    store: dict[str, Any] = {"current": [], "hypotheses": []}
+    # Lazy init means no patching needed - agent only instantiated when run() has evidence
+    agent = ReportAgent(store)
+    response = await agent.run("test query")
+    assert response.messages[0].text is not None
+    assert "Cannot generate report" in response.messages[0].text
+# ═══════════════════════════════════════════════════════════════════════════
+# 🚨 CRITICAL: Citation Validation Tests
+# ═══════════════════════════════════════════════════════════════════════════
+@pytest.mark.asyncio
+async def test_report_agent_removes_hallucinated_citations(
+    sample_evidence: list[Evidence],
+) -> None:
+    """ReportAgent should remove citations not in evidence."""
+    from src.utils.citation_validator import validate_references
+    # Create report with mix of valid and hallucinated references
+    report_with_hallucinations = ResearchReport(
+        title="Test Report",
+        executive_summary=(
+            "This is a test report for citation validation. It needs to be "
+            "sufficiently long to pass validation. We are ensuring that the "
+            "system correctly identifies and removes citations that do not "
+            "appear in collected evidence. This prevents hallucinations."
+        ),
+        research_question="Testing citation validation",
+        methodology=ReportSection(title="Methodology", content="Test"),
+        hypotheses_tested=[],
+        mechanistic_findings=ReportSection(title="Mechanistic", content="Test"),
+        clinical_findings=ReportSection(title="Clinical", content="Test"),
+        drug_candidates=["TestDrug"],
+        limitations=["Test limitation"],
+        conclusion="Test conclusion",
+        references=[
+            # Valid reference (matches sample_evidence)
+            {
+                "title": "Metformin mechanisms",
+                "url": "https://pubmed.ncbi.nlm.nih.gov/12345/",
+                "authors": "Smith J, Jones A",
+                "date": "2023",
+                "source": "pubmed",
+            },
+            # HALLUCINATED reference (URL doesn't exist in evidence)
+            {
+                "title": "Fake Paper That Doesn't Exist",
+                "url": "https://fake-journal.com/made-up-paper",
+                "authors": "Hallucinated A",
+                "date": "2024",
+                "source": "fake",
+            },
+            # Another HALLUCINATED reference
+            {
+                "title": "Invented Research",
+                "url": "https://pubmed.ncbi.nlm.nih.gov/99999999/",
+                "authors": "NotReal B",
+                "date": "2025",
+                "source": "pubmed",
+            },
+        ],
+        sources_searched=["pubmed"],
+        total_papers_reviewed=1,
+        search_iterations=1,
+        confidence_score=0.5,
+    )
+    # Validate - should remove hallucinated references
+    validated_report = validate_references(report_with_hallucinations, sample_evidence)
+    # Only the valid reference should remain
+    assert len(validated_report.references) == 1
+    assert validated_report.references[0]["title"] == "Metformin mechanisms"
+    # Check that "Fake Paper" is NOT in the string representation of the references list
+    # (This is a bit safer than checking presence in list of dicts if structure varies)
+    ref_urls = [r.get("url") for r in validated_report.references]
+    assert "https://fake-journal.com/made-up-paper" not in ref_urls
+def test_citation_validator_handles_empty_references() -> None:
+    """Citation validator should handle reports with no references."""
+    from src.utils.citation_validator import validate_references
+    report = ResearchReport(
+        title="Empty Refs Report",
+        executive_summary=(
+            "This report has no references. It is designed to test the "
+            "validator's handling of empty reference lists. We must ensure "
+            "that the system does not crash when a report contains no "
+            "citations. This is a valid edge case in early-stage research."
+        ),
+        research_question="Testing empty refs",
+        methodology=ReportSection(title="Methodology", content="Test"),
+        hypotheses_tested=[],
+        mechanistic_findings=ReportSection(title="Mechanistic", content="Test"),
+        clinical_findings=ReportSection(title="Clinical", content="Test"),
+        drug_candidates=[],
+        limitations=[],
+        conclusion="Test",
+        references=[],  # Empty!
+        sources_searched=[],
+        total_papers_reviewed=0,
+        search_iterations=0,
+        confidence_score=0.0,
+    )
+    validated = validate_references(report, [])
+    assert validated.references == []