Spaces:

kylemontgomery
/

imo-viewer

Sleeping

imo-viewer / app.py

Kyle

fix latex rendering

af74d42 3 months ago

16.7 kB

	import html
	import json
	import os
	import re
	from glob import glob
	from typing import Any, Dict, List

	import gradio as gr

	# Optional LaTeX → MathML conversion (install: pip install latex2mathml)
	try:
	from latex2mathml.converter import convert as _latex_to_mathml
	except Exception: # pragma: no cover
	_latex_to_mathml = None

	# Optional Markdown renderer (install: pip install markdown)
	try:
	import markdown as _markdown
	except Exception: # pragma: no cover
	_markdown = None

	# Precompile math pattern once to avoid recompilation on every render
	# Captures four math forms with inner content groups preserved:
	# 1) $$ ... $$ → group2
	# 2) $ ... $ → group4
	# 3) \[ ... \] → group6
	# 4) $ ... $ → group8
	MATH_PATTERN = re.compile(
	r"(\$\$(.*?)\$\$)" # $$block$$
	r"\|(\$([^\$\n]+?)\$)" # $inline$
	r"\|(\\\[(.*?)\\\])" # \[block\]
	r"\|(\\$([^\n]+?)\\$)", # $inline$
	re.DOTALL,
	)


	def format_step_content(content: str) -> str:
	"""Render content to HTML with Markdown and LaTeX support.

	If the optional `markdown` package is available, we tokenize LaTeX spans,
	render Markdown so headings/lists/emphasis work, then substitute tokens
	with MathML (via latex2mathml). Otherwise, we fall back to escaping with
	inline MathML conversion.

	Args:
	content: Raw text content possibly containing LaTeX snippets.

	Returns:
	Safe HTML string for display.
	"""
	if not content:
	return ""

	# Use precompiled regex to capture block and inline math (see MATH_PATTERN above)

	if _markdown is not None:
	# Tokenize math, render Markdown, then substitute tokens with MathML
	parts: list[str] = []
	token_to_html: dict[str, str] = {}
	last_idx = 0
	token_index = 0

	for m in MATH_PATTERN.finditer(content):
	# Non-math prefix: normalize light TeX-ish helpers for MD
	prefix = content[last_idx : m.start()]
	if prefix:
	prefix = re.sub(r"\\text\{([^}]*)\}", r"\1", prefix)
	prefix = re.sub(r"\\emph\{([^}])\}", r"\1*", prefix)
	prefix = re.sub(r"\\Bbb\{([^}]*)\}", r"\\mathbb{\1}", prefix)
	parts.append(prefix)

	# Extract LaTeX
	latex_src = None
	display = False
	if m.group(2) is not None:
	latex_src = m.group(2)
	display = True
	elif m.group(4) is not None:
	latex_src = m.group(4)
	display = False
	elif m.group(6) is not None:
	latex_src = m.group(6)
	display = True
	elif m.group(8) is not None:
	latex_src = m.group(8)
	display = False

	token = f"[[[MATH_TOKEN_{token_index}]]]"
	token_index += 1
	if latex_src is None:
	token_to_html[token] = html.escape(m.group(0)).replace("\n", "<br>")
	else:
	try:
	if _latex_to_mathml is not None:
	mathml = _latex_to_mathml(latex_src)
	if display and mathml.startswith("<math") and " display=" not in mathml:
	mathml = mathml.replace("<math", '<math display="block"', 1)
	token_to_html[token] = mathml
	else:
	token_to_html[token] = html.escape(m.group(0)).replace("\n", "<br>")
	except Exception:
	token_to_html[token] = html.escape(m.group(0)).replace("\n", "<br>")

	parts.append(token)
	last_idx = m.end()

	# Trailing non-math
	tail = content[last_idx:]
	if tail:
	tail = re.sub(r"\\text\{([^}]*)\}", r"\1", tail)
	tail = re.sub(r"\\emph\{([^}])\}", r"\1*", tail)
	tail = re.sub(r"\\Bbb\{([^}]*)\}", r"\\mathbb{\1}", tail)
	parts.append(tail)

	text_with_tokens = "".join(parts)
	try:
	html_out = _markdown.markdown(text_with_tokens, extensions=["extra", "sane_lists", "nl2br"])
	except Exception:
	html_out = html.escape(text_with_tokens).replace("\n", "<br>")

	for token, token_html in token_to_html.items():
	html_out = html_out.replace(token, token_html)
	return html_out

	# Fallback: previous approach (safe HTML escaping + optional MathML)
	text = content

	rendered_parts: list[str] = []
	last_idx = 0

	for m in MATH_PATTERN.finditer(text):
	# Add preceding non-math segment (escaped, with mild TeX tweaks)
	prefix = text[last_idx : m.start()]
	if prefix:
	# In non-math, normalize a few TeX-ish helpers
	prefix = re.sub(r"\\text\{([^}]*)\}", r"\1", prefix)
	prefix = re.sub(r"\\emph\{([^}]*)\}", r"<em>\1</em>", prefix)
	prefix = re.sub(r"\\Bbb\{([^}]*)\}", r"\\mathbb{\1}", prefix)
	rendered_parts.append(html.escape(prefix).replace("\n", "<br>"))

	# Determine which group matched and extract LaTeX
	latex_src = None
	display = False
	if m.group(2) is not None: # $$ ... $$
	latex_src = m.group(2)
	display = True
	elif m.group(4) is not None: # $ ... $
	latex_src = m.group(4)
	display = False
	elif m.group(6) is not None: # \[ ... \]
	latex_src = m.group(6)
	display = True
	elif m.group(8) is not None: # $ ... $
	latex_src = m.group(8)
	display = False

	if latex_src is None:
	# Should not happen; just append raw match safely
	rendered_parts.append(html.escape(m.group(0)).replace("\n", "<br>"))
	else:
	try:
	mathml = _latex_to_mathml(latex_src)
	# Ensure block math displays as block
	if display and mathml.startswith("<math"):
	if " display=" not in mathml:
	mathml = mathml.replace("<math", '<math display="block"', 1)
	rendered_parts.append(mathml)
	except Exception:
	# On failure, fall back to showing the LaTeX literally
	fallback = html.escape(m.group(0)).replace("\n", "<br>")
	rendered_parts.append(fallback)

	last_idx = m.end()

	# Trailing non-math segment
	tail = text[last_idx:]
	if tail:
	tail = re.sub(r"\\text\{([^}]*)\}", r"\1", tail)
	tail = re.sub(r"\\emph\{([^}]*)\}", r"<em>\1</em>", tail)
	tail = re.sub(r"\\Bbb\{([^}]*)\}", r"\\mathbb{\1}", tail)
	rendered_parts.append(html.escape(tail).replace("\n", "<br>"))

	return "".join(rendered_parts)


	def create_step_html(step: Dict[str, Any], step_num: int, border_color: str) -> str:
	"""Create HTML for a single step (no token counting).

	Args:
	step: The step dictionary to render.
	step_num: 1-based step index.
	border_color: CSS color for the step border.

	Returns:
	HTML string for the step.
	"""
	html_out = f"""
	<details style="margin: 8px 0;">
	<summary style="cursor: pointer; color: {border_color}; font-weight: bold;">
	Step {step_num}
	</summary>
	<div style="
	background-color: white;
	border-radius: 6px;
	padding: 12px;
	margin: 8px 0;
	border-left: 4px solid {border_color};
	">
	"""

	observation_text = str(step.get("observation", "")) if step.get("observation") else "None"
	html_out += f"""
	<details style="margin: 8px 0;">
	<summary style="cursor: pointer; color: #666; font-size: 0.9em;">Observation</summary>
	<div style="background-color: #f8f9fa; padding: 8px; border-radius: 4px; margin-top: 4px; color: #333;">
	{format_step_content(observation_text)}
	</div>
	</details>
	"""

	thought_text = step.get("thought", "") if step.get("thought") else "None"
	html_out += f"""
	<details style="margin: 8px 0;">
	<summary style="cursor: pointer; color: #666; font-size: 0.9em;">Thought</summary>
	<div style="background-color: #f8f9fa; padding: 8px; border-radius: 4px; margin-top: 4px; color: #333;">
	{format_step_content(thought_text)}
	</div>
	</details>
	"""

	action_text = ""
	action = step.get("action")
	if action:
	if isinstance(action, dict) and "action" in action:
	action_text = action["action"]
	else:
	action_text = str(action)

	html_out += f"""
	<div style="margin: 8px 0;">
	<strong style="color: #333;">Action:</strong>
	<div style="background-color: #f8f9fa; padding: 8px; border-radius: 4px; margin-top: 4px; color: #333;">
	{format_step_content(action_text)}
	</div>
	</div>
	"""

	reward = step.get("reward", 0.0)
	html_out += f"""
	<div style="margin: 8px 0;">
	<strong style="color: #333;">Reward:</strong> <span style="color: #333;">{reward}</span>
	</div>
	"""

	html_out += "</div></details>"
	return html_out


	def get_agent_color(agent_name: str) -> tuple[str, str]:
	"""Assign a stable color pair to an agent name.

	Args:
	agent_name: Agent identifier string.

	Returns:
	Tuple of (background_color, border_color).
	"""
	# Simple stable hash -> hue
	hue = (abs(hash(agent_name)) % 360)
	bg_color = f"hsl({hue}, 40%, 92%)"
	border_color = f"hsl({hue}, 60%, 40%)"
	return bg_color, border_color


	def create_trajectory_html(agent_name: str, trajectory: Dict[str, Any]) -> str:
	"""Create HTML for a trajectory (no tokenization or chat rendering)."""
	bg_color, border_color = get_agent_color(agent_name)

	agent_display = agent_name

	html_out = f"""
	<div style="
	background-color: {bg_color};
	border-radius: 8px;
	padding: 16px;
	margin: 10px 0;
	border-left: 4px solid {border_color};
	">
	<div style="font-weight: bold; color: {border_color}; margin-bottom: 10px;">{agent_display}</div>
	"""

	steps = trajectory.get("steps", [])
	for i, step in enumerate(steps):
	html_out += create_step_html(step, i + 1, border_color)

	html_out += "</div>"
	return html_out


	def create_episode_view(episode: Dict[str, Any]) -> str:
	"""Create HTML view for an episode (lightweight).

	Args:
	episode: Episode dictionary to render.

	Returns:
	HTML string.
	"""
	episode_id = episode.get("id", "")
	termination_reason = episode.get("termination_reason", "")

	html_out = f"""
	<div style="max-width: 1200px; margin: 0 auto; font-family: Arial, sans-serif;">
	<div style="
	background-color: #f8f9fa;
	border-radius: 8px;
	padding: 20px;
	margin-bottom: 20px;
	border-left: 4px solid #007bff;
	">
	<h2 style="margin: 0 0 12px 0; color: #333;">
	Episode: {episode_id}
	</h2>
	<div style="color:#333; margin-bottom: 16px;">
	<strong>Termination:</strong> <span>{termination_reason}</span>
	</div>
	</div>
	"""

	task = episode.get("task", {})
	if task and "question" in task:
	html_out += f"""
	<div style="
	background-color: #e3f2fd;
	border-radius: 8px;
	padding: 16px;
	margin-bottom: 20px;
	border-left: 4px solid #1976d2;
	">
	<h3 style="margin: 0 0 12px 0; color: #1976d2;">Problem Statement</h3>
	<div style="background-color: white; padding: 12px; border-radius: 4px; color: #333;">
	{format_step_content(task["question"])}
	</div>
	</div>
	"""

	html_out += """
	<div style="
	border: 2px solid #ddd;
	border-radius: 8px;
	padding: 20px;
	background-color: white;
	">
	<h3 style="margin: 0 0 16px 0; color: #333;">Trajectories</h3>
	"""

	trajectories = episode.get("trajectories", [])
	for agent_name, trajectory in trajectories:
	html_out += create_trajectory_html(agent_name, trajectory)

	html_out += "</div></div>"
	return html_out


	def list_runs(data_dir: str) -> list[str]:
	"""List subdirectories inside the data directory.

	Args:
	data_dir: Path to the parent data directory.

	Returns:
	Sorted list of subdirectory names (runs).
	"""
	if not data_dir or not os.path.isdir(data_dir):
	return []
	entries = [name for name in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir, name))]
	return sorted(entries)


	def list_episode_files(run_dir: str) -> list[str]:
	"""List episode JSON files inside a run directory.

	Args:
	run_dir: Absolute path to a run directory.

	Returns:
	Sorted list of file basenames.
	"""
	if not run_dir or not os.path.isdir(run_dir):
	return []
	files = sorted(glob(os.path.join(run_dir, "*.json")))
	return [os.path.basename(p) for p in files]


	def load_episode_from_file(file_path: str) -> Dict[str, Any]:
	"""Load a single-episode JSON file into a dictionary.

	Args:
	file_path: Absolute path to JSON file containing one episode dict.

	Returns:
	Episode dictionary.
	"""
	with open(file_path) as f:
	data = json.load(f)
	# If file accidentally contains a list, take first element
	if isinstance(data, list):
	data = data[0] if data else {}
	return data


	def create_gradio_interface(data_dir: str):
	"""Create the lightweight viewer with two dropdowns.

	Args:
	data_dir: Parent directory that contains run subdirectories.

	Returns:
	Gradio Blocks app.
	"""
	runs = list_runs(data_dir)

	custom_css = """
	/* ─── force global light theme & readable text ─────────────── */
	:root, html, body, #root, .gradio-container{
	background:#ffffff !important;
	color-scheme:light;
	color:#111 !important;
	}
	.gradio-container{
	--body-background-fill:#ffffff;
	--background-fill-primary:#ffffff;
	--background-fill-secondary:#ffffff;
	--block-background-fill:#ffffff;
	--panel-background-fill:#ffffff;
	}

	/* ─── normalize MathML text color to match content ─────────── */
	math, math *{
	color:#333 !important;
	fill:#333 !important;
	stroke:#333 !important;
	}
	"""

	with gr.Blocks(title="Simple Episode Viewer", css=custom_css) as demo:
	with gr.Group():
	with gr.Row():
	run_dropdown = gr.Dropdown(choices=runs, value=None, label="Run (subdirectory)")
	episode_dropdown = gr.Dropdown(choices=[], value=None, label="Episode (file)")

	display_area = gr.HTML(label="Episode View", value="<p>Select a run and episode.</p>")

	def update_runs(dir_value: str):
	available_runs = list_runs(dir_value)
	# Start with no run selected by default
	return (
	gr.Dropdown(choices=available_runs, value=None),
	gr.Dropdown(choices=[], value=None),
	"<p>Select a run and episode.</p>",
	)

	def on_run_change(selected_run: str):
	run_path = os.path.join(data_dir, selected_run) if selected_run else None
	episodes = list_episode_files(run_path) if run_path else []
	return gr.Dropdown(choices=episodes, value=(episodes[0] if episodes else None)), "<p>Select an episode.</p>"

	def on_episode_change(selected_episode: str, selected_run: str):
	if not selected_episode or not selected_run:
	return "<p>Select a run and episode.</p>"
	file_path = os.path.join(data_dir, selected_run, selected_episode)
	try:
	episode = load_episode_from_file(file_path)
	return create_episode_view(episode)
	except Exception as e:
	return f"<p>Error loading episode: {html.escape(str(e))}</p>"

	# Wire events
	run_dropdown.change(on_run_change, inputs=[run_dropdown], outputs=[episode_dropdown, display_area])
	episode_dropdown.change(on_episode_change, inputs=[episode_dropdown, run_dropdown], outputs=[display_area])

	return demo


	if __name__ == "__main__":
	demo = create_gradio_interface("./data")
	demo.launch()