ZENLLC commited on
Commit
31b73d6
Β·
verified Β·
1 Parent(s): 777003e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +274 -120
app.py CHANGED
@@ -1,5 +1,5 @@
1
- import os, io, json, zipfile, hashlib
2
- from typing import List, Dict, Any, Optional
3
  import gradio as gr
4
  from pydantic import BaseModel
5
  from tenacity import retry, stop_after_attempt, wait_exponential, RetryError
@@ -55,6 +55,9 @@ def _listify(x) -> List[Any]:
55
  return x
56
  return [x]
57
 
 
 
 
58
  # -------------------- keys --------------------
59
  class Keys(BaseModel):
60
  openai: Optional[str] = None
@@ -79,10 +82,8 @@ def fc_client(s: Keys) -> Firecrawl:
79
  def fc_search(s: Keys, query: str, limit: int = 5, scrape_formats: Optional[List[str]] = None, location: Optional[str] = None) -> Dict[str, Any]:
80
  fc = fc_client(s)
81
  kwargs: Dict[str, Any] = {"query": query, "limit": limit}
82
- if location:
83
- kwargs["location"] = location
84
- if scrape_formats:
85
- kwargs["scrape_options"] = {"formats": scrape_formats}
86
  res = fc.search(**kwargs)
87
  return _to_dict(res)
88
 
@@ -90,13 +91,9 @@ def fc_search(s: Keys, query: str, limit: int = 5, scrape_formats: Optional[List
90
  def fc_scrape(s: Keys, url: str, formats: Optional[List[str]] = None, timeout_ms: Optional[int] = None, mobile: bool = False) -> Dict[str, Any]:
91
  fc = fc_client(s)
92
  kwargs: Dict[str, Any] = {"url": url}
93
- if formats:
94
- kwargs["formats"] = formats
95
- # give slow pages more time; cap at 40s
96
- if timeout_ms:
97
- kwargs["timeout"] = min(int(timeout_ms), 40000)
98
- if mobile:
99
- kwargs["mobile"] = True
100
  res = fc.scrape(**kwargs)
101
  return _to_dict(res)
102
 
@@ -104,8 +101,7 @@ def fc_scrape(s: Keys, url: str, formats: Optional[List[str]] = None, timeout_ms
104
  def fc_crawl(s: Keys, url: str, max_pages: int = 25, formats: Optional[List[str]] = None) -> Dict[str, Any]:
105
  fc = fc_client(s)
106
  kwargs: Dict[str, Any] = {"url": url, "limit": max_pages}
107
- if formats:
108
- kwargs["scrape_options"] = {"formats": formats}
109
  res = fc.crawl(**kwargs)
110
  return _to_dict(res)
111
 
@@ -118,102 +114,91 @@ SYSTEM_STEER = (
118
 
119
  def use_openai(s: Keys):
120
  k = resolve_keys(s)
121
- if not k.openai:
122
- raise gr.Error("Missing OPENAI_API_KEY.")
123
- if OpenAI is None:
124
- raise gr.Error("OpenAI SDK not installed.")
125
  return OpenAI(api_key=k.openai)
126
 
127
  def use_anthropic(s: Keys):
128
  k = resolve_keys(s)
129
- if not k.anthropic:
130
- raise gr.Error("Missing ANTHROPIC_API_KEY.")
131
- if anthropic is None:
132
- raise gr.Error("Anthropic SDK not installed.")
133
  return anthropic.Anthropic(api_key=k.anthropic)
134
 
135
  ANTHROPIC_FALLBACKS = [
136
- # try a few known-good Sonnet identifiers
137
- "claude-3-7-sonnet-2025-06-13", # example new tag
138
  "claude-3-7-sonnet",
139
  "claude-3-5-sonnet-20241022",
140
  "claude-3-5-sonnet-20240620",
141
  ]
 
 
 
 
 
 
 
 
 
 
142
 
143
- OPENAI_FALLBACKS = [
144
- "gpt-5", # user-preferred
145
- "gpt-4.1", # safe fallback
146
- "gpt-4o", # vision-capable fallback
147
- "gpt-4o-mini", # economical fallback
148
- ]
 
 
 
 
 
149
 
150
  def llm_summarize(s: Keys, provider: str, model_name: str, prompt: str, ctx_md: str, temp: float=0.4) -> str:
151
  ctx = (ctx_md or "")[:150000]
152
  if provider == "openai":
153
- client = use_openai(s)
154
  candidates = [model_name] + OPENAI_FALLBACKS if model_name else OPENAI_FALLBACKS
155
- last_err = None
156
  for m in candidates:
157
- try:
158
- resp = client.chat.completions.create(
159
- model=m,
160
- temperature=temp,
161
- messages=[
162
- {"role": "system", "content": SYSTEM_STEER},
163
- {"role": "user", "content": f"{prompt}\n\n=== SOURCE (markdown) ===\n{ctx}"},
164
- ],
165
- )
166
- return (resp.choices[0].message.content or "").strip()
167
- except Exception as e:
168
- last_err = e
169
- continue
170
- raise gr.Error(f"OpenAI failed across fallbacks: {last_err}")
171
  else:
172
- client = use_anthropic(s)
173
  candidates = [model_name] + ANTHROPIC_FALLBACKS if model_name else ANTHROPIC_FALLBACKS
174
- last_err = None
175
  for m in candidates:
176
- try:
177
- resp = client.messages.create(
178
- model=m,
179
- max_tokens=4000,
180
- temperature=temp,
181
- system=SYSTEM_STEER,
182
- messages=[{"role": "user", "content": f"{prompt}\n\n=== SOURCE (markdown) ===\n{ctx}"}],
183
- )
184
- chunks = []
185
- for blk in resp.content:
186
- t = getattr(blk, "text", None)
187
- if t:
188
- chunks.append(t)
189
- return "".join(chunks).strip()
190
- except AnthropicNotFound as e:
191
- last_err = e
192
- continue
193
- except Exception as e:
194
- last_err = e
195
- continue
196
- raise gr.Error(f"Anthropic failed across fallbacks: {last_err}")
197
-
198
- # -------------------- ZIP export --------------------
199
- def pack_zip(pages: List[Dict[str, Any]]) -> bytes:
200
  mem = io.BytesIO()
201
  with zipfile.ZipFile(mem, mode="w", compression=zipfile.ZIP_DEFLATED) as zf:
202
  manifest = []
203
  for i, p in enumerate(pages, start=1):
204
  url = p.get("url") or p.get("metadata", {}).get("sourceURL") or f"page_{i}"
205
- slug = hashlib.sha1(str(url).encode("utf-8")).hexdigest()[:10]
206
  md = p.get("markdown") or p.get("data", {}).get("markdown") or p.get("content") or ""
207
  html = p.get("html") or p.get("data", {}).get("html") or ""
208
  links = p.get("links") or p.get("data", {}).get("links") or []
 
209
  if md: zf.writestr(f"{i:03d}_{slug}.md", md)
210
  if html: zf.writestr(f"{i:03d}_{slug}.html", html)
211
- manifest.append({"url": url, "title": p.get("title") or p.get("metadata", {}).get("title"), "links": links})
212
  zf.writestr("manifest.json", json.dumps(manifest, indent=2))
213
- mem.seek(0)
214
- return mem.read()
215
 
216
- # -------------------- actions --------------------
 
 
 
 
 
 
 
 
 
217
  def save_keys(openai_key, anthropic_key, firecrawl_key):
218
  return Keys(
219
  openai=(openai_key or "").strip() or None,
@@ -222,8 +207,7 @@ def save_keys(openai_key, anthropic_key, firecrawl_key):
222
  ), gr.Info("Keys saved to this session. (Env vars still apply if set.)")
223
 
224
  def action_search(sess: Keys, query: str, limit: int, scrape_content: bool, location: str):
225
- if not query.strip():
226
- raise gr.Error("Enter a search query.")
227
  formats = ["markdown", "links"] if scrape_content else None
228
  res = fc_search(sess, query=query.strip(), limit=limit, scrape_formats=formats, location=(location or None))
229
  data = res.get("data", res)
@@ -238,53 +222,151 @@ def action_search(sess: Keys, query: str, limit: int, scrape_content: bool, loca
238
  else:
239
  items = _listify(_to_dict(data))
240
  if not items:
241
- return _pretty_json(res) # show raw result if buckets are empty
242
- return json.dumps(items, indent=2)
243
 
244
  def action_scrape(sess: Keys, url: str, mobile: bool, formats_sel: List[str], timeout_ms: int):
245
- if not url.strip():
246
- raise gr.Error("Enter a URL.")
247
  formats = formats_sel or ["markdown", "links"]
248
  try:
249
  out = fc_scrape(sess, url.strip(), formats=formats, timeout_ms=(timeout_ms or 15000), mobile=mobile)
250
  pretty = _pretty_json(out)
251
  md = out.get("markdown") or out.get("data", {}).get("markdown") or out.get("content") or ""
252
- return pretty, md
253
  except RetryError as e:
254
- return f"<!> Scrape timed out after retries. Try increasing timeout, unchecking 'mobile', or limiting formats.\n\n{e}", ""
255
  except Exception as e:
256
- return f"<!> Scrape error: {e}", ""
257
 
258
  def action_crawl(sess: Keys, base_url: str, max_pages: int, formats_sel: List[str]):
259
- if not base_url.strip():
260
- raise gr.Error("Enter a base URL to crawl.")
261
  formats = formats_sel or ["markdown", "links"]
262
  try:
263
  out = fc_crawl(sess, base_url.strip(), max_pages=max_pages, formats=formats)
264
  pages = out.get("data")
265
- if not isinstance(pages, list) or not pages:
266
- raise gr.Error("Crawl returned no pages.")
267
- zip_bytes = pack_zip(pages)
268
- return gr.File.update(value=io.BytesIO(zip_bytes), visible=True, filename="site_clone.zip"), f"Crawled {len(pages)} pages. ZIP is ready."
269
  except RetryError as e:
270
- return gr.File.update(visible=False), f"<!> Crawl timed out after retries. Reduce Max Pages or try again.\n\n{e}"
271
  except Exception as e:
272
- return gr.File.update(visible=False), f"<!> Crawl error: {e}"
273
 
274
  def action_generate(sess: Keys, provider: str, model_name: str, sys_prompt: str, user_prompt: str, context_md: str, temp: float):
275
- if not user_prompt.strip():
276
- raise gr.Error("Enter a prompt or click a starter tile.")
277
  model = (model_name or "").strip()
278
  steer = (sys_prompt or "").strip()
279
  prompt = (("SYSTEM:\n" + steer + "\n\n") if steer else "") + user_prompt.strip()
280
  out = llm_summarize(sess, provider, model, prompt, context_md or "", temp=temp)
281
  return out
282
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
283
  # -------------------- UI --------------------
284
  with gr.Blocks(css="#keys .wrap.svelte-1ipelgc { filter: none !important; }") as demo:
285
  gr.Markdown("## ZEN VibeCoder β€” Web Clone & Research Foundry")
286
  session_state = gr.State(Keys())
287
 
 
 
 
 
 
 
 
288
  with gr.Accordion("πŸ” Keys (session)", open=True):
289
  with gr.Row():
290
  openai_key = gr.Textbox(label="OPENAI_API_KEY (GPT-5 / fallbacks)", type="password", placeholder="sk-...", value=os.getenv("OPENAI_API_KEY") or "")
@@ -295,6 +377,7 @@ with gr.Blocks(css="#keys .wrap.svelte-1ipelgc { filter: none !important; }") as
295
  save_btn.click(save_keys, [openai_key, anthropic_key, firecrawl_key], [session_state, save_msg])
296
 
297
  with gr.Tabs():
 
298
  with gr.Tab("πŸ”Ž Search"):
299
  query = gr.Textbox(label="Query", placeholder='ex: site:docs "vector database" 2025')
300
  with gr.Row():
@@ -303,8 +386,13 @@ with gr.Blocks(css="#keys .wrap.svelte-1ipelgc { filter: none !important; }") as
303
  location = gr.Textbox(label="Location (optional)", placeholder="ex: Germany")
304
  go_search = gr.Button("Run Search", variant="primary")
305
  search_json = gr.Code(label="Results JSON", language="json")
306
- go_search.click(action_search, [session_state, query, limit, scrape_content, location], [search_json])
307
 
 
 
 
 
 
 
308
  with gr.Tab("πŸ•ΈοΈ Scrape β€’ Crawl β€’ Clone"):
309
  with gr.Row():
310
  target_url = gr.Textbox(label="URL to Scrape", placeholder="https://example.com")
@@ -315,7 +403,7 @@ with gr.Blocks(css="#keys .wrap.svelte-1ipelgc { filter: none !important; }") as
315
  run_scrape = gr.Button("Scrape URL", variant="primary")
316
  scrape_json = gr.Code(label="Raw Response (JSON)", language="json")
317
  scrape_md = gr.Markdown(label="Markdown Preview")
318
- run_scrape.click(action_scrape, [session_state, target_url, mobile, formats_sel, timeout_ms], [scrape_json, scrape_md])
319
 
320
  gr.Markdown("---")
321
 
@@ -326,49 +414,115 @@ with gr.Blocks(css="#keys .wrap.svelte-1ipelgc { filter: none !important; }") as
326
  run_crawl = gr.Button("Crawl & Build ZIP", variant="primary")
327
  zip_file = gr.File(label="Clone ZIP", visible=False)
328
  crawl_status = gr.Markdown()
329
- run_crawl.click(action_crawl, [session_state, base_url, max_pages, formats_crawl], [zip_file, crawl_status])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
330
 
 
331
  with gr.Tab("✨ Vibe Code (Synthesis)"):
332
  with gr.Row():
333
  provider = gr.Radio(choices=["openai","anthropic"], value="openai", label="Provider")
334
  model_name = gr.Textbox(label="Model (override)", placeholder="(blank = auto fallback)")
335
  temp = gr.Slider(0.0, 1.2, value=0.4, step=0.05, label="Temperature")
336
-
337
  sys_prompt = gr.Textbox(label="System Style (optional)",
338
  value="Return structured outputs with file trees, code blocks and ordered steps. Be concise and concrete.")
339
  user_prompt = gr.Textbox(label="User Prompt", lines=6)
340
- ctx_md = gr.Textbox(label="Context (paste markdown from Scrape/Crawl)", lines=10)
341
-
342
- with gr.Row():
343
- gen_btn = gr.Button("Generate", variant="primary")
344
- out_md = gr.Markdown()
345
-
346
  gr.Markdown("**Starter Tiles**")
347
  with gr.Row():
348
- t1 = gr.Button("πŸ”§ Clone Docs ➜ Clean Markdown ➜ README")
349
- t2 = gr.Button("🧭 Competitor Teardown ➜ Features β€’ Pricing β€’ Moats")
350
- t3 = gr.Button("πŸ§ͺ API Wrapper ➜ Python Client (requests + retries)")
351
- t4 = gr.Button("πŸ“ Landing Page Rewrite ➜ ZEN Tone")
352
- t5 = gr.Button("πŸ“Š Dataset Outline ➜ Schema + Fields + ETL")
353
-
354
  def fill_tile(tile: str):
355
  tiles = {
356
- "t1": "Create a clean knowledge pack from the context, then output a README.md with:\n- Overview\n- Key features\n- Quickstart\n- API endpoints (if any)\n- Notes & gotchas\n- License\nAlso produce a /docs/ tree outline with suggested pages and headings.",
357
- "t2": "From the context, produce a feature matrix, pricing table, ICP notes, moats/risks, and a market POV. Conclude with a ZEN playbook: 5 lever moves for advantage.",
358
- "t3": "Using the context, design a Python client that wraps the target API with retry/backoff and typed responses. Output:\n- package layout\n- requirements\n- client.py\n- examples/\n- README with usage.\nInclude robust error handling.",
359
- "t4": "Rewrite the landing page in ZEN brand voice: crisp headline, 3 value props, social proof, CTA, and a concise FAQ. Provide HTML sections and copy blocks.",
360
- "t5": "Propose a dataset schema based on the context. Output a table of fields, types, constraints, and an ETL plan (sources, transforms, validation, freshness, monitoring).",
361
  }
362
  return tiles[tile]
363
-
364
  t1.click(lambda: fill_tile("t1"), outputs=[user_prompt])
365
  t2.click(lambda: fill_tile("t2"), outputs=[user_prompt])
366
  t3.click(lambda: fill_tile("t3"), outputs=[user_prompt])
367
  t4.click(lambda: fill_tile("t4"), outputs=[user_prompt])
368
  t5.click(lambda: fill_tile("t5"), outputs=[user_prompt])
369
-
370
  gen_btn.click(action_generate, [session_state, provider, model_name, sys_prompt, user_prompt, ctx_md, temp], [out_md])
371
 
 
 
 
 
 
 
 
 
 
 
 
 
 
372
  gr.Markdown("Built for **ZEN Arena** pipelines. Export ZIPs β†’ ingest β†’ credentialize via ZEN Cards.")
373
 
374
  if __name__ == "__main__":
 
1
+ import os, io, json, zipfile, hashlib, time
2
+ from typing import List, Dict, Any, Optional, Tuple
3
  import gradio as gr
4
  from pydantic import BaseModel
5
  from tenacity import retry, stop_after_attempt, wait_exponential, RetryError
 
55
  return x
56
  return [x]
57
 
58
+ def _hash(s: str) -> str:
59
+ return hashlib.sha1(s.encode("utf-8")).hexdigest()[:10]
60
+
61
  # -------------------- keys --------------------
62
  class Keys(BaseModel):
63
  openai: Optional[str] = None
 
82
  def fc_search(s: Keys, query: str, limit: int = 5, scrape_formats: Optional[List[str]] = None, location: Optional[str] = None) -> Dict[str, Any]:
83
  fc = fc_client(s)
84
  kwargs: Dict[str, Any] = {"query": query, "limit": limit}
85
+ if location: kwargs["location"] = location
86
+ if scrape_formats: kwargs["scrape_options"] = {"formats": scrape_formats}
 
 
87
  res = fc.search(**kwargs)
88
  return _to_dict(res)
89
 
 
91
  def fc_scrape(s: Keys, url: str, formats: Optional[List[str]] = None, timeout_ms: Optional[int] = None, mobile: bool = False) -> Dict[str, Any]:
92
  fc = fc_client(s)
93
  kwargs: Dict[str, Any] = {"url": url}
94
+ if formats: kwargs["formats"] = formats
95
+ if timeout_ms: kwargs["timeout"] = min(int(timeout_ms), 40000) # cap 40s
96
+ if mobile: kwargs["mobile"] = True
 
 
 
 
97
  res = fc.scrape(**kwargs)
98
  return _to_dict(res)
99
 
 
101
  def fc_crawl(s: Keys, url: str, max_pages: int = 25, formats: Optional[List[str]] = None) -> Dict[str, Any]:
102
  fc = fc_client(s)
103
  kwargs: Dict[str, Any] = {"url": url, "limit": max_pages}
104
+ if formats: kwargs["scrape_options"] = {"formats": formats}
 
105
  res = fc.crawl(**kwargs)
106
  return _to_dict(res)
107
 
 
114
 
115
  def use_openai(s: Keys):
116
  k = resolve_keys(s)
117
+ if not k.openai: raise gr.Error("Missing OPENAI_API_KEY.")
118
+ if OpenAI is None: raise gr.Error("OpenAI SDK not installed.")
 
 
119
  return OpenAI(api_key=k.openai)
120
 
121
  def use_anthropic(s: Keys):
122
  k = resolve_keys(s)
123
+ if not k.anthropic: raise gr.Error("Missing ANTHROPIC_API_KEY.")
124
+ if anthropic is None: raise gr.Error("Anthropic SDK not installed.")
 
 
125
  return anthropic.Anthropic(api_key=k.anthropic)
126
 
127
  ANTHROPIC_FALLBACKS = [
128
+ "claude-3-7-sonnet-2025-06-13",
 
129
  "claude-3-7-sonnet",
130
  "claude-3-5-sonnet-20241022",
131
  "claude-3-5-sonnet-20240620",
132
  ]
133
+ OPENAI_FALLBACKS = ["gpt-5", "gpt-4.1", "gpt-4o", "gpt-4o-mini"]
134
+
135
+ def llm_once_openai(s: Keys, model: str, prompt: str, ctx: str, temp: float) -> str:
136
+ client = use_openai(s)
137
+ resp = client.chat.completions.create(
138
+ model=model, temperature=temp,
139
+ messages=[{"role":"system","content":SYSTEM_STEER},
140
+ {"role":"user","content":f"{prompt}\n\n=== SOURCE (markdown) ===\n{ctx}"}]
141
+ )
142
+ return (resp.choices[0].message.content or "").strip()
143
 
144
+ def llm_once_anthropic(s: Keys, model: str, prompt: str, ctx: str, temp: float) -> str:
145
+ client = use_anthropic(s)
146
+ resp = client.messages.create(
147
+ model=model, max_tokens=4000, temperature=temp, system=SYSTEM_STEER,
148
+ messages=[{"role":"user","content":f"{prompt}\n\n=== SOURCE (markdown) ===\n{ctx}"}],
149
+ )
150
+ out=[]
151
+ for blk in resp.content:
152
+ t=getattr(blk,"text",None)
153
+ if t: out.append(t)
154
+ return "".join(out).strip()
155
 
156
  def llm_summarize(s: Keys, provider: str, model_name: str, prompt: str, ctx_md: str, temp: float=0.4) -> str:
157
  ctx = (ctx_md or "")[:150000]
158
  if provider == "openai":
 
159
  candidates = [model_name] + OPENAI_FALLBACKS if model_name else OPENAI_FALLBACKS
160
+ last=None
161
  for m in candidates:
162
+ try: return llm_once_openai(s, m, prompt, ctx, temp)
163
+ except Exception as e: last=e; continue
164
+ raise gr.Error(f"OpenAI failed across fallbacks: {last}")
 
 
 
 
 
 
 
 
 
 
 
165
  else:
 
166
  candidates = [model_name] + ANTHROPIC_FALLBACKS if model_name else ANTHROPIC_FALLBACKS
167
+ last=None
168
  for m in candidates:
169
+ try: return llm_once_anthropic(s, m, prompt, ctx, temp)
170
+ except AnthropicNotFound as e: last=e; continue
171
+ except Exception as e: last=e; continue
172
+ raise gr.Error(f"Anthropic failed across fallbacks: {last}")
173
+
174
+ # -------------------- ZIP export helpers --------------------
175
+ def pack_zip_pages(pages: List[Dict[str, Any]]) -> bytes:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
176
  mem = io.BytesIO()
177
  with zipfile.ZipFile(mem, mode="w", compression=zipfile.ZIP_DEFLATED) as zf:
178
  manifest = []
179
  for i, p in enumerate(pages, start=1):
180
  url = p.get("url") or p.get("metadata", {}).get("sourceURL") or f"page_{i}"
181
+ slug = _hash(str(url))
182
  md = p.get("markdown") or p.get("data", {}).get("markdown") or p.get("content") or ""
183
  html = p.get("html") or p.get("data", {}).get("html") or ""
184
  links = p.get("links") or p.get("data", {}).get("links") or []
185
+ title = p.get("title") or p.get("metadata", {}).get("title")
186
  if md: zf.writestr(f"{i:03d}_{slug}.md", md)
187
  if html: zf.writestr(f"{i:03d}_{slug}.html", html)
188
+ manifest.append({"url": url, "title": title, "links": links})
189
  zf.writestr("manifest.json", json.dumps(manifest, indent=2))
190
+ mem.seek(0); return mem.read()
 
191
 
192
+ def pack_zip_corpus(corpus: List[Dict[str, Any]], merged_md: str, extras: Dict[str,str]) -> bytes:
193
+ mem = io.BytesIO()
194
+ with zipfile.ZipFile(mem, mode="w", compression=zipfile.ZIP_DEFLATED) as zf:
195
+ zf.writestr("corpus_merged.md", merged_md or "")
196
+ zf.writestr("corpus_manifest.json", json.dumps(corpus, indent=2))
197
+ for name,content in extras.items():
198
+ zf.writestr(name, content)
199
+ mem.seek(0); return mem.read()
200
+
201
+ # -------------------- actions: keys/search/scrape/crawl/generate --------------------
202
  def save_keys(openai_key, anthropic_key, firecrawl_key):
203
  return Keys(
204
  openai=(openai_key or "").strip() or None,
 
207
  ), gr.Info("Keys saved to this session. (Env vars still apply if set.)")
208
 
209
  def action_search(sess: Keys, query: str, limit: int, scrape_content: bool, location: str):
210
+ if not query.strip(): raise gr.Error("Enter a search query.")
 
211
  formats = ["markdown", "links"] if scrape_content else None
212
  res = fc_search(sess, query=query.strip(), limit=limit, scrape_formats=formats, location=(location or None))
213
  data = res.get("data", res)
 
222
  else:
223
  items = _listify(_to_dict(data))
224
  if not items:
225
+ return _pretty_json(res), res # return raw and obj (store for later)
226
+ return json.dumps(items, indent=2), items
227
 
228
  def action_scrape(sess: Keys, url: str, mobile: bool, formats_sel: List[str], timeout_ms: int):
229
+ if not url.strip(): raise gr.Error("Enter a URL.")
 
230
  formats = formats_sel or ["markdown", "links"]
231
  try:
232
  out = fc_scrape(sess, url.strip(), formats=formats, timeout_ms=(timeout_ms or 15000), mobile=mobile)
233
  pretty = _pretty_json(out)
234
  md = out.get("markdown") or out.get("data", {}).get("markdown") or out.get("content") or ""
235
+ return pretty, md, out
236
  except RetryError as e:
237
+ return f"<!> Scrape timed out after retries. Try increasing timeout, unchecking 'mobile', or limiting formats.\n\n{e}", "", {}
238
  except Exception as e:
239
+ return f"<!> Scrape error: {e}", "", {}
240
 
241
  def action_crawl(sess: Keys, base_url: str, max_pages: int, formats_sel: List[str]):
242
+ if not base_url.strip(): raise gr.Error("Enter a base URL to crawl.")
 
243
  formats = formats_sel or ["markdown", "links"]
244
  try:
245
  out = fc_crawl(sess, base_url.strip(), max_pages=max_pages, formats=formats)
246
  pages = out.get("data")
247
+ if not isinstance(pages, list) or not pages: raise gr.Error("Crawl returned no pages.")
248
+ zip_bytes = pack_zip_pages(pages)
249
+ return gr.File.update(value=io.BytesIO(zip_bytes), visible=True, filename="site_clone.zip"), f"Crawled {len(pages)} pages. ZIP is ready.", pages
 
250
  except RetryError as e:
251
+ return gr.File.update(visible=False), f"<!> Crawl timed out after retries. Reduce Max Pages or try again.\n\n{e}", []
252
  except Exception as e:
253
+ return gr.File.update(visible=False), f"<!> Crawl error: {e}", []
254
 
255
  def action_generate(sess: Keys, provider: str, model_name: str, sys_prompt: str, user_prompt: str, context_md: str, temp: float):
256
+ if not user_prompt.strip(): raise gr.Error("Enter a prompt or click a starter tile.")
 
257
  model = (model_name or "").strip()
258
  steer = (sys_prompt or "").strip()
259
  prompt = (("SYSTEM:\n" + steer + "\n\n") if steer else "") + user_prompt.strip()
260
  out = llm_summarize(sess, provider, model, prompt, context_md or "", temp=temp)
261
  return out
262
 
263
+ # -------------------- Corpus features --------------------
264
+ def corpus_normalize_items(items: Any) -> List[Dict[str, Any]]:
265
+ """Accepts list/dict/raw and returns a list of page-like dicts with url/title/markdown/html/links."""
266
+ out=[]
267
+ if isinstance(items, dict): items=[items]
268
+ for it in _listify(items):
269
+ d=_to_dict(it)
270
+ if not isinstance(d, dict): continue
271
+ url = d.get("url") or d.get("metadata",{}).get("sourceURL") or d.get("link") or ""
272
+ title = d.get("title") or d.get("metadata",{}).get("title") or d.get("name") or ""
273
+ md = d.get("markdown") or d.get("data",{}).get("markdown") or d.get("content") or ""
274
+ html = d.get("html") or d.get("data",{}).get("html") or ""
275
+ links = d.get("links") or d.get("data",{}).get("links") or []
276
+ out.append({"url":url,"title":title,"markdown":md,"html":html,"links":links})
277
+ return out
278
+
279
+ def corpus_add(corpus: List[Dict[str,Any]], items: Any, include_filter: str, exclude_filter: str, dedupe: bool) -> Tuple[List[Dict[str,Any]], str]:
280
+ added=0
281
+ existing = set(_hash(x.get("url","")) for x in corpus if x.get("url"))
282
+ inc = (include_filter or "").strip().lower()
283
+ exc = (exclude_filter or "").strip().lower()
284
+ for rec in corpus_normalize_items(items):
285
+ url = (rec.get("url") or "").lower()
286
+ title = (rec.get("title") or "").lower()
287
+ if inc and (inc not in url and inc not in title): continue
288
+ if exc and (exc in url or exc in title): continue
289
+ if dedupe and rec.get("url") and _hash(rec["url"]) in existing: continue
290
+ corpus.append(rec); added+=1
291
+ if rec.get("url"): existing.add(_hash(rec["url"]))
292
+ return corpus, f"Added {added} item(s). Corpus size: {len(corpus)}."
293
+
294
+ def corpus_list(corpus: List[Dict[str,Any]]) -> str:
295
+ lines=[]
296
+ for i,rec in enumerate(corpus,1):
297
+ url = rec.get("url") or "(no url)"
298
+ title = rec.get("title") or "(no title)"
299
+ mlen = len(rec.get("markdown") or "")
300
+ lines.append(f"{i:03d}. {title} β€” {url} [md:{mlen} chars]")
301
+ if not lines: return "_(empty)_"
302
+ return "\n".join(lines)
303
+
304
+ def corpus_clear() -> Tuple[List[Dict[str,Any]], str]:
305
+ return [], "Corpus cleared."
306
+
307
+ def corpus_merge_md(corpus: List[Dict[str,Any]]) -> str:
308
+ parts=[]
309
+ for rec in corpus:
310
+ hdr = f"### {rec.get('title') or rec.get('url') or 'Untitled'}"
311
+ md = rec.get("markdown") or ""
312
+ if md: parts.append(hdr+"\n\n"+md.strip())
313
+ return "\n\n---\n\n".join(parts)
314
+
315
+ def corpus_export(corpus: List[Dict[str,Any]], merged: str, extras: Dict[str,str]):
316
+ data = pack_zip_corpus(corpus, merged, extras)
317
+ return gr.File.update(value=io.BytesIO(data), visible=True, filename=f"corpus_{int(time.time())}.zip")
318
+
319
+ def dual_generate(sess: Keys, model_openai: str, model_anthropic: str, sys_prompt: str, user_prompt: str, ctx_md: str, temp: float):
320
+ if not user_prompt.strip(): raise gr.Error("Enter a prompt or use a tile.")
321
+ steer = (sys_prompt or "").strip()
322
+ prompt = (("SYSTEM:\n" + steer + "\n\n") if steer else "") + user_prompt.strip()
323
+ ctx = ctx_md or ""
324
+ # OpenAI
325
+ oa_txt, an_txt = "", ""
326
+ try:
327
+ oa_txt = llm_summarize(sess, "openai", model_openai or "", prompt, ctx, temp)
328
+ except Exception as e:
329
+ oa_txt = f"<!> OpenAI error: {e}"
330
+ try:
331
+ an_txt = llm_summarize(sess, "anthropic", model_anthropic or "", prompt, ctx, temp)
332
+ except Exception as e:
333
+ an_txt = f"<!> Anthropic error: {e}"
334
+ # render side-by-side
335
+ md = (
336
+ "### OpenAI\n\n" + (oa_txt or "_(empty)_") +
337
+ "\n\n---\n\n" +
338
+ "### Anthropic\n\n" + (an_txt or "_(empty)_")
339
+ )
340
+ return md
341
+
342
+ def scaffold_from_corpus(corpus_md: str, site_name: str = "zen-scan"):
343
+ """
344
+ Produce a tiny site/docs scaffold as a ZIP:
345
+ /README.md
346
+ /docs/index.md (from corpus)
347
+ /docs/summary.md (brief)
348
+ """
349
+ summary = (corpus_md[:1800] + ("..." if len(corpus_md) > 1800 else "")) if corpus_md else "No content."
350
+ mem = io.BytesIO()
351
+ with zipfile.ZipFile(mem, "w", zipfile.ZIP_DEFLATED) as zf:
352
+ zf.writestr("README.md", f"# {site_name}\n\nAuto-generated scaffold from ZEN VibeCoder corpus.\n")
353
+ zf.writestr("docs/index.md", corpus_md or "# Empty\n")
354
+ zf.writestr("docs/summary.md", f"# Summary\n\n{summary}\n")
355
+ mem.seek(0)
356
+ return gr.File.update(value=mem, visible=True, filename=f"{site_name}_scaffold.zip")
357
+
358
  # -------------------- UI --------------------
359
  with gr.Blocks(css="#keys .wrap.svelte-1ipelgc { filter: none !important; }") as demo:
360
  gr.Markdown("## ZEN VibeCoder β€” Web Clone & Research Foundry")
361
  session_state = gr.State(Keys())
362
 
363
+ # keep stateful objects
364
+ last_search_obj = gr.State({})
365
+ last_scrape_obj = gr.State({})
366
+ last_crawl_pages = gr.State([])
367
+ corpus_state = gr.State([]) # list of dicts
368
+ merged_md_state = gr.State("") # merged markdown cache
369
+
370
  with gr.Accordion("πŸ” Keys (session)", open=True):
371
  with gr.Row():
372
  openai_key = gr.Textbox(label="OPENAI_API_KEY (GPT-5 / fallbacks)", type="password", placeholder="sk-...", value=os.getenv("OPENAI_API_KEY") or "")
 
377
  save_btn.click(save_keys, [openai_key, anthropic_key, firecrawl_key], [session_state, save_msg])
378
 
379
  with gr.Tabs():
380
+ # --- SEARCH ---
381
  with gr.Tab("πŸ”Ž Search"):
382
  query = gr.Textbox(label="Query", placeholder='ex: site:docs "vector database" 2025')
383
  with gr.Row():
 
386
  location = gr.Textbox(label="Location (optional)", placeholder="ex: Germany")
387
  go_search = gr.Button("Run Search", variant="primary")
388
  search_json = gr.Code(label="Results JSON", language="json")
 
389
 
390
+ def _search(sess, q, lmt, scp, loc):
391
+ txt, obj = action_search(sess, q, lmt, scp, loc)
392
+ return txt, obj
393
+ go_search.click(_search, [session_state, query, limit, scrape_content, location], [search_json, last_search_obj])
394
+
395
+ # --- SCRAPE / CRAWL ---
396
  with gr.Tab("πŸ•ΈοΈ Scrape β€’ Crawl β€’ Clone"):
397
  with gr.Row():
398
  target_url = gr.Textbox(label="URL to Scrape", placeholder="https://example.com")
 
403
  run_scrape = gr.Button("Scrape URL", variant="primary")
404
  scrape_json = gr.Code(label="Raw Response (JSON)", language="json")
405
  scrape_md = gr.Markdown(label="Markdown Preview")
406
+ run_scrape.click(action_scrape, [session_state, target_url, mobile, formats_sel, timeout_ms], [scrape_json, scrape_md, last_scrape_obj])
407
 
408
  gr.Markdown("---")
409
 
 
414
  run_crawl = gr.Button("Crawl & Build ZIP", variant="primary")
415
  zip_file = gr.File(label="Clone ZIP", visible=False)
416
  crawl_status = gr.Markdown()
417
+ run_crawl.click(action_crawl, [session_state, base_url, max_pages, formats_crawl], [zip_file, crawl_status, last_crawl_pages])
418
+
419
+ # --- CORPUS & BUILD ---
420
+ with gr.Tab("πŸ“¦ Corpus & Build"):
421
+ with gr.Row():
422
+ include_filter = gr.Textbox(label="Include filter (substring)", placeholder="docs, api, blog...")
423
+ exclude_filter = gr.Textbox(label="Exclude filter (substring)", placeholder="cdn, tracking, terms...")
424
+ dedupe = gr.Checkbox(label="Dedupe by URL", value=True)
425
+ with gr.Row():
426
+ add_from_search = gr.Button("Add from Last Search")
427
+ add_from_scrape = gr.Button("Add from Last Scrape")
428
+ add_from_crawl = gr.Button("Add from Last Crawl")
429
+ status_corpus = gr.Markdown()
430
+ corpus_list_md = gr.Markdown(label="Corpus Items")
431
+
432
+ def do_add_from_search(corpus, items, inc, exc, dd):
433
+ corpus, msg = corpus_add(corpus or [], items, inc, exc, dd)
434
+ return corpus, msg, corpus_list(corpus)
435
+ def do_add_from_scrape(corpus, obj, inc, exc, dd):
436
+ corpus, msg = corpus_add(corpus or [], obj, inc, exc, dd)
437
+ return corpus, msg, corpus_list(corpus)
438
+ def do_add_from_crawl(corpus, pages, inc, exc, dd):
439
+ corpus, msg = corpus_add(corpus or [], pages, inc, exc, dd)
440
+ return corpus, msg, corpus_list(corpus)
441
+
442
+ add_from_search.click(do_add_from_search, [corpus_state, last_search_obj, include_filter, exclude_filter, dedupe], [corpus_state, status_corpus, corpus_list_md])
443
+ add_from_scrape.click(do_add_from_scrape, [corpus_state, last_scrape_obj, include_filter, exclude_filter, dedupe], [corpus_state, status_corpus, corpus_list_md])
444
+ add_from_crawl.click(do_add_from_crawl, [corpus_state, last_crawl_pages, include_filter, exclude_filter, dedupe], [corpus_state, status_corpus, corpus_list_md])
445
+
446
+ with gr.Row():
447
+ merge_btn = gr.Button("Merge ➜ Markdown", variant="primary")
448
+ clear_btn = gr.Button("Clear Corpus", variant="secondary")
449
+ merged_md = gr.Textbox(label="Merged Markdown (editable)", lines=12)
450
+
451
+ def do_merge(corpus):
452
+ md = corpus_merge_md(corpus or [])
453
+ return md, md
454
+ def do_clear():
455
+ c,msg = corpus_clear()
456
+ return c, msg, corpus_list(c), ""
457
+ merge_btn.click(do_merge, [corpus_state], [merged_md, merged_md_state])
458
+ clear_btn.click(do_clear, [], [corpus_state, status_corpus, corpus_list_md, merged_md])
459
+
460
+ gr.Markdown("---")
461
+ with gr.Row():
462
+ site_name = gr.Textbox(label="Scaffold Name", value="zen-scan")
463
+ scaffold_btn = gr.Button("Generate Minimal Site Scaffold (ZIP)")
464
+ scaffold_zip = gr.File(visible=False)
465
+ scaffold_btn.click(lambda md, name: scaffold_from_corpus(md, name or "zen-scan"),
466
+ [merged_md], [scaffold_zip])
467
+
468
+ gr.Markdown("---")
469
+ with gr.Row():
470
+ export_zip_btn = gr.Button("Export Corpus (ZIP)")
471
+ export_zip_file = gr.File(visible=False)
472
+
473
+ def do_export(corpus, merged):
474
+ extras = {"README.txt": "Exported by ZEN VibeCoder"}
475
+ return corpus_export(corpus or [], merged or "", extras)
476
+ export_zip_btn.click(do_export, [corpus_state, merged_md], [export_zip_file])
477
 
478
+ # --- VIBE CODE (single provider) ---
479
  with gr.Tab("✨ Vibe Code (Synthesis)"):
480
  with gr.Row():
481
  provider = gr.Radio(choices=["openai","anthropic"], value="openai", label="Provider")
482
  model_name = gr.Textbox(label="Model (override)", placeholder="(blank = auto fallback)")
483
  temp = gr.Slider(0.0, 1.2, value=0.4, step=0.05, label="Temperature")
 
484
  sys_prompt = gr.Textbox(label="System Style (optional)",
485
  value="Return structured outputs with file trees, code blocks and ordered steps. Be concise and concrete.")
486
  user_prompt = gr.Textbox(label="User Prompt", lines=6)
487
+ ctx_md = gr.Textbox(label="Context (paste markdown or click Merge first)", lines=10)
488
+ gen_btn = gr.Button("Generate", variant="primary")
489
+ out_md = gr.Markdown()
 
 
 
490
  gr.Markdown("**Starter Tiles**")
491
  with gr.Row():
492
+ t1 = gr.Button("πŸ”§ Clone Docs ➜ Clean README")
493
+ t2 = gr.Button("🧭 Competitor Matrix")
494
+ t3 = gr.Button("πŸ§ͺ Python API Client")
495
+ t4 = gr.Button("πŸ“ ZEN Landing Rewrite")
496
+ t5 = gr.Button("πŸ“Š Dataset & ETL Plan")
 
497
  def fill_tile(tile: str):
498
  tiles = {
499
+ "t1": "Create a clean knowledge pack from the context, then output a README.md with: Overview, Key features, Quickstart, API endpoints, Notes & gotchas, License. Include a /docs/ outline.",
500
+ "t2": "Produce a feature matrix, pricing table, ICP notes, moats/risks, and a market POV. End with a ZEN playbook: 5 lever moves.",
501
+ "t3": "Design a Python client that wraps the target API with retry/backoff and typed responses. Provide package layout, requirements, client.py, examples/, and README.",
502
+ "t4": "Rewrite the landing content in ZEN brand voice: headline, 3 value props, social proof, CTA, concise FAQ. Provide HTML sections and copy.",
503
+ "t5": "Propose a dataset schema. Output a table of fields, types, constraints, plus an ETL plan (sources, transforms, validation, freshness, monitoring).",
504
  }
505
  return tiles[tile]
 
506
  t1.click(lambda: fill_tile("t1"), outputs=[user_prompt])
507
  t2.click(lambda: fill_tile("t2"), outputs=[user_prompt])
508
  t3.click(lambda: fill_tile("t3"), outputs=[user_prompt])
509
  t4.click(lambda: fill_tile("t4"), outputs=[user_prompt])
510
  t5.click(lambda: fill_tile("t5"), outputs=[user_prompt])
 
511
  gen_btn.click(action_generate, [session_state, provider, model_name, sys_prompt, user_prompt, ctx_md, temp], [out_md])
512
 
513
+ # --- DUAL (side-by-side router) ---
514
+ with gr.Tab("πŸ§ͺ Dual Synth (OpenAI vs Anthropic)"):
515
+ with gr.Row():
516
+ model_openai = gr.Textbox(label="OpenAI Model", placeholder="(blank = auto fallback)")
517
+ model_anthropic = gr.Textbox(label="Anthropic Model", placeholder="(blank = auto fallback)")
518
+ temp2 = gr.Slider(0.0, 1.2, value=0.4, step=0.05, label="Temperature")
519
+ sys2 = gr.Textbox(label="System Style (optional)", value="Return structured outputs with file trees and clear steps.")
520
+ user2 = gr.Textbox(label="User Prompt", lines=6, value="Summarize the corpus and propose a 5-step execution plan.")
521
+ ctx2 = gr.Textbox(label="Context (tip: click Merge in Corpus tab)", lines=10)
522
+ dual_btn = gr.Button("Run Dual Synthesis", variant="primary")
523
+ dual_md = gr.Markdown()
524
+ dual_btn.click(dual_generate, [session_state, model_openai, model_anthropic, sys2, user2, ctx2, temp2], [dual_md])
525
+
526
  gr.Markdown("Built for **ZEN Arena** pipelines. Export ZIPs β†’ ingest β†’ credentialize via ZEN Cards.")
527
 
528
  if __name__ == "__main__":