Spaces:

librarian-bots
/

collection-reading-list-generator

Runtime error

App Files Files Community

collection-reading-list-generator / app.py

davanstrien HF Staff

turn off debugging

cbd65c5 about 2 years ago

raw

history blame contribute delete

6.2 kB

	import json
	from typing import Any, Dict, List, Optional, Union

	import gradio as gr
	import httpx
	from cachetools import TTLCache, cached
	from gradio_client import Client
	from toolz import groupby

	CACHE_TIME = 60 * 60 * 1 # 1 hour

	client = Client("https://librarian-bots-collection-papers-extractor.hf.space/")


	@cached(cache=TTLCache(maxsize=500, ttl=10))
	def get_arxiv_ids_from_slug(
	slug: str,
	) -> Dict[str, Union[None, Dict[str, Dict[str, Union[List[str], List[str]]]]]]:
	result = client.predict(slug, api_name="/predict")
	with open(result) as f:
	data = json.load(f)
	return data


	def format_arxiv_id_for_semantic_scholar(arxiv_id: str) -> str:
	return f"ArXiv:{arxiv_id}"


	def format_ids(data, exclude_keys: Optional[list[str]] = None) -> list[str]:
	arxiv_ids = []
	if exclude_keys is not None:
	data = {k: v for k, v in data.items() if k not in exclude_keys}
	# check if dict now empty
	if not data:
	return []
	for repo in data.values():
	if repo is None:
	continue
	for item in repo.values():
	arxiv_ids.extend(item["arxiv_ids"])
	# format for semantic scholar
	return [format_arxiv_id_for_semantic_scholar(id) for id in arxiv_ids]


	@cached(cache=TTLCache(maxsize=500, ttl=CACHE_TIME))
	def get_recommendations_from_semantic_scholar(paper_ids: tuple[str]):
	paper_ids = list(paper_ids)
	print(paper_ids)
	r = httpx.post(
	"https://api.semanticscholar.org/recommendations/v1/papers/",
	json={
	"positivePaperIds": paper_ids,
	},
	params={"fields": "externalIds,title,year", "limit": 10},
	timeout=30,
	)
	print(r.text)
	return r.json()


	def is_arxiv_paper(recommendation: Dict[str, Any]) -> bool:
	return recommendation["externalIds"].get("ArXiv", None) is not None


	def group_by_is_arxiv_paper(
	recommendations: List[Dict[str, Any]]
	) -> Dict[bool, List[Dict[str, Any]]]:
	return groupby(is_arxiv_paper, recommendations)


	def format_recommendation_into_markdown(
	grouped_recommendations: Dict[bool, List[Dict[str, Any]]]
	):
	comment = "The following papers were recommended by the Semantic Scholar API \n\n"
	arxiv_papers = grouped_recommendations.get(True)
	if arxiv_papers:
	comment += "## Papers available on Hugging Face Papers:\n\n"
	for r in arxiv_papers:
	hub_paper_url = f"https://huggingface.co/papers/{r['externalIds']['ArXiv']}"
	comment += f"* [{r['title']}]({hub_paper_url}) ({r['year']})\n"
	other_papers = grouped_recommendations.get(False)
	if other_papers:
	comment += "\n\n## Other papers:\n\n"
	for r in other_papers:
	comment += f"* {r['title']} ({r['year']})\n"
	return comment


	def map_repo_name_to_api_key(repo_name: str) -> str:
	return {
	"datasets": "dataset papers",
	"models": "model papers",
	"papers": "papers",
	}[repo_name]


	def get_recommendations_from_slug(
	slug: str, excluded_repo_types: Optional[list[str]] = None
	):
	excluded_repo_types = tuple(excluded_repo_types)
	return _get_recommendations_from_slug(slug, excluded_repo_types=excluded_repo_types)


	@cached(cache=TTLCache(maxsize=500, ttl=60))
	def _get_recommendations_from_slug(
	slug: str, excluded_repo_types: Optional[tuple[str]] = None
	):
	data = get_arxiv_ids_from_slug(slug)
	if excluded_repo_types:
	excluded_repo_types = list(excluded_repo_types)
	excluded_repo_types = [map_repo_name_to_api_key(k) for k in excluded_repo_types]
	print(f"excluded_repo_types_remapped={excluded_repo_types}")
	ids = format_ids(data, exclude_keys=excluded_repo_types)
	if not ids:
	return (
	"Based on your collection and exclusions"
	f" ({','.join(excluded_repo_types)}), there are no papers to recommend. Try"
	" removing some excluded repo types or adding more items to your"
	" collection."
	)
	ids = tuple(ids)
	recommendations = get_recommendations_from_semantic_scholar(ids)
	recommendations = recommendations.get("recommendedPapers")
	if recommendations is None:
	raise gr.Error("Something went wrong with the Semantic Scholar API")
	grouped = group_by_is_arxiv_paper(recommendations)
	return format_recommendation_into_markdown(grouped)


	title = """📚 Collections Reading List Generator 📚"""
	description = """<img src="https://huggingface.co/datasets/librarian-bots/images/raw/main/Mascot%20Bookie.svg"
	alt="Mascot Bookie" width="200" style="float:left; margin-right:20px; margin-bottom:20px;">

	\n\n
	Hugging Face Collections allow you to curate models, datasets, spaces,
	and papers from the Hugging Face Hub.
	This Space will generate a reading list based on the items in your collection.
	This can be a great way to find related papers to the models and datasets in your collection and dive more deeply into a topic!

	The Space works by:

	- finding any papers in your collection
	- finding papers related to the models and datasets in your collection
	- requesting recommendations from the [Semantic Scholar API](https://api.semanticscholar.org/api-docs/recommendations#tag/Paper-Recommendations/operation/post_papers) for these papers.

	You can optionally exclude certain repo types fromm consideration when generating the reading list.
	"""

	slug_input = gr.Textbox(
	lines=1,
	label="Collection Slug",
	placeholder="merve/video-classification-models-6509edd0a6f657faa425e8c3",
	)
	example_slugs = [
	["merve/video-classification-models-6509edd0a6f657faa425e8c3", []],
	["osanseviero/model-merging-65097893623330a3a51ead66", []],
	["hf4h/clinical-language-models-64f9c1cd0cedc04f3caca264", []],
	]

	gr.Interface(
	get_recommendations_from_slug,
	inputs=[
	slug_input,
	gr.Dropdown(
	label="Repos to exclude from contributing to recommendations",
	choices=["datasets", "models", "papers"],
	multiselect=True,
	),
	],
	outputs="markdown",
	description=description,
	title=title,
	allow_flagging="never",
	examples=example_slugs,
	).launch()