| | import os |
| | from datetime import datetime |
| |
|
| | import pytz |
| | from huggingface_hub import HfApi |
| |
|
| | GENERATED_BELOW_MARKER = "--- Generated Part of README Below ---" |
| |
|
| |
|
| | def update_dataset_readme(dataset_name: str, subreddit: str, new_rows: int) -> None: |
| | """ |
| | Update the README file of a specified dataset repository with new information. |
| | |
| | Args: |
| | dataset_name (str): Name of the dataset repository. |
| | subreddit (str): Name of the subreddit being used for dataset creation. |
| | new_rows (int): Number of new rows added in the latest update. |
| | """ |
| | |
| | api = HfApi() |
| | |
| | |
| | readme_path = api.hf_hub_download(repo_id=dataset_name, repo_type="dataset", filename="README.md", token=hf_token) |
| |
|
| | |
| | with open(readme_path, "r") as file: |
| | old_readme = file.read() |
| |
|
| | |
| | new_readme = append_to_readme(subreddit=subreddit, new_rows=new_rows, old_readme=old_readme) |
| |
|
| | |
| | api.upload_file( |
| | path_or_fileobj=new_readme.encode(), |
| | path_in_repo="README.md", |
| | repo_id=dataset_name, |
| | repo_type="dataset", |
| | token=hf_token, |
| | commit_message=f'Pushing {new_rows} new rows' |
| | ) |
| |
|
| |
|
| | def append_to_readme(subreddit: str, new_rows: int, old_readme: str) -> str: |
| | """ |
| | Append new information to the existing README content. |
| | |
| | Args: |
| | subreddit (str): Name of the subreddit. |
| | new_rows (int): Number of new rows added. |
| | old_readme (str): Existing README content. |
| | |
| | Returns: |
| | str: Updated README content. |
| | """ |
| | latest_hour = datetime.now(pytz.utc).replace(minute=0, second=0, microsecond=0) |
| | latest_hour_str = latest_hour.strftime('%Y-%m-%d %H:00:00 %Z%z') |
| |
|
| | readme_text = f""" |
| | ## Dataset Overview |
| | This dataset is based on [derek-thomas/dataset-creator-reddit-{subreddit}](https://huggingface.co/datasets/derek-thomas/dataset-creator-reddit-{subreddit}) |
| | and will add [nomic-ai/nomic-embed-text-v1](https://huggingface.co/nomic-ai/nomic-embed-text-v1) embeddings based on the |
| | `content` field. |
| | |
| | The goal is to be able to have an automatic and free semantic/neural tool for any subreddit. |
| | |
| | The last run was on {latest_hour_str} and updated {new_rows} new rows. |
| | |
| | ## Creation Details |
| | This is done by triggering [derek-thomas/processing-bestofredditorupdates](https://huggingface.co/spaces/derek-thomas/processing-bestofredditorupdates) |
| | based on a repository update [webhook](https://huggingface.co/docs/hub/en/webhooks) to calculate the embeddings and update the [nomic atlas](https://docs.nomic.ai) |
| | visualization. This is done by this [processing space](https://huggingface.co/spaces/derek-thomas/processing-bestofredditorupdates). |
| | |
| | ## Update Frequency |
| | The dataset is updated based on a [webhook](https://huggingface.co/docs/hub/en/webhooks) trigger, so each time [derek-thomas/dataset-creator-reddit-{subreddit}](https://huggingface.co/datasets/derek-thomas/dataset-creator-reddit-{subreddit}) |
| | is updated, this dataset will be updated. |
| | |
| | ## Opt-out |
| | To opt-out of this dataset please make a request in the community tab |
| | """ |
| |
|
| | if GENERATED_BELOW_MARKER in old_readme: |
| | index = old_readme.index(GENERATED_BELOW_MARKER) + len(GENERATED_BELOW_MARKER) |
| | new_readme = old_readme[:index] + "\n\n" + readme_text |
| | else: |
| | new_readme = old_readme + "\n\n" + GENERATED_BELOW_MARKER + "\n\n" + readme_text + "\n" |
| |
|
| | return new_readme |
| |
|